Example #1
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="EXTENSION", help="Old EXTENSION", action="store", type="string", dest="old")
    parser.add_option("-n", metavar="EXTENSION", help="New EXTENSION", action="store", type="string", dest="new")
    parser.add_option("--convert", help="Do convert genbank file into embl", action="store_true", dest="convert")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    for file in os.listdir('.'):
        if options.old in file:
            oldfile = file
            newfile = "%s.%s" % (oldfile.split(".")[0], options.new)
            print "Convert file %s into %s" % (oldfile, newfile)
            if options.convert:
                cmd = "seqret -sequence gb::%s -feature Yes -outseq embl::%s" % (oldfile, newfile)
                try:
                    util.runProcess(cmd)
                except Exception, e:
                    print "Error to convert %s" % oldfile
                    print e
Example #2
0
def doValidate():
    cmd = "%s/tbl2asn -p /Users/ap12/Documents/metahit_data/EMBLValidation -t /Users/ap12/Documents/metahit_data/EMBLValidation/template -V v" % os.path.realpath(os.path.dirname(__file__))
    try:
        util.runProcess(cmd)
    except Exception, e:
        log.error(traceback.extract_stack())
        log.error(e)
Example #3
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="EXTENSION", help="val EXTENSION", action="store", type="string", dest="old")
    parser.add_option("-n", metavar="EXTENSION", help="err EXTENSION", action="store", type="string", dest="new")
    parser.add_option("--extract", help="Extract ERRORs only", action="store_true", dest="extract")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    for file in os.listdir('.'):
        if options.old in file:
            oldfile = file
            newfile = "%s.%s" % (oldfile.split(".")[0], options.new)
            print "Convert file %s into %s" % (oldfile, newfile)
            if options.extract:
                cmd = "grep ERROR %s > %s" % (oldfile, newfile)
                try:
                    util.runProcess(cmd)
                except Exception, e:
                    print "Error to extract %s" % oldfile
                    print e
Example #4
0
def has_same_md5(ori_file, copied_file):
    cmd = "md5sum %s"
    ori_md5 = util.runProcess(cmd % ori_file).split()[0]
    copied_md5 = util.runProcess(cmd % copied_file).split()[0]
    if ori_md5 == copied_md5:
        return True
    else:
        return False
Example #5
0
def infoseq(file):
    """
    Run EMBOSS infoseq to 
    Display basic information about sequences
    """
    util.checkFile(file)
    util.checkSoft("infoseq")
    cmd = "infoseq -only -length -noheading %s -outfile %s.infoseq" % (file, file)
    util.runProcess(cmd)
Example #6
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", "--list", metavar="FILE", help="FILE containing the list of all organism common names and its associated file to load", action="store", type="string", dest="list")
    parser.add_option("-D", action="store", dest="dbhost")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " " 
    logger.info(cmdline)
    
    # Print logger file info
    logger.info(logsetup.logpath)
    
    # Setup database connection
    host = ropy.util.getDArg("dbhost", raiseOnEmpty = True)
    database = ropy.util.getDArg("dbname", raiseOnEmpty = True)
    port = ropy.util.getDArg("dbport", raiseOnEmpty = True)
    user = ropy.util.getDArg("dbuser", raiseOnEmpty = True)
    #password = ropy.util.getDArg("dbpassword", raiseOnEmpty = True)
    
    # Check if chado_load is installed
    util.isSoftInstalled("chado_load")

    # Read organism common name and load related embl file into the database
    data_path = options.list
    for line in open(data_path, "r"):
        if line[0] == '!':
            continue
        if line.count('||') < 1:
            continue
        # ! common_name||taxon_id||filename
        line = line.strip()
        list = line.split('||')
        common_name = list[0]
        filename = list[2]
        util.checkFile(filename)
        # Loader command
        cmd = "chado_load embl -o %s -t contig -D %s:%s/%s -U %s %s" % (common_name, host, port, database, user, filename)
        # Run command
        util.runProcess(cmd)
Example #7
0
def doConvert(embl_file, tbl_file, locus_tag):
    # convert .4dep.embl into .4val.embl to get a std ID line
    embl_file_val = embl_file.replace('4dep','4val')
    cmd = "seqret -sequence embl::%s -feature Yes -outseq embl::%s" % (embl_file, embl_file_val)
    util.runProcess(cmd)
    # convert .4val.embl into tbl file
    record = SeqIO.read(open(embl_file_val), "embl")
    table = open(tbl_file, 'w')
    table.write('>Feature %s\n' % locus_tag)
    for feature in record.features:
        if not ('source' in feature.type or 'gap' in feature.type):
            table.write('%s\t%s\n' % (getLocation(feature), feature.type))
            for qualifier in feature.qualifiers:
                if not 'translation' in qualifier:
                    table.write('\t\t%s\t%s\n' % (qualifier, feature.qualifiers[qualifier][0]))
            table.write('\t\tinference\tab initio prediction:IMG/ER\n')
    table.close()
Example #8
0
def doRun():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name")
    parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input")
    (options, args) = parser.parse_args()

    try:
        common_name = options.name
        input_file = checkValidInput(options.input, common_name)
    
        # Print info
        log.info("Running Glimmer3 on %s\n" % common_name)
        log.info("Getting sequence from %s\n" % input_file)
    
        # Run glimmer3 iterated
        script = "/software/pathogen/external/applications/glimmer/glimmer/scripts/g3-iterated.csh"
        util.checkFile(script)
        cmd = "%s %s %s" % (script, input_file, common_name)
        util.runProcess(cmd)
    
        # Run the conversion only if g3 successful 
        g3_predict_file = "%s.predict" % common_name
        if os.path.exists(g3_predict_file):
            # Convert output results into a feature table EMBL file.
            g3_tab = convertToTab(g3_predict_file, common_name)
        
            # Tidy up
            util.rmFile(common_name + ".longorfs")
            util.rmFile(common_name + ".train")
            util.rmFile(common_name + ".icm")
            util.rmFile(common_name + ".run1.detail")
            util.rmFile(common_name + ".run1.predict")
            util.rmFile(common_name + ".coords")
            util.rmFile(common_name + ".upstream")
            util.rmFile(common_name + ".motif")
            util.rmFile(common_name + ".detail")
            util.rmFile(g3_predict_file)
    
            log.info("%s is the final feature table Glimmer3 predictions\n" % g3_tab)
        else:
            log.info("%s file does not exists\n" % g3_predict_file)
    except Exception, e:
        log.error(e)
        raise e
Example #9
0
def union(file, common_name, locus_tag, organism_name, strain):
    """
    Merge scaffolds into one sequence file
    Run EMBOSS union if more than on '>' is found
    """
    util.checkFile(file)
    cmd = "grep '>' %s | wc -l" % file
    result = util.runProcess(cmd)
    if int(result) > 1:
        new_file = "%s.fsa" % common_name
        util.checkSoft("union")
        util.checkSoft("descseq")
        name = "%s [organism=%s] [strain=%s] [gcode=11]" % (locus_tag, organism_name, strain)
        cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (file, name, new_file)
        util.runProcess(cmd_union)
        return new_file
    else:
        return file
Example #10
0
def checkValidInput(input_file, common_name):
    """
    Check if the input fasta sequence file is of correct format.
    RAST re-arrange the scaffolds if a splitted sequences is submitted
    Run EMBOSS union before if more than on '>' is found
    """
    util.checkFile(input_file)
    cmd = "grep '>' %s | wc -l" % input_file
    result = util.runProcess(cmd)
    if int(result) > 1:
        new_input_file = "%s.fna" % common_name
        util.checkSoft("union")
        util.checkSoft("descseq")
        cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file)
        util.runProcess(cmd_union)
        return new_input_file
    else:
        return input_file
Example #11
0
def doRun():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name")
    parser.add_option(
        "-i",
        metavar="FILE",
        help="input organism sequence file in FASTA format",
        action="store",
        type="string",
        dest="input",
    )
    (options, args) = parser.parse_args()

    try:
        common_name = options.name
        input_file = checkValidInput(options.input, common_name)
        output_file = "%s.prodigal" % common_name

        # Print info
        log.info("Running prodigal on %s\n" % common_name)
        log.info("Getting sequence from %s\n" % input_file)

        # Run prodigal
        softname = "prodigal"
        util.checkSoft(softname)
        cmd = "%s < %s > %s" % (softname, input_file, output_file)
        util.runProcess(cmd)

        # Run the conversion only if successful
        if os.path.exists(output_file):
            # Convert output results into a feature table EMBL file.
            tab_file = convertToTab(output_file, common_name)

            # Tidy up
            util.rmFile(common_name + ".fna")
            util.rmFile(output_file)

            log.info("%s is the final feature table Prodigal predictions\n" % tab_file)
        else:
            log.info("%s file does not exists\n" % output_file)
    except Exception, e:
        log.error(e)
        raise e
Example #12
0
def checkValidInput(input_file, common_name):
    """
    Check if the input fasta sequence file is of correct format.
    Segmentation fault while running glimmer on splitted sequences with a fasta file
    Run EMBOSS union before if more than on '>' is found
    """
    try:
        util.checkFile(input_file)
        cmd = "grep '>' %s | wc -l" % input_file
        result = util.runProcess(cmd)
        if int(result) > 1:
            new_input_file = "%s.fna" % common_name
            util.checkSoft("union")
            util.checkSoft("descseq")
            cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file)
            util.runProcess(cmd_union)
            return new_input_file
        else:
            return input_file
    except util.UtilException, ue:
        raise ue
Example #13
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="EXTENSION", help="Old EXTENSION", action="store", type="string", dest="old")
    parser.add_option("-n", metavar="EXTENSION", help="New EXTENSION", action="store", type="string", dest="new")
    parser.add_option("--rename", help="Do rename", action="store_true", dest="rename")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    for file in os.listdir('.'):
        if options.old in file:
            oldfile = file
            newfile = "%s.%s" % (oldfile.split(".")[0], options.new)
            print "Rename old file %s into %s" % (oldfile, newfile)
            if options.rename:
                cmd = "mv %s %s" % (oldfile, newfile)
                util.runProcess(cmd)
    if not options.rename:
        print "To perform the action, please use --rename"
Example #14
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all 454 runs and its associated information", action="store", type="string", dest="list")
    parser.add_option("-o", "--outpath", metavar="PATH", help="PATH where to generate indexes and temporary fastq files", action="store", type="string", dest="outpath")
    parser.add_option("--fastq", help="Do generate fastq files.", action="store_true", dest="fastq")
    parser.add_option("--md5", help="Do run md5sum on generated fastq files.", action="store_true", dest="md5")
    
    (options, args) = parser.parse_args()

    if not options.list and not options.outpath:
        parser.print_help()
        sys.exit()
    
    # input file
    input_file = options.list
    util.checkFile(input_file)
    input_lines = open(input_file, "r").readlines()

    # output path
    output_path = options.outpath
    util.checkDir(output_path)
    out_metadata = "%s/metadata" % output_path
    util.checkDir(out_metadata)
    out_fastq = "%s/fastq" % output_path
    util.checkDir(out_fastq)

    # checking file format first before processing it
    lines = []
    for line in input_lines:
        if line[0] == '!':
            continue
        elif not line.count('||') == 8:
            log.error("line is not well formated. Please check your input file.")
            log.error(line.count('||'))
            log.error(line)
            sys.exit()
        else:
            lines.append(line)
    log.debug(lines)

    # opening output files
    sequence_index_filename = '%s/sequence.index' % out_metadata
    sequence_index = open(sequence_index_filename, 'w')
    samples_info = open('%s/samples.info' % out_metadata, 'w')
    assembly_index = open('%s/assembly.index' % out_metadata, 'w')

    # processing input file
    sample_count = 0
    for line in lines:
        line = line.strip()
        values = line.split('||')
        log.info(line)
        genus = values[1]
        species = values[2]
        strain = values[3]
        organism_name = "%s %s %s" % (genus, species, strain)
        sample_count = sample_count + 1
        run = values[4]
        if values[5] == '1':
            paired = 'PAIRED'
        else:
            paired = 'SINGLE'
        insert_size = values[6]
        sff_file = "/nfs/%s" % values[7]
        trim_status = "/nfs/%s/454TrimStatus.txt" % values[8]
        contigs_file = "/nfs/%s/454LargeContigs.fna" %values[8]
        scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[8]

        species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_')
        strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_')
        study = "%s_%s%s" % (values[0], genus[0], species_strain)

        # check that project name (study) is less than 40 char
        # mysql> desc project;
        # | name           | varchar(40)           | NO   | MUL |         |                | 
        # | hierarchy_name | varchar(40)           | NO   | MUL |         |                | 
        if len(study) > 40:
            log.warning("Project name %s has more than 40 char." % study)

        # checking files
        util.checkFile(sff_file)
        util.checkFile(trim_status)
        util.checkFile(contigs_file)
        util.checkFile(scaffolds_file)

        # convert sff into fastq
        outprefix = "%s/%s" % (out_fastq, run)
        cmd_sff2fastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/454TrimStatus2reads.py --pair_suffix=/1,/2 --sff %s %s %s" % (sff_file, trim_status, outprefix)
        fastq_pairs = "%s-pairs.fastq" % outprefix
        fastq_single = "%s-single.fastq" % outprefix

        # split fastq pairs file
        fastq_1 = "%s_1.fastq" % outprefix
        fastq_2 = "%s_2.fastq" % outprefix
        cmd_splitfastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/fastn_unshuffle.py %s %s %s" % (fastq_pairs, fastq_1, fastq_2)

        # rename fastq single file
        fastq_0 = "%s.fastq" % outprefix
        cmd_rename = "mv %s %s" % (fastq_single, fastq_0)

        # tidy-up
        cmd_remove = "rm %s-info.txt; rm %s-pairs.fastq" % (outprefix, outprefix)

        # gzip fastq files
        cmd_gzip = "gzip %s; gzip %s; gzip %s" % (fastq_1, fastq_2, fastq_0)

        # all commands
        cmd = "%s; %s; %s; %s; %s" % (cmd_sff2fastq, cmd_splitfastq, cmd_rename, cmd_remove, cmd_gzip)


        if IS_LSF:
            if not (os.path.exists("%s.gz" % fastq_1) and os.path.exists("%s.gz" % fastq_2) and os.path.exists("%s.gz" % fastq_0)):
                if options.fastq:
                    util.submitJob(jobname='sff2fastq_%s' % run, cmd=cmd, outdir=out_metadata)
                else:
                    log.info("fastq files do not exist, use '--fastq' to generate them.")
            else:
                log.info("fastq files already exist.")
        else:
            log.info("Need to be run on LSF.")

        instrument_platform = '454'
        empty = 'n/a'
        fastq_1_gz = "%s.gz" % fastq_1
        fastq_2_gz = "%s.gz" % fastq_2
        fastq_0_gz = "%s.gz" % fastq_0

        # write to output files
        # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name|
        #                 (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)|
        #                 insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_1_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_2_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_0_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0'))

        # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex
        samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty))

        # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq
        # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt
        assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10]
        assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run))

    # close files
    sequence_index.close()
    samples_info.close()
    assembly_index.close()

    if not options.fastq:
        log.info("Use '--fastq' for generating fastq files")

    if options.md5:
        # calculate md5 and modify sequence.index
        util.checkFile(sequence_index_filename)
        seq_lines = open(sequence_index_filename, "r").readlines()
        sequence_index = open(sequence_index_filename, 'w')
        for line in seq_lines:
            values = line.split('\t')
            fastq = values[0]
            run = values[2]
            if os.path.exists(fastq):
                md5 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq).strip()
                line = line.replace('md5', md5)
                sequence_index.write(line)
            else:
                log.info("fastq file %s does not exist, use '--fastq' for generating it." % fastq)

        # close file
        sequence_index.close()
    else:
        log.info("When all submitted jobs end, use '--md5' for updating sequence.index with md5sum.")
Example #15
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.root and options.category):
        parser.print_help()
        sys.exit()
    
    # get the data from Goodgle spreadsheet
    lines = gdocs.getValues(doc='%s_454_Projects' % options.category.title())

    # check output path
    util.checkDir(options.root)
    out_metadata = "%s/.tmp/metadata/%s" % (options.root, options.category)
    util.checkDir(out_metadata)
    out_fastq = "%s/.tmp/fastq/%s" % (options.root, options.category)
    util.checkDir(out_fastq)

    # open output files
    sequence_index = open('%s/sequence.index' % out_metadata, 'w')
    samples_info = open('%s/samples.info' % out_metadata, 'w')
    assembly_index = open('%s/assembly.index' % out_metadata, 'w')

    # process input data
    sample_count = 0
    for line in lines:
        line = line.strip()
        values = line.split('||')
        genus = values[1]
        species = values[2]
        strain = values[3]
        organism_name = "%s %s %s" % (genus, species, strain)
        sample_count = sample_count + 1
        sample = values[4]
        if sample == 'None':
            sample = strain
        library = values[5]
        run = values[6]
        if library == 'None':
            library = run
        if values[7] == '1':
            paired = 'PAIRED'
        else:
            paired = 'SINGLE'
            log.error("Single read set for %s. Not implemented." % run)
            continue
        insert_size = values[8]
        sff_file = "/nfs/%s" % values[9]
        trim_status = "/nfs/%s/454TrimStatus.txt" % values[10]
        contigs_file = "/nfs/%s/454LargeContigs.fna" %values[10]
        scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[10]

        species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_')
        strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_')
        study = "%s_%s%s" % (values[0], genus[0], species_strain)

        instrument_platform = '454'
        empty = 'n/a'
        fastq_1_gz = "%s/%s_1.fastq.gz" % (out_fastq, run)
        fastq_2_gz = "%s/%s_2.fastq.gz" % (out_fastq, run)
        fastq_0_gz = "%s/%s.fastq.gz" % (out_fastq, run)

        # check that project name (study) is less than 40 char
        # mysql> desc project;
        # | name           | varchar(40)           | NO   | MUL |         |                | 
        # | hierarchy_name | varchar(40)           | NO   | MUL |         |                | 
        if len(study) > 40:
            log.warning("Project name %s has more than 40 char." % study)

        # checking files
        util.checkFile(sff_file)
        util.checkFile(trim_status)
        util.checkFile(contigs_file)
        util.checkFile(scaffolds_file)

        log.info("> checking fastq files: %s" % run)
        # get lane hierarchy path (run)
        run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip()

        # check if fastq files have been loaded in db
        do_generate_indexes  = False
        do_generate_assembly_indexes = False
        if run_path != "undefined":
            log.info("  loaded in db.")
            fastq_path = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path)
            util.checkDir(fastq_path)
            # check if fastq files have been imported into the hierarchy
            if not (os.path.exists("%s/%s_1.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s_2.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s.fastq.gz" % (fastq_path, run))):
                log.info("  not imported into hierarchy.")
                # check if fastq files have been generated from sff into tmp dir
                if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)):
                    log.info("  not generated from sff files.")
                else:
                    log.info("  generate indexes.")
                    do_generate_indexes = True
            else:
                log.info("  already imported into hierarchy.")
                do_generate_assembly_indexes = True
        else:
            log.info("  not loaded in db.")
            # check if fastq files have been generated from sff into tmp dir
            if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)):
                log.info("  not generated from sff files.")
            else:
                log.info("  generate indexes.")
                do_generate_indexes = True
        
        # generate sequence and sample indexes
        if do_generate_indexes:
            # calculate md5
            md5_1 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_1_gz).strip()
            md5_2 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_2_gz).strip()
            md5_0 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_0_gz).strip()

            # write to output files
            # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name|
            #                 (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)|
            #                 insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_1_gz, md5_1, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0'))
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_2_gz, md5_2, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0'))
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_0_gz, md5_0, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0'))

            # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex
            samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty))
        
        # generate assembly indexes
        if do_generate_indexes or do_generate_assembly_indexes:
            # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq
            # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt
            assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10]
            assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run))

    # close files
    sequence_index.close()
    samples_info.close()
    assembly_index.close()
Example #16
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly")
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.assembly and options.root and options.category):
        parser.print_help()
        sys.exit()

    # check root path
    if not os.path.exists(options.root):
        log.error("%s path do not exist" % options.root)
        log.error("Create root path first, then run pipeline before importing assembly files.")
        sys.exit()
    
    # check input assembly file and read it - one line per run (lane)
    util.checkFile(options.assembly)
    assembly_lines = open(options.assembly, "r").readlines()
    # compare project name - could have more than one run per project
    previous_project = ""
    is_new_project = True
    for line in assembly_lines:
        if line[0] == '!':
            continue
        if not line.count('||') == 6:
            continue
        line = line.strip()
        values = line.split('||')
        project = values[0]
        genus = values[1]
        species = values[2]
        assembly_id = values[3]
        contig_file = values[4]
        scaffold_file = values[5]
        run = values[6]

        # get lane hierarchy path (run)
        run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip()
        fastq_file = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path)

        # check if new project
        if project == previous_project:
            is_new_project = False
        else:
            previous_project = project
            is_new_project = True

        # check species path
        species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species)
        if not os.path.exists(species_path):
            log.error("%s path do not exist" % species_path)
            log.error("Run fastq import pipeline before importing assembly files.")
        else:
            # create assembly path
            assembly_path = "%s/ASSEMBLY" % species_path
            if not os.path.exists(assembly_path):
                os.makedirs(assembly_path)
                log.info("%s created" % assembly_path)
            else:
                log.info("%s path already exists" % assembly_path)

            # create assembly_id path (newbler_2009_06_29)
            assembly_id_path = "%s/%s" % (assembly_path, assembly_id)
            if not os.path.exists(assembly_id_path):
                os.makedirs(assembly_id_path)
                log.info("%s created" % assembly_id_path)
            else:
                log.info("%s path already exists" % assembly_id_path)

            # copy contigs file
            contig_file_hierarchy = "%s/LargeContigs.fna" % assembly_id_path
            util.checkFile(contig_file)
            cmd_cp = "cp %s %s" % (contig_file, contig_file_hierarchy)
            if not os.path.exists(contig_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))
            else:
                log.info("%s file already exists" % contig_file_hierarchy)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))

            # copy scaffolds file
            scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path
            util.checkFile(scaffold_file)
            cmd_cp = "cp %s %s" % (scaffold_file, scaffold_file_hierarchy)
            if not os.path.exists(scaffold_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))
            else:
                log.info("%s file already exists" % scaffold_file_hierarchy)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))

            # create fastqs path
            fastqs_path = "%s/fastqs" % assembly_id_path
            if not os.path.exists(fastqs_path):
                os.makedirs(fastqs_path)
                log.info("%s created" % fastqs_path)
            else:
                log.info("%s path already exists" % fastqs_path)
            
            # create simlinks to fastqs
            util.checkDir(fastq_file)
            fastq_name = run
            symlink = "%s/%s" % (fastqs_path, fastq_name)
            if not os.path.exists(symlink):
                os.symlink(fastq_file, symlink)
                log.info("%s symlink created" % symlink)
            else:
                log.info("%s symlink already exists" % symlink)
                
            # run samtools faidx, refstats, bwa to generate extra files required for the QC pipeline
            cmd = ""
            if not os.path.exists("%s.fai" % scaffold_file_hierarchy):
                cmd = "samtools faidx %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.fai already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.refstats" % scaffold_file_hierarchy):
                cmd = cmd + "ref-stats -r %s > %s.refstats; " % (scaffold_file_hierarchy, scaffold_file_hierarchy)
            else:
                log.info("%s.refstats already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.bwt" % scaffold_file_hierarchy):
                cmd = cmd + "bwa index %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.bwt already exists" % scaffold_file_hierarchy)

            # run stats.py on contigs and scaffolds
            cmd_stats = "python /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/stats.py -f %s; "
            if not os.path.exists("%s.stats" % contig_file_hierarchy):
                cmd = cmd + cmd_stats % contig_file_hierarchy
            else:
                log.info("%s.stats already exists" % contig_file_hierarchy)
            if not os.path.exists("%s.stats" % scaffold_file_hierarchy):
                cmd = cmd + cmd_stats % scaffold_file_hierarchy
            else:
                log.info("%s.stats already exists" % scaffold_file_hierarchy)

            # submit all jobs
            if is_new_project and not cmd == "":
                util.submitJob(jobname='stats_%s_%s' % (project, assembly_id), cmd=cmd, outdir=assembly_id_path)
Example #17
0
### ---------------------------------------------------------------------------
def doValidate():
    cmd = "%s/tbl2asn -p /Users/ap12/Documents/metahit_data/EMBLValidation -t /Users/ap12/Documents/metahit_data/EMBLValidation/template -V v" % os.path.realpath(os.path.dirname(__file__))
    try:
        util.runProcess(cmd)
    except Exception, e:
        log.error(traceback.extract_stack())
        log.error(e)
    # extract errors only
    for file in os.listdir('.'):
        if 'val' in file:
            oldfile = file
            newfile = "%s.err" % (oldfile.split(".")[0])
            cmd = "grep ERROR %s > %s" % (oldfile, newfile)
            try:
                util.runProcess(cmd)
            except Exception, e:
                log.error(traceback.extract_stack())
                log.error(e)

### ---------------------------------------------------------------------------
def doClean():
    pass

### ---------------------------------------------------------------------------
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated information (common_name||organim_name||strain||locus_tag||genome_project_id||coverage)", action="store", type="string", dest="list")
    parser.add_option("--convert", help="Do convert embl file into tbl", action="store_true", dest="convert")