Exemple #1
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
        
    # Read organism common name and related fasta sequence file
    list_file = options.list
    util.checkFile(list_file)
    for line in open(list_file, "r"):
        if line[0] == '!':
            continue
        if line.count('||') < 1:
            continue
        # ! common_name||sequence_file
        line = line.strip()
        values = line.split('||')
        common_name = values[0]
        input_file = values[1]
        #util.checkFile(input_file)
        doSubmit(common_name, input_file)
Exemple #2
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
        
    # Get and check input arguments
    if options.list:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||organim_name||strain||locus_tag||fasta_file
            line = line.strip()
            values = line.split('||')
            print "Processing %s" % values[0]
            union(file=values[4], common_name=values[0], locus_tag=values[3], organism_name=values[1], strain=values[2])           
Exemple #3
0
def convertToTab(result_file, common_name):
    try:
        tab_file = "%s.trna.tab" % common_name
        util.checkFile(result_file)
        f_input = open (result_file, 'r')
        f_output = open (tab_file, 'w')
        for line in f_input:
            line = line.strip()
            # name=Alistipes_shahii 	1	89017	88933	Undet	???	0	0	61.20
            values = line.split()
            start = int(values[2])
            end = int(values [3])
            trna_type = values[4]
            anti_codon = values[5]
            score = values [8]

            if start <= end:
                location = "%s..%s" % (start, end)
            else:
                location = "complement(%s..%s)" % (end, start)

            
            f_output.write("FT  tRNA             %s\n" % location)
            f_output.write("FT                   /note=\"tRNA-%s(%s) Cove Score %s\"\n" % (trna_type, anti_codon, score))
            f_output.write("FT                   /colour=4\n")
            f_output.write("FT                   /method=\"tRNAscan-SE\"\n")
        f_input.close()
        f_output.close()
        return tab_file
    except util.UtilException, ue:
        raise ue
Exemple #4
0
def infoseq(file):
    """
    Run EMBOSS infoseq to 
    Display basic information about sequences
    """
    util.checkFile(file)
    util.checkSoft("infoseq")
    cmd = "infoseq -only -length -noheading %s -outfile %s.infoseq" % (file, file)
    util.runProcess(cmd)
Exemple #5
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", "--list", metavar="FILE", help="FILE containing the list of all organism common names and its associated file to load", action="store", type="string", dest="list")
    parser.add_option("-D", action="store", dest="dbhost")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    # Print command line
    cmdline = "$ python "
    for argv in sys.argv:
        cmdline += argv + " " 
    logger.info(cmdline)
    
    # Print logger file info
    logger.info(logsetup.logpath)
    
    # Setup database connection
    host = ropy.util.getDArg("dbhost", raiseOnEmpty = True)
    database = ropy.util.getDArg("dbname", raiseOnEmpty = True)
    port = ropy.util.getDArg("dbport", raiseOnEmpty = True)
    user = ropy.util.getDArg("dbuser", raiseOnEmpty = True)
    #password = ropy.util.getDArg("dbpassword", raiseOnEmpty = True)
    
    # Check if chado_load is installed
    util.isSoftInstalled("chado_load")

    # Read organism common name and load related embl file into the database
    data_path = options.list
    for line in open(data_path, "r"):
        if line[0] == '!':
            continue
        if line.count('||') < 1:
            continue
        # ! common_name||taxon_id||filename
        line = line.strip()
        list = line.split('||')
        common_name = list[0]
        filename = list[2]
        util.checkFile(filename)
        # Loader command
        cmd = "chado_load embl -o %s -t contig -D %s:%s/%s -U %s %s" % (common_name, host, port, database, user, filename)
        # Run command
        util.runProcess(cmd)
Exemple #6
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names, its associated information", action="store", type="string", dest="list")
    parser.add_option("--submit", help="To submit data, not only checking locus_tag", action="store_true", dest="submit")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
        
    # Get and check input arguments
    # Read organism common name and related fasta sequence file
    list_file = options.list
    util.checkFile(list_file)
    for line in open(list_file, "r"):
        if line[0] == '!':
            continue
        if line.count('||') < 1:
            continue
        # ! organism_name||strain||locus_tag||seq_size||seq_depth||dna_source||description
        line = line.strip()
        values = line.split('||')
        organism_name = values[0]
        strain = values[1]
        locus_tag = values[2]
        seq_size = values[3]
        seq_depth = values[4]
        if values[5] == 'GHP':
            dna_source = 'Gut Health Programme, Rowett Institute of Nutrition and Health, University of Aberdeen. http://www.rowett.ac.uk/divisions/ghp/'
        elif values[5] == 'INRA':
            dna_source = 'INRA Clermont-Ferrand-Theix. http://www.clermont.inra.fr/'
        elif values[5] == 'DSMZ':
            dna_source = 'Deutsche Sammlung von Mikroorganismen und Zellkulturen. GmbH http://www.dsmz.de/'
        elif values[5] == 'NCTC':
            dna_source = "Health Protection Agency's National Collection of Type Cultures. http://www.hpacultures.org.uk/"
        else:
            print "DNA source %s not found! Please provide details..." % values[5]
            continue
        
        #print dna_source
        description = values[6]
        doSubmit(organism_name=organism_name, strain=strain, locus_tag=locus_tag, 
                 seq_size=seq_size, seq_depth=seq_depth, dna_source=dna_source, 
                 description=description, submit=options.submit)
Exemple #7
0
def doRun():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name")
    parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input")
    (options, args) = parser.parse_args()

    try:
        common_name = options.name
        input_file = checkValidInput(options.input, common_name)
    
        # Print info
        log.info("Running Glimmer3 on %s\n" % common_name)
        log.info("Getting sequence from %s\n" % input_file)
    
        # Run glimmer3 iterated
        script = "/software/pathogen/external/applications/glimmer/glimmer/scripts/g3-iterated.csh"
        util.checkFile(script)
        cmd = "%s %s %s" % (script, input_file, common_name)
        util.runProcess(cmd)
    
        # Run the conversion only if g3 successful 
        g3_predict_file = "%s.predict" % common_name
        if os.path.exists(g3_predict_file):
            # Convert output results into a feature table EMBL file.
            g3_tab = convertToTab(g3_predict_file, common_name)
        
            # Tidy up
            util.rmFile(common_name + ".longorfs")
            util.rmFile(common_name + ".train")
            util.rmFile(common_name + ".icm")
            util.rmFile(common_name + ".run1.detail")
            util.rmFile(common_name + ".run1.predict")
            util.rmFile(common_name + ".coords")
            util.rmFile(common_name + ".upstream")
            util.rmFile(common_name + ".motif")
            util.rmFile(common_name + ".detail")
            util.rmFile(g3_predict_file)
    
            log.info("%s is the final feature table Glimmer3 predictions\n" % g3_tab)
        else:
            log.info("%s file does not exists\n" % g3_predict_file)
    except Exception, e:
        log.error(e)
        raise e
Exemple #8
0
def union(file, common_name, locus_tag, organism_name, strain):
    """
    Merge scaffolds into one sequence file
    Run EMBOSS union if more than on '>' is found
    """
    util.checkFile(file)
    cmd = "grep '>' %s | wc -l" % file
    result = util.runProcess(cmd)
    if int(result) > 1:
        new_file = "%s.fsa" % common_name
        util.checkSoft("union")
        util.checkSoft("descseq")
        name = "%s [organism=%s] [strain=%s] [gcode=11]" % (locus_tag, organism_name, strain)
        cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (file, name, new_file)
        util.runProcess(cmd_union)
        return new_file
    else:
        return file
Exemple #9
0
def checkValidInput(input_file, common_name):
    """
    Check if the input fasta sequence file is of correct format.
    RAST re-arrange the scaffolds if a splitted sequences is submitted
    Run EMBOSS union before if more than on '>' is found
    """
    util.checkFile(input_file)
    cmd = "grep '>' %s | wc -l" % input_file
    result = util.runProcess(cmd)
    if int(result) > 1:
        new_input_file = "%s.fna" % common_name
        util.checkSoft("union")
        util.checkSoft("descseq")
        cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file)
        util.runProcess(cmd_union)
        return new_input_file
    else:
        return input_file
Exemple #10
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name")
    parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input")
    parser.add_option("-j", metavar="ID", help="input job ID to fetch results", action="store", type="string", dest="jobid")
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list")
    parser.add_option("--fetch", help="To fetch results, job id must be provided", action="store_true", dest="fetch")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
        
    # Get and check input arguments
    if options.list:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||sequence_file
            line = line.strip()
            values = line.split('||')
            common_name = values[0]
            if options.fetch:
                job_id = values[2]
                doFetch(common_name, job_id)
            else:
                input_file = checkValidInput(values[1], common_name)           
                doSubmit(common_name, input_file)
    else:
        common_name = options.name
        if options.fetch:
                job_id = options.jobid
                doFetch(common_name, job_id)
        else:
            input_file = checkValidInput(options.input, common_name)           
            doSubmit(common_name, input_file)
Exemple #11
0
def convertToTab(result_file, common_name):
    try:
        tab_file = "%s.prodigal.tab" % common_name
        util.checkFile(result_file)
        f_input = open(result_file, "r")
        f_output = open(tab_file, "w")
        for line in f_input:
            line = line.strip()
            #      CDS             complement(14682..18617)
            values = line.split()
            location = values[1]

            f_output.write("FT   CDS             %s\n" % location)
            f_output.write("FT                   /colour=4\n")
            f_output.write('FT                   /method="PRODIGAL"\n')
        f_input.close()
        f_output.close()
        return tab_file
    except util.UtilException, ue:
        raise ue
Exemple #12
0
def checkValidInput(input_file, common_name):
    """
    Check if the input fasta sequence file is of correct format.
    Segmentation fault while running glimmer on splitted sequences with a fasta file
    Run EMBOSS union before if more than on '>' is found
    """
    try:
        util.checkFile(input_file)
        cmd = "grep '>' %s | wc -l" % input_file
        result = util.runProcess(cmd)
        if int(result) > 1:
            new_input_file = "%s.fna" % common_name
            util.checkSoft("union")
            util.checkSoft("descseq")
            cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file)
            util.runProcess(cmd_union)
            return new_input_file
        else:
            return input_file
    except util.UtilException, ue:
        raise ue
Exemple #13
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated information (common_name||organim_name||strain||locus_tag||genome_project_id||coverage)", action="store", type="string", dest="list")
    parser.add_option("--convert", help="Do convert embl file into tbl", action="store_true", dest="convert")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    # Get and check input arguments
    if options.list:
        # Read organism common name and related locus tag
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if not line.count('||') == 6:
                continue
            # ! common_name||organim_name||strain||locus_tag||genome_project_id||coverage||source
            line = line.strip()
            values = line.split('||')
            common_name=values[0]
            locus_tag=values[3]

            embl_file = "../IMG/%s.4dep.embl" % common_name
            util.checkFile(embl_file)
            tbl_file = "%s.tbl" % common_name
            log.info("Convert file %s into %s" % (embl_file, tbl_file))
            if options.convert:
                try:
                    doConvert(embl_file, tbl_file, locus_tag)
                except Exception, e:
                    log.error("Converting %s" % embl_file)
                    log.error(traceback.extract_stack())
                    log.error(e)
Exemple #14
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated locus tag", action="store", type="string", dest="list")
    parser.add_option("--convert", help="Do convert genbank file into embl", action="store_true", dest="convert")

    (options, args) = parser.parse_args()
    
    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
    
    # Get and check input arguments
    if options.list:
        # Read organism common name and related locus tag
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||organim_name||strain||locus_tag||fasta_file
            line = line.strip()
            values = line.split('||')
            common_name=values[0]
            locus_tag=values[3]

            gbk_file = "%s.img.embl" % common_name
            util.checkFile(gbk_file)
            tbl_file = "%s.tbl" % common_name
            print "Convert file %s into %s" % (gbk_file, tbl_file)
            if options.convert:
                try:
                    doConvert(gbk_file, tbl_file, locus_tag)
                except Exception, e:
                    print "ERROR to convert %s" % gbk_file
                    print e
Exemple #15
0
def convertToTab(g3_predict_file, common_name):
    try:
        g3_tab = "%s.g3.tab" % common_name
        util.checkFile(g3_predict_file)
        f_input = open (g3_predict_file, 'r')
        f_output = open (g3_tab, 'w')
        for line in f_input:
            if line[0] == '>':
                continue
            line = line.strip()
            # id    start    end    direction    score
            
            values = line.split()
            id = values[0]
            start = int(values[1])
            end = int(values[2])
            direction = values[3]
            score = values[4]
            
            if direction[0] == "+":
                location = "%s..%s" % (start, end)
            else:
                location = "complement(%s..%s)" % (end, start)
    
            if not ((direction[0] == '+' and start > end) or (direction[0] == '-' and start < end)):
    
                f_output.write("FT   CDS             %s\n" % location)
                f_output.write("FT                   /note=\"Raw score %s\"\n" % score)
                f_output.write("FT                   /label=%s\n" % id)
                f_output.write("FT                   /colour=4\n")
                f_output.write("FT                   /method=\"GLIMMER\"\n")
        f_input.close()
        f_output.close()
        return g3_tab
    except util.UtilException, ue:
        raise ue
Exemple #16
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name")
    parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input")
    parser.add_option("-p", metavar="ID", help="IMG project ID (GOLD Stamp ID)", action="store", type="string", dest="id")
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names, its associated sequence file and IMG project ID", action="store", type="string", dest="list")
    
    (options, args) = parser.parse_args()

    # Print help if no argument given
    if util.printHelp(options):
        parser.print_help()
        sys.exit()
        
    # Get and check input arguments
    if options.list:
        # Read organism common name and related fasta sequence file
        list_file = options.list
        util.checkFile(list_file)
        for line in open(list_file, "r"):
            if line[0] == '!':
                continue
            if line.count('||') < 1:
                continue
            # ! common_name||sequence_file
            line = line.strip()
            values = line.split('||')
            common_name = values[0]
            input_file = values[1]
            id = values[2]
            util.checkFile(input_file)
            doSubmit(common_name, input_file, id)
    else:
        common_name = options.name
        input_file = options.input
        id = options.id
        util.checkFile(input_file)
        doSubmit(common_name, input_file, id)
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-l", metavar="FILE", help="FILE containing the list of all 454 runs and its associated information", action="store", type="string", dest="list")
    parser.add_option("-o", "--outpath", metavar="PATH", help="PATH where to generate indexes and temporary fastq files", action="store", type="string", dest="outpath")
    parser.add_option("--fastq", help="Do generate fastq files.", action="store_true", dest="fastq")
    parser.add_option("--md5", help="Do run md5sum on generated fastq files.", action="store_true", dest="md5")
    
    (options, args) = parser.parse_args()

    if not options.list and not options.outpath:
        parser.print_help()
        sys.exit()
    
    # input file
    input_file = options.list
    util.checkFile(input_file)
    input_lines = open(input_file, "r").readlines()

    # output path
    output_path = options.outpath
    util.checkDir(output_path)
    out_metadata = "%s/metadata" % output_path
    util.checkDir(out_metadata)
    out_fastq = "%s/fastq" % output_path
    util.checkDir(out_fastq)

    # checking file format first before processing it
    lines = []
    for line in input_lines:
        if line[0] == '!':
            continue
        elif not line.count('||') == 8:
            log.error("line is not well formated. Please check your input file.")
            log.error(line.count('||'))
            log.error(line)
            sys.exit()
        else:
            lines.append(line)
    log.debug(lines)

    # opening output files
    sequence_index_filename = '%s/sequence.index' % out_metadata
    sequence_index = open(sequence_index_filename, 'w')
    samples_info = open('%s/samples.info' % out_metadata, 'w')
    assembly_index = open('%s/assembly.index' % out_metadata, 'w')

    # processing input file
    sample_count = 0
    for line in lines:
        line = line.strip()
        values = line.split('||')
        log.info(line)
        genus = values[1]
        species = values[2]
        strain = values[3]
        organism_name = "%s %s %s" % (genus, species, strain)
        sample_count = sample_count + 1
        run = values[4]
        if values[5] == '1':
            paired = 'PAIRED'
        else:
            paired = 'SINGLE'
        insert_size = values[6]
        sff_file = "/nfs/%s" % values[7]
        trim_status = "/nfs/%s/454TrimStatus.txt" % values[8]
        contigs_file = "/nfs/%s/454LargeContigs.fna" %values[8]
        scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[8]

        species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_')
        strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_')
        study = "%s_%s%s" % (values[0], genus[0], species_strain)

        # check that project name (study) is less than 40 char
        # mysql> desc project;
        # | name           | varchar(40)           | NO   | MUL |         |                | 
        # | hierarchy_name | varchar(40)           | NO   | MUL |         |                | 
        if len(study) > 40:
            log.warning("Project name %s has more than 40 char." % study)

        # checking files
        util.checkFile(sff_file)
        util.checkFile(trim_status)
        util.checkFile(contigs_file)
        util.checkFile(scaffolds_file)

        # convert sff into fastq
        outprefix = "%s/%s" % (out_fastq, run)
        cmd_sff2fastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/454TrimStatus2reads.py --pair_suffix=/1,/2 --sff %s %s %s" % (sff_file, trim_status, outprefix)
        fastq_pairs = "%s-pairs.fastq" % outprefix
        fastq_single = "%s-single.fastq" % outprefix

        # split fastq pairs file
        fastq_1 = "%s_1.fastq" % outprefix
        fastq_2 = "%s_2.fastq" % outprefix
        cmd_splitfastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/fastn_unshuffle.py %s %s %s" % (fastq_pairs, fastq_1, fastq_2)

        # rename fastq single file
        fastq_0 = "%s.fastq" % outprefix
        cmd_rename = "mv %s %s" % (fastq_single, fastq_0)

        # tidy-up
        cmd_remove = "rm %s-info.txt; rm %s-pairs.fastq" % (outprefix, outprefix)

        # gzip fastq files
        cmd_gzip = "gzip %s; gzip %s; gzip %s" % (fastq_1, fastq_2, fastq_0)

        # all commands
        cmd = "%s; %s; %s; %s; %s" % (cmd_sff2fastq, cmd_splitfastq, cmd_rename, cmd_remove, cmd_gzip)


        if IS_LSF:
            if not (os.path.exists("%s.gz" % fastq_1) and os.path.exists("%s.gz" % fastq_2) and os.path.exists("%s.gz" % fastq_0)):
                if options.fastq:
                    util.submitJob(jobname='sff2fastq_%s' % run, cmd=cmd, outdir=out_metadata)
                else:
                    log.info("fastq files do not exist, use '--fastq' to generate them.")
            else:
                log.info("fastq files already exist.")
        else:
            log.info("Need to be run on LSF.")

        instrument_platform = '454'
        empty = 'n/a'
        fastq_1_gz = "%s.gz" % fastq_1
        fastq_2_gz = "%s.gz" % fastq_2
        fastq_0_gz = "%s.gz" % fastq_0

        # write to output files
        # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name|
        #                 (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)|
        #                 insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_1_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_2_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0'))
        sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                             % (fastq_0_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform,
                                empty, run, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0'))

        # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex
        samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty))

        # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq
        # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt
        assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10]
        assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run))

    # close files
    sequence_index.close()
    samples_info.close()
    assembly_index.close()

    if not options.fastq:
        log.info("Use '--fastq' for generating fastq files")

    if options.md5:
        # calculate md5 and modify sequence.index
        util.checkFile(sequence_index_filename)
        seq_lines = open(sequence_index_filename, "r").readlines()
        sequence_index = open(sequence_index_filename, 'w')
        for line in seq_lines:
            values = line.split('\t')
            fastq = values[0]
            run = values[2]
            if os.path.exists(fastq):
                md5 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq).strip()
                line = line.replace('md5', md5)
                sequence_index.write(line)
            else:
                log.info("fastq file %s does not exist, use '--fastq' for generating it." % fastq)

        # close file
        sequence_index.close()
    else:
        log.info("When all submitted jobs end, use '--md5' for updating sequence.index with md5sum.")
Exemple #18
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.root and options.category):
        parser.print_help()
        sys.exit()
    
    # get the data from Goodgle spreadsheet
    lines = gdocs.getValues(doc='%s_454_Projects' % options.category.title())

    # check output path
    util.checkDir(options.root)
    out_metadata = "%s/.tmp/metadata/%s" % (options.root, options.category)
    util.checkDir(out_metadata)
    out_fastq = "%s/.tmp/fastq/%s" % (options.root, options.category)
    util.checkDir(out_fastq)

    # open output files
    sequence_index = open('%s/sequence.index' % out_metadata, 'w')
    samples_info = open('%s/samples.info' % out_metadata, 'w')
    assembly_index = open('%s/assembly.index' % out_metadata, 'w')

    # process input data
    sample_count = 0
    for line in lines:
        line = line.strip()
        values = line.split('||')
        genus = values[1]
        species = values[2]
        strain = values[3]
        organism_name = "%s %s %s" % (genus, species, strain)
        sample_count = sample_count + 1
        sample = values[4]
        if sample == 'None':
            sample = strain
        library = values[5]
        run = values[6]
        if library == 'None':
            library = run
        if values[7] == '1':
            paired = 'PAIRED'
        else:
            paired = 'SINGLE'
            log.error("Single read set for %s. Not implemented." % run)
            continue
        insert_size = values[8]
        sff_file = "/nfs/%s" % values[9]
        trim_status = "/nfs/%s/454TrimStatus.txt" % values[10]
        contigs_file = "/nfs/%s/454LargeContigs.fna" %values[10]
        scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[10]

        species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_')
        strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_')
        study = "%s_%s%s" % (values[0], genus[0], species_strain)

        instrument_platform = '454'
        empty = 'n/a'
        fastq_1_gz = "%s/%s_1.fastq.gz" % (out_fastq, run)
        fastq_2_gz = "%s/%s_2.fastq.gz" % (out_fastq, run)
        fastq_0_gz = "%s/%s.fastq.gz" % (out_fastq, run)

        # check that project name (study) is less than 40 char
        # mysql> desc project;
        # | name           | varchar(40)           | NO   | MUL |         |                | 
        # | hierarchy_name | varchar(40)           | NO   | MUL |         |                | 
        if len(study) > 40:
            log.warning("Project name %s has more than 40 char." % study)

        # checking files
        util.checkFile(sff_file)
        util.checkFile(trim_status)
        util.checkFile(contigs_file)
        util.checkFile(scaffolds_file)

        log.info("> checking fastq files: %s" % run)
        # get lane hierarchy path (run)
        run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip()

        # check if fastq files have been loaded in db
        do_generate_indexes  = False
        do_generate_assembly_indexes = False
        if run_path != "undefined":
            log.info("  loaded in db.")
            fastq_path = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path)
            util.checkDir(fastq_path)
            # check if fastq files have been imported into the hierarchy
            if not (os.path.exists("%s/%s_1.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s_2.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s.fastq.gz" % (fastq_path, run))):
                log.info("  not imported into hierarchy.")
                # check if fastq files have been generated from sff into tmp dir
                if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)):
                    log.info("  not generated from sff files.")
                else:
                    log.info("  generate indexes.")
                    do_generate_indexes = True
            else:
                log.info("  already imported into hierarchy.")
                do_generate_assembly_indexes = True
        else:
            log.info("  not loaded in db.")
            # check if fastq files have been generated from sff into tmp dir
            if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)):
                log.info("  not generated from sff files.")
            else:
                log.info("  generate indexes.")
                do_generate_indexes = True
        
        # generate sequence and sample indexes
        if do_generate_indexes:
            # calculate md5
            md5_1 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_1_gz).strip()
            md5_2 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_2_gz).strip()
            md5_0 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_0_gz).strip()

            # write to output files
            # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name|
            #                 (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)|
            #                 insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_1_gz, md5_1, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0'))
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_2_gz, md5_2, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0'))
            sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                                 % (fastq_0_gz, md5_0, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform,
                                    empty, library, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0'))

            # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex
            samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty))
        
        # generate assembly indexes
        if do_generate_indexes or do_generate_assembly_indexes:
            # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq
            # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt
            assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10]
            assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run))

    # close files
    sequence_index.close()
    samples_info.close()
    assembly_index.close()
Exemple #19
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly")
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.assembly and options.root and options.category):
        parser.print_help()
        sys.exit()

    # check root path
    if not os.path.exists(options.root):
        log.error("%s path do not exist" % options.root)
        log.error("Create root path first, then run pipeline before importing assembly files.")
        sys.exit()
    
    # check input assembly file and read it - one line per run (lane)
    util.checkFile(options.assembly)
    assembly_lines = open(options.assembly, "r").readlines()
    # compare project name - could have more than one run per project
    previous_project = ""
    is_new_project = True
    for line in assembly_lines:
        if line[0] == '!':
            continue
        if not line.count('||') == 6:
            continue
        line = line.strip()
        values = line.split('||')
        project = values[0]
        genus = values[1]
        species = values[2]
        assembly_id = values[3]
        contig_file = values[4]
        scaffold_file = values[5]
        run = values[6]

        # get lane hierarchy path (run)
        run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip()
        fastq_file = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path)

        # check if new project
        if project == previous_project:
            is_new_project = False
        else:
            previous_project = project
            is_new_project = True

        # check species path
        species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species)
        if not os.path.exists(species_path):
            log.error("%s path do not exist" % species_path)
            log.error("Run fastq import pipeline before importing assembly files.")
        else:
            # create assembly path
            assembly_path = "%s/ASSEMBLY" % species_path
            if not os.path.exists(assembly_path):
                os.makedirs(assembly_path)
                log.info("%s created" % assembly_path)
            else:
                log.info("%s path already exists" % assembly_path)

            # create assembly_id path (newbler_2009_06_29)
            assembly_id_path = "%s/%s" % (assembly_path, assembly_id)
            if not os.path.exists(assembly_id_path):
                os.makedirs(assembly_id_path)
                log.info("%s created" % assembly_id_path)
            else:
                log.info("%s path already exists" % assembly_id_path)

            # copy contigs file
            contig_file_hierarchy = "%s/LargeContigs.fna" % assembly_id_path
            util.checkFile(contig_file)
            cmd_cp = "cp %s %s" % (contig_file, contig_file_hierarchy)
            if not os.path.exists(contig_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))
            else:
                log.info("%s file already exists" % contig_file_hierarchy)
                if not has_same_md5(contig_file, contig_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy))

            # copy scaffolds file
            scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path
            util.checkFile(scaffold_file)
            cmd_cp = "cp %s %s" % (scaffold_file, scaffold_file_hierarchy)
            if not os.path.exists(scaffold_file_hierarchy):
                util.runProcess(cmd_cp)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))
            else:
                log.info("%s file already exists" % scaffold_file_hierarchy)
                if not has_same_md5(scaffold_file, scaffold_file_hierarchy):
                    log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy))

            # create fastqs path
            fastqs_path = "%s/fastqs" % assembly_id_path
            if not os.path.exists(fastqs_path):
                os.makedirs(fastqs_path)
                log.info("%s created" % fastqs_path)
            else:
                log.info("%s path already exists" % fastqs_path)
            
            # create simlinks to fastqs
            util.checkDir(fastq_file)
            fastq_name = run
            symlink = "%s/%s" % (fastqs_path, fastq_name)
            if not os.path.exists(symlink):
                os.symlink(fastq_file, symlink)
                log.info("%s symlink created" % symlink)
            else:
                log.info("%s symlink already exists" % symlink)
                
            # run samtools faidx, refstats, bwa to generate extra files required for the QC pipeline
            cmd = ""
            if not os.path.exists("%s.fai" % scaffold_file_hierarchy):
                cmd = "samtools faidx %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.fai already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.refstats" % scaffold_file_hierarchy):
                cmd = cmd + "ref-stats -r %s > %s.refstats; " % (scaffold_file_hierarchy, scaffold_file_hierarchy)
            else:
                log.info("%s.refstats already exists" % scaffold_file_hierarchy)
            
            if not os.path.exists("%s.bwt" % scaffold_file_hierarchy):
                cmd = cmd + "bwa index %s; " % scaffold_file_hierarchy
            else:
                log.info("%s.bwt already exists" % scaffold_file_hierarchy)

            # run stats.py on contigs and scaffolds
            cmd_stats = "python /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/stats.py -f %s; "
            if not os.path.exists("%s.stats" % contig_file_hierarchy):
                cmd = cmd + cmd_stats % contig_file_hierarchy
            else:
                log.info("%s.stats already exists" % contig_file_hierarchy)
            if not os.path.exists("%s.stats" % scaffold_file_hierarchy):
                cmd = cmd + cmd_stats % scaffold_file_hierarchy
            else:
                log.info("%s.stats already exists" % scaffold_file_hierarchy)

            # submit all jobs
            if is_new_project and not cmd == "":
                util.submitJob(jobname='stats_%s_%s' % (project, assembly_id), cmd=cmd, outdir=assembly_id_path)
Exemple #20
0
def main():
    usage = "usage: %prog [Options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly")
    parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root")
    parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category")
    
    (options, args) = parser.parse_args()

    if not (options.assembly and options.root and options.category):
        parser.print_help()
        sys.exit()

    # check root path
    if not os.path.exists(options.root):
        log.error("%s path do not exist" % options.root)
        log.error("Create root path first, then run pipeline before importing assembly files.")
        sys.exit()
    
    # check log directory exists
    out_log = "%s/log/%s" % (options.root, options.category)
    util.checkDir(out_log)

    # open qc_pipeline.conf
    pipeline_qc = open('%s/conf/%s/qc_pipeline.conf' % (options.root, options.category), 'w')

    # check input assembly file and read it - one line per run (lane)
    util.checkFile(options.assembly)
    assembly_lines = open(options.assembly, "r").readlines()
    # compare project name - could have more than one run per project
    previous_project = ""
    for line in assembly_lines:
        if line[0] == '!':
            continue
        if not line.count('||') == 6:
            continue
        line = line.strip()
        values = line.split('||')
        project = values[0]
        genus = values[1]
        species = values[2]
        assembly_id = values[3]
        contig_file = values[4]
        scaffold_file = values[5]
        run = values[6]

        # check if new project
        if project != previous_project:
            # check if files are in place in the hierarchy
            species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species)
            assembly_path = "%s/ASSEMBLY" % species_path
            assembly_id_path = "%s/%s" % (assembly_path, assembly_id)
            scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path
            util.checkFile(scaffold_file_hierarchy)
            util.checkFile("%s.fai" % scaffold_file_hierarchy)
            util.checkFile("%s.refstats" % scaffold_file_hierarchy)
            util.checkFile("%s.bwt" % scaffold_file_hierarchy)

            # create one qc conf file specific per project
            qc_conf_filename = '%s/conf/%s/%s_qc.conf' % (options.root, options.category, project)
            qc_conf = open(qc_conf_filename, 'w')
            qc_conf.write(constants.QC_CONF_TEMPLATE % {'root':options.root,
                                                        'category':options.category,
                                                        'db':constants.DATABASE[options.category],
                                                        'db_host':os.getenv('VRTRACK_HOST'),
                                                        'db_port':os.getenv('VRTRACK_PORT'),
                                                        'db_rw_user':os.getenv('VRTRACK_RW_USER'),
                                                        'db_password':os.getenv('VRTRACK_PASSWORD'),
                                                        'project':project,
                                                        'ref':scaffold_file_hierarchy})
            qc_conf.close()

            log.info("QC conf file %s has been generated." % qc_conf_filename)

            # update qc_pipeline.conf
            pipeline_qc.write("__VRTrack_QC__\t%s\n" % (qc_conf_filename))

            # update previous project name
            previous_project = project

    pipeline_qc.close()