def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||sequence_file line = line.strip() values = line.split('||') common_name = values[0] input_file = values[1] #util.checkFile(input_file) doSubmit(common_name, input_file)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments if options.list: # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||organim_name||strain||locus_tag||fasta_file line = line.strip() values = line.split('||') print "Processing %s" % values[0] union(file=values[4], common_name=values[0], locus_tag=values[3], organism_name=values[1], strain=values[2])
def convertToTab(result_file, common_name): try: tab_file = "%s.trna.tab" % common_name util.checkFile(result_file) f_input = open (result_file, 'r') f_output = open (tab_file, 'w') for line in f_input: line = line.strip() # name=Alistipes_shahii 1 89017 88933 Undet ??? 0 0 61.20 values = line.split() start = int(values[2]) end = int(values [3]) trna_type = values[4] anti_codon = values[5] score = values [8] if start <= end: location = "%s..%s" % (start, end) else: location = "complement(%s..%s)" % (end, start) f_output.write("FT tRNA %s\n" % location) f_output.write("FT /note=\"tRNA-%s(%s) Cove Score %s\"\n" % (trna_type, anti_codon, score)) f_output.write("FT /colour=4\n") f_output.write("FT /method=\"tRNAscan-SE\"\n") f_input.close() f_output.close() return tab_file except util.UtilException, ue: raise ue
def infoseq(file): """ Run EMBOSS infoseq to Display basic information about sequences """ util.checkFile(file) util.checkSoft("infoseq") cmd = "infoseq -only -length -noheading %s -outfile %s.infoseq" % (file, file) util.runProcess(cmd)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", "--list", metavar="FILE", help="FILE containing the list of all organism common names and its associated file to load", action="store", type="string", dest="list") parser.add_option("-D", action="store", dest="dbhost") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Print command line cmdline = "$ python " for argv in sys.argv: cmdline += argv + " " logger.info(cmdline) # Print logger file info logger.info(logsetup.logpath) # Setup database connection host = ropy.util.getDArg("dbhost", raiseOnEmpty = True) database = ropy.util.getDArg("dbname", raiseOnEmpty = True) port = ropy.util.getDArg("dbport", raiseOnEmpty = True) user = ropy.util.getDArg("dbuser", raiseOnEmpty = True) #password = ropy.util.getDArg("dbpassword", raiseOnEmpty = True) # Check if chado_load is installed util.isSoftInstalled("chado_load") # Read organism common name and load related embl file into the database data_path = options.list for line in open(data_path, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||taxon_id||filename line = line.strip() list = line.split('||') common_name = list[0] filename = list[2] util.checkFile(filename) # Loader command cmd = "chado_load embl -o %s -t contig -D %s:%s/%s -U %s %s" % (common_name, host, port, database, user, filename) # Run command util.runProcess(cmd)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names, its associated information", action="store", type="string", dest="list") parser.add_option("--submit", help="To submit data, not only checking locus_tag", action="store_true", dest="submit") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! organism_name||strain||locus_tag||seq_size||seq_depth||dna_source||description line = line.strip() values = line.split('||') organism_name = values[0] strain = values[1] locus_tag = values[2] seq_size = values[3] seq_depth = values[4] if values[5] == 'GHP': dna_source = 'Gut Health Programme, Rowett Institute of Nutrition and Health, University of Aberdeen. http://www.rowett.ac.uk/divisions/ghp/' elif values[5] == 'INRA': dna_source = 'INRA Clermont-Ferrand-Theix. http://www.clermont.inra.fr/' elif values[5] == 'DSMZ': dna_source = 'Deutsche Sammlung von Mikroorganismen und Zellkulturen. GmbH http://www.dsmz.de/' elif values[5] == 'NCTC': dna_source = "Health Protection Agency's National Collection of Type Cultures. http://www.hpacultures.org.uk/" else: print "DNA source %s not found! Please provide details..." % values[5] continue #print dna_source description = values[6] doSubmit(organism_name=organism_name, strain=strain, locus_tag=locus_tag, seq_size=seq_size, seq_depth=seq_depth, dna_source=dna_source, description=description, submit=options.submit)
def doRun(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name") parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input") (options, args) = parser.parse_args() try: common_name = options.name input_file = checkValidInput(options.input, common_name) # Print info log.info("Running Glimmer3 on %s\n" % common_name) log.info("Getting sequence from %s\n" % input_file) # Run glimmer3 iterated script = "/software/pathogen/external/applications/glimmer/glimmer/scripts/g3-iterated.csh" util.checkFile(script) cmd = "%s %s %s" % (script, input_file, common_name) util.runProcess(cmd) # Run the conversion only if g3 successful g3_predict_file = "%s.predict" % common_name if os.path.exists(g3_predict_file): # Convert output results into a feature table EMBL file. g3_tab = convertToTab(g3_predict_file, common_name) # Tidy up util.rmFile(common_name + ".longorfs") util.rmFile(common_name + ".train") util.rmFile(common_name + ".icm") util.rmFile(common_name + ".run1.detail") util.rmFile(common_name + ".run1.predict") util.rmFile(common_name + ".coords") util.rmFile(common_name + ".upstream") util.rmFile(common_name + ".motif") util.rmFile(common_name + ".detail") util.rmFile(g3_predict_file) log.info("%s is the final feature table Glimmer3 predictions\n" % g3_tab) else: log.info("%s file does not exists\n" % g3_predict_file) except Exception, e: log.error(e) raise e
def union(file, common_name, locus_tag, organism_name, strain): """ Merge scaffolds into one sequence file Run EMBOSS union if more than on '>' is found """ util.checkFile(file) cmd = "grep '>' %s | wc -l" % file result = util.runProcess(cmd) if int(result) > 1: new_file = "%s.fsa" % common_name util.checkSoft("union") util.checkSoft("descseq") name = "%s [organism=%s] [strain=%s] [gcode=11]" % (locus_tag, organism_name, strain) cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (file, name, new_file) util.runProcess(cmd_union) return new_file else: return file
def checkValidInput(input_file, common_name): """ Check if the input fasta sequence file is of correct format. RAST re-arrange the scaffolds if a splitted sequences is submitted Run EMBOSS union before if more than on '>' is found """ util.checkFile(input_file) cmd = "grep '>' %s | wc -l" % input_file result = util.runProcess(cmd) if int(result) > 1: new_input_file = "%s.fna" % common_name util.checkSoft("union") util.checkSoft("descseq") cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file) util.runProcess(cmd_union) return new_input_file else: return input_file
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name") parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input") parser.add_option("-j", metavar="ID", help="input job ID to fetch results", action="store", type="string", dest="jobid") parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated sequence file", action="store", type="string", dest="list") parser.add_option("--fetch", help="To fetch results, job id must be provided", action="store_true", dest="fetch") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments if options.list: # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||sequence_file line = line.strip() values = line.split('||') common_name = values[0] if options.fetch: job_id = values[2] doFetch(common_name, job_id) else: input_file = checkValidInput(values[1], common_name) doSubmit(common_name, input_file) else: common_name = options.name if options.fetch: job_id = options.jobid doFetch(common_name, job_id) else: input_file = checkValidInput(options.input, common_name) doSubmit(common_name, input_file)
def convertToTab(result_file, common_name): try: tab_file = "%s.prodigal.tab" % common_name util.checkFile(result_file) f_input = open(result_file, "r") f_output = open(tab_file, "w") for line in f_input: line = line.strip() # CDS complement(14682..18617) values = line.split() location = values[1] f_output.write("FT CDS %s\n" % location) f_output.write("FT /colour=4\n") f_output.write('FT /method="PRODIGAL"\n') f_input.close() f_output.close() return tab_file except util.UtilException, ue: raise ue
def checkValidInput(input_file, common_name): """ Check if the input fasta sequence file is of correct format. Segmentation fault while running glimmer on splitted sequences with a fasta file Run EMBOSS union before if more than on '>' is found """ try: util.checkFile(input_file) cmd = "grep '>' %s | wc -l" % input_file result = util.runProcess(cmd) if int(result) > 1: new_input_file = "%s.fna" % common_name util.checkSoft("union") util.checkSoft("descseq") cmd_union = "union -sequence %s -stdout Yes -auto Yes | descseq -filter Yes -name '%s' -auto Yes > %s" % (input_file, common_name, new_input_file) util.runProcess(cmd_union) return new_input_file else: return input_file except util.UtilException, ue: raise ue
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated information (common_name||organim_name||strain||locus_tag||genome_project_id||coverage)", action="store", type="string", dest="list") parser.add_option("--convert", help="Do convert embl file into tbl", action="store_true", dest="convert") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments if options.list: # Read organism common name and related locus tag list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if not line.count('||') == 6: continue # ! common_name||organim_name||strain||locus_tag||genome_project_id||coverage||source line = line.strip() values = line.split('||') common_name=values[0] locus_tag=values[3] embl_file = "../IMG/%s.4dep.embl" % common_name util.checkFile(embl_file) tbl_file = "%s.tbl" % common_name log.info("Convert file %s into %s" % (embl_file, tbl_file)) if options.convert: try: doConvert(embl_file, tbl_file, locus_tag) except Exception, e: log.error("Converting %s" % embl_file) log.error(traceback.extract_stack()) log.error(e)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names and its associated locus tag", action="store", type="string", dest="list") parser.add_option("--convert", help="Do convert genbank file into embl", action="store_true", dest="convert") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments if options.list: # Read organism common name and related locus tag list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||organim_name||strain||locus_tag||fasta_file line = line.strip() values = line.split('||') common_name=values[0] locus_tag=values[3] gbk_file = "%s.img.embl" % common_name util.checkFile(gbk_file) tbl_file = "%s.tbl" % common_name print "Convert file %s into %s" % (gbk_file, tbl_file) if options.convert: try: doConvert(gbk_file, tbl_file, locus_tag) except Exception, e: print "ERROR to convert %s" % gbk_file print e
def convertToTab(g3_predict_file, common_name): try: g3_tab = "%s.g3.tab" % common_name util.checkFile(g3_predict_file) f_input = open (g3_predict_file, 'r') f_output = open (g3_tab, 'w') for line in f_input: if line[0] == '>': continue line = line.strip() # id start end direction score values = line.split() id = values[0] start = int(values[1]) end = int(values[2]) direction = values[3] score = values[4] if direction[0] == "+": location = "%s..%s" % (start, end) else: location = "complement(%s..%s)" % (end, start) if not ((direction[0] == '+' and start > end) or (direction[0] == '-' and start < end)): f_output.write("FT CDS %s\n" % location) f_output.write("FT /note=\"Raw score %s\"\n" % score) f_output.write("FT /label=%s\n" % id) f_output.write("FT /colour=4\n") f_output.write("FT /method=\"GLIMMER\"\n") f_input.close() f_output.close() return g3_tab except util.UtilException, ue: raise ue
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-o", metavar="NAME", help="organism common name", action="store", type="string", dest="name") parser.add_option("-i", metavar="FILE", help="input organism sequence file in FASTA format", action="store", type="string", dest="input") parser.add_option("-p", metavar="ID", help="IMG project ID (GOLD Stamp ID)", action="store", type="string", dest="id") parser.add_option("-l", metavar="FILE", help="FILE containing the list of all organism common names, its associated sequence file and IMG project ID", action="store", type="string", dest="list") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() # Get and check input arguments if options.list: # Read organism common name and related fasta sequence file list_file = options.list util.checkFile(list_file) for line in open(list_file, "r"): if line[0] == '!': continue if line.count('||') < 1: continue # ! common_name||sequence_file line = line.strip() values = line.split('||') common_name = values[0] input_file = values[1] id = values[2] util.checkFile(input_file) doSubmit(common_name, input_file, id) else: common_name = options.name input_file = options.input id = options.id util.checkFile(input_file) doSubmit(common_name, input_file, id)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-l", metavar="FILE", help="FILE containing the list of all 454 runs and its associated information", action="store", type="string", dest="list") parser.add_option("-o", "--outpath", metavar="PATH", help="PATH where to generate indexes and temporary fastq files", action="store", type="string", dest="outpath") parser.add_option("--fastq", help="Do generate fastq files.", action="store_true", dest="fastq") parser.add_option("--md5", help="Do run md5sum on generated fastq files.", action="store_true", dest="md5") (options, args) = parser.parse_args() if not options.list and not options.outpath: parser.print_help() sys.exit() # input file input_file = options.list util.checkFile(input_file) input_lines = open(input_file, "r").readlines() # output path output_path = options.outpath util.checkDir(output_path) out_metadata = "%s/metadata" % output_path util.checkDir(out_metadata) out_fastq = "%s/fastq" % output_path util.checkDir(out_fastq) # checking file format first before processing it lines = [] for line in input_lines: if line[0] == '!': continue elif not line.count('||') == 8: log.error("line is not well formated. Please check your input file.") log.error(line.count('||')) log.error(line) sys.exit() else: lines.append(line) log.debug(lines) # opening output files sequence_index_filename = '%s/sequence.index' % out_metadata sequence_index = open(sequence_index_filename, 'w') samples_info = open('%s/samples.info' % out_metadata, 'w') assembly_index = open('%s/assembly.index' % out_metadata, 'w') # processing input file sample_count = 0 for line in lines: line = line.strip() values = line.split('||') log.info(line) genus = values[1] species = values[2] strain = values[3] organism_name = "%s %s %s" % (genus, species, strain) sample_count = sample_count + 1 run = values[4] if values[5] == '1': paired = 'PAIRED' else: paired = 'SINGLE' insert_size = values[6] sff_file = "/nfs/%s" % values[7] trim_status = "/nfs/%s/454TrimStatus.txt" % values[8] contigs_file = "/nfs/%s/454LargeContigs.fna" %values[8] scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[8] species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_') strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_') study = "%s_%s%s" % (values[0], genus[0], species_strain) # check that project name (study) is less than 40 char # mysql> desc project; # | name | varchar(40) | NO | MUL | | | # | hierarchy_name | varchar(40) | NO | MUL | | | if len(study) > 40: log.warning("Project name %s has more than 40 char." % study) # checking files util.checkFile(sff_file) util.checkFile(trim_status) util.checkFile(contigs_file) util.checkFile(scaffolds_file) # convert sff into fastq outprefix = "%s/%s" % (out_fastq, run) cmd_sff2fastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/454TrimStatus2reads.py --pair_suffix=/1,/2 --sff %s %s %s" % (sff_file, trim_status, outprefix) fastq_pairs = "%s-pairs.fastq" % outprefix fastq_single = "%s-single.fastq" % outprefix # split fastq pairs file fastq_1 = "%s_1.fastq" % outprefix fastq_2 = "%s_2.fastq" % outprefix cmd_splitfastq = "/nfs/users/nfs_m/mh12/svn-repository/pathogen/user/mh12/python/fastn_unshuffle.py %s %s %s" % (fastq_pairs, fastq_1, fastq_2) # rename fastq single file fastq_0 = "%s.fastq" % outprefix cmd_rename = "mv %s %s" % (fastq_single, fastq_0) # tidy-up cmd_remove = "rm %s-info.txt; rm %s-pairs.fastq" % (outprefix, outprefix) # gzip fastq files cmd_gzip = "gzip %s; gzip %s; gzip %s" % (fastq_1, fastq_2, fastq_0) # all commands cmd = "%s; %s; %s; %s; %s" % (cmd_sff2fastq, cmd_splitfastq, cmd_rename, cmd_remove, cmd_gzip) if IS_LSF: if not (os.path.exists("%s.gz" % fastq_1) and os.path.exists("%s.gz" % fastq_2) and os.path.exists("%s.gz" % fastq_0)): if options.fastq: util.submitJob(jobname='sff2fastq_%s' % run, cmd=cmd, outdir=out_metadata) else: log.info("fastq files do not exist, use '--fastq' to generate them.") else: log.info("fastq files already exist.") else: log.info("Need to be run on LSF.") instrument_platform = '454' empty = 'n/a' fastq_1_gz = "%s.gz" % fastq_1 fastq_2_gz = "%s.gz" % fastq_2 fastq_0_gz = "%s.gz" % fastq_0 # write to output files # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name| # (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)| # insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_1_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_2_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_0_gz, 'md5', run, study, study, 'SC', empty, empty, strain, strain, strain, empty, instrument_platform, empty, run, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0')) # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty)) # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10] assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run)) # close files sequence_index.close() samples_info.close() assembly_index.close() if not options.fastq: log.info("Use '--fastq' for generating fastq files") if options.md5: # calculate md5 and modify sequence.index util.checkFile(sequence_index_filename) seq_lines = open(sequence_index_filename, "r").readlines() sequence_index = open(sequence_index_filename, 'w') for line in seq_lines: values = line.split('\t') fastq = values[0] run = values[2] if os.path.exists(fastq): md5 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq).strip() line = line.replace('md5', md5) sequence_index.write(line) else: log.info("fastq file %s does not exist, use '--fastq' for generating it." % fastq) # close file sequence_index.close() else: log.info("When all submitted jobs end, use '--md5' for updating sequence.index with md5sum.")
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root") parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category") (options, args) = parser.parse_args() if not (options.root and options.category): parser.print_help() sys.exit() # get the data from Goodgle spreadsheet lines = gdocs.getValues(doc='%s_454_Projects' % options.category.title()) # check output path util.checkDir(options.root) out_metadata = "%s/.tmp/metadata/%s" % (options.root, options.category) util.checkDir(out_metadata) out_fastq = "%s/.tmp/fastq/%s" % (options.root, options.category) util.checkDir(out_fastq) # open output files sequence_index = open('%s/sequence.index' % out_metadata, 'w') samples_info = open('%s/samples.info' % out_metadata, 'w') assembly_index = open('%s/assembly.index' % out_metadata, 'w') # process input data sample_count = 0 for line in lines: line = line.strip() values = line.split('||') genus = values[1] species = values[2] strain = values[3] organism_name = "%s %s %s" % (genus, species, strain) sample_count = sample_count + 1 sample = values[4] if sample == 'None': sample = strain library = values[5] run = values[6] if library == 'None': library = run if values[7] == '1': paired = 'PAIRED' else: paired = 'SINGLE' log.error("Single read set for %s. Not implemented." % run) continue insert_size = values[8] sff_file = "/nfs/%s" % values[9] trim_status = "/nfs/%s/454TrimStatus.txt" % values[10] contigs_file = "/nfs/%s/454LargeContigs.fna" %values[10] scaffolds_file = "/nfs/%s/454Scaffolds.fna" %values[10] species_strain = sub('[^a-zA-Z0-9_]', '_', "%s_%s" % (species, strain)).replace('__', '_') strain_4_hierarchy = sub('[^a-zA-Z0-9_]', '_', strain).replace('__', '_') study = "%s_%s%s" % (values[0], genus[0], species_strain) instrument_platform = '454' empty = 'n/a' fastq_1_gz = "%s/%s_1.fastq.gz" % (out_fastq, run) fastq_2_gz = "%s/%s_2.fastq.gz" % (out_fastq, run) fastq_0_gz = "%s/%s.fastq.gz" % (out_fastq, run) # check that project name (study) is less than 40 char # mysql> desc project; # | name | varchar(40) | NO | MUL | | | # | hierarchy_name | varchar(40) | NO | MUL | | | if len(study) > 40: log.warning("Project name %s has more than 40 char." % study) # checking files util.checkFile(sff_file) util.checkFile(trim_status) util.checkFile(contigs_file) util.checkFile(scaffolds_file) log.info("> checking fastq files: %s" % run) # get lane hierarchy path (run) run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip() # check if fastq files have been loaded in db do_generate_indexes = False do_generate_assembly_indexes = False if run_path != "undefined": log.info(" loaded in db.") fastq_path = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path) util.checkDir(fastq_path) # check if fastq files have been imported into the hierarchy if not (os.path.exists("%s/%s_1.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s_2.fastq.gz" % (fastq_path, run)) and os.path.exists("%s/%s.fastq.gz" % (fastq_path, run))): log.info(" not imported into hierarchy.") # check if fastq files have been generated from sff into tmp dir if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)): log.info(" not generated from sff files.") else: log.info(" generate indexes.") do_generate_indexes = True else: log.info(" already imported into hierarchy.") do_generate_assembly_indexes = True else: log.info(" not loaded in db.") # check if fastq files have been generated from sff into tmp dir if not (os.path.exists(fastq_1_gz) and os.path.exists(fastq_2_gz) and os.path.exists(fastq_0_gz)): log.info(" not generated from sff files.") else: log.info(" generate indexes.") do_generate_indexes = True # generate sequence and sample indexes if do_generate_indexes: # calculate md5 md5_1 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_1_gz).strip() md5_2 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_2_gz).strip() md5_0 = util.runProcess("md5sum %s | cut -d ' ' -f 1" % fastq_0_gz).strip() # write to output files # sequence.index: fastq_file|md5|run_id|study_id|(study_name)|center_name|(submission_id)|(submission_date)|sample_id|sample_name| # (population)|(experiment_id)|instrument_platform|(instrument_model)|library_name|(run_name)|(run_block_name)| # insert_size|(library_layout)|paired_fastq|withdrawn|(withdrawn_date)|(comment)|read_count|base_count sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_1_gz, md5_1, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'PAIRED', fastq_2_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_2_gz, md5_2, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'PAIRED', fastq_1_gz, '0', empty, empty, '0', '0')) sequence_index.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (fastq_0_gz, md5_0, run, study, study, 'SC', empty, empty, sample, sample, strain, empty, instrument_platform, empty, library, empty, empty, insert_size, 'SINGLE', '', '0', empty, empty, '0', '0')) # samples.info: lookup_name|acc|individual_name|alias|population_name|species_name|taxon_id|sex samples_info.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (strain, strain, strain, strain, strain, organism_name, empty, empty)) # generate assembly indexes if do_generate_indexes or do_generate_assembly_indexes: # assembly.index: genus||species_strain||assembly_id||contig||scaffold||fastq # /pyrodata01/assemblies/Ruminococcus/obeum/A2162/P_2009_07_12_22_16_23_runAssembly/454TrimStatus.txt assembly_id = "newbler_%s" % trim_status.split('/P_')[1][:10] assembly_index.write("%s||%s||%s||%s||%s||%s||%s\n" % (study, genus, species_strain, assembly_id, contigs_file, scaffolds_file, run)) # close files sequence_index.close() samples_info.close() assembly_index.close()
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly") parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root") parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category") (options, args) = parser.parse_args() if not (options.assembly and options.root and options.category): parser.print_help() sys.exit() # check root path if not os.path.exists(options.root): log.error("%s path do not exist" % options.root) log.error("Create root path first, then run pipeline before importing assembly files.") sys.exit() # check input assembly file and read it - one line per run (lane) util.checkFile(options.assembly) assembly_lines = open(options.assembly, "r").readlines() # compare project name - could have more than one run per project previous_project = "" is_new_project = True for line in assembly_lines: if line[0] == '!': continue if not line.count('||') == 6: continue line = line.strip() values = line.split('||') project = values[0] genus = values[1] species = values[2] assembly_id = values[3] contig_file = values[4] scaffold_file = values[5] run = values[6] # get lane hierarchy path (run) run_path = util.runProcess("/software/bin/perl /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/get_lane_hierarchy_path.pl --lane=%s --db=%s" % (run, constants.DATABASE[options.category])).strip() fastq_file = "%s/%s/seq-pipelines/%s" % (options.root, options.category, run_path) # check if new project if project == previous_project: is_new_project = False else: previous_project = project is_new_project = True # check species path species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species) if not os.path.exists(species_path): log.error("%s path do not exist" % species_path) log.error("Run fastq import pipeline before importing assembly files.") else: # create assembly path assembly_path = "%s/ASSEMBLY" % species_path if not os.path.exists(assembly_path): os.makedirs(assembly_path) log.info("%s created" % assembly_path) else: log.info("%s path already exists" % assembly_path) # create assembly_id path (newbler_2009_06_29) assembly_id_path = "%s/%s" % (assembly_path, assembly_id) if not os.path.exists(assembly_id_path): os.makedirs(assembly_id_path) log.info("%s created" % assembly_id_path) else: log.info("%s path already exists" % assembly_id_path) # copy contigs file contig_file_hierarchy = "%s/LargeContigs.fna" % assembly_id_path util.checkFile(contig_file) cmd_cp = "cp %s %s" % (contig_file, contig_file_hierarchy) if not os.path.exists(contig_file_hierarchy): util.runProcess(cmd_cp) if not has_same_md5(contig_file, contig_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy)) else: log.info("%s file already exists" % contig_file_hierarchy) if not has_same_md5(contig_file, contig_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (contig_file, contig_file_hierarchy)) # copy scaffolds file scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path util.checkFile(scaffold_file) cmd_cp = "cp %s %s" % (scaffold_file, scaffold_file_hierarchy) if not os.path.exists(scaffold_file_hierarchy): util.runProcess(cmd_cp) if not has_same_md5(scaffold_file, scaffold_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy)) else: log.info("%s file already exists" % scaffold_file_hierarchy) if not has_same_md5(scaffold_file, scaffold_file_hierarchy): log.error("Copied file %s is not the same as original file %s" (scaffold_file, scaffold_file_hierarchy)) # create fastqs path fastqs_path = "%s/fastqs" % assembly_id_path if not os.path.exists(fastqs_path): os.makedirs(fastqs_path) log.info("%s created" % fastqs_path) else: log.info("%s path already exists" % fastqs_path) # create simlinks to fastqs util.checkDir(fastq_file) fastq_name = run symlink = "%s/%s" % (fastqs_path, fastq_name) if not os.path.exists(symlink): os.symlink(fastq_file, symlink) log.info("%s symlink created" % symlink) else: log.info("%s symlink already exists" % symlink) # run samtools faidx, refstats, bwa to generate extra files required for the QC pipeline cmd = "" if not os.path.exists("%s.fai" % scaffold_file_hierarchy): cmd = "samtools faidx %s; " % scaffold_file_hierarchy else: log.info("%s.fai already exists" % scaffold_file_hierarchy) if not os.path.exists("%s.refstats" % scaffold_file_hierarchy): cmd = cmd + "ref-stats -r %s > %s.refstats; " % (scaffold_file_hierarchy, scaffold_file_hierarchy) else: log.info("%s.refstats already exists" % scaffold_file_hierarchy) if not os.path.exists("%s.bwt" % scaffold_file_hierarchy): cmd = cmd + "bwa index %s; " % scaffold_file_hierarchy else: log.info("%s.bwt already exists" % scaffold_file_hierarchy) # run stats.py on contigs and scaffolds cmd_stats = "python /nfs/users/nfs_a/ap12/genlibpy/genepy/pathtrack/stats.py -f %s; " if not os.path.exists("%s.stats" % contig_file_hierarchy): cmd = cmd + cmd_stats % contig_file_hierarchy else: log.info("%s.stats already exists" % contig_file_hierarchy) if not os.path.exists("%s.stats" % scaffold_file_hierarchy): cmd = cmd + cmd_stats % scaffold_file_hierarchy else: log.info("%s.stats already exists" % scaffold_file_hierarchy) # submit all jobs if is_new_project and not cmd == "": util.submitJob(jobname='stats_%s_%s' % (project, assembly_id), cmd=cmd, outdir=assembly_id_path)
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-a", "--assembly", metavar="FILE", help="FILE containing the list of all contigs and scaffolds to import", action="store", type="string", dest="assembly") parser.add_option("-r", "--root", metavar="PATH", help="PATH to the root of the hierarchy", action="store", type="string", dest="root") parser.add_option("-c", "--category", metavar="CATEGORY", help="name of the category from %s" % constants.CATEGORY, action="store", choices=constants.CATEGORY, dest="category") (options, args) = parser.parse_args() if not (options.assembly and options.root and options.category): parser.print_help() sys.exit() # check root path if not os.path.exists(options.root): log.error("%s path do not exist" % options.root) log.error("Create root path first, then run pipeline before importing assembly files.") sys.exit() # check log directory exists out_log = "%s/log/%s" % (options.root, options.category) util.checkDir(out_log) # open qc_pipeline.conf pipeline_qc = open('%s/conf/%s/qc_pipeline.conf' % (options.root, options.category), 'w') # check input assembly file and read it - one line per run (lane) util.checkFile(options.assembly) assembly_lines = open(options.assembly, "r").readlines() # compare project name - could have more than one run per project previous_project = "" for line in assembly_lines: if line[0] == '!': continue if not line.count('||') == 6: continue line = line.strip() values = line.split('||') project = values[0] genus = values[1] species = values[2] assembly_id = values[3] contig_file = values[4] scaffold_file = values[5] run = values[6] # check if new project if project != previous_project: # check if files are in place in the hierarchy species_path = "%s/%s/seq-pipelines/%s/%s" % (options.root, options.category, genus, species) assembly_path = "%s/ASSEMBLY" % species_path assembly_id_path = "%s/%s" % (assembly_path, assembly_id) scaffold_file_hierarchy = "%s/Scaffolds.fna" % assembly_id_path util.checkFile(scaffold_file_hierarchy) util.checkFile("%s.fai" % scaffold_file_hierarchy) util.checkFile("%s.refstats" % scaffold_file_hierarchy) util.checkFile("%s.bwt" % scaffold_file_hierarchy) # create one qc conf file specific per project qc_conf_filename = '%s/conf/%s/%s_qc.conf' % (options.root, options.category, project) qc_conf = open(qc_conf_filename, 'w') qc_conf.write(constants.QC_CONF_TEMPLATE % {'root':options.root, 'category':options.category, 'db':constants.DATABASE[options.category], 'db_host':os.getenv('VRTRACK_HOST'), 'db_port':os.getenv('VRTRACK_PORT'), 'db_rw_user':os.getenv('VRTRACK_RW_USER'), 'db_password':os.getenv('VRTRACK_PASSWORD'), 'project':project, 'ref':scaffold_file_hierarchy}) qc_conf.close() log.info("QC conf file %s has been generated." % qc_conf_filename) # update qc_pipeline.conf pipeline_qc.write("__VRTrack_QC__\t%s\n" % (qc_conf_filename)) # update previous project name previous_project = project pipeline_qc.close()