def process_input(input, output, input_type , gene_list, append, errorlogger = None): commentPATT = re.compile(r'^#') count = 0 mode = 'w' if append: mode = 'a' gene_list = read_gene_list(gene_list) gene_dict = {} for gene in gene_list: gene_dict[gene.lower()] = gene # re.compile(r'[\/\s]' + gene + '[\/\s]') if input_type=='LAST2': q = 0 t = 9 if input_type=='LAST1': q = 0 t = 1 if input_type=='HMM': q = 2 t = 0 try: inputfile = open(input, 'r') outputfile = open(output, mode) except: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) for line in inputfile: result = commentPATT.search(line) if result: continue fields = [ x.strip() for x in line.split('\t') ] if len(fields) < 3: continue orfid = fields[q] #if input_type=='LAST1' or input_type=='LAST2': target = find_gene_name(fields[t], gene_list, gene_dict) if target==None: continue fprintf(outputfile, "%s\t%s\n",orfid, gene_dict[target]); outputfile.close() inputfile.close() # rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count
def write_selected_sequences(selected_sequences, output_file_name): output_file = open(output_file_name, 'w') for read in selected_sequences: fprintf(output_file, ">%s\n", read) fprintf(output_file, "%s\n", selected_sequences[read]) output_file.close()
def consolidateSplitResults(self, P, split_results): sourceParentDir = self.base_output_folder + PATHDELIM + P[0] + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' + PATHDELIM + 'split_results' targetParentDir = self.base_output_folder + PATHDELIM + P[0] + PATHDELIM + 'blast_results' targetFileName = targetParentDir + PATHDELIM + P[0] + '.' + P[1] + '.' + self.algorithm +"out" try: targetfile = open( targetFileName, 'w') except: self.messagelogger.write("ERROR: Cannot create consolidated search results file %s!\n" %(targetFileName )) sys.exit(0) for filename in split_results: sourceFileName = sourceParentDir + PATHDELIM + filename try: sourcefile = open(sourceFileName, 'r') resultLines = sourcefile.readlines() sourcefile.close() except: self.messagelogger.write("ERROR: Cannot create consolidated search results file %s!\n" %(sourceFileName )) sys.exit(0) try: for line in resultLines: fprintf(targetfile, "%s", line) except: self.messagelogger.write("ERROR: Cannot write result from file %s to the consolidated file!\n" %(sourceFileName )) sys.exit(0) self.messagelogger.write("SUCCESS: Successfully consolidated search results into file %s!\n" %(targetFileName )) targetfile.close() """ Now delete the consolidates split_files files """ for filename in split_results: sourceFileName = sourceParentDir + PATHDELIM + filename os.remove(sourceFileName)
def write_selected_sequences(selected_sequences, output_file_name): output_file = open(output_file_name, 'w') for read in selected_sequences: fprintf(output_file, ">%s\n", read) fprintf(output_file,"%s\n", selected_sequences[read]) output_file.close()
def main(argv): (opts, args) = parser.parse_args() if check_arguments(opts, args): print usage sys.exit(0) input_folder = opts.input_folder output_file = opts.output_file filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa'); cogSeqMatchesPATTERN = re.compile(r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa'); list= [] for file in listdir(input_folder): if filePATTERN.match(file): hits = cogSeqMatchesPATTERN.search( file) if hits: list.append( (hits.group(1), hits.group(2)) ) try: outputfile = open(output_file, 'w') except: print "Cannot open file to MLTreeMap hits" sys.exit(0) fprintf(outputfile, "Sequences\tCOG\n") for seq, cog in list: fprintf(outputfile, "%s\t%s\n",seq, cog) outputfile.close()
def main(argv): (opts, args) = parser.parse_args() if check_arguments(opts, args): print usage sys.exit(0) input_folder = opts.input_folder output_file = opts.output_file filePATTERN = re.compile(r'.*COG[0-9]*.*\.fa') cogSeqMatchesPATTERN = re.compile( r'[a-zA-Z]*_(.*)__[0-9]*__*(COG[0-9]*).*.fa') list = [] for file in listdir(input_folder): if filePATTERN.match(file): hits = cogSeqMatchesPATTERN.search(file) if hits: list.append((hits.group(1), hits.group(2))) try: outputfile = open(output_file, 'w') except: print "Cannot open file to MLTreeMap hits" sys.exit(0) fprintf(outputfile, "Sequences\tCOG\n") for seq, cog in list: fprintf(outputfile, "%s\t%s\n", seq, cog) outputfile.close()
def write_refscores(refscore_file, refscores, compact_output=False): for key, value in refscores.items(): orfid = key if compact_output: orfid = ShortenORFId(key) fprintf(refscore_file, "%s\t%s\n", orfid, value)
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid): try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line= orf_dictionary[contig][candidate_orf_pos]['seqname'] for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t"+ str(orf_dictionary[contig][candidate_orf_pos][field]) attributes = "ID="+orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag="+orf_dictionary[contig][candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length="+orf_dictionary[contig][candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length="+orf_dictionary[contig][candidate_orf_pos]['orf_length'] attributes += ";" + "partial="+orf_dictionary[contig][candidate_orf_pos]['partial'] attributes += ";" + "sourcedb="+candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue="+str('0') attributes += ";" + "ec="+str('') attributes += ";" + "product="+'hypothetical protein' output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line); except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) exit_process()
def write_refscores(refscore_file, refscores, compact_output=False): for key, value in refscores.iteritems(): orfid = key if compact_output: orfid = ShortenORFId(key) fprintf(refscore_file, "%s\t%s\n",orfid, value)
def process_parsed_blastoutput(dbname, blastoutput, opts, orf_read_counts): blastparser = BlastOutputTsvParser(dbname, blastoutput, shortenorfid=False) hit_counts = {} for data in blastparser: #if count%10000==0: if isWithinCutoffs(data, opts): target = getFunctionName(dbname, data) if not target in hit_counts: hit_counts[target] = 0 if data['query'] in orf_read_counts: hit_counts[target] += orf_read_counts[data['query']] else: #print 'query', data['query'] hit_counts[target] += 1 #print data #for name in hit_counts: # print name, hit_counts[name] filename = opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname filename_txt = filename + ".read_counts.txt" filename_biom = filename + ".read_counts.biom" with open(filename_txt, 'w') as fout: fprintf(fout, "# Gene\tCounts\n") for name in hit_counts: fprintf(fout, "%s\t%d\n", name, hit_counts[name]) runBIOMCommand(filename_txt, filename_biom, biomExec="biom") return len(hit_counts)
def make_sure_map_file_exists(config_settings, dbname, globallogger = None): dbmapFile = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt" seqFilePath = config_settings['REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname if not doFilesExist( [dbmapFile ] ): eprintf("WARNING: Trying to create database map file for %s\n", dbname) if globallogger!= None: globallogger.write("WARNING: Trying to create database map file for %s\n" %( dbname) ) if not doFilesExist( [seqFilePath] ): eprintf("ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname) eprintf(" : Make sure you have the file %s\n", seqFilePath) if globallogger!= None: globallogger.write("ERROR \t You do not even have the raw sequence for Database %s to format!\n" %( dbname)) globallogger.write("Make sure you have the file %s\n" %( seqFilePath)) exit_process() mapfile = open(dbmapFile,'w') seqFile = open(seqFilePath,'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n",line.strip()) seqFile.close() mapfile.close() return dbmapFile
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orfid, compact_output): global errorcode try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] output_line = orf_dictionary[contig][candidate_orf_pos]['seqname'] #if compact_output: #output_line = ShortenContigId(output_line) for field in fields: output_line += "\t" + str( orf_dictionary[contig][candidate_orf_pos][field]) #if compact_output: try: attributes = "ID=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['id']) attributes += ";" + "locus_tag=" + ShortenORFId( orf_dictionary[contig][candidate_orf_pos]['locus_tag']) except: attributes = "ID=" + orf_dictionary[contig][candidate_orf_pos]['id'] attributes += ";" + "locus_tag=" + orf_dictionary[contig][ candidate_orf_pos]['locus_tag'] attributes += ";" + "contig_length=" + orf_dictionary[contig][ candidate_orf_pos]['contig_length'] attributes += ";" + "orf_length=" + orf_dictionary[contig][ candidate_orf_pos]['orf_length'] attributes += ";" + "partial=" + orf_dictionary[contig][ candidate_orf_pos]['partial'] attributes += ";" + "sourcedb=" + candidatedbname if candidatedbname in results_dictionary: attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[ candidatedbname][orfid]['product'] else: attributes += ";" + "annotvalue=" + str('0') attributes += ";" + "ec=" + str('') attributes += ";" + "product=" + 'hypothetical protein' output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line) except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) insert_error(errorcode) exit_process()
def process_rRNA_16S_stats(dbname, rRNA_16S_file, orf_read_rpkgs, opts, shortenorfid=False): print "Processing rRNA database : ", dbname counter_rRNA = {} if not doesFileExist(rRNA_16S_file): return try: taxonomy_file = open(rRNA_16S_file, 'r') except IOError: eprintf("Cannot read file %s!\n", rRNA_16S_file) exit_process() tax_lines = taxonomy_file.readlines() similarity_pattern = re.compile("similarity") evalue_pattern = re.compile("evalue") bitscore_pattern = re.compile("bitscore") taxonomy_pattern = re.compile("taxonomy") headerScanned = False seencounter = {} for line in tax_lines: if headerScanned == False: if similarity_pattern.search(line) and evalue_pattern.search( line) and bitscore_pattern.search( line) and taxonomy_pattern.search(line): headerScanned = True continue fields = [x.strip() for x in line.split('\t')] if len(fields) >= 6: if not fields[0] in seencounter: seencounter[fields[0]] = 0 else: seencounter[fields[0]] += 1 _name = fields[0] + "_" + str(seencounter[fields[0]]) + "_rRNA" if not fields[6] in counter_rRNA: counter_rRNA[fields[6]] = 0.0 name = ShortenrRNAId(_name) if name in orf_read_rpkgs: counter_rRNA[fields[6]] += orf_read_rpkgs[name] else: counter_rRNA[fields[6]] += 0 taxonomy_file.close() with open( opts.outputdir + PATHDELIM + opts.sample_name + "." + dbname + ".read_rpkgs.txt", 'w') as fout: fprintf(fout, "# Gene\tCounts\n") for name in counter_rRNA: fprintf(fout, "%s\t%0.2f\n", name, counter_rRNA[name]) return len(counter_rRNA)
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None): global parser options, args = parser.parse_args(argv) # is there a pathwaytools executable installed if False and not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf( "ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" % (options.ptoolsExec)) # command to build the ePGDB command = "%s " % (options.ptoolsExec) command += " -api" pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) #resultLines = pythonCyc.getReactionListLines() resultLines = pythonCyc.getFlatFiles() StopPathwayTools() try: if False: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file, "%s\n", line.strip()) reaction_list_file.close() StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" % (options.sample_name)) eprintf( "INFO\tKill any other PathwayTools instance running on the machine and try again" ) if errorlogger: errorlogger.write( "ERROR\tFailed to run extract pathways for %s : " % (options.sample_name)) errorlogger.write( "INFO\tKill any other PathwayTools instance running on the machine and try again\n" ) StopPathwayTools()
def copy_faa_gff_orf_prediction( source_files, target_files) : for source, target in zip(source_files, target_files): #print source + ' ' + target sourcefile = open(source, 'r') targetfile = open(target, 'w') sourcelines = sourcefile.readlines() for line in sourcelines: fprintf(targetfile, "%s\n", line.strip()) sourcefile.close() targetfile.close()
def make_sure_map_file_exists(dbmapfile): if not doFilesExist( [dbmapfile ] ): print 'WARNING: ' + 'Creating the database map file' fullRefDbName = re.sub(r'-names.txt','',dbmapfile) mapfile = open(dbmapfile,'w') fullRefDbFile = open(fullRefDbName,'r') for line in fullRefDbFile: if re.match(r'>', line): fprintf(mapfile, "%s\n",line.strip()) mapfile.close() fullRefDbFile.close()
def write_new_file(lines, output_file): print "Fixing file " + output_file try: outputfile = open(output_file, 'w') pass except IOError: print "ERROR :Cannot open output file " + output_file for line in lines: fprintf(outputfile, "%s\n", line) outputfile.close()
def createMapFile(seqFilePath, dbMapFile): """ Creates the dbMapFile from sequence file seqFilePath """ try: mapfile = open(dbMapFile,'w') seqFile = open(seqFilePath,'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n",line.strip()) seqFile.close() mapfile.close() except: return False return True
def createMapFile(seqFilePath, dbMapFile): """ Creates the dbMapFile from sequence file seqFilePath """ try: mapfile = open(dbMapFile, 'w') seqFile = open(seqFilePath, 'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n", line.strip()) seqFile.close() mapfile.close() except: return False return True
def write_new_file(lines, output_file): print "Fixing file " + output_file try: outputfile = open(output_file,'w') pass except IOError: print "ERROR :Cannot open output file " + output_file for line in lines: fprintf(outputfile, "%s\n", line) outputfile.close()
def __addToStatusList(self, server, J, list_file_name, list_to_add_to): parentDir = self.base_output_folder + PATHDELIM + J.S + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' list_jobs_stats_file = parentDir + PATHDELIM + list_file_name try: if not doesFileExist(list_jobs_stats_file): self.messagelogger.write( "WARNING: Cannot file \"%s\" for sample \"%s\"!\n" % (list_file_name, J.S)) self.messagelogger.write( "SUCCESS: Create file \"%s\" for sample \"%s\"!\n" % (list_file_name, J.S)) listfile = open(list_jobs_stats_file, 'w') listfile.close() except: self.messagelogger.write( "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % (list_file_name, J.S)) print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % ( list_file_name, J.S) sys.exit(1) try: listfile = open(list_jobs_stats_file, 'a') eventTime = int(time.time()) fprintf( listfile, "%s\t%s\t%s\t%s\t%s\t%s\n" % (J.S, J.d, J.a, J.m, server, str(eventTime))) listfile.close() except: self.messagelogger.write( "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % (list_file_name, J.S)) print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" % ( list_file_name, J.S) sys.exit(1) if not J.S in list_to_add_to: list_to_add_to[J.S] = {} if not J.d in list_to_add_to[J.S]: list_to_add_to[J.S][J.d] = {} if not J.a in list_to_add_to[J.S][J.d]: list_to_add_to[J.S][J.d][J.a] = {} if not J.m in list_to_add_to[J.S][J.d][J.a]: list_to_add_to[J.S][J.d][J.a][J.m] = {} list_to_add_to[J.S][J.d][J.a][J.m][server] = eventTime return True
def write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag): fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] for rRNA in rRNA_dictionary: output_line= rRNA_dictionary[rRNA]['id'] for field in fields: output_line += "\t"+ str(rRNA_dictionary[rRNA][field]) attributes = "ID="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag attributes += ";" + "locus_tag="+ShortenORFId(rRNA_dictionary[rRNA]['seqname']) + tag attributes += ";" + "orf_length=" + str(rRNA_dictionary[rRNA]['orf_length']) attributes += ";" + "contig_length=" + str(rRNA_dictionary[rRNA]['contig_length']) attributes += ";" + "ec=" attributes += ";" + "product="+rRNA_dictionary[rRNA]['product'] output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line);
def write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, tag): fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] for rRNA in rRNA_dictionary: output_line= rRNA_dictionary[rRNA]['seqname'] for field in fields: output_line += "\t"+ str(rRNA_dictionary[rRNA][field]) attributes = "ID="+rRNA_dictionary[rRNA]['seqname'] + tag attributes += ";" + "locus_tag="+rRNA_dictionary[rRNA]['seqname'] + tag attributes += ";" + "orf_length=" + str(rRNA_dictionary[rRNA]['orf_length']) attributes += ";" + "contig_length=" + str(rRNA_dictionary[rRNA]['contig_length']) attributes += ";" + "ec=" attributes += ";" + "product="+rRNA_dictionary[rRNA]['product'] output_line += '\t' + attributes fprintf(outputgff_file, "%s\n", output_line);
def create_blast_splits(self, target_folder, blocks_list_filename, maxSize=500, maxBytes=40000000): blockno = 0 currblocksize = 0 currblockbyteSize = 0 fastareader = FastaReader(self.fastaFile) # Read sequences from sorted sequence file and write them to block files try: blocklistfile = open(blocks_list_filename, 'w') except: print "ERROR: Cannot open " + blocks_list_filename sys.exit(0) sample_name = 'split' fragments = [] for name in fastareader: fragments.append(fastareader.seqname) fragments.append(fastareader.sequence) if currblocksize >= maxSize - 1 or currblockbyteSize >= maxBytes: #TODO adjust the 000 to match the format blockfile = open( target_folder + PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w') fprintf(blockfile, "%s", '\n'.join(fragments)) fragments = [] blockfile.close() # Add this block name to the blocklistfile #TODO adjust the 000 to match the format fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno)) blockno += 1 currblocksize = 0 currblockbyteSize = 0 else: currblocksize += 1 currblockbyteSize += len(fastareader.sequence) if fragments: #TODO adjust the 000 to match the format blockfile = open( target_folder + PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w') fprintf(blockfile, "%s", '\n'.join(fragments)) blockfile.close() fragments = [] #TODO adjust the 000 to match the format fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno)) blockno += 1 #Add this block name to the blocklistfile blocklistfile.close() currblocksize = 0 currblockbyteSize = 0
def add_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: line=line.rstrip() fields = line.split('\t') if len(fields) != 12: eprintf("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) exit_process("ERROR: Error in line \n%s\n of the blastout file %s" %(line, blast_table_out)) for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, contig, candidate_orf_pos, orfid, compact_output): try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] _values = ["M", "CDS", "0", "30", "100.0", "+", "0"] values = {} for key, value in zip(fields, _values): values[key] = value #if compact_output: output_line = contig for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t" + values[field] #if compact_output: attributes = "ID=" + orfid attributes += ";" + "locus_tag=" + orfid attributes += ";" + "contig_length=" + str(100) attributes += ";" + "orf_length=" + str(30) attributes += ";" + "partial=" + "00" attributes += ";" + "sourcedb=" + candidatedbname attributes += ";" + "annotvalue=" + str( results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec=" + str( results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product=" + results_dictionary[candidatedbname][ orfid]['product'] output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line) except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) exit_process()
def consolidateSplitResults(self, P, split_results): sourceParentDir = self.base_output_folder + PATHDELIM + P[ 0] + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' + PATHDELIM + 'split_results' targetParentDir = self.base_output_folder + PATHDELIM + P[ 0] + PATHDELIM + 'blast_results' targetFileName = targetParentDir + PATHDELIM + P[0] + '.' + P[ 1] + '.' + self.algorithm + "out" try: targetfile = open(targetFileName, 'w') except: self.messagelogger.write( "ERROR: Cannot create consolidated search results file %s!\n" % (targetFileName)) sys.exit(0) for filename in split_results: sourceFileName = sourceParentDir + PATHDELIM + filename try: sourcefile = open(sourceFileName, 'r') resultLines = sourcefile.readlines() sourcefile.close() except: self.messagelogger.write( "ERROR: Cannot create consolidated search results file %s!\n" % (sourceFileName)) sys.exit(0) try: for line in resultLines: fprintf(targetfile, "%s", line) except: self.messagelogger.write( "ERROR: Cannot write result from file %s to the consolidated file!\n" % (sourceFileName)) sys.exit(0) self.messagelogger.write( "SUCCESS: Successfully consolidated search results into file %s!\n" % (targetFileName)) targetfile.close() """ Now delete the consolidates split_files files """ for filename in split_results: sourceFileName = sourceParentDir + PATHDELIM + filename os.remove(sourceFileName)
def add_blast_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open(blast_table_out, 'r') refscores = {} lines = infile.readlines() for line in lines: line = line.rstrip() fields = line.split('\t') if len(fields) != 12: print('Error in the blastout file') sys.exit(1) if fields[0].rstrip() == fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[0]] = fields[11] for key, value in refscores.items(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n", key, value) infile.close()
def add_blast_refscore_to_file(blast_table_out, refscore_file, allNames): infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: line=line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) if fields[0].rstrip()==fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[0]]=fields[11] for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def main(argv, errorlogger = None, runcommand = None, runstatslogger = None): global parser options, args = parser.parse_args(argv) # is there a pathwaytools executable installed if False and not path.exists(options.ptoolsExec): eprintf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) if errorlogger: errorlogger.printf("ERROR\tPathwayTools executable %s not found!\n", options.ptoolsExec) exit_process("ERROR\tPathwayTools executable %s not found!\n" %(options.ptoolsExec)) # command to build the ePGDB command = "%s " %(options.ptoolsExec) command += " -api" pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) #resultLines = pythonCyc.getReactionListLines() resultLines = pythonCyc.getFlatFiles() StopPathwayTools() try: if False: pythonCyc = startPathwayTools(options.sample_name.lower(), options.ptoolsExec, True) pythonCyc.setDebug() # disable pathway debug statements printf("INFO\tExtracting the reaction list from ePGDB " + options.sample_name + "\n") resultLines = pythonCyc.getReactionListLines() #pythonCyc.stopPathwayTools() reaction_list_file = open(options.reactions_list + ".tmp", 'w') for line in resultLines: fprintf(reaction_list_file,"%s\n",line.strip()) reaction_list_file.close() StopPathwayTools() except: print traceback.print_exc(10) eprintf("ERROR\tFailed to run extract pathways for %s : \n" %(options.sample_name)) eprintf("INFO\tKill any other PathwayTools instance running on the machine and try again") if errorlogger: errorlogger.write("ERROR\tFailed to run extract pathways for %s : " %(options.sample_name)) errorlogger.write("INFO\tKill any other PathwayTools instance running on the machine and try again\n") StopPathwayTools()
def create_blast_splits(self, target_folder, blocks_list_filename, maxSize=500, maxBytes = 40000000): blockno = 0 currblocksize = 0 currblockbyteSize = 0 fastareader = FastaReader(self.fastaFile) # Read sequences from sorted sequence file and write them to block files try: blocklistfile = open(blocks_list_filename, 'w') except: print "ERROR: Cannot open " + blocks_list_filename sys.exit(0) sample_name = 'split' fragments = [] for name in fastareader: fragments.append(fastareader.seqname) fragments.append(fastareader.sequence) if currblocksize >= maxSize -1 or currblockbyteSize >= maxBytes: #TODO adjust the 000 to match the format blockfile = open(target_folder + PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w') fprintf(blockfile, "%s",'\n'.join(fragments)) fragments=[] blockfile.close() # Add this block name to the blocklistfile #TODO adjust the 000 to match the format fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno)) blockno += 1 currblocksize = 0 currblockbyteSize = 0 else: currblocksize += 1 currblockbyteSize += len(fastareader.sequence) if fragments: #TODO adjust the 000 to match the format blockfile = open(target_folder + PATHDELIM + sample_name + '.000' + str(blockno) + '.fasta', 'w') fprintf(blockfile, "%s",'\n'.join(fragments)) blockfile.close() fragments = [] #TODO adjust the 000 to match the format fprintf(blocklistfile, "%s\n", sample_name + ".000" + str(blockno)) blockno += 1 #Add this block name to the blocklistfile blocklistfile.close() currblocksize = 0 currblockbyteSize = 0
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary): tag = re.sub(r'[.]gbk', '', input_gbk) tag = re.sub(r'.*/', '', tag) output_gbk_file = open(output_gbk, 'w') serializer = genbank.GenBankRecordSerializer() with open(input_gbk, 'r') as genbank_file: out_list = [] count = 0 for record in genbank.GenBankRecordParser(genbank_file.read()): count += 1 record.locus = tag + str(count) if count % 1000 == 0: print('Count = ' + str(count)) if headers and 'REFERENCES' in headers: record.references_ = headers['REFERENCES'] i = 0 for feature in record.features: if feature.type == "CDS": if feature.locus_tag in gff_dictionary: record.features[i].product = 'aaaaa ' + gff_dictionary[ feature.locus_tag]['product'] i += 1 #record.locus = "hello" out_list.append(serializer.serialize(record)) if count % 1000 == 0: output_str = '\n'.join(out_list) out_list = [] fprintf(output_gbk_file, '%s\n', output_str) output_str = '\n'.join(out_list) fprintf(output_gbk_file, '%s\n', output_str) output_gbk_file.close()
def __addToStatusList(self, server, J, list_file_name, list_to_add_to): parentDir = self.base_output_folder + PATHDELIM + J.S + PATHDELIM + 'blast_results' + PATHDELIM + 'grid' list_jobs_stats_file=parentDir + PATHDELIM + list_file_name try: if not doesFileExist(list_jobs_stats_file): self.messagelogger.write("WARNING: Cannot file \"%s\" for sample \"%s\"!\n" %(list_file_name, J.S)) self.messagelogger.write("SUCCESS: Create file \"%s\" for sample \"%s\"!\n" %(list_file_name, J.S)) listfile = open(list_jobs_stats_file, 'w') listfile.close() except: self.messagelogger.write("ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S)) print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S) sys.exit(1) try: listfile = open(list_jobs_stats_file, 'a') eventTime = int(time.time()) fprintf(listfile, "%s\t%s\t%s\t%s\t%s\t%s\n" %(J.S, J.d, J.a, J.m, server, str(eventTime)) ) listfile.close() except: self.messagelogger.write("ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S)) print "ERROR: Cannot open job list %s file for sample \"%s\"!\n" %(list_file_name, J.S) sys.exit(1) if not J.S in list_to_add_to: list_to_add_to[J.S] = {} if not J.d in list_to_add_to[J.S]: list_to_add_to[J.S][J.d] = {} if not J.a in list_to_add_to[J.S][J.d]: list_to_add_to[J.S][J.d][J.a] = {} if not J.m in list_to_add_to[J.S][J.d][J.a]: list_to_add_to[J.S][J.d][J.a][J.m] = {} list_to_add_to[J.S][J.d][J.a][J.m][server] = eventTime return True
def process_gbk_file(input_gbk, output_gbk, headers, gff_dictionary): tag = re.sub(r'[.]gbk','', input_gbk) tag = re.sub(r'.*/','', tag) output_gbk_file = open(output_gbk,'w') serializer = genbank.GenBankRecordSerializer() with open(input_gbk, 'r') as genbank_file: out_list=[] count = 0 for record in genbank.GenBankRecordParser(genbank_file.read()): count+=1 record.locus = tag + str(count) if count%1000==0: print 'Count = ' + str(count) if headers and 'REFERENCES' in headers: record.references_ = headers['REFERENCES'] i = 0 for feature in record.features: if feature.type =="CDS": if feature.locus_tag in gff_dictionary: record.features[i].product= 'aaaaa ' + gff_dictionary[feature.locus_tag]['product'] i+=1 #record.locus = "hello" out_list.append(serializer.serialize(record)) if count%1000 == 0: output_str = '\n'.join(out_list) out_list=[] fprintf(output_gbk_file,'%s\n',output_str) output_str = '\n'.join(out_list) fprintf(output_gbk_file,'%s\n',output_str) output_gbk_file.close()
def write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, contig, candidate_orf_pos, orfid, compact_output): try: fields = [ 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame' ] _values = [ "M", "CDS", "0", "30", "100.0", "+", "0" ] values = {} for key, value in zip(fields, _values): values[key] = value #if compact_output: output_line = contig for field in fields: # printf("\t%s", orf_dictionary[contig][candidate_orf_pos][field]) output_line += "\t"+ values[field] #if compact_output: attributes = "ID="+orfid attributes += ";" + "locus_tag="+orfid attributes += ";" + "contig_length="+str(100) attributes += ";" + "orf_length="+str(30) attributes += ";" + "partial="+"00" attributes += ";" + "sourcedb="+candidatedbname attributes += ";" + "annotvalue="+str(results_dictionary[candidatedbname][orfid]['value']) attributes += ";" + "ec="+str(results_dictionary[candidatedbname][orfid]['ec']) attributes += ";" + "product="+results_dictionary[candidatedbname][orfid]['product'] output_line += '\t' + attributes if candidatedbname in results_dictionary: fprintf(outputgff_file, "%s\n", output_line); except: eprintf("ERROR : Failure to annotate in contig %s\n", contig) #print orf_dictionary[contig] print traceback.print_exc(10) exit_process()
def print_counts_at_level(hierarchical_map, field_to_description, depth, level, outputfile, printKey=True, header=None): if type(hierarchical_map) is type(0): return hierarchical_map if header: fprintf(outputfile, "%s\n", header) count = 0 for key in hierarchical_map: tempcount = print_counts_at_level(hierarchical_map[key], field_to_description, depth + 1, level, outputfile, printKey=printKey) if depth == level: if key in field_to_description: if printKey: fprintf( outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' + str(tempcount)) else: fprintf(outputfile, "%s\n", field_to_description[key] + '\t' + str(tempcount)) else: if printKey: fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount)) else: fprintf(outputfile, "%s\n", key + '\t' + str(tempcount)) count += tempcount return count
def print_counts_at_level(hierarchical_map, field_to_description, depth, level, outputfile, printKey=True, header=None): if type(hierarchical_map) is type(0): return hierarchical_map if header: fprintf(outputfile, "%s\n",header ) count = 0 for key in hierarchical_map: tempcount = print_counts_at_level(hierarchical_map[key],field_to_description, depth+1, level, outputfile, printKey=printKey) if depth==level: if key in field_to_description: if printKey: fprintf(outputfile, "%s\n", key + '\t' + field_to_description[key] + '\t' + str(tempcount) ) else: fprintf(outputfile, "%s\n", field_to_description[key] + '\t' + str(tempcount) ) else: if printKey: fprintf(outputfile, "%s\n", key + '\t' + ' ' + '\t' + str(tempcount)) else: fprintf(outputfile, "%s\n", key + '\t' + str(tempcount)) count+=tempcount return count
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames): commentPATTERN = re.compile(r'^#') infile = open(blast_table_out, 'r') refscores = {} lines = infile.readlines() for line in lines: if commentPATTERN.match(line): continue line = line.rstrip() fields = line.split('\t') if len(fields) != 12: print('Error in the blastout file') sys.exit(1) if fields[6].rstrip() == fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[1]] = fields[0] for key, value in refscores.items(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n", key, value) infile.close()
def add_last_refscore_to_file(blast_table_out, refscore_file, allNames): commentPATTERN = re.compile(r'^#') infile = open( blast_table_out,'r') refscores = {} lines = infile.readlines() for line in lines: if commentPATTERN.match(line): continue line=line.rstrip() fields = line.split('\t') if len(fields) != 12: print 'Error in the blastout file' sys.exit(1) if fields[6].rstrip()==fields[1].rstrip(): # fprintf(refscore_file, "%s\t%s\n",fields[0], fields[11]) refscores[fields[1]]=fields[0] for key, value in refscores.iteritems(): allNames[key] = True fprintf(refscore_file, "%s\t%s\n",key, value) infile.close()
def create_gff_faa(tempfile, gfffile, faafile): patt = re.compile(r'>(.*)_(\d+)_(\d+)_([+-])') idpatt = re.compile(r'.*_(\d+_\d+)') with open(gfffile, 'w') as gffout: with open(faafile, 'w') as faaout: fastareader = FastaReader(tempfile) for fasta in fastareader: res=patt.search(fasta.name) if res: #nameprint(res.group(1),res.group(2), res.group(3), res.group(4)) orfname=res.group(1) start=res.group(2) end=res.group(3) strand=res.group(4) res=idpatt.search(orfname) id='' if res: id=res.group(1) attr = "ID=" + id + ";partial=00" fields=[orfname, 'FGS+', 'CDS', start, end, '0', strand, "0", attr] fprintf(faaout,'>' + orfname + "\n" + fasta.sequence+"\n") fprintf(gffout,'\t'.join(fields) +'\n')
def make_sure_map_file_exists(config_settings, dbname, globallogger=None): dbmapFile = config_settings[ 'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + 'formatted' + PATHDELIM + dbname + "-names.txt" seqFilePath = config_settings[ 'REFDBS'] + PATHDELIM + 'functional' + PATHDELIM + dbname if not doFilesExist([dbmapFile]): eprintf("WARNING: Trying to create database map file for %s\n", dbname) if globallogger != None: globallogger.write( "WARNING: Trying to create database map file for %s\n" % (dbname)) if not doFilesExist([seqFilePath]): eprintf( "ERROR : You do not even have the raw sequence for Database %s to format!\n", dbname) eprintf(" : Make sure you have the file %s\n", seqFilePath) if globallogger != None: globallogger.write( "ERROR \t You do not even have the raw sequence for Database %s to format!\n" % (dbname)) globallogger.write("Make sure you have the file %s\n" % (seqFilePath)) exit_process() mapfile = open(dbmapFile, 'w') seqFile = open(seqFilePath, 'r') for line in seqFile: if re.match(r'>', line): fprintf(mapfile, "%s\n", line.strip()) seqFile.close() mapfile.close() return dbmapFile
def writeParsedLines(fieldmapHeaderline, parsedLines, list, names, outputfilename): try: outputfile = open(outputfilename, 'w') except OSError: print "ERROR: Cannot create sequence file : " + outputfilename sys.exit(0) outputStr=fieldmapHeaderline + "\n" fprintf(outputfile, "%s", outputStr) outputStr="" i = 0 for item in list: outputStr += parsedLines[item[0]]+'\n' if i% 1000==0 and i > 0: fprintf(outputfile, "%s", outputStr) outputStr="" i += 1 if len(outputStr) > 0: fprintf(outputfile, "%s", outputStr) outputfile.close()
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger=None): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger=errorlogger) blastparser.setMaxErrorsLimit(100) blastparser.setErrorAndWarningLogger(errorlogger) blastparser.setSTEP_NAME('PARSE BLAST') fields = [ 'target', 'q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = opts.parsed_output # temporary file is used to deal with incomplete processing of the file output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp" try: outputfile = open(output_blastoutput_parsed_tmp, 'w') except: if errorlogger: errorlogger.write( "PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" % (soutput_blastoutput_parsed_tmp, dbname)) exit_process( "PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" % (soutput_blastoutput_parsed_tmp, dbname)) # write the headers out fprintf(outputfile, "#%s", 'query') for field in fields: fprintf(outputfile, "\t%s", field) fprintf(outputfile, "\n") pattern = re.compile(r'' + "(\d+_\d+)$") count = 0 uniques = {} for data in blastparser: if not data: continue try: fprintf(outputfile, "%s", data['query']) result = pattern.search(data['query']) if result: name = result.group(1) uniques[name] = True except: print 'data is : ', data, '\n' return count, len(uniques) for field in fields: fprintf(outputfile, "\t%s", data[field]) fprintf(outputfile, "\n") count += 1 outputfile.close() rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count, len(uniques)
def main(argv, errorlogger = None, runstatslogger = None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) min_length = 0 #inputfile = open(opts.input_fasta,'r') outfile = open(opts.output_fasta, 'w') outfilefna = open(opts.output_fna, 'w') outfilefaa = open(opts.output_faa, 'w') outfilegff = open(opts.output_gff, 'w') logfile = open(opts.log_file, 'w') lengthsfile = open(opts.lengths_file, 'w') if opts.map_file: mapfile = open(opts.map_file, 'w') else: mapfile = None sample_name = opts.input_fasta; sample_name = re.sub(r'^.*/','',sample_name, re.I) sample_name = re.sub(r'^.*\\','',sample_name, re.I) sample_name = re.sub(r'\.fasta$','',sample_name, re.I) sample_name = re.sub(r'\.fna$','',sample_name, re.I) sample_name = re.sub(r'\.faa$','',sample_name, re.I) sample_name = re.sub(r'\.fas$','',sample_name, re.I) sample_name = re.sub(r'\.fa$','',sample_name, re.I) BEFORE = 'BEFORE' AFTER = 'AFTER' NUMSEQ = "#INFO\tNumber of sequences :" NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences" AVG_LENGTH= "@INFO\tAverage length of sequences:" MIN_LENGTH= "@INFO\tMinimum length of sequences:" MAX_LENGTH= "@INFO\tMaximum length of sequences:" _MAX = 1000000000000 stats = { MIN_LENGTH: { 'BEFORE':_MAX, 'AFTER':_MAX }, MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 }, NUMSEQ : { 'BEFORE' :0, 'AFTER':0}, NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 }, AVG_LENGTH : { 'BEFORE':0, 'AFTER':0 }, } length_distribution = {} length_cumulative_distribution = {} for i in range(0,31): length_distribution[i]= 0 length_cumulative_distribution[i]= 0 seq_count = 0 allNames= dict() outputStr = "" outputLines = [] fastareader= FastaReader(opts.input_fasta) """ process one fasta sequence at a time """ lengths_str="" for record in fastareader: seqname = record.name seq = record.sequence length = len(seq) index = int(len(seq) / 50); if index >= 30: index = 30 length_distribution[index] += 1 if length < stats[MIN_LENGTH][BEFORE] : stats[MIN_LENGTH][BEFORE] = length if length > stats[MAX_LENGTH][BEFORE] : stats[MAX_LENGTH][BEFORE] = length if length < MIN_LENGTH: stats[NUMSEQ_SHORTER][BEFORE] += 1 stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE] + length seqvalue = filter_sequence(seq) stats[NUMSEQ][BEFORE] += 1 seqlen = len(seqvalue) if seqlen>= min_length : if len(lengths_str) > 100: fprintf(lengthsfile,"%s\n",lengths_str); lengths_str = str(seqlen) else: lengths_str += '\t' + str(seqlen) stats[NUMSEQ][AFTER] += 1 stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER] + seqlen if mapfile==None: fprintf(outfile, "%s\n", seqname) else: contigID = sample_name + '_' + str(seq_count) orfID = sample_name + '_' + str(seq_count) + "_0" fprintf(outfile, ">%s\n", contigID ) fprintf(outfilefna, ">%s\n", orfID ) fprintf(outfilefaa, ">%s\n", orfID ) gffString = sample_name + '_' + str(seq_count) gffString += "\t" + "AMINO_ACID_SEQ" gffString += "\t" + "CDS" gffString += "\t" + "0" gffString += "\t" + str(3*seqlen) gffString += "\t" + "0" gffString += "\t" + "+" gffString += "\t" + "0" gffString += "\t" + "ID=" + orfID + ";" gffString += "locus_tag=" + orfID + ";" gffString += "partial=00;" gffString += "orf_length="+ str(seqlen)+";" gffString += "contig_length="+ str(3*seqlen) fprintf(outfilegff, "%s\n", gffString) key = re.sub(r'^>','',seqname) fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen)) seq_count += 1 fprintf(outfile, "%s\n","DUMMY CONTIGS FOR AMINO ACID SEQUENCES") fprintf(outfilefna, "%s\n","DUMMY ORFS FOR AMINO ACID SEQUENCES") fprintf(outfilefaa, "%s\n",seqvalue) if seqlen < stats[MIN_LENGTH][AFTER] : stats[MIN_LENGTH][AFTER] = seqlen if seqlen > stats[MAX_LENGTH][AFTER] : stats[MAX_LENGTH][AFTER] = seqlen print 'done' fprintf(lengthsfile,"%s\n",lengths_str); if stats[NUMSEQ][BEFORE] > 0 : stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE] else: stats[AVG_LENGTH][BEFORE] = 0 if stats[NUMSEQ][AFTER] > 0 : stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER]/stats[NUMSEQ][AFTER] else : stats[AVG_LENGTH][AFTER] = 0 lengthsfile.close() outfile.close() outfilefna.close() outfilefaa.close() outfilegff.close() #inputfile.close() if mapfile != None: mapfile.close() """ min length """ if stats[MIN_LENGTH][BEFORE] == _MAX: stats[MIN_LENGTH][BEFORE] = 0 if stats[MIN_LENGTH][AFTER] == _MAX: stats[MIN_LENGTH][AFTER] = 0 fprintf(logfile, "@INFO\tBEFORE\tAFTER\n"); fprintf(logfile, "%s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER])); fprintf(logfile, "%s\n", NUMSEQ_SHORTER + '\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER])) fprintf(logfile, "%s\n", AVG_LENGTH +'\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t'+ str(stats[AVG_LENGTH][AFTER])) fprintf(logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER])) fprintf(logfile, "%s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' + str(stats[MAX_LENGTH][AFTER])) fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n"); # fprintf(logfile, "# ---\t-----\t--------\t---------\t----------\n"); i = 30 length_cumulative_distribution[i] = length_cumulative_distribution[i]; i -= 1 while i >= 0: length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i]; i -= 1 for i in range(0,31): fprintf(logfile, " %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\ str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) ) logfile.close() seqtype='amino' """priority is used to sort the output to print in the right order""" priority = 2000 if runstatslogger != None: runstatslogger.write("%s\tSequences BEFORE Filtering (%s)\t%s\n" %(str(priority), seqtype, str(stats[NUMSEQ][BEFORE])) ) runstatslogger.write("%s\tmin length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) ) runstatslogger.write("%s\tavg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write("%s\tmax length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) ) runstatslogger.write("%s\ttot length\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE])))) runstatslogger.write("%s\tSequences AFTER Filtering (%s)\t%s\n" %(str(priority + 5), seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write("%s\tmin length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) ) runstatslogger.write("%s\tavg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write("%s\tmax length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) ) runstatslogger.write("%s\ttot length\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) ))
def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_stats_files, tRNA_stats_files, output_gff, output_comparative_annotation, contig_lengths, compact_output = False): orf_dictionary={} # process_gff_file(input_gff, orf_dictionary) gffreader = GffFileParser(input_gff) output_gff_tmp = output_gff + ".tmp" outputgff_file = open( output_gff_tmp, 'w') output_comp_annot_file1 = open( output_comparative_annotation + '.1.txt', 'w') output_comp_annot_file2 = open( output_comparative_annotation + '.2.txt', 'w') output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue' fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str) output_comp_annot_file2_Str = 'orf_id' dbnames = dbname_weight.keys() for dbname in dbnames: weight = dbname_weight[dbname] output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format(dbname) fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str) # gffreader = GffReader(input_gff) # for dbname in dbnames: # print dbname, len(results_dictionary[dbname].keys()) # print results_dictionary[dbname].keys() i = 0 for contig in gffreader: count = 0 for orf in gffreader.orf_dictionary[contig]: value = 0.0001 success =False output_comp_annot_file1_Str = '' output_comp_annot_file2_Str = '' for dbname in dbnames: weight = dbname_weight[dbname] value = 0 orf_id = orf['id'] if orf_id in results_dictionary[dbname]: if value < results_dictionary[dbname][orf_id]['value']: value = results_dictionary[dbname][orf_id]['value'] candidatedbname=dbname success =True candidate_orf_pos = count if output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id, results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: if not output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, '','','','') if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format('', '','') else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id, '','','','') if success: # there was a database hit fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str) fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str) write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos, orf_id, compact_output=compact_output) else: # if it was not a hit then it is a hypothetical protein #print gffreader.orf_dictionary write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf_id, compact_output = compact_output) count +=1 #move to the next orf #del orf_dictionary[contig] output_comp_annot_file1.close() output_comp_annot_file2.close() # now deal with the rRNA sequences if there is rRNA stats file if len(rRNA_16S_stats_files) > 0 and contig_lengths : rRNA_16S_dictionary={} for rRNA_16S_stats_file in rRNA_16S_stats_files: process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary) rRNA_dictionary = {} add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths) write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA') # now deal with the tRNA sequences if there is tRNA stats file if len(tRNA_stats_files) > 0 and contig_lengths: tRNA_dictionary={} for tRNA_stats_file in tRNA_stats_files: process_tRNA_stats(tRNA_stats_file, tRNA_dictionary) tRNA_gff_dictionary = {} add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths) write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA') #print tRNA_dictionary outputgff_file.close() rename(output_gff_tmp, output_gff)
def formatDB(tools, db, refdbspath, seqType, dbType, algorithm, configs, logger=None): """ Formats the sequences for the specified algorithm """ EXECUTABLES_DIR = configs['METAPATHWAYS_PATH'] + PATHDELIM + configs[ 'EXECUTABLES_DIR'] formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools['FUNC_SEARCH'][ 'exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType == 'nucl': if algorithm == 'LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm == 'BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] if seqType == 'prot': if algorithm == 'LAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['LAST']['LASTDB_EXECUTABLE'] if algorithm == 'BLAST': formatdb_executable = EXECUTABLES_DIR + PATHDELIM + tools[ 'FUNC_SEARCH']['exec']['BLAST']['FORMATDB_EXECUTABLE'] formatted_db = refdbspath + PATHDELIM + dbType + PATHDELIM + 'formatted' + PATHDELIM + db raw_sequence_file = refdbspath + PATHDELIM + dbType + PATHDELIM + db _temp_formatted_db = formatted_db + "__temp__" """ format with 4GB file size """ cmd = "" if algorithm == 'BLAST': cmd = '%s -dbtype %s -max_file_sz 4294967296 -in %s -out %s' % ( formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) #cmd='%s -dbtype %s -max_file_sz 20267296 -in %s -out %s' %(formatdb_executable, seqType, raw_sequence_file, _temp_formatted_db) if algorithm == 'LAST': # dirname = os.path.dirname(raw_sequence_file) cmd = "" if seqType == "prot": cmd = '%s -s 4000M -p -c %s %s' % ( formatdb_executable, _temp_formatted_db, raw_sequence_file) if seqType == "nucl": cmd = '%s -s 4000M -c %s %s' % ( formatdb_executable, _temp_formatted_db, raw_sequence_file) eprintf("INFO\tCommand to format \"%s\"\n", cmd) logger.printf("INFO\tCommand to format \"%s\"\n", cmd) result = getstatusoutput(cmd) temp_fileList = glob(_temp_formatted_db + '*') _formatted_db_pal = _temp_formatted_db + ".pal" if algorithm == 'BLAST' and path.exists(_formatted_db_pal): try: formatted_db_pal = formatted_db + ".pal" if seqType == "nucl": formatted_db_pal = formatted_db + ".nal" _openpal = open(_formatted_db_pal, 'r') openpal = open(formatted_db_pal, 'w') lines = _openpal.readlines() tempPATT = re.compile(r'__temp__') for line in lines: _result = tempPATT.search(line) modline = line.strip() if _result: modline = re.sub('__temp__', '', modline) fprintf(openpal, "%s\n", modline) openpal.close() _openpal.close() remove(_formatted_db_pal) except: return False try: temp_fileList = glob(_temp_formatted_db + '*') for tempFile in temp_fileList: file = re.sub('__temp__', '', tempFile) rename(tempFile, file) except: return False if result[0] == 0: eprintf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm)) logger.printf("INFO\tFormatted database %s successfully for %s\n", sQuote(db), sQuote(algorithm)) return True else: eprintf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm)) eprintf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tReason for failure %s\n", result[1]) logger.printf("INFO\tFailed to Format database %s for %s\n", sQuote(db), sQuote(algorithm)) return False
def create_annotation(results_dictionary, dbname, annotated_gff, output_dir, Taxons, orfsPicked, orfToContig, lca): meganTree = None #lca.set_results_dictionary(results_dictionary) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a') count = 0 for contig in gffreader: # shortORFId = getShortORFId(orf['id']) for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) if shortORFId not in orfsPicked: continue orfToContig[shortORFId] = contig taxonomy = None #_results = re.search(r'refseq', opts_global.database_name, re.I) if shortORFId in Taxons: taxonomy1=Taxons[shortORFId] taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True) # print taxonomy_id preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id) if preferred_taxonomy: taxonomy = preferred_taxonomy else: taxonomy = Taxons[shortORFId] else: taxonomy = 'root' # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip() product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip() # if "partial" in orf['product']: # print orf['product'].strip() # print product fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", product) # adding taxons to the megan tree #if meganTree and taxonomy != '': # meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() output_table_file.close()
def add_job_to_list_jobs(self, J, listfile): fprintf(listfile, "%s\t%s\t%s\t%s\n" % (J.S, J.d, J.a, self.getAlgorithm(J.S))) return True
def main(argv, errorlogger = None, runstatslogger = None): global parser global errorcode (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) min_length = opts.min_length outfile = open(opts.output_fasta + '.tmp', 'w') logfile = open(opts.log_file, 'w') lengthsfile = open(opts.lengths_file + '.tmp', 'w') if opts.map_file: mapfile = open(opts.map_file, 'w') else: mapfile = None if opts.seqtype=='nucleotide': errorcode = 1 else: errorcode = 3 sample_name = opts.input_fasta; sample_name = re.sub(r'^.*/','',sample_name, re.I) sample_name = re.sub(r'^.*\\','',sample_name, re.I) sample_name = re.sub(r'\.fasta$','',sample_name, re.I) sample_name = re.sub(r'\.fna$','',sample_name, re.I) sample_name = re.sub(r'\.faa$','',sample_name, re.I) sample_name = re.sub(r'\.fas$','',sample_name, re.I) sample_name = re.sub(r'\.fa$','',sample_name, re.I) BEFORE = 'BEFORE' AFTER = 'AFTER' NUMSEQ = "#INFO\tNumber of sequences :" NUMSEQ_SHORTER = "@INFO\tNumber of sequences shorter than minimum length of sequences" AVG_LENGTH= "@INFO\tAverage length of sequences:" MIN_LENGTH= "@INFO\tMinimum length of sequences:" MAX_LENGTH= "@INFO\tMaximum length of sequences:" _MAX = 1000000000000 stats = { MIN_LENGTH: { 'BEFORE':_MAX, 'AFTER':_MAX }, MAX_LENGTH: { 'BEFORE': 0, 'AFTER':0 }, NUMSEQ : { 'BEFORE' :0, 'AFTER':0}, NUMSEQ_SHORTER : { 'BEFORE':0, 'AFTER':0 }, AVG_LENGTH : { 'BEFORE':0, 'AFTER':0 }, } length_distribution = {} length_cumulative_distribution = {} for i in range(0,31): length_distribution[i]= 0 length_cumulative_distribution[i]= 0 seq_count = 0 allNames= dict() outputStr = "" outputLines = [] fastareader= FastaReader(opts.input_fasta) """ process one fasta sequence at a time """ lengths_str="" for record in fastareader: seqname = record.name seq = record.sequence length = len(seq) index = int(len(seq) / 50); if index >= 30: index = 30 length_distribution[index] += 1 if length < stats[MIN_LENGTH][BEFORE] : stats[MIN_LENGTH][BEFORE] = length if length > stats[MAX_LENGTH][BEFORE] : stats[MAX_LENGTH][BEFORE] = length if length < MIN_LENGTH: stats[NUMSEQ_SHORTER][BEFORE] += 1 stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE] + length #stopped the filtering process seqvalue = filter_sequence(seq) seqvalue = seq.upper() stats[NUMSEQ][BEFORE] += 1 seqlen = len(seqvalue) if seqlen>= min_length : if len(lengths_str) > 100: fprintf(lengthsfile,"%s\n",lengths_str); lengths_str = str(seqlen) else: lengths_str += '\t' + str(seqlen) stats[NUMSEQ][AFTER] += 1 stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER] + seqlen if mapfile==None: fprintf(outfile, "%s\n", seqname) else: fprintf(outfile, ">%s\n", sample_name + '_' + str(seq_count) ) key = re.sub(r'^>','',seqname) fprintf(mapfile, "%s\n", sample_name+ '_' + str(seq_count) + '\t' + key + '\t' + str(seqlen)) seq_count += 1 fprintf(outfile, "%s\n",seqvalue) if seqlen < stats[MIN_LENGTH][AFTER] : stats[MIN_LENGTH][AFTER] = seqlen if seqlen > stats[MAX_LENGTH][AFTER] : stats[MAX_LENGTH][AFTER] = seqlen fprintf(lengthsfile,"%s\n",lengths_str); if stats[NUMSEQ][BEFORE] > 0 : stats[AVG_LENGTH][BEFORE] = stats[AVG_LENGTH][BEFORE]/stats[NUMSEQ][BEFORE] else: stats[AVG_LENGTH][BEFORE] = 0 if stats[NUMSEQ][AFTER] > 0 : stats[AVG_LENGTH][AFTER] = stats[AVG_LENGTH][AFTER]/stats[NUMSEQ][AFTER] else : stats[AVG_LENGTH][AFTER] = 0 lengthsfile.close() outfile.close() rename(opts.output_fasta + ".tmp", opts.output_fasta) rename(opts.lengths_file + ".tmp", opts.lengths_file) #inputfile.close() if mapfile != None: mapfile.close() """ min length """ if stats[MIN_LENGTH][BEFORE] == _MAX: stats[MIN_LENGTH][BEFORE] = 0 if stats[MIN_LENGTH][AFTER] == _MAX: stats[MIN_LENGTH][AFTER] = 0 fprintf(logfile, "@INFO\tBEFORE\tAFTER\n"); fprintf(logfile, "%s\n", NUMSEQ +'\t' + str(stats[NUMSEQ][BEFORE]) + '\t' + str(stats[NUMSEQ][AFTER])); fprintf(logfile, "%s\n", NUMSEQ_SHORTER + '\t'+ str(stats[NUMSEQ_SHORTER][BEFORE]) + '\t' + str(stats[NUMSEQ_SHORTER][AFTER])) fprintf(logfile, "%s\n", AVG_LENGTH +'\t' + str(stats[AVG_LENGTH][BEFORE]) + '\t'+ str(stats[AVG_LENGTH][AFTER])) fprintf(logfile, "%s\n", MIN_LENGTH + '\t' + str(stats[MIN_LENGTH][BEFORE]) +'\t'+ str(stats[MIN_LENGTH][AFTER])) fprintf(logfile, "%s\n", MAX_LENGTH +'\t'+ str(stats[MAX_LENGTH][BEFORE]) + '\t' + str(stats[MAX_LENGTH][AFTER])) fprintf(logfile, "@INFO\tLOW\tHIGH\tFREQUENCY\tCUMULATIVE_FREQUENCY\n"); # fprintf(logfile, "# ---\t-----\t--------\t---------\t----------\n"); i = 30 length_cumulative_distribution[i] = length_cumulative_distribution[i]; i -= 1 while i >= 0: length_cumulative_distribution[i] = length_cumulative_distribution[i+1] + length_distribution[i]; i -= 1 for i in range(0,31): fprintf(logfile, " %s\n", str(i*50) + '\t' + str((i+1)*50) + '\t' +\ str(length_distribution[i]) +'\t' + str(length_cumulative_distribution[i]) ) logfile.close() if opts.seqtype=='nucleotide': priority = 1000 else: priority = 2000 if runstatslogger != None: if opts.seqtype=='nucleotide': runstatslogger.write("%s\tNumber of sequences in input file BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype, str(stats[NUMSEQ][BEFORE])) ) runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) ) runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write("%s\t-max length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) ) runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE])))) runstatslogger.write("%s\tNumber of sequences AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) ) runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) ) runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) )) else: runstatslogger.write("%s\tNumber of translated ORFs BEFORE QC (%s)\t%s\n" %(str(priority), opts.seqtype, str(stats[NUMSEQ][BEFORE])) ) runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 1), str(stats[MIN_LENGTH][BEFORE])) ) runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 2), str(int(stats[AVG_LENGTH][BEFORE])))) runstatslogger.write("%s\t-max length\t%s\n" %(str(priority + 3), str(stats[MAX_LENGTH][BEFORE])) ) runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %(str(priority + 4), str(int(stats[AVG_LENGTH][BEFORE]* stats[NUMSEQ][BEFORE])))) runstatslogger.write("%s\tNumber of tranlated ORFs AFTER QC (%s)\t%s\n" %(str(priority + 5), opts.seqtype, str(stats[NUMSEQ][AFTER]))) runstatslogger.write("%s\t-min length\t%s\n" %(str(priority + 6), str(stats[MIN_LENGTH][AFTER])) ) runstatslogger.write("%s\t-avg length\t%s\n" %( str(priority + 7), str(int(stats[AVG_LENGTH][AFTER])))) runstatslogger.write("%s\t-max length\t%s\n" %( str(priority + 8), str(stats[MAX_LENGTH][AFTER])) ) runstatslogger.write("%s\t-total base pairs (bp)\t%s\n" %( str(priority + 9), str(int(stats[AVG_LENGTH][AFTER]* stats[NUMSEQ][AFTER])) ))
def main(argv, errorlogger=None): global parser (opts, args) = parser.parse_args(argv) if not valid_arguments(opts, args): print usage sys.exit(0) sample_name = opts.sample_name folder_path = opts.folder_path results = [] try: STEP_NAME = "GATHER_STATS" # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'nuc') if status != None: results += status else: errorlogger.write( "%s\tERROR\tCannot read nuc stats file\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the nucleotide seequences status = get_stats_from_stats_file(sample_name, folder_path, 'amino') if status != None: results += status else: errorlogger.write( "%s\tERROR\tCannot read amino stats file\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the blast/last hits status = get_BLAST_LAST_hits(sample_name, folder_path) if status != None: results += status else: errorlogger.write( "%s\tERROR\tReading BLAST HITS\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the selected parsed blast/last hits status = get_BLAST_LAST_parsed_hits(sample_name, folder_path) if status != None: results += status else: errorlogger.write( "%s\tERROR\tReading parsed BLAST HITS\t%s" % (STEP_NAME, folder_path + PATHDELIM + sample_name)) exit_process() # read the annotated gff hits status = get_annotation_hits(sample_name, folder_path) if status != None: results += status # read the annotated gff hits status = get_functional_taxonomic_hits(sample_name, folder_path) if status != None: results += status # read the number of ORFs that are used for mapping to functional categories status = get_ORF_annotations_hits(sample_name, folder_path) if status != None: results += status # get the rRNA hits status = get_rRNA_hits(sample_name, folder_path) if status != None: results += status # get the tRNA hits status = get_tRNA_hits(sample_name, folder_path) if status != None: results += status stats_file_name = folder_path + PATHDELIM + 'run_statistics' + PATHDELIM + sample_name + '.run.stats.txt' try: statsfilename = open(stats_file_name, 'w') except: print "ERRROR : Cannot open stats file format " + stats_file_name sys.exit(0) for pair in results: fprintf(statsfilename, '%s\t%s\n', pair[0], pair[1]) statsfilename.close() except: exit_process()
def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_stats_files, tRNA_stats_files, output_gff, output_comparative_annotation, contig_lengths, compact_output=False): orf_dictionary = {} # process_gff_file(input_gff, orf_dictionary) gffreader = GffFileParser(input_gff) output_gff_tmp = output_gff + ".tmp" outputgff_file = open(output_gff_tmp, 'w') output_comp_annot_file1 = open(output_comparative_annotation + '.1.txt', 'w') output_comp_annot_file2 = open(output_comparative_annotation + '.2.txt', 'w') output_comp_annot_file1_Str = 'orf_id\tref dbname\tEC\tproduct\tvalue' fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str) output_comp_annot_file2_Str = 'orf_id' dbnames = dbname_weight.keys() for dbname in dbnames: weight = dbname_weight[dbname] output_comp_annot_file2_Str += '\t{0}(EC) \t{0}(product)\t{0}(value)'.format( dbname) fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str) # gffreader = GffReader(input_gff) # for dbname in dbnames: # print dbname, len(results_dictionary[dbname].keys()) # print results_dictionary[dbname].keys() i = 0 for contig in gffreader: count = 0 for orf in gffreader.orf_dictionary[contig]: value = 0.0001 success = False output_comp_annot_file1_Str = '' output_comp_annot_file2_Str = '' for dbname in dbnames: weight = dbname_weight[dbname] value = 0 orf_id = orf['id'] if orf_id in results_dictionary[dbname]: if value < results_dictionary[dbname][orf_id]['value']: value = results_dictionary[dbname][orf_id]['value'] candidatedbname = dbname success = True candidate_orf_pos = count if output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format('', dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(orf_id, dbname,\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format(\ results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf_id, results_dictionary[dbname][orf['id']]['ec'],\ results_dictionary[dbname][orf['id']]['product'],\ str(results_dictionary[dbname][orf['id']]['value']*float(weight))) else: if not output_comp_annot_file1_Str: output_comp_annot_file1_Str += '{0}\t{1}\t{2}\t{3}\t{4}\n'.format( orf_id, '', '', '', '') if output_comp_annot_file2_Str: output_comp_annot_file2_Str += '\t{0}\t{1}\t{2}'.format( '', '', '') else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format( orf_id, '', '', '', '') if success: # there was a database hit fprintf(output_comp_annot_file1, '%s\n', output_comp_annot_file1_Str) fprintf(output_comp_annot_file2, '%s\n', output_comp_annot_file2_Str) write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, gffreader.orf_dictionary, contig, candidate_orf_pos, orf_id, compact_output=compact_output) else: # if it was not a hit then it is a hypothetical protein #print gffreader.orf_dictionary write_annotation_for_orf(outputgff_file, 'None', '0', results_dictionary, gffreader.orf_dictionary, contig, count, orf_id, compact_output=compact_output) count += 1 #move to the next orf #del orf_dictionary[contig] output_comp_annot_file1.close() output_comp_annot_file2.close() # now deal with the rRNA sequences if there is rRNA stats file if len(rRNA_16S_stats_files) > 0 and contig_lengths: rRNA_16S_dictionary = {} for rRNA_16S_stats_file in rRNA_16S_stats_files: process_rRNA_16S_stats(rRNA_16S_stats_file, rRNA_16S_dictionary) rRNA_dictionary = {} add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths) write_16S_tRNA_gene_info(rRNA_dictionary, outputgff_file, '_rRNA') # now deal with the tRNA sequences if there is tRNA stats file if len(tRNA_stats_files) > 0 and contig_lengths: tRNA_dictionary = {} for tRNA_stats_file in tRNA_stats_files: process_tRNA_stats(tRNA_stats_file, tRNA_dictionary) tRNA_gff_dictionary = {} add_tRNA_genes(tRNA_dictionary, tRNA_gff_dictionary, contig_lengths) write_16S_tRNA_gene_info(tRNA_gff_dictionary, outputgff_file, '_tRNA') #print tRNA_dictionary outputgff_file.close() rename(output_gff_tmp, output_gff)
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None): linecount = 0 readerhandles = [] if verbose: eprintf("Processing for database : %s\n", dbname) if len(filenames)==0: eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname) exit_process() try: for i in range(len(filenames)): #print filenames readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) ) except OSError: eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i]) exit_process() # set error and warning parameters for readerhandle in readerhandles: readerhandle.setMaxErrorsLimit(5) readerhandle.setErrorAndWarningLogger(errorlogger) readerhandle.setSTEP_NAME('PARSE BLAST') try: outputfile = open(outputfilename, 'w') fieldmapHeaderLine = readerhandles[0].getHeaderLine() fprintf(outputfile, "%s\n",fieldmapHeaderLine) except OSError: eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename) exit_process() values = [] for i in range(len(filenames)): iterate = iter(readerhandles[i]) try : next(iterate) line = readerhandles[i].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values.append( (i, orfRanks[shortORFId], line) ) except: outputfile.close() return S = len(filenames) BuildHeap(S, values) while S>0: try: iterate = iter(readerhandles[values[0][0]]) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] #print fields[0], orfRanks[fields[0]] fprintf(outputfile, "%s\n",line) next(iterate) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values[0] = (values[0][0], orfRanks[shortORFId], line) except: #import traceback #traceback.print_exc() #print 'finished ' + str(S) values[0] = values[S-1] S = S - 1 if S>0: Heapify(values, 0, S) #print 'line count ' + str(linecount) outputfile.close()
def main(argv, errorlogger = None, runstatslogger = None): global parser (opts, args) = parser.parse_args(argv) global opts_global opts_global = opts if not check_arguments(opts, args): print usage sys.exit(0) db_to_map_Maps = {'cog':opts.input_cog_maps, 'seed':opts.input_seed_maps, 'kegg':opts.input_kegg_maps, 'cazy':opts.input_cazy_maps} results_dictionary={} dbname_weight={} checkOrCreateFolder(opts.output_dir) output_table_file = open(opts.output_dir + PATHDELIM +'functional_and_taxonomic_table.txt', 'w') fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") output_table_file.close() # print "memory used = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000) listOfOrfs = get_list_of_queries(opts.input_annotated_gff) listOfOrfs.sort(key=lambda tup: tup, reverse=False) #printlist(listOfOrfs,5) #sys.exit(0) ##### uncomment the following lines for dbname, blastoutput in zip(opts.database_name, opts.input_blastout): create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, verbose= opts.verbose, errorlogger = errorlogger) ##### # process in blocks of size _stride lca = LCAComputation(opts.ncbi_taxonomy_map, opts.ncbi_megan_map) lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) blastParsers={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') blastParsers[dbname].setMaxErrorsLimit(5) blastParsers[dbname].setErrorAndWarningLogger(errorlogger) # this part of the code computes the occurence of each of the taxons # which is use in the later stage is used to evaluate the min support # as used in the MEGAN software start = 0 Length = len(listOfOrfs) _stride = 100000 Taxons = {} while start < Length: pickorfs= {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]]= 'root' start = last #print 'Num of Min support orfs ' + str(start) results_dictionary={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): results = re.search(r'refseq', dbname, re.I) if results: #if True: try: results_dictionary[dbname]={} process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) #print results_dictionary[dbname].keys()[1:5] lca.set_results_dictionary(results_dictionary) lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs, dbname = dbname ) for key, taxon in pickorfs.iteritems(): Taxons[key] = taxon except: eprintf("ERROR: while training for min support tree %s\n", dbname) import traceback traceback.print_exc() blastParsers={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') # this loop determines the actual/final taxonomy of each of the ORFs # taking into consideration the min support filePermTypes= {} start = 0 outputfile = open( opts.output_dir +'/ORF_annotation_table.txt', 'w') short_to_long_dbnames = {} for dbname in opts.database_name: results = re.search(r'^seed', dbname, re.IGNORECASE) if results: short_to_long_dbnames['seed'] = dbname results = re.search(r'^cog', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cog'] = dbname results = re.search(r'^kegg', dbname, re.IGNORECASE) if results: short_to_long_dbnames['kegg'] = dbname results = re.search(r'^cazy', dbname, re.IGNORECASE) if results: short_to_long_dbnames['cazy'] = dbname standard_dbs = ['cog', 'seed', 'kegg', 'cazy'] standard_db_maps = [opts.input_cog_maps, opts.input_seed_maps, opts.input_kegg_maps, opts.input_cazy_maps] field_to_description = {} hierarchical_map = {} for db in standard_dbs: if db in short_to_long_dbnames: field_to_description[db] = {} hierarchical_map[db] = {} for dbname in standard_dbs: if dbname in short_to_long_dbnames: try: read_map_file(db_to_map_Maps[dbname], field_to_description[dbname], hierarchical_map[dbname]) except: raise pass while start < Length: pickorfs= {} last = min(Length, start + _stride) for i in range(start, last): pickorfs[listOfOrfs[i]]= True start = last gc.collect() eprintf("\nMemory used = %s MB\n", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000)) results_dictionary={} for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): try: results_dictionary[dbname]={} eprintf("Processing database %s...", dbname) process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) eprintf("done\n") except: import traceback traceback.print_exc() eprintf("ERROR: %s\n", dbname) pass # print dbname + ' ' + str(len(results_dictionary[dbname])) eprintf("Num orfs processed : %s\n", str(start)) # create the annotations now orfToContig = {} create_annotation(results_dictionary, opts.database_name, opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig, lca) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: create_table(results_dictionary[short_to_long_dbnames[std_dbname]], std_dbname, opts.output_dir, hierarchical_map, field_to_description) # create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType) print_orf_table(results_dictionary, orfToContig, opts.output_dir, outputfile) for std_dbname, db_map_filename in zip(standard_dbs, standard_db_maps): if std_dbname in short_to_long_dbnames: print_kegg_cog_tables(std_dbname, opts.output_dir, hierarchical_map, field_to_description, filePermType = 'w') outputfile.close() # now remove the temporary files for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): try: remove( blastoutput + '.tmp') except: pass
def main(argv, errorlogger=None, runcommand=None, runstatslogger=None): global parser options, args = parser.parse_args(argv) if not len(options.blast_files): parser.error('At least one taxonomic BLAST output is required') if runBlastCommandrRNA(runcommand=runcommand) != 0: if errorlogger: errorlogger.write( "ERROR: Failed to BLAST the sequences against database %s : " % (options.tax_databases[0])) errorlogger.write(" : " + runcommand) exit_process("ERROR: Failed to BLAST the sequences against database %s : " %(options.tax_databases[0]) +\ " : " + runcommand) if not (len(options.tax_databases) == len(options.blast_files)): parser.error( 'Number of taxonomic databases and BLAST outputs should be the same' ) if not options.output: parser.error('Output file must be specified') # Incredible sanity check if not files_exist(options.blast_files): sys.exit(0) if not files_exist(options.tax_databases): sys.exit(0) params = { 'length': int(options.length), 'similarity': float(options.similarity), 'evalue': float(options.evalue), 'bitscore': float(options.bitscore) } #print params['bitscore'] table = {} for x in range(0, len(options.blast_files)): table[options.tax_databases[x]] = {} process_blastout_file(options.blast_files[x], options.tax_databases[x], table[options.tax_databases[x]], errorlogger=errorlogger) priority = 7000 reads = {} for x in range(0, len(options.blast_files)): append_taxonomic_information(options.tax_databases[x], table[options.tax_databases[x]], params) for key in table[options.tax_databases[x]]: if len(table[options.tax_databases[x]][key][6]) > 1: reads[key] = True dbname = re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x]) runstatslogger.write("%s\tTaxonomic hits in %s\t%s\n" % (str(priority), dbname, str(len(reads)))) priority += 1 outputfile = open(options.output, 'w') fprintf(outputfile, "#Similarity cutoff :\t" + str(params['similarity']) + '\n') fprintf(outputfile, "#Length cutoff :\t" + str(params['length']) + '\n') fprintf(outputfile, "#Evalue cutoff :\t" + str(params['evalue']) + '\n') fprintf(outputfile, "#Bit score cutoff :\t" + str(params['bitscore']) + '\n') fprintf(outputfile, "#Number of rRNA sequences detected:\t" + str(len(reads)) + '\n\n') for x in range(0, len(options.tax_databases)): # printf('\t%s\t\t\t', re.sub(r'^.*/','', options.tax_databases[x])) fprintf(outputfile, '\t%s\t\t\t', re.sub(r'^.*' + PATHDELIM, '', options.tax_databases[x])) #printf('\n') fprintf(outputfile, '\n') #printf('%s', 'read') for x in range(0, len(options.blast_files)): fprintf(outputfile, '%s\t%s\t%s\t%s\t%s\t%s\t%s', 'sequence', 'start', 'end', 'similarity', 'evalue', 'bitscore', 'taxonomy') fprintf(outputfile, '\n') for read in reads: #printf('%s', read) fprintf(outputfile, '%s', read) for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: fprintf(outputfile, '\t%s\t%s\t%s\t%s\t%s\t%s', str(table[options.tax_databases[x]][read][4]), str(table[options.tax_databases[x]][read][5]), str(table[options.tax_databases[x]][read][0]), str(table[options.tax_databases[x]][read][1]), str(table[options.tax_databases[x]][read][2]), str(table[options.tax_databases[x]][read][6])) else: fprintf(outputfile, '\t-\t-\t-\t-\t-\t-') fprintf(outputfile, '\n') outputfile.close() # collect the exact reads database_hits = {} for read in reads: for x in range(0, len(options.blast_files)): if read in table[options.tax_databases[x]]: database_hits[read] = [ table[options.tax_databases[x]][read][4], table[options.tax_databases[x]][read][5] ] # pick the hits, trim them according to the match and write them if options.fasta: selected_sequences = {} read_select_fasta_sequences(database_hits, selected_sequences, options.fasta) for read in database_hits: selected_sequences[read] = selected_sequences[read][ database_hits[read][0]:database_hits[read][1]] write_selected_sequences(selected_sequences, options.output + '.fasta')
def process_blastoutput(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = None): blastparser = BlastOutputParser(dbname, blastoutput, mapfile, refscore_file, opts, errorlogger = errorlogger) blastparser.setMaxErrorsLimit(100) blastparser.setErrorAndWarningLogger(errorlogger) blastparser.setSTEP_NAME('PARSE BLAST') fields = ['target','q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if opts.taxonomy: fields.append('taxonomy') fields.append('product') output_blastoutput_parsed = blastoutput + '.parsed.txt' # temporary file is used to deal with incomplete processing of the file output_blastoutput_parsed_tmp = output_blastoutput_parsed + ".tmp" try: outputfile = open(output_blastoutput_parsed_tmp, 'w') except: if errorlogger: errorlogger.write("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) exit_process("PARSE_BLAST\tERROR\tCannot open temp file %s to sort\tfor reference db\n" %(soutput_blastoutput_parsed_tmp, dbname)) # write the headers out fprintf(outputfile, "#%s",'query') for field in fields: fprintf(outputfile,"\t%s",field) fprintf(outputfile, "\n") count = 0; for data in blastparser: if not data: continue try: fprintf(outputfile, "%s",data['query']) except: print 'data is : ', data, '\n' sys.exit() for field in fields: fprintf(outputfile, "\t%s",data[field]) fprintf(outputfile, "\n") count += 1 outputfile.close() rename(output_blastoutput_parsed_tmp, output_blastoutput_parsed) return count