def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs): fields = [ 'target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ] fields.append('product') for data in blastparser: if data != None and isWithinCutoffs(data, cutoffs): # if dbname=='refseq': #if "partial" in data['product']: # print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] annotation = {} shortORFId = None for field in fields: if field in data: if field == 'query': shortORFId = getShortORFId(data[field]) annotation[field] = shortORFId else: annotation[field] = data[field] if not shortORFId in pickorfs: blastparser.rewind() return None annotation['dbname'] = dbname if not shortORFId in annotation_results: annotation_results[shortORFId] = [] annotation_results[shortORFId].append(annotation) return None
def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs): fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ] fields.append('product') for data in blastparser: if data!=None and isWithinCutoffs(data, cutoffs) : # if dbname=='refseq': #if "partial" in data['product']: # print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] annotation = {} shortORFId = None for field in fields: if field in data: if field == 'query': shortORFId = getShortORFId(data[field]) annotation[field] = shortORFId else: annotation[field] = data[field] if not shortORFId in pickorfs: blastparser.rewind() return None annotation['dbname'] = dbname if not shortORFId in annotation_results: annotation_results[shortORFId] = [] annotation_results[shortORFId].append(annotation) return None
def create_annotation(results_dictionary, dbname, annotated_gff, output_dir, Taxons, orfsPicked, orfToContig, lca): meganTree = None #lca.set_results_dictionary(results_dictionary) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a') count = 0 for contig in gffreader: # shortORFId = getShortORFId(orf['id']) for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) if shortORFId not in orfsPicked: continue orfToContig[shortORFId] = contig taxonomy = None #_results = re.search(r'refseq', opts_global.database_name, re.I) if shortORFId in Taxons: taxonomy1=Taxons[shortORFId] taxonomy_id=lca.get_supported_taxon(taxonomy1, return_id=True) # print taxonomy_id preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id) if preferred_taxonomy: taxonomy = preferred_taxonomy else: taxonomy = Taxons[shortORFId] else: taxonomy = 'root' product = orf['product'] # leave product as it is # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip() # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip() # if "partial" in orf['product']: # print orf['product'].strip() # print product fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", product) # adding taxons to the megan tree #if meganTree and taxonomy != '': # meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() output_table_file.close()
def compute_min_support_tree(self, annotate_gff_file, pickorfs, dbname='refseq'): self.tax_dbname = dbname gffreader = GffFileParser(annotate_gff_file) try: for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) #print shortORFId, orf['id'] if not shortORFId in pickorfs: continue #print ">", shortORFId, orf['id'] taxonomy = None species = [] if self.tax_dbname in self.results_dictionary: if shortORFId in self.results_dictionary[ self.tax_dbname]: #compute the top hit wrt score top_score = 0 for hit in self.results_dictionary[ self.tax_dbname][shortORFId]: # print hit['bitscore'], self.lca_min_score, top_score if hit['bitscore'] >= self.lca_min_score and hit[ 'bitscore'] >= top_score: top_score = hit['bitscore'] for hit in self.results_dictionary[ self.tax_dbname][shortORFId]: if (100 - self.lca_top_percent ) * top_score / 100 < hit['bitscore']: names = self.get_species(hit) if names: species.append(names) # print self.results_dictionary[dbname][shortORFId][0]['product'] # print orf['id'] # print orf['id'], species # print orf['id'], len(self.results_dictionary[dbname][shortORFId]), species taxonomy = self.getTaxonomy(species) # taxonomy_id = self.getTaxonomy(species, return_id=True) # print taxonomy # print taxonomy_id # print taxonomy, orf['id'], species self.update_taxon_support_count(taxonomy) # preferred_taxonomy = self.get_preferred_taxonomy(taxonomy_id) # print taxonomy # print preferred_taxonomy pickorfs[shortORFId] = taxonomy except: import traceback traceback.print_exc() print "ERROR : Cannot read annotated gff file "
def get_list_of_queries(annotated_gff): orfList = {} gffreader = GffFileParser(annotated_gff) count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: orfid = getShortORFId(orf['id']) orfList[orfid] = 1 count += 1 # if count%500000==0: # print count return orfList.keys()
def compute_min_support_tree(self, annotate_gff_file, pickorfs, dbname= 'refseq'): self.tax_dbname = dbname gffreader = GffFileParser(annotate_gff_file) try: for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) #print shortORFId, orf['id'] if not shortORFId in pickorfs: continue #print ">", shortORFId, orf['id'] taxonomy = None species = [] if self.tax_dbname in self.results_dictionary: if shortORFId in self.results_dictionary[self.tax_dbname]: #compute the top hit wrt score top_score = 0 for hit in self.results_dictionary[self.tax_dbname][shortORFId]: # print hit['bitscore'], self.lca_min_score, top_score if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score: top_score = hit['bitscore'] for hit in self.results_dictionary[self.tax_dbname][shortORFId]: if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']: names = self.get_species(hit) if names: species.append(names) # print self.results_dictionary[dbname][shortORFId][0]['product'] # print orf['id'] # print orf['id'], species # print orf['id'], len(self.results_dictionary[dbname][shortORFId]), species taxonomy=self.getTaxonomy(species) # taxonomy_id = self.getTaxonomy(species, return_id=True) # print taxonomy # print taxonomy_id # print taxonomy, orf['id'], species self.update_taxon_support_count(taxonomy) # preferred_taxonomy = self.get_preferred_taxonomy(taxonomy_id) # print taxonomy # print preferred_taxonomy pickorfs[shortORFId] = taxonomy except: import traceback traceback.print_exc() print "ERROR : Cannot read annotated gff file "
def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs): fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ] fields.append('product') try: for data in blastparser: # if dbname=='refseq-nr-2014-01-18': # print 'refseq process', data if data!=None and isWithinCutoffs(data, cutoffs) : #if dbname=='refseq-nr-2014-01-18': # print 'refseq process', data # if dbname=='refseq': #if "partial" in data['product']: # print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] annotation = {} shortORFId = None for field in fields: if field in data: if field == 'query': shortORFId = getShortORFId(data[field]) annotation[field] = shortORFId else: annotation[field] = data[field] if not shortORFId in pickorfs: continue # blastparser.rewind() # return None annotation['dbname'] = dbname if not shortORFId in annotation_results: annotation_results[shortORFId] = [] #if dbname=='refseq-nr-2014-01-18': #print annotation annotation_results[shortORFId].append(annotation) except: print traceback.print_exc() #if dbname=='refseq-nr-2014-01-18': # print 'annot refseq process', len(annotation_results) return None
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger=None): linecount = 0 readerhandles = [] if verbose: eprintf("Processing database : %s\n", dbname) if len(filenames) == 0: eprintf( "WARNING : Cannot find any B/LAST output file for database : %\n", dbname) exit_process() try: for i in range(len(filenames)): #print filenames readerhandles.append(BlastOutputTsvParser(dbname, filenames[i])) except OSError: eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i]) exit_process() # set error and warning parameters for readerhandle in readerhandles: readerhandle.setMaxErrorsLimit(5) readerhandle.setErrorAndWarningLogger(errorlogger) readerhandle.setSTEP_NAME('PARSE BLAST') try: outputfile = open(outputfilename, 'w') fieldmapHeaderLine = readerhandles[0].getHeaderLine() fprintf(outputfile, "%s\n", fieldmapHeaderLine) except OSError: eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename) exit_process() values = [] for i in range(len(filenames)): iterate = iter(readerhandles[i]) try: next(iterate) line = readerhandles[i].getProcessedLine() fields = [x.strip() for x in line.split('\t')] shortORFId = getShortORFId(fields[0]) values.append((i, orfRanks[shortORFId], line)) except: outputfile.close() return S = len(filenames) BuildHeap(S, values) while S > 0: try: iterate = iter(readerhandles[values[0][0]]) line = readerhandles[values[0][0]].getProcessedLine() fields = [x.strip() for x in line.split('\t')] #print fields[0], orfRanks[fields[0]] fprintf(outputfile, "%s\n", line) next(iterate) line = readerhandles[values[0][0]].getProcessedLine() fields = [x.strip() for x in line.split('\t')] shortORFId = getShortORFId(fields[0]) values[0] = (values[0][0], orfRanks[shortORFId], line) except: #print 'finished ' + str(S) values[0] = values[S - 1] S = S - 1 if S > 0: Heapify(values, 0, S) #print 'line count ' + str(linecount) outputfile.close()
def create_annotation(results_dictionary, dbname, annotated_gff, output_dir, Taxons, orfsPicked, orfToContig, lca): meganTree = None #lca.set_results_dictionary(results_dictionary) if not path.exists(output_dir): makedirs(output_dir) orf_dictionary = {} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) output_table_file = open( output_dir + '/functional_and_taxonomic_table.txt', 'a') count = 0 for contig in gffreader: # shortORFId = getShortORFId(orf['id']) for orf in gffreader.orf_dictionary[contig]: shortORFId = getShortORFId(orf['id']) if shortORFId not in orfsPicked: continue orfToContig[shortORFId] = contig taxonomy = None #_results = re.search(r'refseq', opts_global.database_name, re.I) if shortORFId in Taxons: taxonomy1 = Taxons[shortORFId] taxonomy_id = lca.get_supported_taxon(taxonomy1, return_id=True) # print taxonomy_id preferred_taxonomy = lca.get_preferred_taxonomy(taxonomy_id) if preferred_taxonomy: taxonomy = preferred_taxonomy else: taxonomy = Taxons[shortORFId] else: taxonomy = 'root' product = orf['product'] # leave product as it is # product = re.sub(r'\[{1,2}.+?\]{1,2}','', orf['product']).strip() # product = re.sub(r'\[[^\[]+?\]','', orf['product']).strip() # if "partial" in orf['product']: # print orf['product'].strip() # print product fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) fprintf(output_table_file, "\t%s", orf['end']) fprintf(output_table_file, "\t%s", orf['seqname']) fprintf(output_table_file, "\t%s", orf['contig_length']) fprintf(output_table_file, "\t%s", orf['strand']) fprintf(output_table_file, "\t%s", orf['ec']) # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", product) # adding taxons to the megan tree #if meganTree and taxonomy != '': # meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() output_table_file.close()
def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks, verbose=False, errorlogger = None): linecount = 0 readerhandles = [] if verbose: eprintf("Processing for database : %s\n", dbname) if len(filenames)==0: eprintf("WARNING : Cannot find any B/LAST output file for database : %\n", dbname) exit_process() try: for i in range(len(filenames)): #print filenames readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) ) except OSError: eprintf("ERROR: Cannot read sequence file : %s\n", filenames[i]) exit_process() # set error and warning parameters for readerhandle in readerhandles: readerhandle.setMaxErrorsLimit(5) readerhandle.setErrorAndWarningLogger(errorlogger) readerhandle.setSTEP_NAME('PARSE BLAST') try: outputfile = open(outputfilename, 'w') fieldmapHeaderLine = readerhandles[0].getHeaderLine() fprintf(outputfile, "%s\n",fieldmapHeaderLine) except OSError: eprintf("ERROR: Cannot create sequence file : %s\n", outputfilename) exit_process() values = [] for i in range(len(filenames)): iterate = iter(readerhandles[i]) try : next(iterate) line = readerhandles[i].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values.append( (i, orfRanks[shortORFId], line) ) except: outputfile.close() return S = len(filenames) BuildHeap(S, values) while S>0: try: iterate = iter(readerhandles[values[0][0]]) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] #print fields[0], orfRanks[fields[0]] fprintf(outputfile, "%s\n",line) next(iterate) line = readerhandles[values[0][0]].getProcessedLine() fields = [ x.strip() for x in line.split('\t') ] shortORFId = getShortORFId(fields[0]) values[0] = (values[0][0], orfRanks[shortORFId], line) except: #import traceback #traceback.print_exc() #print 'finished ' + str(S) values[0] = values[S-1] S = S - 1 if S>0: Heapify(values, 0, S) #print 'line count ' + str(linecount) outputfile.close()