def process_parsed_blastoutput(dbname, weight, blastoutput, cutoffs, annotation_results): blastparser = BlastOutputTsvParser(dbname, blastoutput) fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if cutoffs.taxonomy: fields.append('taxonomy') fields.append('product') annotation = {} for data in blastparser: #if count%10000==0: # print count if isWithinCutoffs(data, cutoffs) : #print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] # if data['query'] =='NapDC_illum_asm_188606_0': # print dbname annotation['bsr'] = data['bsr'] annotation['ec'] = data['ec'] annotation['product'] = strip_taxonomy(process_product(data['product'], dbname) ) annotation['value'] = compute_annotation_value(annotation)*weight # print annotation if not data['query'] in annotation_results: annotation_results[data['query']] = {'value':0} if annotation_results[data['query']]['value'] <= annotation['value'] : annotation_results[data['query']] = annotation.copy() # add_refscore_to_file(blastoutput,refscore_file, allNames) count = len(annotation_results.keys()) return count
def process_parsed_blastoutput(dbname, weight, blastoutput, cutoffs, annotation_results): blastparser = BlastOutputTsvParser(dbname, blastoutput, shortenorfid=False) fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ] if cutoffs.taxonomy: fields.append('taxonomy') fields.append('product') annotation = {} for data in blastparser: #if count%10000==0: if isWithinCutoffs(data, cutoffs) : #print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] # if data['query'] =='NapDC_illum_asm_188606_0': # print dbname annotation['bsr'] = data['bsr'] annotation['ec'] = data['ec'] annotation['product'] = strip_taxonomy(process_product(data['product'], dbname) ) annotation['value'] = compute_annotation_value(annotation)*weight # print annotation if not data['query'] in annotation_results: annotation_results[data['query']] = {'value':0} if annotation_results[data['query']]['value'] <= annotation['value'] : annotation_results[data['query']] = annotation.copy() # add_refscore_to_file(blastoutput,refscore_file, allNames) count = len(annotation_results.keys()) return count
def process_product(product, database, similarity_threshold=0.9): """Returns the best set of products from the list of (*database*, *product*) tuples *products*. Each product in the set is first trimmed down, removing database-specific information. The set is then determined by first sorting the products by length (ascending), and then, for each product, sequentially applying the longest common substring algorithm to determine the similarity between the product and already determined products. If this similarity is greater than the specified *similarity_threshold*, the longer of the two products is chosen to be a determined product. """ processed_product = '' # print 'dbase', database # COG if database == 'cog': results = re.search(r'Function: (.+?) #', product) if results: processed_product = results.group(1) # KEGG: split and process elif database == 'kegg': kegg_products = re.split(r'\s*;\s+', product) for kegg_product in kegg_products: # Toss out organism:ID pairs, gene names, and KO IDs kegg_product = re.sub(r'^lcl[|]', '', kegg_product) kegg_product = re.sub(r'[a-z]{3}:\S+', '', kegg_product) kegg_product = kegg_product.strip() kegg_product = re.sub(r'(, \b[a-z]{3}[A-Z]?\b)+', '', kegg_product) kegg_product = re.sub(r'^\b[a-z]{3}[A-Z]?\b', '', kegg_product) # get KO number kegg_product = re.sub(r'\bK\d{5}\b', '', kegg_product) # Also toss out anything between square brackets kegg_product = re.sub(r'\[.*\]', '', kegg_product) if kegg_product.strip(): processed_product = kegg_product.strip() # RefSeq: split and process elif database == 'refseq': for subproduct in product.split('; '): subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) if subproduct.strip(): processed_product = subproduct.strip() # MetaCyc: split and process elif database == 'metacyc': # Pull out first name after the accession code: product_name = product.split('#')[0].strip() product_name = re.sub(r'^[^ ]* ', '', product_name) product_name = re.sub(r' OS=.*', '', product_name) if product_name: processed_product = product_name # Seed: split and process elif database == 'seed': for subproduct in product.split('; '): #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) subproduct = re.sub(r'\(.+?\)', '', subproduct) if subproduct.strip(): processed_product = subproduct.strip() elif database == 'cazy': for subproduct in product.split('; '): #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) subproduct = re.sub(r'\(.+?\)', '', subproduct) if subproduct.strip(): processed_product = subproduct.strip() print processed_product # MetaCyc: split and process # Generic else: processed_product = strip_taxonomy(product) words = [x.strip() for x in processed_product.split()] filtered_words = [] underscore_pattern = re.compile("_") arrow_pattern = re.compile(">") for word in words: if not underscore_pattern.search(word) and not arrow_pattern.search( word): filtered_words.append(word) #processed_product = ' '.join(filtered_words) # Chop out hypotheticals processed_product = remove_repeats(filtered_words) processed_product = re.sub(';', '', processed_product) # can actually be a proper annotation # processed_product = re.sub(r'hypothetical protein','', processed_product) return processed_product
def process_product(product, database, similarity_threshold=0.9): """Returns the best set of products from the list of (*database*, *product*) tuples *products*. Each product in the set is first trimmed down, removing database-specific information. The set is then determined by first sorting the products by length (ascending), and then, for each product, sequentially applying the longest common substring algorithm to determine the similarity between the product and already determined products. If this similarity is greater than the specified *similarity_threshold*, the longer of the two products is chosen to be a determined product. """ processed_product = '' # print 'dbase', database # COG if database == 'cog': results = re.search(r'Function: (.+?) #', product) if results: processed_product=results.group(1) # KEGG: split and process elif database == 'kegg': kegg_products = re.split(r'\s*;\s+', product) for kegg_product in kegg_products: # Toss out organism:ID pairs, gene names, and KO IDs kegg_product = re.sub(r'^lcl[|]', '', kegg_product) kegg_product = re.sub(r'[a-z]{3}:\S+', '', kegg_product) kegg_product = kegg_product.strip() kegg_product = re.sub(r'(, \b[a-z]{3}[A-Z]?\b)+', '', kegg_product) kegg_product = re.sub(r'^\b[a-z]{3}[A-Z]?\b', '', kegg_product) # get KO number kegg_product = re.sub(r'\bK\d{5}\b', '', kegg_product) # Also toss out anything between square brackets kegg_product = re.sub(r'\[.*\]', '', kegg_product) if kegg_product.strip(): processed_product=kegg_product.strip() # RefSeq: split and process elif database == 'refseq': for subproduct in product.split('; '): subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) if subproduct.strip(): processed_product=subproduct.strip() # MetaCyc: split and process elif database == 'metacyc': # Pull out first name after the accession code: product_name = product.split('#')[0].strip() product_name = re.sub(r'^[^ ]* ', '', product_name) product_name = re.sub(r' OS=.*', '', product_name) if product_name: processed_product=product_name # Seed: split and process elif database == 'seed': for subproduct in product.split('; '): #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) subproduct = re.sub(r'\(.+?\)', '', subproduct) if subproduct.strip(): processed_product=subproduct.strip() elif database == 'cazy': for subproduct in product.split('; '): #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct) subproduct = re.sub(r'\[.+?\]', '', subproduct) subproduct = re.sub(r'\(.+?\)', '', subproduct) if subproduct.strip(): processed_product=subproduct.strip() print processed_product # MetaCyc: split and process # Generic else: processed_product=strip_taxonomy(product) words = [ x.strip() for x in processed_product.split() ] filtered_words =[] underscore_pattern = re.compile("_") arrow_pattern = re.compile(">") for word in words: if not underscore_pattern.search(word) and not arrow_pattern.search(word): filtered_words.append(word) #processed_product = ' '.join(filtered_words) # Chop out hypotheticals processed_product = remove_repeats(filtered_words) processed_product = re.sub(';','',processed_product) # can actually be a proper annotation # processed_product = re.sub(r'hypothetical protein','', processed_product) return processed_product