def process_parsed_blastoutput(dbname, weight,  blastoutput, cutoffs, annotation_results):
    blastparser =  BlastOutputTsvParser(dbname, blastoutput)

    fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if cutoffs.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    annotation = {}
    for data in blastparser:
        #if count%10000==0:
        #   print count

        if isWithinCutoffs(data, cutoffs) :
           #print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
#           if data['query'] =='NapDC_illum_asm_188606_0':

    #       print dbname 
           annotation['bsr'] = data['bsr']
           annotation['ec'] = data['ec']
           annotation['product'] = strip_taxonomy(process_product(data['product'], dbname) )
           annotation['value'] = compute_annotation_value(annotation)*weight
         #  print annotation
           
           if not data['query'] in annotation_results:
               annotation_results[data['query']] = {'value':0}

           if annotation_results[data['query']]['value'] <= annotation['value'] :
                annotation_results[data['query']] = annotation.copy()

#    add_refscore_to_file(blastoutput,refscore_file, allNames)
    count =  len(annotation_results.keys())
    return count
Example #2
0
def process_parsed_blastoutput(dbname, weight,  blastoutput, cutoffs, annotation_results):
    blastparser =  BlastOutputTsvParser(dbname, blastoutput, shortenorfid=False)

    fields = ['q_length', 'bitscore', 'bsr', 'expect', 'aln_length', 'identity', 'ec' ]
    if cutoffs.taxonomy:
       fields.append('taxonomy')
    fields.append('product')

    annotation = {}
    for data in blastparser:
        #if count%10000==0:

        if isWithinCutoffs(data, cutoffs) :
  
           #print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
#           if data['query'] =='NapDC_illum_asm_188606_0':

    #       print dbname 
           annotation['bsr'] = data['bsr']
           annotation['ec'] = data['ec']
           annotation['product'] = strip_taxonomy(process_product(data['product'], dbname) )
           annotation['value'] = compute_annotation_value(annotation)*weight
         #  print annotation
           
           if not data['query'] in annotation_results:
               annotation_results[data['query']] = {'value':0}

           if annotation_results[data['query']]['value'] <= annotation['value'] :
                annotation_results[data['query']] = annotation.copy()

#    add_refscore_to_file(blastoutput,refscore_file, allNames)
    count =  len(annotation_results.keys())
    return count
def process_product(product, database, similarity_threshold=0.9):
    """Returns the best set of products from the list of (*database*,
    *product*) tuples *products*.

    Each product in the set is first trimmed down, removing database-specific
    information.

    The set is then determined by first sorting the products by length
    (ascending), and then, for each product, sequentially applying the longest
    common substring algorithm to determine the similarity between the product
    and already determined products. If this similarity is greater than the
    specified *similarity_threshold*, the longer of the two products is chosen
    to be a determined product.
    """

    processed_product = ''

    # print 'dbase', database
    # COG
    if database == 'cog':
        results = re.search(r'Function: (.+?) #', product)
        if results:
            processed_product = results.group(1)

    # KEGG: split and process

    elif database == 'kegg':
        kegg_products = re.split(r'\s*;\s+', product)
        for kegg_product in kegg_products:
            # Toss out organism:ID pairs, gene names, and KO IDs
            kegg_product = re.sub(r'^lcl[|]', '', kegg_product)
            kegg_product = re.sub(r'[a-z]{3}:\S+', '', kegg_product)
            kegg_product = kegg_product.strip()
            kegg_product = re.sub(r'(, \b[a-z]{3}[A-Z]?\b)+', '', kegg_product)
            kegg_product = re.sub(r'^\b[a-z]{3}[A-Z]?\b', '', kegg_product)
            # get KO number
            kegg_product = re.sub(r'\bK\d{5}\b', '', kegg_product)

            # Also toss out anything between square brackets
            kegg_product = re.sub(r'\[.*\]', '', kegg_product)

            if kegg_product.strip():
                processed_product = kegg_product.strip()

    # RefSeq: split and process

    elif database == 'refseq':
        for subproduct in product.split('; '):
            subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            if subproduct.strip():
                processed_product = subproduct.strip()

    # MetaCyc: split and process

    elif database == 'metacyc':
        # Pull out first name after the accession code:
        product_name = product.split('#')[0].strip()
        product_name = re.sub(r'^[^ ]* ', '', product_name)
        product_name = re.sub(r' OS=.*', '', product_name)
        if product_name:
            processed_product = product_name

    # Seed: split and process

    elif database == 'seed':
        for subproduct in product.split('; '):
            #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            subproduct = re.sub(r'\(.+?\)', '', subproduct)
            if subproduct.strip():
                processed_product = subproduct.strip()

    elif database == 'cazy':
        for subproduct in product.split('; '):
            #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            subproduct = re.sub(r'\(.+?\)', '', subproduct)
            if subproduct.strip():
                processed_product = subproduct.strip()
                print processed_product

    # MetaCyc: split and process

    # Generic
    else:
        processed_product = strip_taxonomy(product)

    words = [x.strip() for x in processed_product.split()]
    filtered_words = []
    underscore_pattern = re.compile("_")
    arrow_pattern = re.compile(">")
    for word in words:
        if not underscore_pattern.search(word) and not arrow_pattern.search(
                word):
            filtered_words.append(word)

    #processed_product = ' '.join(filtered_words)
    # Chop out hypotheticals
    processed_product = remove_repeats(filtered_words)
    processed_product = re.sub(';', '', processed_product)

    # can actually be a proper annotation
    # processed_product = re.sub(r'hypothetical protein','', processed_product)

    return processed_product
def process_product(product, database, similarity_threshold=0.9):
    """Returns the best set of products from the list of (*database*,
    *product*) tuples *products*.

    Each product in the set is first trimmed down, removing database-specific
    information.

    The set is then determined by first sorting the products by length
    (ascending), and then, for each product, sequentially applying the longest
    common substring algorithm to determine the similarity between the product
    and already determined products. If this similarity is greater than the
    specified *similarity_threshold*, the longer of the two products is chosen
    to be a determined product.
    """

    processed_product = ''

    # print 'dbase', database
    # COG
    if database == 'cog':
        results = re.search(r'Function: (.+?) #', product)
        if results:
           processed_product=results.group(1)

    # KEGG: split and process

    elif database == 'kegg':
        kegg_products = re.split(r'\s*;\s+', product)
        for kegg_product in kegg_products:
            # Toss out organism:ID pairs, gene names, and KO IDs
            kegg_product = re.sub(r'^lcl[|]', '', kegg_product)
            kegg_product = re.sub(r'[a-z]{3}:\S+', '', kegg_product)
            kegg_product = kegg_product.strip()
            kegg_product = re.sub(r'(, \b[a-z]{3}[A-Z]?\b)+', '', kegg_product)
            kegg_product = re.sub(r'^\b[a-z]{3}[A-Z]?\b', '', kegg_product)
            # get KO number 
            kegg_product = re.sub(r'\bK\d{5}\b', '', kegg_product)

            # Also toss out anything between square brackets
            kegg_product = re.sub(r'\[.*\]', '', kegg_product)

            if kegg_product.strip():
                processed_product=kegg_product.strip()
                

    # RefSeq: split and process

    elif database == 'refseq':
        for subproduct in product.split('; '):
            subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            if subproduct.strip():
                processed_product=subproduct.strip()

    # MetaCyc: split and process

    elif database == 'metacyc':
        # Pull out first name after the accession code:
        product_name = product.split('#')[0].strip()
        product_name = re.sub(r'^[^ ]* ', '', product_name)
        product_name = re.sub(r' OS=.*', '', product_name)
        if product_name:
            processed_product=product_name

    # Seed: split and process

    elif database == 'seed':
        for subproduct in product.split('; '):
            #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            subproduct = re.sub(r'\(.+?\)', '', subproduct)
            if subproduct.strip():
                processed_product=subproduct.strip()

    elif database == 'cazy':
        for subproduct in product.split('; '):
            #subproduct = re.sub(r'[a-z]{2,}\|(.+?)\|\S*', '', subproduct)
            subproduct = re.sub(r'\[.+?\]', '', subproduct)
            subproduct = re.sub(r'\(.+?\)', '', subproduct)
            if subproduct.strip():
                processed_product=subproduct.strip()
                print processed_product

    # MetaCyc: split and process

    # Generic
    else:
        processed_product=strip_taxonomy(product)

    words = [ x.strip() for x in processed_product.split() ]
    filtered_words =[]
    underscore_pattern = re.compile("_")
    arrow_pattern = re.compile(">")
    for word in words:
       if not  underscore_pattern.search(word) and not arrow_pattern.search(word):
           filtered_words.append(word)
    

    #processed_product = ' '.join(filtered_words)
    # Chop out hypotheticals
    processed_product = remove_repeats(filtered_words)
    processed_product = re.sub(';','',processed_product)

    # can actually be a proper annotation
    # processed_product = re.sub(r'hypothetical protein','', processed_product)

    return processed_product