def parse_hmm_evidence(log_fh, polypeptides, htab_list, cursor): ''' Reads a list file of HMM evidence and dict of polypeptides, populating each with Annotation evidence where appropriate. Each file in the list can have results for multiple queries, but it's assumed that ALL candidate matches for any given query are grouped together. Currently only the top hit for any given query polypeptide is used. ''' for file in utils.read_list_file(htab_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") ## only consider the row if the total score is above the total trusted cutoff if cols[12] >= cols[17]: continue this_qry_id = cols[5] accession = cols[0] version = None # if this is a PFAM accession, handle the version m = re.match("^(PF\d+)\.\d+", accession) if m: version = accession accession = m.group(1) ## the HMM hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: ## save it annot = polypeptides[this_qry_id].annotation annot.product_name = cols[15] log_fh.write( "INFO: {0}: Updated product name to '{1}' based on HMM hit to accession '{2}'" .format(this_qry_id, annot.product_name, accession)) # does our hmm database provide GO terms for this accession? for go_annot in get_hmmdb_go_terms(accession, cursor): annot.add_go_annotation(go_annot) # do we have a gene symbol for this accession? annot.gene_symbol = get_hmmdb_gene_symbol(accession, cursor) # do we have an EC number? for ec_annot in get_hmmdb_ec_nums(accession, cursor): annot.add_ec_number(ec_annot) ## remember the ID we just saw last_qry_id = this_qry_id
def parse_hmm_evidence( log_fh, polypeptides, htab_list, cursor ): ''' Reads a list file of HMM evidence and dict of polypeptides, populating each with Annotation evidence where appropriate. Each file in the list can have results for multiple queries, but it's assumed that ALL candidate matches for any given query are grouped together. Currently only the top hit for any given query polypeptide is used. ''' for file in utils.read_list_file(htab_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") ## only consider the row if the total score is above the total trusted cutoff if cols[12] >= cols[17]: continue this_qry_id = cols[5] accession = cols[0] version = None # if this is a PFAM accession, handle the version m = re.match("^(PF\d+)\.\d+", accession) if m: version = accession accession = m.group(1) ## the HMM hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: ## save it annot = polypeptides[this_qry_id].annotation annot.product_name = cols[15] log_fh.write("INFO: {0}: Updated product name to '{1}' based on HMM hit to accession '{2}'".format(this_qry_id, annot.product_name, accession)) # does our hmm database provide GO terms for this accession? for go_annot in get_hmmdb_go_terms( accession, cursor ): annot.add_go_annotation(go_annot) # do we have a gene symbol for this accession? annot.gene_symbol = get_hmmdb_gene_symbol( accession, cursor ) # do we have an EC number? for ec_annot in get_hmmdb_ec_nums( accession, cursor ): annot.add_ec_number(ec_annot) ## remember the ID we just saw last_qry_id = this_qry_id
def parse_trembl_blast_evidence(polypeptides, blast_list, eval_cutoff): ''' Reads a list file of NCBI BLAST evidence against TrEMBL and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query which doesn't have 'uncharacterized' in the product name. ''' for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") # We're going to ignore any lines which have 'uncharacterized' in the name if 'ncharacterized' in cols[15]: continue this_qry_id = cols[0] # skip this line if it doesn't meet the cutoff if float(cols[19]) > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] # then process for known accession types accession = cols[5] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: # current hack until DB is updated: # some products look like this: # Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1 # take off everything after the OS= m = re.search("(.+) OS=", cols[15]) if m: annot.product_name = m.group(1) else: annot.product_name = cols[15] # remember the ID we just saw last_qry_id = this_qry_id
def parse_uniref100_blast_evidence(log_fh, polypeptides, blast_list, cursor, eval_cutoff, algorithm, uniref100_fasta_path): ''' Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query. ''' if algorithm not in ['blast', 'rapsearch2']: raise Exception( "algorithm argument must be either blast or rapsearch2") ## need to load the UniRef100 to TREMBL accession lookup from teh FASTA # like UniRef100_K1T359 -> K1T359_9ZZZZ uniref2acc = dict() print("INFO: parsing UniRef100 FASTA headers for annotation") if algorithm == 'rapsearch2': for line in open(uniref100_fasta_path): if line[0] == '>': m = re.match("\>(\S+) (.+) n=.+RepID=(\S+)", line) if m: uniref2acc[m.group(1)] = { 'acc': m.group(3), 'prod': m.group(2) } for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): # 0 indexing is faster than startswith() if line[0] == '#': continue line = line.rstrip() cols = line.split("\t") this_qry_id = cols[0] # We're going to ignore any lines which have a few keywords in the name # First character left off for initcap reasons if algorithm == 'blast': skip_products = [ 'ncharacterized', 'ypothetical', 'enomic scaffold' ] skip = False for keyword in skip_products: if keyword in cols[15]: skip = True if skip == True: continue if algorithm == 'blast': e_value = float(cols[19]) elif algorithm == 'rapsearch2': ## rapsearch2 can actually report values outside of python's double range. Handle these try: e_value = math.pow(10, float(cols[10])) except OverflowError: print( "WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}" .format(line)) e_value = 0 # skip this line if it doesn't meet the cutoff if e_value > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession then process for known accession types accession = None if algorithm == 'blast': # UniRef100_K1T359 -> K1T359_9ZZZZ m = re.search("RepID\=(\S+)", cols[15]) if m: accession = m.group(1) else: raise Exception( "ERROR: Unexpected product format in UniRef BLAST results: {0}" .format(cols[15])) elif algorithm == 'rapsearch2': accession = uniref2acc[cols[1]]['acc'] assertions = get_uniref_annot(accession, cursor) # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: if algorithm == 'blast': # these hits look like this: # AD-specific glutamate dehydrogenase n=1 Tax=Ceriporiopsis subvermispora (strain B) RepID=M2RLB9_CERS8 m = re.match("(.+) n\=.+", cols[15]) if m: annot.product_name = m.group(1) else: raise Exception( "ERROR: Unexpected product format in UniRef BLAST results: {0}" .format(cols[15])) log_fh.write( "INFO: {0}: Updated product name to '{1}' based on BLAST hit to UniRef100 accession '{2}'\n" .format(this_qry_id, annot.product_name, accession)) elif algorithm == 'rapsearch2': annot.product_name = uniref2acc[cols[1]]['prod'] # if no EC numbers have been set, they can inherit from this if len(annot.ec_numbers) == 0: for ec_annot in get_uniref_ec_nums(accession, cursor): annot.add_ec_number(ec_annot) # if no GO IDs have been set, they can inherit from this if len(annot.go_annotations) == 0: for go_annot in get_uniref_go_terms(accession, cursor): annot.add_go_annotation(go_annot) # if no gene symbol has been set, it can inherit from this if annot.gene_symbol is None: annot.gene_symbol = assertions['symbol'] # remember the ID we just saw last_qry_id = this_qry_id
def parse_tmhmm_evidence(log_fh, polypeptides, htab_list): ''' Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation attributes where possible. Notes from the esteemed M Giglio: The GO term to use would be GO:0016021 "integral component of membrane" Or if you want to be more conservative you could go with GO:0016020 "membrane" Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM domains and then we call it putative integral membrane protein. On ECO - in fact Marcus and I are the developers of ECO. It is an ontology of evidence types. An annotation to an ECO term is used in conjunction with another annotation, like a GO term (but many other types of annotation can, and are, used with ECO). It provides additional information about the annotation. In fact for GO, the assignment of an evidence term along with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.) INPUT: Expected TMHMM input (all HTML lines are skipped) # CHARM010_V2.mRNA.887 Length: 904 # CHARM010_V2.mRNA.887 Number of predicted TMHs: 6 # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638 # CHARM010_V2.mRNA.887 Exp number, first 60 AAs: 21.83212 # CHARM010_V2.mRNA.887 Total prob of N-in: 0.99994 # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence CHARM010_V2.mRNA.887 TMHMM2.0 inside 1 11 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 12 34 CHARM010_V2.mRNA.887 TMHMM2.0 outside 35 712 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 713 735 CHARM010_V2.mRNA.887 TMHMM2.0 inside 736 755 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 756 773 CHARM010_V2.mRNA.887 TMHMM2.0 outside 774 782 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 783 805 CHARM010_V2.mRNA.887 TMHMM2.0 inside 806 809 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 810 832 CHARM010_V2.mRNA.887 TMHMM2.0 outside 833 871 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 872 894 CHARM010_V2.mRNA.887 TMHMM2.0 inside 895 904 ''' # The number of helices spanning the membrane required before counted as a membrane protein MIN_HELICAL_SPANS = 3 # For successful matches, this is the product name which gets applied GENE_PRODUCT_NAME = 'Putative integral membrane protein' for file in utils.read_list_file(htab_list): last_qry_id = None current_helix_count = 0 for line in open(file): if line.startswith('<'): continue m = re.match("# (.+?)\s+Length: \d+", line) if m: current_id = m.group(1) # purge previous result if current_helix_count >= MIN_HELICAL_SPANS: annot = polypeptides[last_qry_id].annotation if annot.product_name == DEFAULT_PRODUCT_NAME: annot.product_name = GENE_PRODUCT_NAME log_fh.write( "INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n" .format(last_qry_id, annot.product_name, current_helix_count)) else: log_fh.write( "INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n" .format(last_qry_id, current_helix_count)) ## we add the GO terms no matter what annot.add_go_annotation( annotation.GOAnnotation(go_id='0016021')) # reset last_qry_id = current_id current_helix_count = 0 continue cols = line.split() if len(cols) == 5 and cols[2] == 'TMhelix': current_helix_count += 1
def parse_sprot_blast_evidence(log_fh, polypeptides, blast_org, blast_list, cursor, eval_cutoff, algorithm): ''' Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query. ''' if algorithm not in ['blast', 'rapsearch2']: raise Exception( "algorithm argument must be either blast or rapsearch2") for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): # 0 indexing is faster than startswith() if line[0] == '#': continue line = line.rstrip() cols = line.split("\t") this_qry_id = cols[0] if algorithm == 'blast': e_value = float(cols[19]) elif algorithm == 'rapsearch2': ## rapsearch2 can actually report values outside of python's double range. Handle these try: e_value = math.pow(10, float(cols[10])) except OverflowError: print( "WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}" .format(line)) e_value = 0 # skip this line if it doesn't meet the cutoff if e_value > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] # then process for known accession types if algorithm == 'blast': accession = cols[5] elif algorithm == 'rapsearch2': accession = cols[1] if accession.startswith('sp|'): # pluck the second part out of this: # sp|Q4PEV8|EIF3M_USTMA accession = accession.split('|')[1] assertions = get_uspdb_annot(accession, cursor) blast_org[this_qry_id] = assertions['organism'] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: if algorithm == 'blast': # current hack until DB is updated: # some products look like this: # Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1 # take off everything after the OS= m = re.search("(.+) OS=", cols[15]) if m: annot.product_name = m.group(1) else: annot.product_name = cols[15] elif algorithm == 'rapsearch2': annot.product_name = assertions['product'] log_fh.write( "INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'\n" .format(this_qry_id, annot.product_name, accession)) # if no EC numbers have been set, they can inherit from this if len(annot.ec_numbers) == 0: for ec_annot in get_uspdb_ec_nums(accession, cursor): annot.add_ec_number(ec_annot) # if no GO IDs have been set, they can inherit from this if len(annot.go_annotations) == 0: for go_annot in get_uspdb_go_terms(accession, cursor): annot.add_go_annotation(go_annot) # if no gene symbol has been set, it can inherit from this if annot.gene_symbol is None: annot.gene_symbol = assertions['symbol'] # remember the ID we just saw last_qry_id = this_qry_id
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff): ''' Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query which doesn't have 'uncharacterized' or hypothetical in the product name. ''' for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") # We're going to ignore any lines which have a few keywords in the name # First character left off for initcap reasons if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]: continue this_qry_id = cols[0] # skip this line if it doesn't meet the cutoff if float(cols[19]) > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] accession = cols[5] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: accession = cols[5] # the product field looks like this: # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72] # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle if ' [EC' in cols[15] and cols[15].endswith(']'): m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15]) else: m = re.search("\; (K\d+)\s+(.+)", cols[15]) if m: kegg_id = m.group(1) product = m.group(2) if len(m.groups()) == 3: ec_num = m.group(3) else: ec_num = None annot.product_name = product log_fh.write( "INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n" .format(this_qry_id, annot.product_name, accession)) if ec_num is not None and ec_num is not '': ec = annotation.ECAnnotation(number=ec_num) annot.add_ec_number(ec) kegg_dbxref = annotation.Dbxref(db='KEGG', identifier=kegg_id) annot.add_dbxref(kegg_dbxref) # remember the ID we just saw last_qry_id = this_qry_id
def parse_uniref100_blast_evidence( log_fh, polypeptides, blast_list, cursor, eval_cutoff, algorithm, uniref100_fasta_path ): ''' Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query. ''' if algorithm not in ['blast', 'rapsearch2']: raise Exception("algorithm argument must be either blast or rapsearch2") ## need to load the UniRef100 to TREMBL accession lookup from teh FASTA # like UniRef100_K1T359 -> K1T359_9ZZZZ uniref2acc = dict() print("INFO: parsing UniRef100 FASTA headers for annotation") if algorithm == 'rapsearch2': for line in open(uniref100_fasta_path): if line[0] == '>': m = re.match("\>(\S+) (.+) n=.+RepID=(\S+)", line) if m: uniref2acc[m.group(1)] = {'acc': m.group(3), 'prod': m.group(2)} for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): # 0 indexing is faster than startswith() if line[0] == '#': continue line = line.rstrip() cols = line.split("\t") this_qry_id = cols[0] # We're going to ignore any lines which have a few keywords in the name # First character left off for initcap reasons if algorithm == 'blast': skip_products = ['ncharacterized', 'ypothetical', 'enomic scaffold'] skip = False for keyword in skip_products: if keyword in cols[15]: skip = True if skip == True: continue if algorithm == 'blast': e_value = float(cols[19]) elif algorithm == 'rapsearch2': ## rapsearch2 can actually report values outside of python's double range. Handle these try: e_value = math.pow(10, float(cols[10])) except OverflowError: print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line)) e_value = 0 # skip this line if it doesn't meet the cutoff if e_value > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession then process for known accession types accession = None if algorithm == 'blast': # UniRef100_K1T359 -> K1T359_9ZZZZ m = re.search("RepID\=(\S+)", cols[15]) if m: accession = m.group(1) else: raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15])) elif algorithm == 'rapsearch2': accession = uniref2acc[cols[1]]['acc'] assertions = get_uniref_annot( accession, cursor ) # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: if algorithm == 'blast': # these hits look like this: # AD-specific glutamate dehydrogenase n=1 Tax=Ceriporiopsis subvermispora (strain B) RepID=M2RLB9_CERS8 m = re.match("(.+) n\=.+", cols[15]) if m: annot.product_name = m.group(1) else: raise Exception("ERROR: Unexpected product format in UniRef BLAST results: {0}".format(cols[15])) log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to UniRef100 accession '{2}'\n".format(this_qry_id, annot.product_name, accession)) elif algorithm == 'rapsearch2': annot.product_name = uniref2acc[cols[1]]['prod'] # if no EC numbers have been set, they can inherit from this if len(annot.ec_numbers) == 0: for ec_annot in get_uniref_ec_nums( accession, cursor ): annot.add_ec_number(ec_annot) # if no GO IDs have been set, they can inherit from this if len(annot.go_annotations) == 0: for go_annot in get_uniref_go_terms( accession, cursor ): annot.add_go_annotation(go_annot) # if no gene symbol has been set, it can inherit from this if annot.gene_symbol is None: annot.gene_symbol = assertions['symbol'] # remember the ID we just saw last_qry_id = this_qry_id
def parse_tmhmm_evidence( log_fh, polypeptides, htab_list ): ''' Reads a list of raw TMHMM evidence and a dict of polypeptides, adding annotation attributes where possible. Notes from the esteemed M Giglio: The GO term to use would be GO:0016021 "integral component of membrane" Or if you want to be more conservative you could go with GO:0016020 "membrane" Depends on the evidence. For the prok pipe we are pretty conservative, we require five TMHMM domains and then we call it putative integral membrane protein. On ECO - in fact Marcus and I are the developers of ECO. It is an ontology of evidence types. An annotation to an ECO term is used in conjunction with another annotation, like a GO term (but many other types of annotation can, and are, used with ECO). It provides additional information about the annotation. In fact for GO, the assignment of an evidence term along with a GO term is a required part of a GO annotation. (ECO terms are the "evidence codes" in GO.) INPUT: Expected TMHMM input (all HTML lines are skipped) # CHARM010_V2.mRNA.887 Length: 904 # CHARM010_V2.mRNA.887 Number of predicted TMHs: 6 # CHARM010_V2.mRNA.887 Exp number of AAs in TMHs: 133.07638 # CHARM010_V2.mRNA.887 Exp number, first 60 AAs: 21.83212 # CHARM010_V2.mRNA.887 Total prob of N-in: 0.99994 # CHARM010_V2.mRNA.887 POSSIBLE N-term signal sequence CHARM010_V2.mRNA.887 TMHMM2.0 inside 1 11 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 12 34 CHARM010_V2.mRNA.887 TMHMM2.0 outside 35 712 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 713 735 CHARM010_V2.mRNA.887 TMHMM2.0 inside 736 755 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 756 773 CHARM010_V2.mRNA.887 TMHMM2.0 outside 774 782 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 783 805 CHARM010_V2.mRNA.887 TMHMM2.0 inside 806 809 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 810 832 CHARM010_V2.mRNA.887 TMHMM2.0 outside 833 871 CHARM010_V2.mRNA.887 TMHMM2.0 TMhelix 872 894 CHARM010_V2.mRNA.887 TMHMM2.0 inside 895 904 ''' # The number of helices spanning the membrane required before counted as a membrane protein MIN_HELICAL_SPANS = 3 # For successful matches, this is the product name which gets applied GENE_PRODUCT_NAME = 'Putative integral membrane protein' for file in utils.read_list_file(htab_list): last_qry_id = None current_helix_count = 0 for line in open(file): if line.startswith('<'): continue m = re.match("# (.+?)\s+Length: \d+", line) if m: current_id = m.group(1) # purge previous result if current_helix_count >= MIN_HELICAL_SPANS: annot = polypeptides[last_qry_id].annotation if annot.product_name == DEFAULT_PRODUCT_NAME: annot.product_name = GENE_PRODUCT_NAME log_fh.write("INFO: {0}: Updated product name to '{1}' because it had {2} TMHelix domains predicted by TMHMM\n".format(last_qry_id, annot.product_name, current_helix_count)) else: log_fh.write("INFO: {0}: TMHMM predicted {1} TMHelix domains but gene product name unchanged because of previous assignment\n".format(last_qry_id, current_helix_count)) ## we add the GO terms no matter what annot.add_go_annotation(annotation.GOAnnotation(go_id='0016021')) # reset last_qry_id = current_id current_helix_count = 0 continue cols = line.split() if len(cols) == 5 and cols[2] == 'TMhelix': current_helix_count += 1
def parse_sprot_blast_evidence( log_fh, polypeptides, blast_org, blast_list, cursor, eval_cutoff, algorithm ): ''' Reads a list file of NCBI BLAST evidence and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query. ''' if algorithm not in ['blast', 'rapsearch2']: raise Exception("algorithm argument must be either blast or rapsearch2") for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): # 0 indexing is faster than startswith() if line[0] == '#': continue line = line.rstrip() cols = line.split("\t") this_qry_id = cols[0] if algorithm == 'blast': e_value = float(cols[19]) elif algorithm == 'rapsearch2': ## rapsearch2 can actually report values outside of python's double range. Handle these try: e_value = math.pow(10, float(cols[10])) except OverflowError: print("WARN: couldn't handle E-value math on the following line (setting to 0):\n{0}".format(line)) e_value = 0 # skip this line if it doesn't meet the cutoff if e_value > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] # then process for known accession types if algorithm == 'blast': accession = cols[5] elif algorithm == 'rapsearch2': accession = cols[1] if accession.startswith('sp|'): # pluck the second part out of this: # sp|Q4PEV8|EIF3M_USTMA accession = accession.split('|')[1] assertions = get_uspdb_annot( accession, cursor ) blast_org[this_qry_id] = assertions['organism'] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: if algorithm == 'blast': # current hack until DB is updated: # some products look like this: # Coatomer subunit gamma-2 OS=Bos taurus GN=COPG2 PE=2 SV=1 # take off everything after the OS= m = re.search("(.+) OS=", cols[15]) if m: annot.product_name = m.group(1) else: annot.product_name = cols[15] elif algorithm == 'rapsearch2': annot.product_name = assertions['product'] log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to SPROT accession '{2}'\n".format(this_qry_id, annot.product_name, accession)) # if no EC numbers have been set, they can inherit from this if len(annot.ec_numbers) == 0: for ec_annot in get_uspdb_ec_nums( accession, cursor ): annot.add_ec_number(ec_annot) # if no GO IDs have been set, they can inherit from this if len(annot.go_annotations) == 0: for go_annot in get_uspdb_go_terms( accession, cursor ): annot.add_go_annotation(go_annot) # if no gene symbol has been set, it can inherit from this if annot.gene_symbol is None: annot.gene_symbol = assertions['symbol'] # remember the ID we just saw last_qry_id = this_qry_id
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff): ''' Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query which doesn't have 'uncharacterized' or hypothetical in the product name. ''' for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") # We're going to ignore any lines which have a few keywords in the name # First character left off for initcap reasons if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]: continue this_qry_id = cols[0] # skip this line if it doesn't meet the cutoff if float(cols[19]) > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] accession = cols[5] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: accession = cols[5] # the product field looks like this: # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72] # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle if ' [EC' in cols[15] and cols[15].endswith(']'): m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15]) else: m = re.search("\; (K\d+)\s+(.+)", cols[15]) if m: kegg_id = m.group(1) product = m.group(2) if len(m.groups()) == 3: ec_num = m.group(3) else: ec_num = None annot.product_name = product log_fh.write("INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n".format(this_qry_id, annot.product_name, accession)) if ec_num is not None and ec_num is not '': ec = annotation.ECAnnotation(number=ec_num) annot.add_ec_number(ec) kegg_dbxref = annotation.Dbxref(db='KEGG', identifier=kegg_id) annot.add_dbxref(kegg_dbxref) # remember the ID we just saw last_qry_id = this_qry_id
PROJECT_DIR = Project_Directory( ARGS.project_dir, ARGS.project_name, ["summary", "logs", "history", "minimum_spanning_set"], ["patterns", "flags", "history"]) config_logging( os.path.join( PROJECT_DIR.get_sub_directory("logs"), "{0}_pattern_selection.log".format(PROJECT_DIR.project_name)), ARGS.log) LOGGER = logging.getLogger(__name__) try: if ARGS.req_loci_file is not None: ARGS.required_loci = read_list_file(ARGS.req_loci_file) except IOError: LOGGER.error("Cannot open required loci file: %s", ARGS.req_loci_file) raise try: if ARGS.excl_loci_file is not None: ARGS.exclude_loci = read_list_file(ARGS.excl_loci_file) except IOError: LOGGER.error("Cannot open excluded loci file: %s", ARGS.excl_loci_file) raise try: if ARGS.excl_strains_file is not None: ARGS.exclude_strains = read_list_file(ARGS.excl_strains_file) except IOError: