def get_hmmdb_ec_nums(acc, c): """ This returns a list of bioannotation:ECAnnotation objects """ qry = "SELECT he.ec_id FROM hmm_ec he JOIN hmm ON he.hmm_id=hmm.id WHERE hmm.accession = ?" c.execute(qry, (acc, )) ec_annots = list() for row in c: ec = annotation.ECAnnotation(number=row[0]) ec_annots.append(ec) return ec_annots
def get_uniref_ec_nums(acc, c): """ This returns a list of bioannotation:ECAnnotation objects """ qry = """ SELECT us_ec.ec_num FROM uniref_ec us_ec WHERE us_ec.id = ? """ c.execute(qry, (acc, )) ec_annots = list() for row in c: ec = annotation.ECAnnotation(number=row[0]) ec_annots.append(ec) return ec_annots
def get_uspdb_ec_nums(acc, c): """ This returns a list of bioannotation:ECAnnotation objects """ qry = """ SELECT us_ec.ec_num FROM uniprot_sprot_ec us_ec JOIN uniprot_sprot_acc us_acc ON us_ec.id=us_acc.id WHERE us_acc.accession = ? """ c.execute(qry, (acc, )) ec_annots = list() for row in c: ec = annotation.ECAnnotation(number=row[0]) ec_annots.append(ec) return ec_annots
def parse_kegg_blast_evidence(log_fh, polypeptides, blast_list, eval_cutoff): ''' Reads a list file of NCBI BLAST evidence against KEGG and a dict of polypeptides, populating each with Annotation evidence where appropriate. Only attaches evidence if the product name is the default. Currently only considers the top BLAST hit for each query which doesn't have 'uncharacterized' or hypothetical in the product name. ''' for file in utils.read_list_file(blast_list): last_qry_id = None for line in open(file): line = line.rstrip() cols = line.split("\t") # We're going to ignore any lines which have a few keywords in the name # First character left off for initcap reasons if 'ncharacterized' in cols[15] or 'ypothetical' in cols[15]: continue this_qry_id = cols[0] # skip this line if it doesn't meet the cutoff if float(cols[19]) > eval_cutoff: continue # the BLAST hits are sorted already with the top hit for each query first if last_qry_id != this_qry_id: annot = polypeptides[this_qry_id].annotation # get the accession from the cols[5] accession = cols[5] # save it, unless the gene product name has already changed from the default if annot.product_name == DEFAULT_PRODUCT_NAME: accession = cols[5] # the product field looks like this: # dam; adenine-specific DNA methyltransferase; K06223 DNA adenine methylase [EC:2.1.1.72] # troponin I type 1 (skeletal, slow); K10371 troponin I, slow skeletal muscle if ' [EC' in cols[15] and cols[15].endswith(']'): m = re.search("\; (K\d+)\s+(.+) \[EC\:(.+)\]", cols[15]) else: m = re.search("\; (K\d+)\s+(.+)", cols[15]) if m: kegg_id = m.group(1) product = m.group(2) if len(m.groups()) == 3: ec_num = m.group(3) else: ec_num = None annot.product_name = product log_fh.write( "INFO: {0}: Updated product name to '{1}' based on BLAST hit to KEGG accession '{2}'\n" .format(this_qry_id, annot.product_name, accession)) if ec_num is not None and ec_num is not '': ec = annotation.ECAnnotation(number=ec_num) annot.add_ec_number(ec) kegg_dbxref = annotation.Dbxref(db='KEGG', identifier=kegg_id) annot.add_dbxref(kegg_dbxref) # remember the ID we just saw last_qry_id = this_qry_id