def _get_name_by_id(self, entity_id): entity_term = self.tree.find("TERM/[@id='%s']" % entity_id) if entity_term is None: logger.debug('Term %s for entity not found' % entity_id) return None name = entity_term.find("name") if name is None: logger.debug('Entity without a name') return None try: dbid = entity_term.attrib["dbid"] except: #logger.debug('No grounding information for %s' % name.text) return self._get_valid_name(name.text) dbids = dbid.split('|') hgnc_ids = [i for i in dbids if i.startswith('HGNC')] up_ids = [i for i in dbids if i.startswith('UP')] #TODO: handle protein families like 14-3-3 with IDs like # XFAM:PF00244.15, FA:00007 if hgnc_ids: if len(hgnc_ids) > 1: lisp_str = entity_term.attrib.get('lisp') if lisp_str is None: hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] else: parts = lisp_str.split('(TERM :ID ') scores = {} for p in parts: res = re.findall('HGNC::\|(.*)\|', p) if res: hgnc_id = res[0] score = re.findall(':SCORE ([^ ]+)', p)[0] scores[hgnc_id] = float(score) if scores: sorted_ids = sorted(scores.items(), key=operator.itemgetter(1)) hgnc_id = sorted_ids[-1][0] else: hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] hgnc_name = self._get_hgnc_name(hgnc_id) return self._get_valid_name(hgnc_name) elif up_ids: if len(hgnc_ids) > 1: logger.debug('%d UniProt IDs reported.' % len(up_ids)) up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0] # First try to get HGNC name hgnc_name = up_client.get_hgnc_name(up_id) if hgnc_name is not None: return self._get_valid_name(hgnc_name) # Next, try to get the gene name gene_name = up_client.get_gene_name(up_id) if gene_name is not None: return self._get_valid_name(gene_name) # By default, return the text of the name tag name_txt = name.text.strip('|') return self._get_valid_name(name_txt)
def _get_name_by_id(self, entity_id): entity_term = self.tree.find("TERM/[@id='%s']" % entity_id) name = entity_term.find("name") if name is None: warnings.warn('Entity without a name') return '' try: dbid = entity_term.attrib["dbid"] except: warnings.warn('No grounding information for %s' % name.text) return self._get_valid_component_name(name.text) dbids = dbid.split('|') hgnc_ids = [i for i in dbids if i.startswith('HGNC')] up_ids = [i for i in dbids if i.startswith('UP')] #TODO: handle protein families like 14-3-3 with IDs like # XFAM:PF00244.15, FA:00007 if hgnc_ids: if len(hgnc_ids) > 1: warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids)) hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] hgnc_name = self._get_hgnc_name(hgnc_id) return self._get_valid_component_name(hgnc_name) elif up_ids: if len(hgnc_ids) > 1: warnings.warn('%d UniProt IDs reported.' % len(up_ids)) up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0] up_rdf = up_client.query_protein(up_id) # First try to get HGNC name hgnc_name = up_client.get_hgnc_name(up_rdf) if hgnc_name is not None: return self._get_valid_component_name(hgnc_name) # Next, try to get the gene name gene_name = up_client.get_gene_name(up_rdf) if gene_name is not None: return self._get_valid_component_name(gene_name) # By default, return the text of the name tag name_txt = name.text.strip('|') return self._get_valid_component_name(name_txt)
def _get_name_by_id(self, entity_id): entity_term = self.tree.find("TERM/[@id='%s']" % entity_id) name = entity_term.find("name") if name is None: warnings.warn('Entity without a name') return '' try: dbid = entity_term.attrib["dbid"] except: warnings.warn('No grounding information for %s' % name.text) return self._get_valid_component_name(name.text) dbids = dbid.split('|') hgnc_ids = [i for i in dbids if i.startswith('HGNC')] up_ids = [i for i in dbids if i.startswith('UP')] #TODO: handle protein families like 14-3-3 with IDs like # XFAM:PF00244.15, FA:00007 if hgnc_ids: if len(hgnc_ids) > 1: warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids)) hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] hgnc_name = self._get_hgnc_name(hgnc_id) return self._get_valid_component_name(hgnc_name) elif up_ids: if len(hgnc_ids) > 1: warnings.warn('%d UniProt IDs reported.' % len(up_ids)) up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0] up_rdf = up_client.query_protein(up_id) # First try to get HGNC name hgnc_name = up_client.get_hgnc_name(up_rdf) if hgnc_name is not None: return self._get_valid_component_name(hgnc_name) # Next, try to get the gene name gene_name = up_client.get_gene_name(up_rdf) if gene_name is not None: return self._get_valid_component_name(gene_name) # By default, return the text of the name tag name_txt = name.text.strip('|') return self._get_valid_component_name(name_txt)
def _get_agent_from_entity(self, entity_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = res.next() except StopIteration: logger.debug(' %s is not an entity' % entity_id) return None # This is the default name, which can be overwritten # below for specific database entries agent_name = self._get_valid_name(entity_term['text']) db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt hgnc_name = up_client.get_hgnc_name(up_id) if hgnc_name is not None: agent_name = self._get_valid_name(hgnc_name) else: gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = self._get_valid_name(gene_name) elif ns == 'interpro': db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'][6:] elif ns == 'go': db_refs['GO'] = xr['id'][3:] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'][4:] db_refs['TEXT'] = entity_term['text'] mod_terms = entity_term.get('modifications') mods = [] muts = [] if mod_terms is not None: for m in mod_terms: if m['type'].lower() == 'mutation': # Evidence is usualy something like "V600E" # We could parse this to get the amino acid # change that happened. mutation_str = m.get('evidence') # TODO: sometimes mutation_str is "mutant", "Mutant", # "mutants" - this indicates that there is a mutation # but not the specific type. We should encode this # somehow as a "blank" mutation condition mut = self._parse_mutation(mutation_str) if mut is not None: muts.append(mut) elif m['type'].lower() == 'phosphorylation' or\ m['type'].lower() == 'phosphorylated': site = m.get('site') if site is not None: mod_res, mod_pos = self._parse_site_text(site) mod = ModCondition('phosphorylation', mod_res, mod_pos) mods.append(mod) else: mods.append(ModCondition('phosphorylation')) elif m['type'].lower() == 'ubiquitination': mods.append(ModCondition('ubiquitination')) else: logger.warning('Unhandled entity modification type: %s' % m['type']) agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts) return agent
def test_get_hgnc_name_nonhuman(): hgnc_name = uniprot_client.get_hgnc_name('P31938') assert(hgnc_name is None)
def test_get_hgnc_name_human(): hgnc_name = uniprot_client.get_hgnc_name('P00533') assert(hgnc_name == 'EGFR')
def test_query_protein_deprecated(): g = uniprot_client.query_protein('Q8NHX1') assert(g is not None) assert(uniprot_client.get_hgnc_name('Q8NHX1') == 'MAPK3')