def run_adeft_disambiguation(stmt, agent_list, idx, new_agent, agent_txt): # Initialize annotations if needed so Adeft predicted # probabilities can be added to Agent annotations annots = stmt.evidence[0].annotations if stmt.evidence else {} if 'agents' in annots: if 'adeft' not in annots['agents']: annots['agents']['adeft'] = \ {'adeft': [None for _ in agent_list]} else: annots['agents'] = {'adeft': [None for _ in agent_list]} grounding_text = _get_text_for_grounding(stmt, agent_txt) if grounding_text: res = adeft_disambiguators[agent_txt].disambiguate( [grounding_text]) ns_and_id, standard_name, disamb_scores = res[0] # If the highest score is ungrounded we don't do anything # TODO: should we explicitly remove grounding if we conclude it # doesn't match any of the choices? if ns_and_id == 'ungrounded': return db_ns, db_id = ns_and_id.split(':', maxsplit=1) new_agent.db_refs = {'TEXT': agent_txt, db_ns: db_id} new_agent.name = standard_name logger.info('Disambiguated %s to: %s, %s:%s' % (agent_txt, standard_name, db_ns, db_id)) if db_ns == 'HGNC': hgnc_sym = hgnc_client.get_hgnc_name(db_id) GroundingMapper.standardize_agent_db_refs(new_agent, {'HGNC': hgnc_sym}, do_rename=False) annots['agents']['adeft'][idx] = disamb_scores
def _get_hgnc_name(self, hgnc_id): try: hgnc_name = self._hgnc_cache[hgnc_id] except KeyError: hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) self._hgnc_cache[hgnc_id] = hgnc_name return hgnc_name
def _make_db_refs(self, entrez_id, text_id): """Looks up the HGNC ID and name, as well as the Uniprot ID. Parameters ---------- entrez_id : str Entrez gene ID. text_id : str or None A plain text systematic name, or None if not listed in the Biogrid data. Returns ------- hgnc_name : str Official HGNC symbol for the gene. db_refs : dict db_refs grounding dictionary, used when constructing the Agent object. """ db_refs = {} if text_id != '-' and text_id is not None: db_refs['TEXT'] = text_id hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id is not None: db_refs['UP'] = up_id return (hgnc_name, db_refs)
def _get_agent_grounding(agent): """Convert an agent to the corresponding PyBEL DSL object (to be filled with variants later).""" def _get_id(_agent, key): _id = _agent.db_refs.get(key) if isinstance(_id, list): _id = _id[0] return _id hgnc_id = _get_id(agent, 'HGNC') if hgnc_id: hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.warning('Agent %s with HGNC ID %s has no HGNC name.', agent, hgnc_id) return return protein('HGNC', hgnc_name) uniprot_id = _get_id(agent, 'UP') if uniprot_id: return protein('UP', uniprot_id) fplx_id = _get_id(agent, 'FPLX') if fplx_id: return protein('FPLX', fplx_id) pfam_id = _get_id(agent, 'PF') if pfam_id: return protein('PFAM', pfam_id) ip_id = _get_id(agent, 'IP') if ip_id: return protein('IP', ip_id) fa_id = _get_id(agent, 'FA') if fa_id: return protein('NXPFA', fa_id) chebi_id = _get_id(agent, 'CHEBI') if chebi_id: if chebi_id.startswith('CHEBI:'): chebi_id = chebi_id[len('CHEBI:'):] return abundance('CHEBI', chebi_id) pubchem_id = _get_id(agent, 'PUBCHEM') if pubchem_id: return abundance('PUBCHEM', pubchem_id) go_id = _get_id(agent, 'GO') if go_id: return bioprocess('GO', go_id) mesh_id = _get_id(agent, 'MESH') if mesh_id: return bioprocess('MESH', mesh_id) return
def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: refs['UP'] = up_id # If there is a HGNC ID, we standardize the gene name name = hgnc_client.get_hgnc_name(hgnc_id) return Agent(name, db_refs=refs)
def get_db_refs(egid): hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) if not hgnc_id: logger.info("No HGNC ID for Entrez ID: %s" % egid) return (None, {}) hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.info("No HGNC name for HGNC ID: %s" % hgnc_id) return (None, {}) up_id = hgnc_client.get_uniprot_id(hgnc_id) if not up_id: logger.info("No Uniprot ID for EGID / HGNC ID / Symbol " "%s / %s / %s" % (egid, hgnc_id, hgnc_name)) return (None, {}) return (hgnc_name, {'HGNC': hgnc_id, 'UP': up_id})
def go_gene_pair(stmt): go = None gene_symbol = None for ag in stmt.agent_list(): if ag is None: continue grounding = ag.db_refs.keys() if 'HGNC' in grounding: gene_id = ag.db_refs.get('HGNC') gene_symbol = hgnc_client.get_hgnc_name(gene_id) elif 'GO' in grounding: go = ag.db_refs.get('GO') bp_name = ag.name if go is not None and gene_symbol is not None: return (bp_name, go, gene_symbol) else: return (None, None, None)
def get_grounding(self): import indra.databases.hgnc_client as hgc import indra.databases.uniprot_client as upc be = self.db_refs.get('FPLX') if be: return ('FPLX', be) hgnc = self.db_refs.get('HGNC') if hgnc: if isinstance(hgnc, list): hgnc = hgnc[0] return ('HGNC', hgc.get_hgnc_name(str(hgnc))) up = self.db_refs.get('UP') if up: if isinstance(up, list): up = up[0] if upc.is_human(up): gene_name = upc.get_gene_name(up, web_fallback=False) if gene_name: return ('HGNC', gene_name) else: return ('UP', up) return (None, None)
def get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['BE'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in [ 'agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype' ]: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
cxa = CxAssembler(stmts) cxa.make_model(add_indra_json=False) cxa.save_model(save_file) return cxa if __name__ == '__main__': # Load NDEx credentials with open('ndex_cred.json', 'rt') as f: ndex_cred = json.load(f) # Get the network ncp = ndex_cx.process_ndex_network('df1fea48-8cfb-11e7-a10d-0ac135e8bacf', username=ndex_cred['user'], password=ndex_cred['password']) gene_names = [ hgnc_client.get_hgnc_name(ag.db_refs['HGNC']) for ag in ncp.get_agents() ] """ # Get PMIDs for reading entrez_pmids = get_pmids(gene_names) network_pmids = ncp.get_pmids() pmids = list(set(entrez_pmids + network_pmids)) save_pmids_for_reading(pmids, 'dna_damage_pmids.txt') """ # Build the model prior_stmts = build_prior(gene_names, 'prior_stmts.pkl') reach_stmts = ac.load_statements('reach_stmts.pkl') stmts = ncp.statements + reach_stmts + prior_stmts stmts = run_assembly(stmts, 'unfiltered_assembled_stmts.pkl')
def test_get_hgnc_name(): hgnc_id = '3236' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert(hgnc_name == 'EGFR') assert unicode_strs(hgnc_name)
def _get_hgnc_name(hgnc_id): hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) return hgnc_name
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = get_bioentities_mapping(db_ns, db_id) if be_id: db_refs[db_ns] = db_id db_refs['BE'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: db_refs['UP'] = db_id gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXPFA'] = db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'BE']: db_refs[db_ns] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert (agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def test_get_hgnc_name_nonexistent(): hgnc_id = '123456' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert(hgnc_name is None)
def _get_agent(concept, entity): name = term_from_uri(concept) namespace = namespace_from_uri(entity) db_refs = {} if namespace == 'HGNC': agent_name = name hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning("Couldn't get HGNC ID for HGNC symbol %s" % name) elif namespace in ('MGI', 'RGD'): agent_name = name db_refs[namespace] = name elif namespace in ('PFH', 'SFAM'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL family: %s' % name logger.warning(msg) else: db_refs['FPLX'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace in ('NCH', 'SCOMP'): indra_name = bel_to_indra.get(name) db_refs[namespace] = name if indra_name is None: agent_name = name msg = 'Could not find mapping for BEL complex: %s' % name logger.warning(msg) else: db_refs['FPLX'] = indra_name db_refs['TEXT'] = name agent_name = indra_name elif namespace == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs['CHEBI'] = chebi_id else: logger.warning('CHEBI name %s not found in map.' % name) agent_name = name elif namespace == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs['EGID'] = name if hgnc_id is not None: db_refs['HGNC'] = str(hgnc_id) agent_name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.warning('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.warning('Could not map EGID%s to HGNC.' % name) agent_name = 'E%s' % name else: logger.warning('Unhandled entity namespace: %s' % namespace) print('%s, %s' % (concept, entity)) agent_name = name agent = Agent(agent_name, db_refs=db_refs) return agent
def test_get_hgnc_name_nonexistent(): hgnc_id = '123456' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is None assert unicode_strs(hgnc_name)
def _get_db_refs(entity_term): agent_name = entity_term['text'] db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = gene_name # If the gene name corresponds to an HGNC ID, add it to the # db_refs if up_client.is_human(up_id): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif ns == 'hgnc': hgnc_id = xr['id'] db_refs['HGNC'] = hgnc_id # Look up the standard gene symbol and set as name hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_name: agent_name = hgnc_name # Look up the corresponding uniprot id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id elif ns == 'pfam': be_id = famplex_map.get(('PF', xr['id'])) if be_id: db_refs['FPLX'] = be_id agent_name = be_id db_refs['PF'] = xr['id'] elif ns == 'interpro': be_id = famplex_map.get(('IP', xr['id'])) if be_id: db_refs['FPLX'] = be_id agent_name = be_id db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = xr['id'] elif ns == 'go': go_id = xr['id'] # Handle secondary to primary mapping if necessary pri = go_client.get_primary_id(go_id) if pri: go_id = pri db_refs['GO'] = go_id elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] elif ns == 'be': db_refs['FPLX'] = xr['id'] agent_name = db_refs['FPLX'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] return agent_name, db_refs
def get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = node_modifier_data or 'No node data' logger.info("Nodes of type %s not handled: %s", node_func, mod_data) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [BoundCondition(get_agent(m), True) for m in members[1:]] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s', node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def analyze(filename): results = load_file(filename) all_stmts = [stmt for paper_stmts in results.values() for stmt in paper_stmts] # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) map_stmts = gmap.rename_agents(map_stmts) # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() # Get complexes complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)] # Get HGNC grounding protein_complexes = [s for s in complexes if all([True if 'HGNC' in ag.db_refs.keys() else False for ag in s.agent_list()])] logger.info('Mapping gene IDs to gene symbols') gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes for ag in stmt.members])) genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids] # Get complexes from BioGrid and combine duplicates num_genes_per_query = 50 start_indices = range(0, len(genes), num_genes_per_query) end_indices = [i + num_genes_per_query if i + num_genes_per_query < len(genes) else len(genes) for i in start_indices] bg_complexes = [] for i in range(len(start_indices)): logger.info("Querying biogrid for %s" % str(genes[start_indices[i]:end_indices[i]])) bg_complexes += (bg.get_statements( genes[start_indices[i]:end_indices[i]])) # Filter out Biogrid statements not involving genes in the gene list # (this will make duplicate removal more efficient bg_filt = [] for stmt in bg_complexes: if stmt.members[0].name in genes and \ stmt.members[1].name in genes: bg_filt.append(stmt) # Might as well free up some memory del bg_complexes logger.info("Combining duplicates with biogrid...") pa = Preassembler(hierarchies, bg_filt + protein_complexes) pa.combine_duplicates() indra_only = [] bg_only = [] indra_and_bg = [] for stmt in pa.unique_stmts: evidence_source_list = set([]) for e in stmt.evidence: evidence_source_list.add(e.source_api) if 'reach' in evidence_source_list and \ 'biogrid' in evidence_source_list: indra_and_bg.append(stmt) elif 'reach' in evidence_source_list and \ 'biogrid' not in evidence_source_list: indra_only.append(stmt) elif 'reach' not in evidence_source_list and \ 'biogrid' in evidence_source_list: bg_only.append(stmt) rows = [] for stmt in indra_only: rows.append([stmt.members[0].name, stmt.members[1].name, str(len(stmt.evidence))]) write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t') return {'indra_only': indra_only, 'bg_only': bg_only, 'indra_and_bg': indra_and_bg}
def _get_agent_from_entity(self, entity_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % entity_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = next(res) except StopIteration: logger.debug(' %s is not an entity' % entity_id) return None # This is the default name, which can be overwritten # below for specific database entries agent_name = self._get_valid_name(entity_term['text']) db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = self._get_valid_name(gene_name) # If the gene name corresponds to an HGNC ID, add it to the # db_refs hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif ns == 'hgnc': hgnc_id = xr['id'] db_refs['HGNC'] = hgnc_id # Look up the standard gene symbol and set as name hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_name: agent_name = hgnc_name # Look up the corresponding uniprot id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id elif ns == 'pfam': be_id = bioentities_map.get(('PF', xr['id'])) if be_id: db_refs['BE'] = be_id db_refs['PF'] = xr['id'] elif ns == 'interpro': be_id = bioentities_map.get(('IP', xr['id'])) if be_id: db_refs['BE'] = be_id db_refs['PF'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = 'PUBCHEM:%s' % xr['id'] elif ns == 'go': db_refs['GO'] = xr['id'] elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] elif ns == 'be': db_refs['BE'] = xr['id'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] mod_terms = entity_term.get('modifications') mods = [] muts = [] if mod_terms is not None: for m in mod_terms: if m['type'].lower() == 'mutation': # Evidence is usualy something like "V600E" # We could parse this to get the amino acid # change that happened. mutation_str = m.get('evidence') # TODO: sometimes mutation_str is "mutant", "Mutant", # "mutants" - this indicates that there is a mutation # but not the specific type. We should encode this # somehow as a "blank" mutation condition mut = self._parse_mutation(mutation_str) if mut is not None: muts.append(mut) else: mc = self._get_mod_condition(m) if mc is not None: mods.append(mc) agent = Agent(agent_name, db_refs=db_refs, mods=mods, mutations=muts) return agent
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] elif db_ns == 'GO': if db_id.startswith('GO:'): db_refs_tmp['GO'] = db_id else: db_refs_tmp['GO'] = 'GO:' + db_id # Change PCID name space elif db_ns == 'PCID': db_refs_tmp.pop('PCID', None) db_refs_tmp['PUBCHEM'] = db_id agent.db_refs = db_refs_tmp # Check if we have a FPLX entry and handle old BE mappings if 'BE' in agent.db_refs: agent.db_refs['FPLX'] = agent.db_refs.pop('BE') be_id = agent.db_refs.get('FPLX') # Try to map to FPLX from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = famplex_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # FPLX takes precedence if we have it if be_id: agent.db_refs['FPLX'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # If it doesn't have a gene name, it's better to just # use the raw string name otherwise Sparser sets # has Uniprot IDs or mnemonics as the name else: name = agent.db_refs.get('TEXT', agent.name) agent.name = name
statement_list = ac.load_statements(args.input_file) # Make a dictionary mapping the raw text mention to db_refs logger.info('Extracting grounding information') text_to_refs = {} counter = 0 percent_done = 0 start_time = time.time() for statement in statement_list: for a in statement.agent_list(): db_refs = copy.copy(a.db_refs) text = db_refs.pop('TEXT', None) # Convert HGNC ids to names if 'HGNC' in db_refs and string_is_integer(db_refs['HGNC']): db_refs['HGNC'] = get_hgnc_name(db_refs['HGNC']) if len(db_refs.keys()) > 0: text_to_refs[text] = db_refs counter = counter + 1 progress = math.floor(100.0 * float(counter) / float(len(statement_list))) if progress > percent_done: percent_done = progress ellapsed_min = (time.time()-start_time) / 60.0 logger.info(('%d%% done with processing statements ' '(%f minutes elapsed)') % (percent_done, ellapsed_min)) logger.info('\tDone!')
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: db_refs['UP'] = db_id gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert(agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def test_get_hgnc_name_nonexistent(): hgnc_id = '123456' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert(hgnc_name is None) assert unicode_strs(hgnc_name)
def test_get_hgnc_name(): hgnc_id = '3236' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert(hgnc_name == 'EGFR')
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} entries = entity_info['entityId'] if entries is None: entries = [] ref_counts = Counter([entry['source'] for entry in entries]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entries: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] hgnc_id = uniprot_client.get_hgnc_id(id_dict['idString']) if hgnc_id: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords
def test_get_hgnc_name(): hgnc_id = '3236' hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name == 'EGFR' assert unicode_strs(hgnc_name)
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype']: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
def _make_agent(self, hprd_id, refseq_id=None): if hprd_id is None or hprd_id is nan: return None # Get the basic info (HGNC name/symbol, Entrez ID) from the # ID mappings dataframe try: egid = self.id_df.loc[hprd_id].EGID except KeyError: logger.info('HPRD ID %s not found in mappings table.' % hprd_id) return None if not egid: logger.info('No Entrez ID for HPRD ID %s' % hprd_id) return None # Get the HGNC ID hgnc_id = hgnc_client.get_hgnc_from_entrez(egid) # If we couldn't get an HGNC ID for the Entrez ID, this means that # the Entrez ID has been discontinued or replaced. if not hgnc_id: self.no_hgnc_for_egid.append(egid) return None # Get the (possibly updated) HGNC Symbol hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) assert hgnc_name is not None # See if we can get a Uniprot ID from the HGNC symbol--if there is # a RefSeq ID we wil also try to use it to get an isoform specific # UP ID, but we will have this one to fall back on. But if we can't # get one here, then we skip the Statement up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id) if not up_id_from_hgnc: self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id)) return None # If we have provided the RefSeq ID, it's because we need to make # sure that we are getting the right isoform-specific ID (for sequence # positions of PTMs). Here we try to get the Uniprot ID from the # Refseq->UP mappings in the protmapper.uniprot_client. if refseq_id is not None: # Get the Uniprot IDs from the uniprot client up_ids = uniprot_client.get_ids_from_refseq(refseq_id, reviewed_only=True) # Nothing for this RefSeq ID (quite likely because the RefSeq ID # is obsolete; take the UP ID from HGNC if len(up_ids) == 0: self.no_up_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # More than one reviewed entry--no thanks, we'll take the one from # HGNC instead elif len(up_ids) > 1: self.many_ups_for_refseq.append(refseq_id) up_id = up_id_from_hgnc # We got a unique, reviewed UP entry for the RefSeq ID else: up_id = up_ids[0] # If it's the canonical isoform, strip off the '-1' if up_id.endswith('-1'): up_id = up_id.split('-')[0] # For completeness, get the Refseq ID from the HPRD ID table else: refseq_id = self.id_df.loc[hprd_id].REFSEQ_PROTEIN up_id = up_id_from_hgnc # Make db_refs, return Agent db_refs = {'HGNC': hgnc_id, 'UP': up_id, 'EGID': egid, 'REFSEQ_PROT': refseq_id} return Agent(hgnc_name, db_refs=db_refs)
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs