def _get_location_by_id(self, loc_id): qstr = "$.entities.frames[(@.frame_id is \'%s\')]" % loc_id res = self.tree.execute(qstr) if res is None: return None try: entity_term = next(res) except StopIteration: logger.debug(' %s is not an entity' % loc_id) return None name = entity_term.get('text') go_id = None for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'go': go_id = xr['id'] # If there is no GO ID, we try to "ground" the name to an ID if go_id is None: go_id = go_client.get_go_id_from_label_or_synonym(name.lower()) # Try to get valid location based on GO id if go_id is not None: loc = go_client.get_go_label(go_id) if loc: return loc return None
def standardize_agent_name(agent, standardize_refs=True): """Standardize the name of an Agent based on grounding information. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of priority to assign a standardized name to the Agent. If no relevant IDs are found, the name is not changed. Parameters ---------- agent : indra.statements.Agent An INDRA Agent whose name attribute should be standardized based on grounding information. standardize_refs : Optional[bool] If True, this function assumes that the Agent's db_refs need to be standardized, e.g., HGNC mapped to UP. Default: True """ # We return immediately for None Agents if agent is None: return if standardize_refs: agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs) # We next look for prioritized grounding, if missing, we return db_ns, db_id = agent.get_grounding() if not db_ns or not db_id: return # If there's a FamPlex ID, prefer that for the name if db_ns == 'FPLX': agent.name = agent.db_refs['FPLX'] # Importantly, HGNC here will be a symbol because that is what # get_grounding returns elif db_ns == 'HGNC': agent.name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'UP': # Try for the gene name gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'], web_fallback=False) if gene_name: agent.name = gene_name elif db_ns == 'CHEBI': chebi_name = \ chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI']) if chebi_name: agent.name = chebi_name elif db_ns == 'MESH': mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False) if mesh_name: agent.name = mesh_name elif db_ns == 'GO': go_name = go_client.get_go_label(agent.db_refs['GO']) if go_name: agent.name = go_name return
def generate_adeft_terms(): from adeft import available_shortforms from adeft.disambiguate import load_disambiguator all_term_args = set() for shortform in available_shortforms: da = load_disambiguator(shortform) for grounding in da.names.keys(): if grounding == 'ungrounded' or ':' not in grounding: continue db_ns, db_id = grounding.split(':', maxsplit=1) if db_ns == 'HGNC': standard_name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'GO': standard_name = go_client.get_go_label(db_id) elif db_ns == 'MESH': standard_name = mesh_client.get_mesh_name(db_id) elif db_ns == 'CHEBI': standard_name = chebi_client.get_chebi_name_from_id(db_id) elif db_ns == 'FPLX': standard_name = db_id elif db_ns == 'UP': standard_name = uniprot_client.get_gene_name(db_id) else: logger.warning('Unknown grounding namespace from Adeft: %s' % db_ns) continue term_args = (normalize(shortform), shortform, db_ns, db_id, standard_name, 'synonym', 'adeft') all_term_args.add(term_args) terms = [ Term(*term_args) for term_args in sorted(list(all_term_args), key=lambda x: x[0]) ] return terms
def generate_famplex_terms(ignore_mappings=False): fname = os.path.join(indra_resources, 'famplex', 'grounding_map.csv') logger.info('Loading %s' % fname) terms = [] for row in read_csv(fname, delimiter=','): txt = row[0] norm_txt = normalize(txt) groundings = {k: v for k, v in zip(row[1::2], row[2::2]) if (k and v)} if 'FPLX' in groundings: id = groundings['FPLX'] term = Term(norm_txt, txt, 'FPLX', id, id, 'assertion', 'famplex') elif 'HGNC' in groundings: id = groundings['HGNC'] term = Term(norm_txt, txt, 'HGNC', hgnc_client.get_hgnc_id(id), id, 'assertion', 'famplex', '9606') elif 'UP' in groundings: db = 'UP' id = groundings['UP'] name = id organism = None if uniprot_client.is_human(id): organism = '9606' hgnc_id = uniprot_client.get_hgnc_id(id) if hgnc_id: name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_id: db = 'HGNC' id = hgnc_id else: logger.warning('No gene name for %s' % id) # TODO: should we add organism info here? term = Term(norm_txt, txt, db, id, name, 'assertion', 'famplex', organism) elif 'CHEBI' in groundings: id = groundings['CHEBI'] name = chebi_client.get_chebi_name_from_id(id[6:]) term = Term(norm_txt, txt, 'CHEBI', id, name, 'assertion', 'famplex') elif 'GO' in groundings: id = groundings['GO'] term = Term(norm_txt, txt, 'GO', id, go_client.get_go_label(id), 'assertion', 'famplex') elif 'MESH' in groundings: id = groundings['MESH'] mesh_mapping = mesh_mappings.get(id) db, db_id, name = mesh_mapping if (mesh_mapping and not ignore_mappings) else \ ('MESH', id, mesh_client.get_mesh_name(id)) term = Term(norm_txt, txt, db, db_id, name, 'assertion', 'famplex') else: # TODO: handle HMDB, PUBCHEM, CHEMBL continue terms.append(term) return terms
def _get_go_type(go_id): from indra.databases import go_client go_namespace = go_client.get_namespace(go_id) term_name = go_client.get_go_label(go_id) if go_namespace == 'cellular_component': if 'complex' in term_name: return 'protein_family_complex' else: return 'cellular_location' elif go_namespace in {'biological_process', 'molecular_function'}: return 'biological_process'
def get_translocates(self): for card in self.index_cards: inter = card.get('interaction') if inter['interaction_type'] != 'translocates': continue ev = self._get_evidence(card) participant = inter.get('participant_b') agent = self._get_agent(participant) from_location = inter.get('from_location_id') to_location = inter.get('to_location_id') if from_location: from_location = go_client.get_go_label(from_location) if to_location: to_location = go_client.get_go_label(to_location) if not from_location and not to_location: continue stmt = Translocation(agent, from_location, to_location, evidence=ev) self.statements.append(stmt)
def add_agent_node(self, agent): """Add a node corresponding to an INDRA Agent.""" go_id = agent.db_refs.get('GO') if go_id: go_id = go_id if go_id.startswith('GO:') else 'GO:%s' % go_id node_key = go_id name = go_client.get_go_label(go_id) self.graph.add_node(node_key, name=name, source='indra', **agent.db_refs) else: node_key = agent.name self.graph.add_node(node_key, name=agent.name, **agent.db_refs, source='indra') self.indra_nodes.add(node_key) return node_key
def test_go_id_lookup(): go_id = 'GO:0001768' go_name = go_client.get_go_label(go_id) assert go_name == 'establishment of T cell polarity'
def test_invalid_id(): go_name = go_client.get_go_label('34jkgfh') assert go_name is None
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in [ 'agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype' ]: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name
def _urn_to_db_refs(urn): """Converts a Medscan URN to an INDRA db_refs dictionary with grounding information. Parameters ---------- urn : str A Medscan URN Returns ------- db_refs : dict A dictionary with grounding information, mapping databases to database identifiers. If the Medscan URN is not recognized, returns an empty dictionary. db_name : str The Famplex name, if available; otherwise the HGNC name if available; otherwise None """ # Convert a urn to a db_refs dictionary if urn is None: return {}, None m = URN_PATT.match(urn) if m is None: return None, None urn_type, urn_id = m.groups() db_refs = {} db_name = None # TODO: support more types of URNs if urn_type == 'agi-cas': # Identifier is CAS, convert to CHEBI chebi_id = get_chebi_id_from_cas(urn_id) if chebi_id: db_refs['CHEBI'] = 'CHEBI:%s' % chebi_id db_name = get_chebi_name_from_id(chebi_id) elif urn_type == 'agi-llid': # This is an Entrez ID, convert to HGNC hgnc_id = get_hgnc_from_entrez(urn_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id # Convert the HGNC ID to a Uniprot ID uniprot_id = get_uniprot_id(hgnc_id) if uniprot_id is not None: db_refs['UP'] = uniprot_id # Try to lookup HGNC name; if it's available, set it to the # agent name db_name = get_hgnc_name(hgnc_id) elif urn_type in ['agi-meshdis', 'agi-ncimorgan', 'agi-ncimtissue', 'agi-ncimcelltype']: if urn_id.startswith('C') and urn_id[1:].isdigit(): # Identifier is probably UMLS db_refs['UMLS'] = urn_id else: # Identifier is MESH urn_mesh_name = unquote(urn_id) mesh_id, mesh_name = mesh_client.get_mesh_id_name(urn_mesh_name) if mesh_id: db_refs['MESH'] = mesh_id db_name = mesh_name else: db_name = urn_mesh_name elif urn_type == 'agi-gocomplex': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id elif urn_type == 'agi-go': # Identifier is GO db_refs['GO'] = 'GO:%s' % urn_id # If we have a GO or MESH grounding, see if there is a corresponding # Famplex grounding db_sometimes_maps_to_famplex = ['GO', 'MESH'] for db in db_sometimes_maps_to_famplex: if db in db_refs: key = (db, db_refs[db]) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the urn corresponds to an eccode, groudn to famplex if that eccode # is in the Famplex equivalences table if urn.startswith('urn:agi-enz'): tokens = urn.split(':') eccode = tokens[2] key = ('ECCODE', eccode) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If the Medscan URN itself maps to a Famplex id, add a Famplex grounding key = ('MEDSCAN', urn) if key in famplex_map: db_refs['FPLX'] = famplex_map[key] # If there is a Famplex grounding, use Famplex for entity name if 'FPLX' in db_refs: db_name = db_refs['FPLX'] elif 'GO' in db_refs: db_name = go_client.get_go_label(db_refs['GO']) return db_refs, db_name