def test_mirna_standardize(): name, db_refs = standardize_name_db_refs({'HGNC': '31476'}) assert db_refs['HGNC'] == '31476' assert db_refs['MIRBASE'] == 'MI0000060' assert name == 'MIRLET7A1' name, db_refs = standardize_name_db_refs({'MIRBASE': 'MI0001730'}) assert db_refs['MIRBASE'] == 'MI0001730' assert name == 'mmu-mir-451a'
def _get_agents_from_singular_entity(self, bpe: bp.PhysicalEntity): """This is for extracting one or more Agents from a PhysicalEntity which doesn't have member_physical_entities.""" try: return copy.deepcopy(self._agents[bpe.uid]) except KeyError: pass mcs = BiopaxProcessor._get_entity_mods(bpe) if _is_protein(bpe) else [] name = bpe.display_name agents = [] # We first get processed xrefs xrefs = BiopaxProcessor._get_processed_xrefs(bpe) # We now need to harmonize UP and HGNC # Case 1. Multiple genes coding for one protein nhgnc_ids = len(xrefs.get('HGNC', {})) nup_ids = len(xrefs.get('UP', {})) # One protein coded by many genes if nhgnc_ids > 1 and nup_ids == 1: for hgnc_id in xrefs['HGNC']: standard_name, db_refs = \ standardize_name_db_refs({'HGNC': hgnc_id}) if standard_name: name = standard_name agents.append(Agent(name, db_refs=db_refs, mods=mcs)) # One gene coding for many proteins elif nhgnc_ids == 1 and nup_ids > 1: for up_id in xrefs['UP']: standard_name, db_refs = \ standardize_name_db_refs({'UP': up_id}) if standard_name: name = standard_name agents.append(Agent(name, db_refs=db_refs, mods=mcs)) # This is secretly a family, i.e., we have more than one # gene/protein IDs and so we can go by one of the ID sets and # standardize from there elif nhgnc_ids > 1 and nhgnc_ids == nup_ids: for up_id in xrefs['UP']: standard_name, db_refs = \ standardize_name_db_refs({'UP': up_id}) if standard_name: name = standard_name agents.append(Agent(name, db_refs=db_refs, mods=mcs)) # Otherwise it's just a regular Agent else: standard_name, db_refs = \ standardize_name_db_refs(clean_up_xrefs(xrefs)) if standard_name: name = standard_name agents.append(Agent(name, db_refs=db_refs, mods=mcs)) # There is a potential here that an Agent name was set to None # if both the display name and the standard name are missing. # We filter these out agents = [a for a in agents if a.name is not None] return agents
def test_drugbank_mappings(): name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'}) assert db_refs.get('CHEBI') == 'CHEBI:142437', db_refs assert db_refs.get('CHEMBL') == 'CHEMBL1201666', db_refs assert name == 'lepirudin' # Here we test for alternative prioritization of name spaces name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'}, ns_order=['DRUGBANK', 'CHEBI']) # We expect to get the Drugbank standard name assert name == 'Lepirudin'
def _get_target_agent(target_element): name_tag = db_find(target_element, 'db:name') name = name_tag.text db_refs = {} # Get Drugbank target ID target_id = db_find(target_element, 'db:id').text db_refs['DRUGBANKV4.TARGET'] = target_id # Extract other xrefs for xref_tag in db_findall( target_element, 'db:polypeptide/' 'db:external-identifiers/' 'db:external-identifier'): resource = db_find(xref_tag, 'db:resource').text identifier = db_find(xref_tag, 'db:identifier').text if resource == 'HUGO Gene Nomenclature Committee (HGNC)': db_refs['HGNC'] = identifier[5:] elif resource == 'UniProtKB': db_refs['UP'] = identifier standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name agent = Agent(name, db_refs=db_refs) return agent
def get_gene_agent(name, gene_entrez_id): db_refs = {'EGID': gene_entrez_id} hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id standard_name, db_refs = standardize_name_db_refs(refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def get_disease_agent(name, disease_id): groundings = disease_id.split('|') db_refs = {} for gr in groundings: db_ns, db_id = gr.split(':') db_refs[db_ns] = db_id standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def get_db_refs_by_ident(ns, ident, node_data): """Return standard name and grounding based on a namespace and an ID. Parameters ---------- ns : str A name space in which the given identifier is interpreted. ident : str The identifier in the given name space to get grounding for. node_data : pybel.dsl.BaseAbundance Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ ns_list = [ 'HGNC', 'UNIPROT', 'UP', 'FPLX', 'GO', 'GOBP', 'GOCC', 'MESHPP', 'MESHD', 'MESH', 'MGI', 'RGD', 'SFAM', 'EGID', 'ENTREZ', 'NCBIGENE', 'MIRBASE', 'CHEBI', 'ECCODE' 'SDIS', 'SCHEM', 'TEXT', 'DOID', 'EFO', 'HP', 'PFAM', 'ECCODE', 'HGNC.GENEFAMILY', 'HGNC_GROUP', 'NCBITAXON', 'PUBCHEM' ] ns_mappings = { 'UNIPROT': 'UP', 'GOBP': 'GO', 'GOCC': 'GO', 'MESHPP': 'MESH', 'MESHD': 'MESH', 'ENTREZ': 'EGID', 'NCBIGENE': 'EGID', 'NCBITAXON': 'TAXONOMY', 'HGNC.GENEFAMILY': 'HGNC_GROUP', 'CHEBIID': 'CHEBI' } raw_name = node_data.name if ns in ns_list: mapped_ns = ns_mappings.get(ns, ns) raw_db_refs = {mapped_ns: ident} std_name, std_db_refs = standardize_name_db_refs(raw_db_refs) if std_name is None: std_name = raw_name if std_db_refs is None: std_db_refs = raw_db_refs else: logger.info("Unhandled namespace %s with name %s and " "identifier %s (%s)." % (ns, raw_name, ident, node_data)) std_name = raw_name std_db_refs = None return std_name, std_db_refs
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions name = ent_name if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': db_refs = process_uniprot_entry(id) # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = { database: id } # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase', 'DRUGBANK'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() elif database == 'ChEBI' and id.startswith('SID:'): gnd_type = 'PUBCHEM.SUBSTANCE' id = id[4:].strip() db_refs = {gnd_type: id} # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name return Agent(name, db_refs=db_refs)
def get_ref_context(lst): if not lst: return None db_name, db_id = lst[0].split(':', 1) db_name = db_name.upper() # Here we are dealing with UniProt subcellular components # so we use a different namespace for those if db_name == 'UNIPROT': db_name = 'UPLOC' # These aren't real groundings elif db_name == 'UAZ': return None standard_name, db_refs = \ standardize_name_db_refs({db_name: db_id}) return RefContext(standard_name, db_refs=db_refs)
def _make_agent(self, symbol, entrez_id, swissprot_id, trembl_id): """Make an Agent object, appropriately grounded. Parameters ---------- entrez_id : str Entrez id number swissprot_id : str Swissprot (reviewed UniProt) ID. trembl_id : str Trembl (unreviewed UniProt) ID. symbol : str A plain text symbol, or None if not listed. Returns ------- agent : indra.statements.Agent A grounded agent object. """ db_refs = {} name = symbol if swissprot_id: if '|' not in swissprot_id: db_refs['UP'] = swissprot_id elif trembl_id: if '|' not in trembl_id: db_refs['UP'] = trembl_id if entrez_id: db_refs['EGID'] = entrez_id standard_name, db_refs = standardize_name_db_refs(db_refs) if standard_name: name = standard_name # At the time of writing this, the name was never None but # just in case if name is None: return None return Agent(name, db_refs=db_refs)
def _extract_drugs(self, compound_ids, lspci_id): drugs = [] for id_ in compound_ids.split('|'): db_refs = {'LSPCI': lspci_id} if id_.startswith('CHEMBL'): db_refs['CHEMBL'] = id_ elif id_.startswith('HMSL'): db_refs['HMS-LINCS'] = id_.split('HMSL')[1] else: logger.warning('Unhandled ID type: %s' % id_) # Name standardization finds correct names but because # ChEMBL is incomplete as a local resource, we don't # universally standardize its names, instead, we look # it up explicitly when necessary. name, db_refs = standardize_name_db_refs(db_refs) if name is None: # This is one way to detect that the drug could not be # standardized beyond just its name so in the # standardized_only condition, we skip this drug if self.standardized_only: continue elif 'HMS-LINCS' in db_refs: name = \ lincs_client_obj.get_small_molecule_name( db_refs['HMS-LINCS']) elif 'CHEMBL' in db_refs: name = chembl_client.get_chembl_name(db_refs['CHEMBL']) # If name is still None, we just use the ID as the name if name is None: # With the named_only restriction, we skip drugs without # a proper name. if self.named_only: continue name = id_ assert_valid_db_refs(db_refs) drugs.append(Agent(name, db_refs=db_refs)) drugs = list({agent.matches_key(): agent for agent in drugs}.values()) return drugs
def _get_drug_agent(drug_element): name_tag = db_find(drug_element, 'db:name') name = name_tag.text db_refs = {} # Extract the DrugBank ID drugbank_id_tags = db_findall(drug_element, 'db:drugbank-id') # We do a sort here because sometimes there's more than one # DrugBank ID and we choose the "smaller" one here drugbank_id = sorted([ di.text for di in drugbank_id_tags if di.text.startswith('DB') ])[0] db_refs['DRUGBANK'] = drugbank_id # Extract CAS ID cas_tag = db_find(drug_element, 'db:cas-number') if cas_tag is not None and cas_tag.text is not None: db_refs['CAS'] = cas_tag.text # Extract other xrefs for xref_tag in db_findall( drug_element, 'db:external-identifiers/' 'db:external-identifier'): resource = db_find(xref_tag, 'db:resource').text identifier = db_find(xref_tag, 'db:identifier').text if resource == 'ChEMBL': db_refs['CHEMBL'] = ensure_chembl_prefix(identifier) elif resource == 'PubChem Compound': db_refs['PUBCHEM'] = identifier elif resource == 'ChEBI': db_refs['CHEBI'] = ensure_chebi_prefix(identifier) assert_valid_db_refs(db_refs) standard_name, db_refs = standardize_name_db_refs(db_refs) assert_valid_db_refs(db_refs) if standard_name: name = standard_name agent = Agent(name, db_refs=db_refs) return agent
def map_readout(stmts): for stmt in stmts: db_refs = {'TEXT': stmt.obj.db_refs['TEXT'], 'MESH': 'D014779'} stmt.obj.name, stmt.obj.db_refs = \ standardize_name_db_refs(db_refs) return stmts
def get_ref_context(db_ns, db_id): db_id = db_id.strip() if db_ns in {'BTO'}: db_id = ensure_prefix(db_ns, db_id) standard_name, db_refs = standardize_name_db_refs({db_ns: db_id}) return RefContext(standard_name, db_refs)
def test_nonhuman_entrez(): name, db_refs = standardize_name_db_refs({'EGID': '109880'}) assert name == 'Braf', name assert db_refs['UP'] == 'P28028', db_refs
def test_drugbank_mappings(): name, db_refs = standardize_name_db_refs({'DRUGBANK': 'DB00001'}) assert db_refs.get('CHEBI') == 'CHEBI:142437', db_refs assert db_refs.get('CHEMBL') == 'CHEMBL1201666', db_refs assert name == 'lepirudin'