def get_agent(self, acsn_agent: str) -> Union[Agent, None]: """Return an INDRA Agent corresponding to an ACSN agent. Parameters ---------- acsn_agent : Agent extracted from the relations statement data frame Returns ------- : Returns INDRA agent with HGNC or FamPlex ID in db_refs. If there are no groundings available, we return None. """ mapping = self.correspondence_dict.get(acsn_agent) if not mapping: return None if len(mapping) == 1: hgnc_id = get_hgnc_id(mapping[0]) if hgnc_id: db_refs = {'HGNC': hgnc_id} return get_standard_agent(mapping[0], db_refs=db_refs) else: fplx_rel = self.fplx_lookup.get( tuple(sorted(self.correspondence_dict[acsn_agent]))) if fplx_rel: db_refs = {'FPLX': fplx_rel} return get_standard_agent(fplx_rel, db_refs=db_refs) return None
def test_get_agent(): # Agents VEGFA = get_standard_agent('VEGFA', db_refs={'HGNC': '12680'}) MIRLET7A = get_standard_agent('MIRLET7A', db_refs={'FPLX': 'MIRLET7A'}) assert ap.get_agent('VEGFA').db_refs == VEGFA.db_refs, VEGFA.db_refs assert ap.get_agent('MIRLET7A*').db_refs == \ MIRLET7A.db_refs, MIRLET7A.db_refs assert ap.get_agent('XyZ') is None
def _get_agents_from_singular_entity(self, bpe: bp.PhysicalEntity): """This is for extracting one or more Agents from a PhysicalEntity which doesn't have member_physical_entities.""" try: return copy.deepcopy(self._agents[bpe.uid]) except KeyError: pass mcs = BiopaxProcessor._get_entity_mods(bpe) if _is_protein(bpe) else [] name = bpe.display_name agents = [] # We first get processed xrefs xrefs = BiopaxProcessor._get_processed_xrefs(bpe) # We now need to harmonize UP and HGNC # Case 1. Multiple genes coding for one protein nhgnc_ids = len(xrefs.get('HGNC', {})) nup_ids = len(xrefs.get('UP', {})) # One protein coded by many genes if nhgnc_ids > 1 and nup_ids == 1: for hgnc_id in xrefs['HGNC']: agent = get_standard_agent(name, {'HGNC': hgnc_id}, mods=mcs) agents.append(agent) # One gene coding for many proteins elif nhgnc_ids == 1 and nup_ids > 1: for up_id in xrefs['UP']: agent = get_standard_agent(name, {'UP': up_id}, mods=mcs) agents.append(agent) # This is secretly a family, i.e., we have more than one # gene/protein IDs and so we can go by one of the ID sets and # standardize from there elif nhgnc_ids > 1 and nhgnc_ids == nup_ids: for up_id in xrefs['UP']: agent = get_standard_agent(name, {'UP': up_id}, mods=mcs) agents.append(agent) # Otherwise it's just a regular Agent else: agent = get_standard_agent(name, clean_up_xrefs(xrefs), mods=mcs) agents.append(agent) # Since there are so many cases above, we fix UP / UPISO issues # in a single loop here for agent in agents: up_id = agent.db_refs.get('UP') if up_id is not None and '-' in up_id: base_id = up_id.split('-')[0] agent.db_refs['UP'] = base_id agent.db_refs['UPISO'] = up_id # There is a potential here that an Agent name was set to None # if both the display name and the standard name are missing. # We filter these out agents = [a for a in agents if a.name is not None] return agents
def _process_record_helper( record, subject, up_stmt_cls, down_stmt_cls ) -> Iterable[Statement]: up_genes, down_genes = _get_regulations(record) evidence = _get_evidence(record) for prefix, identifier, name in up_genes: target = get_standard_agent(name, {prefix: identifier}) yield up_stmt_cls(subject, target, copy(evidence)) for prefix, identifier, name in down_genes: target = get_standard_agent(name, {prefix: identifier}) yield down_stmt_cls(subject, target, copy(evidence))
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions name = ent_name if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent elif ent_type == 'mirna' and id.startswith('URS'): db_refs = {'RNACENTRAL': id} return get_standard_agent(name, db_refs=db_refs) else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': db_refs = process_uniprot_entry(id) # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = {database: id} # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase', 'DRUGBANK'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() elif database == 'ChEBI' and id.startswith('SID:'): gnd_type = 'PUBCHEM.SUBSTANCE' id = id[4:].strip() db_refs = {gnd_type: id} # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} return get_standard_agent(name, db_refs=db_refs)
def get_disease_agent(name, disease_id): groundings = disease_id.split('|') db_refs = {} for gr in groundings: db_ns, db_id = gr.split(':') db_refs[db_ns] = db_id return get_standard_agent(name, db_refs)
def _get_drug_agent(drug_element): name_tag = db_find(drug_element, 'db:name') name = name_tag.text db_refs = {} # Extract the DrugBank ID drugbank_id_tags = db_findall(drug_element, 'db:drugbank-id') # We do a sort here because sometimes there's more than one # DrugBank ID and we choose the "smaller" one here drugbank_id = sorted([di.text for di in drugbank_id_tags if di.text.startswith('DB')])[0] db_refs['DRUGBANK'] = drugbank_id # Extract CAS ID cas_tag = db_find(drug_element, 'db:cas-number') if cas_tag is not None and cas_tag.text is not None: db_refs['CAS'] = cas_tag.text # Extract other xrefs for xref_tag in db_findall(drug_element, 'db:external-identifiers/' 'db:external-identifier'): resource = db_find(xref_tag, 'db:resource').text identifier = db_find(xref_tag, 'db:identifier').text if resource == 'ChEMBL': db_refs['CHEMBL'] = ensure_chembl_prefix(identifier) elif resource == 'PubChem Compound': db_refs['PUBCHEM'] = identifier elif resource == 'ChEBI': db_refs['CHEBI'] = ensure_chebi_prefix(identifier) assert_valid_db_refs(db_refs) return get_standard_agent(name, db_refs)
def get_std_disease(raw_string: str, db_id: str) -> List[Agent]: """Standardize disease names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ agents = [] db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} name = raw_string if not pd.isna(raw_string) else db_id if pd.isna(db_id): pass elif omim_no_prefix_pattern.match(db_id): db_refs['OMIM'] = db_id elif omim_pattern.match(db_id): db_refs['OMIM'] = db_id[5:] elif mesh_no_prefix_pattern.match(db_id): db_refs['MESH'] = db_id elif mesh_pattern.match(db_id): db_refs['MESH'] = db_id[5:] else: raise ValueError('Unexpected disease identifier: %s' % db_id) agents.append(get_standard_agent(name, db_refs)) return agents
def _process_row(row, stmt_type): # Note that even in the DUB table the subject of the statement # is called "E3" # There are some examples where a complex is implied (e.g., BMI1-RNF2), # for simplicity we just ignore these if '-' in row['E3AC']: return None subj_agent = get_standard_agent(row['E3GENE'], {'UP': row['E3AC']}) obj_agent = get_standard_agent(row['SUBGENE'], {'UP': row['SUBAC']}) if row['SOURCE'] == 'MEDLINE' and row['SOURCEID'] != 'UNIPROT': # Note: we sometimes get int here pmid = str(row['SOURCEID']) text = row['SENTENCE'] else: pmid = None text = None ev = Evidence(source_api='ubibrowser', pmid=pmid, text=text) stmt = stmt_type(subj_agent, obj_agent, evidence=[ev]) return stmt
def get_subject(record) -> Agent: db_refs = {} doid = record["do_id"] if doid: db_refs["DOID"] = doid umls_id = record["umls_cui"] if umls_id: db_refs["UMLS"] = umls_id name = record["disease_name"] return get_standard_agent(name, db_refs)
def get_subject(record) -> Agent: db_refs = {} smiles = record["smiles"] if smiles: db_refs["SMILES"] = smiles pubchem_compound_id = record["pubchem_cid"] if pubchem_compound_id: db_refs["PUBCHEM"] = str(pubchem_compound_id) drugbank_id = record["drugbank_id"] if drugbank_id: db_refs["DRUGBANK"] = drugbank_id name = record["drug_name"] return get_standard_agent(name, db_refs)
def get_std_chemical(raw_string: str, db_id: str) -> List[Agent]: """Standardize chemical names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ # If neither a name nor a DB ID is given, we return empty if pd.isna(db_id) and pd.isna(raw_string): return [] # We add TEXT to db_refs if there is a raw_string db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} # In this case we know that there is no db_id but we have raw_string that # we can use as a name and we return with that agent if pd.isna(db_id): return [Agent(raw_string, db_refs=db_refs)] # Otherwise we have a db_id that we can process else: agents = [] for single_db_id in db_id.split('|'): single_db_refs = deepcopy(db_refs) name = raw_string if not pd.isna(raw_string) else single_db_id if cheby_pattern.match(single_db_id): single_db_refs['CHEBI'] = single_db_id elif mesh_pattern.match(single_db_id): mesh_id = single_db_id[5:] # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = mesh_id elif mesh_no_prefix_pattern.match(single_db_id): mesh_id = single_db_id # There are often non-existent MESH IDs here for some reason # that can be filtered out with this technique if not mesh_client.get_mesh_name(mesh_id, offline=True): continue single_db_refs['MESH'] = single_db_id else: raise ValueError('Unexpected chemical identifier: %s' % single_db_id) agents.append(get_standard_agent(name, single_db_refs)) return agents
def get_subject(record) -> Optional[Agent]: ncbigene_id = record["id"][len("gene:") :] uniprot_id = uniprot_client.get_id_from_entrez(ncbigene_id) if uniprot_id is None: logger.debug(f"Could not convert ncbigene:{ncbigene_id} to UniProt") return None name = uniprot_client.get_gene_name(uniprot_id) return get_standard_agent( name, { "EGID": ncbigene_id, "UP": uniprot_id, }, )
def get_agent_from_entity(self, entity): # Note: entities can be negated ("negated") and have a semantic type # (semtype) and score (score) # <Entity id="Dtest.txt.E8" cui="C3192263" name="Vemurafenib" # semtypes="orch,phsu" text="vemurafenib" score="851" negated="false" # begin="147" end="158" /> name = entity.attrib['name'] db_refs = {'TEXT': entity.attrib['text'], 'UMLS': entity.attrib['cui']} agent = get_standard_agent(name, db_refs) # We optionally add groundings from Gilda if standardization didn't # yield and additional references beyond UMLS. if self.use_gilda_grounding and set(db_refs) == {'TEXT', 'UMLS'}: import gilda matches = gilda.ground(name) if matches: db_refs[matches[0].term.db] = matches[0].term.id standardize_agent_name(agent, standardize_refs=True) return agent
def get_std_gene(raw_string: str, db_id: str) -> List[Agent]: """Standardize gene names. Parameters ---------- raw_string : Name of the agent in the GNBR dataset. db_id : Entrez identifier of the agent. Returns ------- : A standardized Agent object. """ # If neither a name nor a DB ID is given, we return empty if pd.isna(db_id) and pd.isna(raw_string): return [] # We add TEXT to db_refs if there is a raw_string db_refs = {'TEXT': raw_string} if not pd.isna(raw_string) else {} # In this case we know that there is no db_id but we have raw_string that # we can use as a name and we return with that agent if pd.isna(db_id): return [Agent(raw_string, db_refs=db_refs)] # Otherwise we have a db_id that we can process else: agents = [] for single_db_id in db_id.split(';'): single_db_refs = deepcopy(db_refs) name = raw_string if not pd.isna(raw_string) else single_db_id if entrez_pattern.match(single_db_id): single_db_refs['EGID'] = single_db_id else: match = entrez_with_tax_pattern.match(single_db_id) if not match: raise ValueError('Unexpected gene identifier: %s' % single_db_id) single_db_refs['EGID'] = match.groups()[0] agents.append(get_standard_agent(name, single_db_refs)) return agents
def _get_target_agent(target_element): name_tag = db_find(target_element, 'db:name') name = name_tag.text db_refs = {} # Get Drugbank target ID target_id = db_find(target_element, 'db:id').text db_refs['DRUGBANKV4.TARGET'] = target_id # Extract other xrefs for xref_tag in db_findall(target_element, 'db:polypeptide/' 'db:external-identifiers/' 'db:external-identifier'): resource = db_find(xref_tag, 'db:resource').text identifier = db_find(xref_tag, 'db:identifier').text if resource == 'HUGO Gene Nomenclature Committee (HGNC)': db_refs['HGNC'] = identifier[5:] elif resource == 'UniProtKB': db_refs['UP'] = identifier return get_standard_agent(name, db_refs=db_refs)
def get_chemical_agent(name, mesh_id, cas_id): db_refs = {'MESH': mesh_id} if cas_id: db_refs['CAS'] = cas_id return get_standard_agent(name, db_refs)
def get_gene_agent(name, gene_entrez_id): db_refs = {'EGID': gene_entrez_id} hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id return get_standard_agent(name, db_refs)
def _extract_protein(self, name, gene_id): db_refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id return get_standard_agent(name, db_refs=db_refs)