def process_fplx_groundings(df): groundings = [] # Iterate over the rows of the curation table and extract groundings for _, row in df.iterrows(): if pandas.isnull(row['Grounding']): break # Here we get the original entity text, its type, and the # correct/incorrect curation grounding = { 'text': row['Text'], 'entity_type': row['EntityType'], 'db_refs': {}, 'correct': bool(int(row['Grounding'])), 'context': row['Sentence'] } # We then extract the grounding (up to 3) that were considered # for the curation for i in [1, 2, 3]: if not pandas.isnull(row['DB_Ns%d' % i]): grounding['db_refs'][row['DB_Ns%d' % i]] = row['DB_Id%d' % i] # We standardize some of the grounding entries to match up with # Gilda's format for k, v in copy.deepcopy(grounding['db_refs']).items(): # Strip off extra GO prefixes if v.startswith('GO:GO'): grounding['db_refs'][k] = v[3:] # Get CHEBI IDs from PUBCHEM if k == 'PUBCHEM': chebi_id = chebi_client.get_chebi_id_from_pubchem(v) if chebi_id: grounding['db_refs']['CHEBI'] = 'CHEBI:%s' % chebi_id groundings.append(grounding) return groundings
def _extract_drug(self, hms_id): refs = self._lc.get_small_molecule_refs(hms_id) name = self._lc.get_small_molecule_name(hms_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = 'CHEBI:%s' % chebi_id return Agent(name, db_refs=refs)
def _extract_drug(self, line): drug_name = line['Small Molecule Name'] lincs_id = line['Small Molecule HMS LINCS ID'] refs = self._lc.get_small_molecule_refs(lincs_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = chebi_id return Agent(drug_name, db_refs=refs)
def _extract_drug(self, line): drug_name = line['Small Molecule Name'] lincs_id = line['Small Molecule HMS LINCS ID'] refs = self._lc.get_small_molecule_refs(lincs_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = 'CHEBI:%s' % chebi_id return Agent(drug_name, db_refs=refs)
def _get_agent(self, participant): dbid = participant.get('identifier') text = participant.get('entity_text')[0] if dbid == 'GENERIC': if not text: return None else: return Agent(text) db_refs = {} entity_type = participant.get('entity_type') if entity_type in ['protein', 'chemical', 'gene']: # TODO: standardize name here name = participant.get('entity_text')[0] db_refs['TEXT'] = text if dbid: db_name, db_id = dbid.split(':') if db_name.lower() == 'uniprot': uniprot_id = uniprot_client.get_id_from_mnemonic(db_id) db_refs['UP'] = uniprot_id elif db_name.lower() == 'pubchem': chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id) db_refs['CHEBI'] = chebi_id elif db_name.lower() == 'hgnc': db_refs['HGNC'] = db_id elif entity_type == 'protein_family': name = text else: return None # TODO: handle other participant types agent = Agent(name, db_refs=db_refs) features = participant.get('features') if features: for feature in features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) agent.bound_conditions.append(bc) elif feature_type == 'mutation_feature': mc = self._get_mut_condition(feature) agent.mutations.append(mc) elif feature_type == 'location_feature': agent.location = feature.get('location') not_features = participant.get('features') if not_features: for feature in not_features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) mc.is_modified = False agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) bc.is_bound = False agent.bound_conditions.append(bc) return agent
def _get_agent(self, participant): dbid = participant.get('identifier') text = participant.get('entity_text')[0] if dbid == 'GENERIC': if not text: return None else: return Agent(text) db_refs = {} entity_type = participant.get('entity_type') if entity_type in ['protein', 'chemical', 'gene']: # TODO: standardize name here name = participant.get('entity_text')[0] db_refs['TEXT'] = text if dbid: db_name, db_id = dbid.split(':') if db_name.lower() == 'uniprot': uniprot_id = uniprot_client.get_id_from_mnemonic(db_id) db_refs['UP'] = uniprot_id elif db_name.lower() == 'pubchem': chebi_id = chebi_client.get_chebi_id_from_pubchem(db_id) db_refs['CHEBI'] = chebi_id elif db_name.lower() == 'hgnc': db_refs['HGNC'] = db_id elif entity_type == 'protein_family': name = text else: return None # TODO: handle other participant types agent = Agent(name, db_refs=db_refs) features = participant.get('features') if features: for feature in features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) agent.bound_conditions.append(bc) elif feature_type == 'mutation_feature': mc = self._get_mut_condition(feature) agent.mutations.append(mc) elif feature_type == 'location_feature': agent.location = feature.get('location') not_features = participant.get('features') if not_features: for feature in not_features: feature_type = feature.get('feature_type') if feature_type == 'modification_feature': mc = self._get_mod_condition(feature) mc.is_modified = False agent.mods.append(mc) elif feature_type == 'binding_feature': bc = self._get_bound_condition(feature) bc.is_bound = False agent.bound_conditions.append(bc) return agent
def test_chebi_pubchem_mapping(): # This is a non-trivial mapping since there are multiple mappings # reported by ChEBI and we need to choose the right one based on # InChIKey matches. assert chebi_client.get_chebi_id_from_pubchem('5287993') == 'CHEBI:3528' assert chebi_client.get_pubchem_id('CHEBI:3528') == '5287993'
"""This script helps identify entries in PubChem.tsv that systematically lead to incorrect groundings and should therefore be removed.""" import os import re from indra.databases import chebi_client if __name__ == '__main__': # Basic positioning here = os.path.dirname(os.path.abspath(__file__)) kb_dir = os.path.join(here, os.pardir, 'src', 'main', 'resources', 'org', 'clulab', 'reach', 'kb') resource_fname = os.path.join(kb_dir, 'PubChem.tsv') keep_rows = [] with open(resource_fname, 'r') as fh: for row in fh.readlines(): if '\t' not in row: continue txt, id = [x.strip() for x in row.split('\t')] if re.match(r'^[A-Z][A-Z]$', txt): chebi_id = chebi_client.get_chebi_id_from_pubchem(id) name = chebi_client.get_chebi_name_from_id(chebi_id) if name and '-' in name and len(name) == 7: continue keep_rows.append(row) with open(resource_fname, 'w') as fh: for row in keep_rows: fh.write(row)
def standardize_db_refs(db_refs): """Return a standardized db refs dict for a given db refs dict. Parameters ---------- db_refs : dict A dict of db refs that may not be standardized, i.e., may be missing an available UP ID corresponding to an existing HGNC ID. Returns ------- dict The db_refs dict with standardized entries. """ up_id = db_refs.get('UP') hgnc_id = db_refs.get('HGNC') # If we have a UP ID and no HGNC ID, we try to get a gene name, # and if possible, a HGNC ID from that if up_id and not hgnc_id and uniprot_client.is_human(up_id): gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to # get the UP ID elif hgnc_id: # Now get the Uniprot ID for the gene mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id) if mapped_up_id: # If we find an inconsistency, we explain it in an error # message and fall back on the mapped ID if up_id and up_id != mapped_up_id: # We handle a special case here in which mapped_up_id is # actually a list of UP IDs that we skip and just keep # the original up_id if ', ' not in mapped_up_id: # If we got a proper single protein mapping, we use # the mapped_up_id to standardize to. msg = ('Inconsistent groundings UP:%s not equal to ' 'UP:%s mapped from HGNC:%s, standardizing to ' 'UP:%s' % (up_id, mapped_up_id, hgnc_id, mapped_up_id)) logger.debug(msg) db_refs['UP'] = mapped_up_id # If there is no conflict, we can update the UP entry else: db_refs['UP'] = mapped_up_id # Now try to improve chemical groundings pc_id = db_refs.get('PUBCHEM') chebi_id = db_refs.get('CHEBI') hmdb_id = db_refs.get('HMDB') mapped_chebi_id = None mapped_pc_id = None hmdb_mapped_chebi_id = None # If we have original PUBCHEM and CHEBI IDs, we always keep those: if pc_id: mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id) if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'): mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id if chebi_id: mapped_pc_id = chebi_client.get_pubchem_id(chebi_id) if hmdb_id: hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id) if hmdb_mapped_chebi_id and \ not hmdb_mapped_chebi_id.startswith('CHEBI:'): hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id # We always keep originals if both are present but display warnings # if there are inconsistencies if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id: msg = ('Inconsistent groundings PUBCHEM:%s not equal to ' 'PUBCHEM:%s mapped from %s, standardizing to ' 'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id)) logger.debug(msg) elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \ mapped_chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from PUBCHEM:%s, standardizing to ' '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id)) logger.debug(msg) # If we have PC and not CHEBI but can map to CHEBI, we do that elif pc_id and not chebi_id and mapped_chebi_id: db_refs['CHEBI'] = mapped_chebi_id elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \ hmdb_mapped_chebi_id != chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from %s, standardizing to ' '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id)) logger.debug(msg) elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id: db_refs['CHEBI'] = hmdb_mapped_chebi_id # If we have CHEBI and not PC but can map to PC, we do that elif chebi_id and not pc_id and mapped_pc_id: db_refs['PUBCHEM'] = mapped_pc_id # Otherwise there is no useful mapping that we can add and no # further conflict to resolve. return db_refs