def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = [ 'PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE' ] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append( { 'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic }, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def save_base_map(filename, grouped_by_text): """Dump a list of agents along with groundings and counts into a csv file Parameters ---------- filename : str Filepath for output file grouped_by_text : list of tuple List of tuples of the form output by agent_texts_with_grounding """ rows = [] for group in grouped_by_text: text_string = group[0] for db, db_id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(db_id) else: name = '' row = [text_string, db, db_id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def protein_map_from_twg(twg): """Build map of entity texts to validate protein grounding. Looks at the grounding of the entity texts extracted from the statements and finds proteins where there is grounding to a human protein that maps to an HGNC name that is an exact match to the entity text. Returns a dict that can be used to update/expand the grounding map. Parameters ---------- twg : list of tuple list of tuples of the form output by agent_texts_with_grounding Returns ------- protein_map : dict dict keyed on agent text with associated values {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts where the grounding map was able to find human protein grounded to this agent_text in Uniprot. """ protein_map = {} unmatched = 0 matched = 0 logger.info('Building grounding map for human proteins') for agent_text, grounding_list, _ in twg: # If 'UP' (Uniprot) not one of the grounding entries for this text, # then we skip it. if 'UP' not in [entry[0] for entry in grounding_list]: continue # Otherwise, collect all the Uniprot IDs for this protein. uniprot_ids = [ entry[1] for entry in grounding_list if entry[0] == 'UP' ] # For each Uniprot ID, look up the species for uniprot_id in uniprot_ids: # If it's not a human protein, skip it mnemonic = uniprot_client.get_mnemonic(uniprot_id) if mnemonic is None or not mnemonic.endswith('_HUMAN'): continue # Otherwise, look up the gene name in HGNC and match against the # agent text gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: unmatched += 1 continue if agent_text.upper() == gene_name.upper(): matched += 1 protein_map[agent_text] = { 'TEXT': agent_text, 'UP': uniprot_id } else: unmatched += 1 logger.info('Exact matches for %d proteins' % matched) logger.info('No match (or no gene name) for %d proteins' % unmatched) return protein_map
def protein_map_from_twg(twg): """Build map of entity texts to validate protein grounding. Looks at the grounding of the entity texts extracted from the statements and finds proteins where there is grounding to a human protein that maps to an HGNC name that is an exact match to the entity text. Returns a dict that can be used to update/expand the grounding map. Parameters ---------- twg : list of tuple list of tuples of the form output by agent_texts_with_grounding Returns ------- protein_map : dict dict keyed on agent text with associated values {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts where the grounding map was able to find human protein grounded to this agent_text in Uniprot. """ protein_map = {} unmatched = 0 matched = 0 logger.info('Building grounding map for human proteins') for agent_text, grounding_list, _ in twg: # If 'UP' (Uniprot) not one of the grounding entries for this text, # then we skip it. if 'UP' not in [entry[0] for entry in grounding_list]: continue # Otherwise, collect all the Uniprot IDs for this protein. uniprot_ids = [entry[1] for entry in grounding_list if entry[0] == 'UP'] # For each Uniprot ID, look up the species for uniprot_id in uniprot_ids: # If it's not a human protein, skip it mnemonic = uniprot_client.get_mnemonic(uniprot_id) if mnemonic is None or not mnemonic.endswith('_HUMAN'): continue # Otherwise, look up the gene name in HGNC and match against the # agent text gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: unmatched += 1 continue if agent_text.upper() == gene_name.upper(): matched += 1 protein_map[agent_text] = {'TEXT': agent_text, 'UP': uniprot_id} else: unmatched += 1 logger.info('Exact matches for %d proteins' % matched) logger.info('No match (or no gene name) for %d proteins' % unmatched) return protein_map
def save_base_map(filename, grouped_by_text): rows = [] for group in grouped_by_text: text_string = group[0] for db, id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(id) else: name = '' row = [text_string, db, id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def sanitize_up_ids(up_ids): # First, we map any secondary IDs to primary IDs up_ids = {uniprot_client.get_primary_id(up_id) for up_id in up_ids} # We filter out IDs that are actually mnemonics, these are just mixed # in without any differentiation from other IDs up_ids = {up_id for up_id in up_ids if '_' not in up_id} # TODO: should we do anything about isoforms? # We separate out specific sets of IDs human_ids = [up_id for up_id in up_ids if uniprot_client.is_human(up_id)] reviewed_non_human_ids = [ up_id for up_id in up_ids if not uniprot_client.is_human(up_id) # get_mnemonic is just a quick way to see if we have this entry and uniprot_client.get_mnemonic(up_id, web_fallback=False) ] if human_ids: return human_ids elif reviewed_non_human_ids: return reviewed_non_human_ids else: return []
def get_grounding(self): be = self.db_refs.get('FPLX') if be: return ('FPLX', be) hgnc = self.db_refs.get('HGNC') if hgnc: if isinstance(hgnc, list): hgnc = hgnc[0] return ('HGNC', hgc.get_hgnc_name(str(hgnc))) up = self.db_refs.get('UP') if up: if isinstance(up, list): up = up[0] up_mnemonic = upc.get_mnemonic(up) if up_mnemonic and up_mnemonic.endswith('HUMAN'): gene_name = upc.get_gene_name(up, web_fallback=False) if gene_name: return ('HGNC', gene_name) else: return ('UP', up) return (None, None)
def update_kinases(): logger.info('--Updating kinase list------') url = 'http://www.uniprot.org/uniprot/?' + \ 'sort=entry_name&desc=no&compress=no&query=database:(type:' + \ 'interpro%20ipr011009)%20AND%20reviewed:yes%20AND%20organism:' + \ '%22Homo%20sapiens%20(Human)%20[9606]%22&fil=&force=no' + \ '&format=tab&columns=id,genes(PREFERRED),organism-id,entry%20name' fname = os.path.join(path, 'kinases.tsv') save_from_http(url, fname) from indra.databases import hgnc_client, uniprot_client add_kinases = ['PGK1', 'PKM', 'TAF1', 'NME1', 'BCKDK', 'PDK1', 'PDK2', 'PDK3', 'PDK4', 'BCR', 'FAM20C', 'BAZ1B', 'PIKFYVE'] df = pandas.read_csv(fname, sep='\t') for kinase in add_kinases: hgnc_id = hgnc_client.get_hgnc_id(kinase) up_id = hgnc_client.get_uniprot_id(hgnc_id) up_mnemonic = uniprot_client.get_mnemonic(up_id) df = df.append({'Entry': up_id, 'Gene names (primary )': kinase, 'Organism ID': '9606', 'Entry name': up_mnemonic}, ignore_index=True) df.to_csv(fname, sep='\t', index=False)
def test_get_mnemonic(): mnemonic = uniprot_client.get_mnemonic('Q02750') assert mnemonic == 'MP2K1_HUMAN' assert unicode_strs(mnemonic)
def get_participant(agent): # Handle missing Agent as generic protein if agent is None: return get_generic('protein') # The Agent is not missing text_name = agent.db_refs.get('TEXT') if text_name is None: text_name = agent.name participant = {} participant['entity_text'] = [text_name] hgnc_id = agent.db_refs.get('HGNC') uniprot_id = agent.db_refs.get('UP') chebi_id = agent.db_refs.get('CHEBI') pfam_def_ids = agent.db_refs.get('PFAM-DEF') # If HGNC grounding is available, that is the first choice if hgnc_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id: uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id)) participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic participant['entity_type'] = 'protein' elif chebi_id: pubchem_id = chebi_client.get_pubchem_id(chebi_id) participant['identifier'] = 'PUBCHEM:%s' % pubchem_id participant['entity_type'] = 'chemical' elif pfam_def_ids: participant['entity_type'] = 'protein_family' participant['entities'] = [] pfam_def_list = [] for p in pfam_def_ids.split('|'): dbname, dbid = p.split(':') pfam_def_list.append({dbname: dbid}) for pdi in pfam_def_list: # TODO: handle non-uniprot protein IDs here uniprot_id = pdi.get('UP') if uniprot_id: entity_dict = {} uniprot_mnemonic = \ str(uniprot_client.get_mnemonic(uniprot_id)) gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: gene_name = "" entity_dict['entity_text'] = [gene_name] entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic entity_dict['entity_type'] = 'protein' participant['entities'].append(entity_dict) else: participant['identifier'] = '' participant['entity_type'] = 'protein' features = [] not_features = [] # Binding features for bc in agent.bound_conditions: feature = { 'feature_type': 'binding_feature', 'bound_to': { # NOTE: get type and identifier for bound to protein 'entity_type': 'protein', 'entity_text': [bc.agent.name], 'identifier': '' } } if bc.is_bound: features.append(feature) else: not_features.append(feature) # Modification features for mc in agent.mods: feature = { 'feature_type': 'modification_feature', 'modification_type': mc.mod_type.lower(), } if mc.position is not None: pos = int(mc.position) feature['location'] = pos if mc.residue is not None: feature['aa_code'] = mc.residue if mc.is_modified: features.append(feature) else: not_features.append(feature) # Mutation features for mc in agent.mutations: feature = {} feature['feature_type'] = 'mutation_feature' if mc.residue_from is not None: feature['from_aa'] = mc.residue_from if mc.residue_to is not None: feature['to_aa'] = mc.residue_to if mc.position is not None: pos = int(mc.position) feature['location'] = pos features.append(feature) if features: participant['features'] = features if not_features: participant['not_features'] = not_features return participant
def add_source_urls(stmts): for stmt in stmts: for ev in stmt.evidence: if ev.source_api == 'hprd': if ev.source_id and ev.source_id.startswith('http'): ev.annotations['source_url'] = ev.source_id elif ev.source_api == 'signor': # Not clear how to use the source_id like SIGNOR-252627 to # link directly to the reaction up_id = stmt.real_agent_list()[0].db_refs.get('UP') if up_id: ev.annotations['source_url'] = \ ('https://signor.uniroma2.it/relation_result.php?' 'id=%s' % up_id) elif ev.source_api == 'ctd': agent = stmt.real_agent_list()[0] egid = agent.db_refs.get('EGID') meshid = agent.db_refs.get('MESH') if egid: ev.annotations['source_url'] = \ 'http://ctdbase.org/detail.go?type=gene&acc=%s' % egid elif meshid: ev.annotations['source_url'] = \ 'http://ctdbase.org/detail.go?type=chem&acc=%s' % meshid elif ev.source_api == 'biogrid': ev.annotations['source_url'] = \ 'https://thebiogrid.org/interaction/%s' % ev.source_id elif ev.source_api == 'phosphoelm': ev.annotations['source_url'] = \ 'http://phospho.elm.eu.org/byKinase/%s.html' % \ ev.annotations['phosphoelm_kinase_name'] elif ev.source_api == 'virhostnet': agent = stmt.real_agent_list()[0] upid = agent.db_refs.get('UP') if upid: mnemonic = \ uniprot_client.get_mnemonic(upid, web_fallback=False) if mnemonic: ev.annotations['source_url'] = \ ('https://virhostnet.prabi.fr/pathostscape3.html' '?protein=%s' % mnemonic) elif ev.source_api == 'drugbank': agent = stmt.real_agent_list()[0] dbid = agent.db_refs.get('DRUGBANK') if dbid: ev.annotations['source_url'] = \ 'https://go.drugbank.com/drugs/%s' % dbid elif ev.source_api == 'trrust': target = stmt.obj ev.annotations['source_url'] = \ ('https://www.grnpedia.org/trrust/result_tonly.php?gene=%s' '&species=human') % target.name elif ev.source_api == 'biopax': if not ev.source_id: continue elif 'phosphosite' in ev.source_id: ev.annotations[ 'source_url'] = 'https://www.phosphosite.org/' else: ev.annotations['source_url'] = ev.source_id elif ev.source_api == 'tas': lspcid = stmt.subj.db_refs.get('LSPCI') if lspcid: url = ( 'https://labsyspharm.shinyapps.io/smallmoleculesuite/' '?_inputs_&binding-table-selectivity_nav=%%22tas%%22&' 'binding-query-select_compound=%%22%s-1%%22&' 'tab=%%22binding%%22') % lspcid ev.annotations['source_url'] = url return stmts
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': # Assumption: name is an HGNC symbol hgnc_id = hgnc_client.get_current_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None elif isinstance(hgnc_id, list): logger.info('More than one current HGNC ID for %s, choosing %s' % (name, hgnc_id[0])) hgnc_id = hgnc_id[0] name = hgnc_client.get_hgnc_name(hgnc_id) db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None # This is a simple test to see if name is a valid UniProt ID, # if we can't get a mnemonic, we assume it's not a UP ID if uniprot_client.get_mnemonic(name, web_fallback=False): up_id = name # We next check if it's a mnemonic else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): if name == 'cell proliferation': name = 'cell population proliferation' go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} name = go_client.get_go_label(go_id) elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id, mesh_name = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID from %s' % name) return name, None name = mesh_name db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns == 'MGI': up_id = mouse_lookup.get(name) if up_id: db_refs = {'UP': up_id} elif ns == 'RGD': up_id = rat_lookup.get(name) if up_id: db_refs = {'UP': up_id} # Map Selventa families and complexes to FamPlex elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name elif ns == 'SCOMP': db_refs = {'SCOMP': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SCOMP complex: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info( 'HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.debug('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return name, None db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) # CHEBI elif ns == 'CHEBI': # We first look up BEL's own namespace map for ChEBI names to IDs chebi_id = chebi_name_id.get(name) # If that fails, we look up INDRA's ChEBI name to ID mapping if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # These appear in the name slot but are actually IDs elif ns == 'CHEBIID': chebi_id = identifiers.ensure_chebi_prefix(name) db_refs = {'CHEBI': chebi_id} name = chebi_client.get_chebi_name_from_id(chebi_id) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM', 'TEXT'): db_refs = {ns: name} elif ns == 'TAX': tid = taxonomy_client.get_taxonomy_id(name) if tid: db_refs = {'TAXONOMY': tid} else: logger.info('Could not get taxonomy ID for %s' % name) else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def test_get_mnemonic(): mnemonic = uniprot_client.get_mnemonic('Q02750') assert(mnemonic == 'MP2K1_HUMAN')