def _get_db_refs(bpe): db_refs = {} if _is_protein(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) # Handle missing HGNC/UP ids if hgnc_id and not uniprot_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id and not hgnc_id: if uniprot_client.is_human(uniprot_id): hgnc_name = uniprot_client.get_gene_name(uniprot_id, False) if hgnc_name: hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id if uniprot_id is not None: db_refs['UP'] = uniprot_id elif _is_small_molecule(bpe): chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id else: chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if uniprot_id is not None: db_refs['UP'] = uniprot_id return db_refs
def filter_human_only(stmts_in, **kwargs): """Filter out statements that are not grounded to human genes. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to filter. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. Returns ------- stmts_out : list[indra.statements.Statement] A list of filtered statements. """ dump_pkl = kwargs.get('save') logger.info('Filtering %d statements for human genes only...' % len(stmts_in)) stmts_out = [] for st in stmts_in: human_genes = True for agent in st.agent_list(): if agent is not None: upid = agent.db_refs.get('UP') if upid and not uniprot_client.is_human(upid): human_genes = False break if human_genes: stmts_out.append(st) logger.info('%d statements after filter...' % len(stmts_out)) if dump_pkl: dump_statements(stmts_out, dump_pkl) return stmts_out
def read_phosphosite(fname): df = pandas.read_csv(fname, index_col=None) statements = [] antibody_map = {} for _, row in df.iterrows(): sub_upid = row['SUB_ID'] if not pandas.isnull(sub_upid): sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid) sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol) else: sub_hgnc_symbol = row['SUB_GENE'] sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol) sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id) sub = Agent(sub_hgnc_symbol, db_refs={'UP': sub_upid,'HGNC': sub_hgnc}) residue = row['Actual_site'][0] if len(row['Actual_site']) > 1: position = row['Actual_site'][1:] else: position = None sub_readout = deepcopy(sub) mc = ModCondition('phosphorylation', residue, position) sub_readout.mods = [mc] ps = row['phosphosite'] if ps in antibody_map: found = False for p in antibody_map[ps]: if p.name == sub.name and p.mods[0].residue == residue and \ p.mods[0].position == position: found = True break if not found: antibody_map[ps].append(sub_readout) else: antibody_map[ps] = [sub_readout] kin_upid = row['KIN_ID'] if not pandas.isnull(kin_upid): if not uniprot_client.is_human(kin_upid): print('%s non human' % kin_upid) continue kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid) kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol) else: kin_hgnc_symbol = row['KINASE_GENE_SYMBOL'] kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol) kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id) kin = Agent(kin_hgnc_symbol, db_refs={'UP': kin_upid,'HGNC': kin_hgnc}) ev = Evidence(source_api='phosphosite') st = Phosphorylation(kin, sub, residue, position, evidence = [ev]) statements.append(st) return statements, antibody_map
def _human_only_filter(stmts_in): stmts_out = [] for st in stmts_in: agents = [a for a in st.agent_list() if a is not None] non_human = False for a in agents: hgnc_id = a.db_refs.get("HGNC") up_id = a.db_refs.get("UP") if not hgnc_id: if up_id and not uniprot_client.is_human(up_id): non_human = True break if not non_human: stmts_out.append(st) return stmts_out
def test_all_protein_db_refs(): unmapped_uniprot_ids = [] for obj in bp.model.getObjects().toArray(): bpe = bpc._cast_biopax_element(obj) if bpc._is_protein(bpe): db_refs = bpc.BiopaxProcessor._get_db_refs(bpe) uniprot_id = db_refs.get('UP') hgnc_id = db_refs.get('HGNC') if uniprot_id: if uniprot_client.is_human(uniprot_id): if not hgnc_id: unmapped_uniprot_ids.append(uniprot_id) unmapped_uniprot_ids = sorted(list(set(unmapped_uniprot_ids))) # The number of unmapped entries should not increase # so we check for an upper limit here assert(len(unmapped_uniprot_ids) < 95)
def get_grounding(self): import indra.databases.hgnc_client as hgc import indra.databases.uniprot_client as upc be = self.db_refs.get('FPLX') if be: return ('FPLX', be) hgnc = self.db_refs.get('HGNC') if hgnc: if isinstance(hgnc, list): hgnc = hgnc[0] return ('HGNC', hgc.get_hgnc_name(str(hgnc))) up = self.db_refs.get('UP') if up: if isinstance(up, list): up = up[0] if upc.is_human(up): gene_name = upc.get_gene_name(up, web_fallback=False) if gene_name: return ('HGNC', gene_name) else: return ('UP', up) return (None, None)
def is_non_human_protein(bio_ontology, node): if bio_ontology.get_ns(node) == 'UP' and \ not uniprot_client.is_human(bio_ontology.get_id(node)): return True return False
def test_not_is_human(): assert(not uniprot_client.is_human('P31938'))
def test_noentry_is_human(): assert not uniprot_client.is_human('XXXX')
def test_is_human(): assert uniprot_client.is_human('P00533')
def test_is_human(): assert(uniprot_client.is_human('P00533'))
def test_not_is_human(): assert not uniprot_client.is_human('P31938')
def _get_db_refs(entity_term): agent_name = entity_term['text'] db_refs = {} for xr in entity_term['xrefs']: ns = xr['namespace'] if ns == 'uniprot': up_id = xr['id'] db_refs['UP'] = up_id # Look up official names in UniProt gene_name = up_client.get_gene_name(up_id) if gene_name is not None: agent_name = gene_name # If the gene name corresponds to an HGNC ID, add it to the # db_refs if up_client.is_human(up_id): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif ns == 'hgnc': hgnc_id = xr['id'] db_refs['HGNC'] = hgnc_id # Look up the standard gene symbol and set as name hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if hgnc_name: agent_name = hgnc_name # Look up the corresponding uniprot id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id elif ns == 'pfam': be_id = famplex_map.get(('PF', xr['id'])) if be_id: db_refs['FPLX'] = be_id agent_name = be_id db_refs['PF'] = xr['id'] elif ns == 'interpro': be_id = famplex_map.get(('IP', xr['id'])) if be_id: db_refs['FPLX'] = be_id agent_name = be_id db_refs['IP'] = xr['id'] elif ns == 'chebi': db_refs['CHEBI'] = xr['id'] elif ns == 'pubchem': db_refs['PUBCHEM'] = xr['id'] elif ns == 'go': db_refs['GO'] = xr['id'] elif ns == 'mesh': db_refs['MESH'] = xr['id'] elif ns == 'hmdb': db_refs['HMDB'] = xr['id'] elif ns == 'simple_chemical': if xr['id'].startswith('HMDB'): db_refs['HMDB'] = xr['id'] elif ns == 'be': db_refs['FPLX'] = xr['id'] agent_name = db_refs['FPLX'] # These name spaces are ignored elif ns in ['uaz']: pass else: logger.warning('Unhandled xref namespace: %s' % ns) db_refs['TEXT'] = entity_term['text'] return agent_name, db_refs
def _get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = ('No node data' if not node_modifier_data else node_modifier_data.get(pc.CNAME)) logger.info("Nodes of type %s not handled: %s" % (node_func, mod_data)) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = _get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [ BoundCondition(_get_agent(m), True) for m in members[1:] ] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.' % (name, hgnc_id)) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s' % node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def test_noentry_is_human(): assert(not uniprot_client.is_human('XXXX'))
def get_agent(node_data, node_modifier_data=None): # FIXME: Handle translocations on the agent for ActiveForms, turn into # location conditions # Check the node type/function node_func = node_data[pc.FUNCTION] if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX, pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA): mod_data = node_modifier_data or 'No node data' logger.info("Nodes of type %s not handled: %s", node_func, mod_data) return None # Skip gene/protein fusions if pc.FUSION in node_data: logger.info("Gene and protein fusions not handled: %s" % str(node_data)) return None # COMPLEXES ------------ # First, handle complexes, which will consist recursively of other agents if node_func == pc.COMPLEX: # First, check for members: if there are no members, we assume this # is a named complex members = node_data.get(pc.MEMBERS) if members is None: return None # Otherwise, get the "main" agent, to which the other members will be # attached as bound conditions main_agent = get_agent(members[0]) # If we can't get the main agent, return None if main_agent is None: return None bound_conditions = [BoundCondition(get_agent(m), True) for m in members[1:]] # Check the bound_conditions for any None agents if any([bc.agent is None for bc in bound_conditions]): return None main_agent.bound_conditions = bound_conditions # Get activity of main agent ac = _get_activity_condition(node_modifier_data) main_agent.activity = ac return main_agent # OTHER NODE TYPES ----- # Get node identifier information name = node_data.get(pc.NAME) ns = node_data[pc.NAMESPACE] ident = node_data.get(pc.IDENTIFIER) # No ID present, get identifier using the name, namespace db_refs = None if not ident: assert name, "Node must have a name if lacking an identifier." if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id # FIXME: Look up go ID in ontology lookup service # FIXME: Look up MESH IDs from name # FIXME: For now, just use node name elif ns in ('GOBP', 'MESHPP', 'MESHD'): db_refs = {} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns == 'EGID': hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) # We've already got an identifier, look up other identifiers if necessary else: # Get the name, overwriting existing name if necessary if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) assert name if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns in ('MGI', 'RGD'): raise ValueError('Identifiers for MGI and RGD databases are not ' 'currently handled: %s' % node_data) else: print("Unhandled namespace with identifier: %s: %s (%s)" % (ns, name, node_data)) if db_refs is None: logger.info('Unable to get identifier information for node: %s', node_data) return None # Get modification conditions mods, muts = _get_all_pmods(node_data) # Get activity condition ac = _get_activity_condition(node_modifier_data) to_loc = _get_translocation_target(node_modifier_data) # Check for unhandled node modifiers, skip if so if _has_unhandled_modifiers(node_modifier_data): return None # Make the agent ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac, location=to_loc) return ag
def get_db_refs_by_name(ns, name, node_data): """Return standard name and grounding based on a namespace and a name. Parameters ---------- ns : str A name space in which the given name is interpreted. name : str The name in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ db_refs = None if ns == 'HGNC': hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info("Invalid HGNC name: %s (%s)" % (name, node_data)) return name, None db_refs = {'HGNC': hgnc_id} up_id = _get_up_id(hgnc_id) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns in ('UNIPROT', 'UP'): up_id = None gene_name = uniprot_client.get_gene_name(name) if gene_name: up_id = name else: up_id_from_mnem = uniprot_client.get_id_from_mnemonic(name) if up_id_from_mnem: up_id = up_id_from_mnem gene_name = uniprot_client.get_gene_name(up_id) if not up_id: logger.info('Couldn\'t get UP ID from %s' % name) return name, None db_refs = {'UP': up_id} if uniprot_client.is_human(up_id): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns == 'FPLX': db_refs = {'FPLX': name} elif ns in ('GO', 'GOBP', 'GOCC'): go_id = go_client.get_go_id_from_label(name) if not go_id: logger.info('Could not find GO ID for %s' % name) return name, None db_refs = {'GO': go_id} elif ns in ('MESHPP', 'MESHD', 'MESH'): mesh_id = mesh_client.get_mesh_id_name(name) if not mesh_id: logger.info('Could not find MESH ID fro %s' % name) return name, None db_refs = {'MESH': mesh_id} # For now, handle MGI/RGD but putting the name into the db_refs so # it's clear what namespace the name belongs to # FIXME: Full implementation would look up MGI/RGD identifiers from # the names, and obtain corresponding Uniprot IDs elif ns in ('MGI', 'RGD'): db_refs = {ns: name} # Map Selventa families to FamPlexes elif ns == 'SFAM': db_refs = {'SFAM': name} indra_name = bel_to_indra.get(name) if indra_name is None: logger.info('Could not find mapping for BEL/SFAM family: ' '%s (%s)' % (name, node_data)) else: db_refs['FPLX'] = indra_name name = indra_name # Map Entrez genes to HGNC/UP elif ns in ('EGID', 'ENTREZ', 'NCBIGENE'): hgnc_id = hgnc_client.get_hgnc_from_entrez(name) db_refs = {'EGID': name} if hgnc_id is not None: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: db_refs['UP'] = up_id else: logger.info('HGNC entity %s with HGNC ID %s has no ' 'corresponding Uniprot ID.', name, hgnc_id) mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(hgnc_id) if mirbase_id: db_refs['MIRBASE'] = mirbase_id else: logger.info('Could not map EGID%s to HGNC.' % name) name = 'E%s' % name elif ns == 'MIRBASE': mirbase_id = mirbase_client.get_mirbase_id_from_mirbase_name(name) if not mirbase_id: logger.info('Could not map miRBase name %s to ID', name) return db_refs = {'MIRBASE': mirbase_id} hgnc_id = mirbase_client.get_hgnc_id_from_mirbase_id(mirbase_id) if hgnc_id: db_refs['HGNC'] = hgnc_id # CHEBI elif ns == 'CHEBI': chebi_id = chebi_name_id.get(name) if not chebi_id: chebi_id = chebi_client.get_chebi_id_from_name(name) if chebi_id: db_refs = {'CHEBI': chebi_id} else: logger.info('CHEBI name %s not found in map.' % name) # SDIS, SCHEM: Include the name as the ID for the namespace elif ns in ('SDIS', 'SCHEM'): db_refs = {ns: name} else: logger.info("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data)) return name, db_refs
def is_non_human_protein(bio_ontology, node): """Return True if the given ontology node is a non-human protein.""" if bio_ontology.get_ns(node) == 'UP' and \ not uniprot_client.is_human(bio_ontology.get_id(node)): return True return False
def standardize_db_refs(db_refs): """Return a standardized db refs dict for a given db refs dict. Parameters ---------- db_refs : dict A dict of db refs that may not be standardized, i.e., may be missing an available UP ID corresponding to an existing HGNC ID. Returns ------- dict The db_refs dict with standardized entries. """ up_id = db_refs.get('UP') hgnc_id = db_refs.get('HGNC') # If we have a UP ID and no HGNC ID, we try to get a gene name, # and if possible, a HGNC ID from that if up_id and not hgnc_id and uniprot_client.is_human(up_id): gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id # Otherwise, if we don't have a UP ID but have an HGNC ID, we try to # get the UP ID elif hgnc_id: # Now get the Uniprot ID for the gene mapped_up_id = hgnc_client.get_uniprot_id(hgnc_id) if mapped_up_id: # If we find an inconsistency, we explain it in an error # message and fall back on the mapped ID if up_id and up_id != mapped_up_id: # We handle a special case here in which mapped_up_id is # actually a list of UP IDs that we skip and just keep # the original up_id if ', ' not in mapped_up_id: # If we got a proper single protein mapping, we use # the mapped_up_id to standardize to. msg = ('Inconsistent groundings UP:%s not equal to ' 'UP:%s mapped from HGNC:%s, standardizing to ' 'UP:%s' % (up_id, mapped_up_id, hgnc_id, mapped_up_id)) logger.debug(msg) db_refs['UP'] = mapped_up_id # If there is no conflict, we can update the UP entry else: db_refs['UP'] = mapped_up_id # Now try to improve chemical groundings pc_id = db_refs.get('PUBCHEM') chebi_id = db_refs.get('CHEBI') hmdb_id = db_refs.get('HMDB') mapped_chebi_id = None mapped_pc_id = None hmdb_mapped_chebi_id = None # If we have original PUBCHEM and CHEBI IDs, we always keep those: if pc_id: mapped_chebi_id = chebi_client.get_chebi_id_from_pubchem(pc_id) if mapped_chebi_id and not mapped_chebi_id.startswith('CHEBI:'): mapped_chebi_id = 'CHEBI:%s' % mapped_chebi_id if chebi_id: mapped_pc_id = chebi_client.get_pubchem_id(chebi_id) if hmdb_id: hmdb_mapped_chebi_id = chebi_client.get_chebi_id_from_hmdb(hmdb_id) if hmdb_mapped_chebi_id and \ not hmdb_mapped_chebi_id.startswith('CHEBI:'): hmdb_mapped_chebi_id = 'CHEBI:%s' % hmdb_mapped_chebi_id # We always keep originals if both are present but display warnings # if there are inconsistencies if pc_id and chebi_id and mapped_pc_id and pc_id != mapped_pc_id: msg = ('Inconsistent groundings PUBCHEM:%s not equal to ' 'PUBCHEM:%s mapped from %s, standardizing to ' 'PUBCHEM:%s.' % (pc_id, mapped_pc_id, chebi_id, pc_id)) logger.debug(msg) elif pc_id and chebi_id and mapped_chebi_id and chebi_id != \ mapped_chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from PUBCHEM:%s, standardizing to ' '%s.' % (chebi_id, mapped_chebi_id, pc_id, chebi_id)) logger.debug(msg) # If we have PC and not CHEBI but can map to CHEBI, we do that elif pc_id and not chebi_id and mapped_chebi_id: db_refs['CHEBI'] = mapped_chebi_id elif hmdb_id and chebi_id and hmdb_mapped_chebi_id and \ hmdb_mapped_chebi_id != chebi_id: msg = ('Inconsistent groundings %s not equal to ' '%s mapped from %s, standardizing to ' '%s.' % (chebi_id, hmdb_mapped_chebi_id, hmdb_id, chebi_id)) logger.debug(msg) elif hmdb_id and not chebi_id and hmdb_mapped_chebi_id: db_refs['CHEBI'] = hmdb_mapped_chebi_id # If we have CHEBI and not PC but can map to PC, we do that elif chebi_id and not pc_id and mapped_pc_id: db_refs['PUBCHEM'] = mapped_pc_id # Otherwise there is no useful mapping that we can add and no # further conflict to resolve. return db_refs