def standardize_agent_db_refs(agent, map_db_refs, do_rename=True): gene_name = None up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: logger.error('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) else: map_db_refs['HGNC'] = hgnc_id # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a FamPlex ID, prefer that for the name if agent.db_refs.get('FPLX'): agent.name = agent.db_refs.get('FPLX') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name return
def test_query_protein_deprecated(): g = uniprot_client.query_protein('Q8NHX1') assert g is not None gene_name = uniprot_client.get_gene_name('Q8NHX1') assert gene_name == 'MAPK3' assert unicode_strs(gene_name) gene_name = uniprot_client.get_gene_name('Q8NHX1', web_fallback=False) assert gene_name == 'MAPK3' assert unicode_strs(gene_name)
def read_phosphosite(fname): df = pandas.read_csv(fname, index_col=None) statements = [] antibody_map = {} for _, row in df.iterrows(): sub_upid = row['SUB_ID'] if not pandas.isnull(sub_upid): sub_hgnc_symbol = uniprot_client.get_gene_name(sub_upid) sub_hgnc = hgnc_client.get_hgnc_id(sub_hgnc_symbol) else: sub_hgnc_symbol = row['SUB_GENE'] sub_hgnc_id = hgnc_client.get_hgnc_id(sub_hgnc_symbol) sub_upid = hgnc_client.get_uniprot_id(sub_hgnc_id) sub = Agent(sub_hgnc_symbol, db_refs={'UP': sub_upid,'HGNC': sub_hgnc}) residue = row['Actual_site'][0] if len(row['Actual_site']) > 1: position = row['Actual_site'][1:] else: position = None sub_readout = deepcopy(sub) mc = ModCondition('phosphorylation', residue, position) sub_readout.mods = [mc] ps = row['phosphosite'] if ps in antibody_map: found = False for p in antibody_map[ps]: if p.name == sub.name and p.mods[0].residue == residue and \ p.mods[0].position == position: found = True break if not found: antibody_map[ps].append(sub_readout) else: antibody_map[ps] = [sub_readout] kin_upid = row['KIN_ID'] if not pandas.isnull(kin_upid): if not uniprot_client.is_human(kin_upid): print('%s non human' % kin_upid) continue kin_hgnc_symbol = uniprot_client.get_gene_name(kin_upid) kin_hgnc = hgnc_client.get_hgnc_id(kin_hgnc_symbol) else: kin_hgnc_symbol = row['KINASE_GENE_SYMBOL'] kin_hgnc_id = hgnc_client.get_hgnc_id(kin_hgnc_symbol) kin_upid = hgnc_client.get_uniprot_id(kin_hgnc_id) kin = Agent(kin_hgnc_symbol, db_refs={'UP': kin_upid,'HGNC': kin_hgnc}) ev = Evidence(source_api='phosphosite') st = Phosphorylation(kin, sub, residue, position, evidence = [ev]) statements.append(st) return statements, antibody_map
def get_all_gene_names(data): gene_names = data['antibody']['Gene Name'] uniprot_ids = data['antibody']['UniProt ID'] all_genes = set() invalid_genes = set() for gn, upid in zip(gene_names, uniprot_ids): # Some entries are lists of genes separated by commas # and we also strip off extra spaces names = [x.strip() for x in gn.split(',')] ids = [x.strip() for x in upid.split(',')] names_from_ids = [uniprot_client.get_gene_name(x) for x in ids] # Find invalid gene names for name in names: if not hgnc_client.get_hgnc_id(name): print('Invalid or deprecated gene symbol: %s' % name) invalid_genes.add(name) # Find inconsistent gene names and UniProt IDs if set(names) != set(names_from_ids): print('Inconsistent entries:') print('- Given gene names: %s' % ','.join(names)) print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids)) # Add both the gene names and the gene names derived from UniProt IDs all_genes = all_genes.union(set(names)).union(set(names_from_ids)) # Finally remove the invalid gene names all_genes = all_genes.difference(invalid_genes) all_genes = sorted(list(all_genes)) return all_genes
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: name = db_refs['FPLX'] # Set agent name to Famplex name if # the Uniprot name is not available elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' + \ 'for ' + c) if not name: name = db_refs['SIGNOR'] # Set the agent name to the # Signor name if neither the # Uniprot nor Famplex names are # available assert(name is not None) agents.append(Agent(name, db_refs=db_refs)) return agents
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': logger.warning('Skipping collection Agent.') return None name_tag = ref.find("var/[@name='name']") if name_tag is not None: name = name_tag.text else: return None uid_tag = ref.find("var/[@name='uid']") if uid_tag is not None: uid = uid_tag.text else: uid = None db_refs = {} text_tag = ref.find("var/[@name='raw-text']") if text_tag is not None: db_refs['TEXT'] = text_tag.text if uid is not None and uid.startswith('UP:'): up_mnemonic = uid[3:] up_id = uniprot_client.get_id_from_mnemonic(up_mnemonic) if up_id is not None: up_name = uniprot_client.get_gene_name(up_id) if up_name is not None: name = up_name db_refs['UP'] = up_id assert name is not None agent = Agent(name, db_refs=db_refs) return agent
def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs)
def _get_db_refs(bpe): db_refs = {} if _is_protein(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) # Handle missing HGNC/UP ids if hgnc_id and not uniprot_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id and not hgnc_id: if uniprot_client.is_human(uniprot_id): hgnc_name = uniprot_client.get_gene_name(uniprot_id, False) if hgnc_name: hgnc_id = hgnc_client.get_hgnc_id(hgnc_name) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id if uniprot_id is not None: db_refs['UP'] = uniprot_id elif _is_small_molecule(bpe): chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id else: chebi_id = BiopaxProcessor._get_chebi_id(bpe) if chebi_id is not None: db_refs['CHEBI'] = chebi_id hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) if hgnc_id is not None: db_refs['HGNC'] = hgnc_id uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if uniprot_id is not None: db_refs['UP'] = uniprot_id return db_refs
def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id prot_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs)
def fix_stmts(stmts): new_stmts = [] for stmt in stmts: for ev in stmt.evidence: if ev.pmid and ev.pmid.startswith('PMID'): ev.pmid = ev.pmid[:-4] # Skip if no subject if isinstance(stmt, RegulateActivity): if stmt.subj is None: continue # Skip if no locations if isinstance(stmt, Translocation): if not (stmt.from_location or stmt.to_location): continue for agent in stmt.agent_list(): if agent is not None: upid = agent.db_refs.get('UP') if upid: gene_name = uniprot_client.get_gene_name(upid) if gene_name: agent.name = gene_name if uniprot_client.is_human(upid): hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id new_stmts.append(stmt) return new_stmts
def get_genes_for_go_ids(go_ids): """Return genes that are annotated with a given go ID or its children.""" df = goa[goa['GO_ID'].isin(set(go_ids))] up_ids = sorted(list(set(df['DB_ID']))) gene_names = [uniprot_client.get_gene_name(up_id) for up_id in up_ids] gene_names = {g for g in gene_names if g} return gene_names
def rename_agents(self, stmts): # Make a copy of the stmts mapped_stmts = deepcopy(stmts) # Iterate over the statements for stmt_ix, stmt in enumerate(mapped_stmts): # Iterate over the agents for agent in stmt.agent_list(): if agent is None: continue old_name = agent.name # If there's a Bioentities ID, prefer that for the name if agent.db_refs.get('BE'): agent.name = agent.db_refs.get('BE') # Take a HGNC name from Uniprot next elif agent.db_refs.get('UP'): # Try for the gene name gene_name = uniprot_client.get_gene_name( agent.db_refs.get('UP'), web_fallback=False) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # Take the text string #if agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') # If this fails, then we continue with no change # Fall back to the text string #elif agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') return mapped_stmts
def generate_adeft_terms(): from adeft import available_shortforms from adeft.disambiguate import load_disambiguator all_term_args = set() for shortform in available_shortforms: da = load_disambiguator(shortform) for grounding in da.names.keys(): if grounding == 'ungrounded' or ':' not in grounding: continue db_ns, db_id = grounding.split(':', maxsplit=1) if db_ns == 'HGNC': standard_name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'GO': standard_name = go_client.get_go_label(db_id) elif db_ns == 'MESH': standard_name = mesh_client.get_mesh_name(db_id) elif db_ns == 'CHEBI': standard_name = chebi_client.get_chebi_name_from_id(db_id) elif db_ns == 'FPLX': standard_name = db_id elif db_ns == 'UP': standard_name = uniprot_client.get_gene_name(db_id) else: logger.warning('Unknown grounding namespace from Adeft: %s' % db_ns) continue term_args = (normalize(shortform), shortform, db_ns, db_id, standard_name, 'synonym', 'adeft') all_term_args.add(term_args) terms = [ Term(*term_args) for term_args in sorted(list(all_term_args), key=lambda x: x[0]) ] return terms
def _get_complex_agents(self, complex_id): """Returns a list of agents corresponding to each of the constituents in a SIGNOR complex.""" agents = [] components = self._recursively_lookup_complex(complex_id) for c in components: db_refs = {} name = uniprot_client.get_gene_name(c) if name is None: db_refs['SIGNOR'] = c else: db_refs['UP'] = c hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id famplex_key = ('SIGNOR', c) if famplex_key in famplex_map: db_refs['FPLX'] = famplex_map[famplex_key] if not name: name = db_refs['FPLX'] # Set agent name to Famplex name if # the Uniprot name is not available elif not name: # We neither have a Uniprot nor Famplex grounding logger.info('Have neither a Uniprot nor Famplex grounding ' + \ 'for ' + c) if not name: name = db_refs['SIGNOR'] # Set the agent name to the # Signor name if neither the # Uniprot nor Famplex names are # available assert (name is not None) agents.append(Agent(name, db_refs=db_refs)) return agents
def get_agent_from_grounding(grounding, up_web_fallback=False): """Return an INDRA Agent based on a grounding annotation.""" db_ns, db_id = grounding.split(':') # Assume UniProt or RefSeq IDs assert db_ns in {'uniprotkb', 'refseq', 'ddbj/embl/genbank'}, db_ns if db_ns == 'uniprotkb': if '-' in db_id: up_id, feat_id = db_id.split('-') # Assume it's a feature ID assert feat_id.startswith('PRO'), feat_id db_refs = {'UP': up_id, 'UPPRO': feat_id} else: db_refs = {'UP': db_id} elif db_ns == 'refseq': db_refs = {'REFSEQ_PROT': db_id} else: db_refs = {'GENBANK': db_id} agent = Agent(db_id, db_refs=db_refs) standardized = standardize_agent_name(agent) if up_web_fallback: # Handle special case of unreviewed UP entries if not standardized and 'UP' in db_refs: name = uniprot_client.get_gene_name(db_refs['UP'], web_fallback=True) if name: agent.name = name return agent
def standardize_agent_name(agent, standardize_refs=True): """Standardize the name of an Agent based on grounding information. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. Similarly, CHEBI, MESH and GO IDs are used in this order of priority to assign a standardized name to the Agent. If no relevant IDs are found, the name is not changed. Parameters ---------- agent : indra.statements.Agent An INDRA Agent whose name attribute should be standardized based on grounding information. standardize_refs : Optional[bool] If True, this function assumes that the Agent's db_refs need to be standardized, e.g., HGNC mapped to UP. Default: True """ # We return immediately for None Agents if agent is None: return if standardize_refs: agent.db_refs = GroundingMapper.standardize_db_refs(agent.db_refs) # We next look for prioritized grounding, if missing, we return db_ns, db_id = agent.get_grounding() if not db_ns or not db_id: return # If there's a FamPlex ID, prefer that for the name if db_ns == 'FPLX': agent.name = agent.db_refs['FPLX'] # Importantly, HGNC here will be a symbol because that is what # get_grounding returns elif db_ns == 'HGNC': agent.name = hgnc_client.get_hgnc_name(db_id) elif db_ns == 'UP': # Try for the gene name gene_name = uniprot_client.get_gene_name(agent.db_refs['UP'], web_fallback=False) if gene_name: agent.name = gene_name elif db_ns == 'CHEBI': chebi_name = \ chebi_client.get_chebi_name_from_id(agent.db_refs['CHEBI']) if chebi_name: agent.name = chebi_name elif db_ns == 'MESH': mesh_name = mesh_client.get_mesh_name(agent.db_refs['MESH'], False) if mesh_name: agent.name = mesh_name elif db_ns == 'GO': go_name = go_client.get_go_label(agent.db_refs['GO']) if go_name: agent.name = go_name return
def _get_name_by_id(self, entity_id): entity_term = self.tree.find("TERM/[@id='%s']" % entity_id) if entity_term is None: logger.debug('Term %s for entity not found' % entity_id) return None name = entity_term.find("name") if name is None: logger.debug('Entity without a name') return None try: dbid = entity_term.attrib["dbid"] except: #logger.debug('No grounding information for %s' % name.text) return self._get_valid_name(name.text) dbids = dbid.split('|') hgnc_ids = [i for i in dbids if i.startswith('HGNC')] up_ids = [i for i in dbids if i.startswith('UP')] #TODO: handle protein families like 14-3-3 with IDs like # XFAM:PF00244.15, FA:00007 if hgnc_ids: if len(hgnc_ids) > 1: lisp_str = entity_term.attrib.get('lisp') if lisp_str is None: hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] else: parts = lisp_str.split('(TERM :ID ') scores = {} for p in parts: res = re.findall('HGNC::\|(.*)\|', p) if res: hgnc_id = res[0] score = re.findall(':SCORE ([^ ]+)', p)[0] scores[hgnc_id] = float(score) if scores: sorted_ids = sorted(scores.items(), key=operator.itemgetter(1)) hgnc_id = sorted_ids[-1][0] else: hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] hgnc_name = self._get_hgnc_name(hgnc_id) return self._get_valid_name(hgnc_name) elif up_ids: if len(hgnc_ids) > 1: logger.debug('%d UniProt IDs reported.' % len(up_ids)) up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0] # First try to get HGNC name hgnc_name = up_client.get_hgnc_name(up_id) if hgnc_name is not None: return self._get_valid_name(hgnc_name) # Next, try to get the gene name gene_name = up_client.get_gene_name(up_id) if gene_name is not None: return self._get_valid_name(gene_name) # By default, return the text of the name tag name_txt = name.text.strip('|') return self._get_valid_name(name_txt)
def print_reading_contribs(reader_sites, psp_sites): reader_only = set(reader_sites.keys()) - set(psp_sites.keys()) for ctrl_id, ctrl_ns, up_id, residue, pos in \ sorted(reader_only, key=lambda x: (x[0], x[2], x[4])): target_name = uniprot_client.get_gene_name(up_id, web_fallback=False) if target_name is None: print('Could not get gene name for %s' % up_id) print('%s -> %s-%s%s' % (ctrl_id, target_name, residue, int(pos)))
def get_db_refs_by_ident(ns, ident, node_data): """Return standard name and grounding based on a namespace and an ID. Parameters ---------- ns : str A name space in which the given identifier is interpreted. ident : str The identifier in the given name space to get grounding for. node_data : dict Node data for logging purposes. Returns ------- name : str The standardized name for the given entity. db_refs : dict The grounding for the given entity. """ name = node_data.get(pc.NAME) db_refs = None if ns == 'HGNC': name = hgnc_client.get_hgnc_name(ident) if not name: return None, None db_refs = {'HGNC': ident} up_id = _get_up_id(ident) if up_id: db_refs['UP'] = up_id mirbase_id = mirbase_client.get_mirbase_id_from_hgnc_id(ident) if mirbase_id: db_refs['MIRBASE'] = mirbase_id elif ns == 'UP': db_refs = {'UP': ident} name = uniprot_client.get_gene_name(ident) if not name: return None, None if uniprot_client.is_human(ident): hgnc_id = hgnc_client.get_hgnc_id(name) if not hgnc_id: logger.info('Uniprot ID linked to invalid human gene ' 'name %s' % name) else: db_refs['HGNC'] = hgnc_id elif ns == 'MIRBASE': db_refs = {'MIRBASE': ident} elif ns in ('MGI', 'RGD', 'CHEBI', 'HMDB', 'MESH'): db_refs = {ns: ident} # raise ValueError('Identifiers for MGI and RGD databases are not ' # 'currently handled: %s' % node_data) elif ns == 'PUBCHEM.COMPOUND': db_refs = {'PUBCHEM': ident} else: logger.info("Unhandled namespace %s with name %s and " "identifier %s (%s)." % (ns, name, node_data.identifier, node_data)) return name, db_refs
def protein_map_from_twg(twg): """Build map of entity texts to validate protein grounding. Looks at the grounding of the entity texts extracted from the statements and finds proteins where there is grounding to a human protein that maps to an HGNC name that is an exact match to the entity text. Returns a dict that can be used to update/expand the grounding map. Parameters ---------- twg : list of tuple list of tuples of the form output by agent_texts_with_grounding Returns ------- protein_map : dict dict keyed on agent text with associated values {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts where the grounding map was able to find human protein grounded to this agent_text in Uniprot. """ protein_map = {} unmatched = 0 matched = 0 logger.info('Building grounding map for human proteins') for agent_text, grounding_list, _ in twg: # If 'UP' (Uniprot) not one of the grounding entries for this text, # then we skip it. if 'UP' not in [entry[0] for entry in grounding_list]: continue # Otherwise, collect all the Uniprot IDs for this protein. uniprot_ids = [ entry[1] for entry in grounding_list if entry[0] == 'UP' ] # For each Uniprot ID, look up the species for uniprot_id in uniprot_ids: # If it's not a human protein, skip it mnemonic = uniprot_client.get_mnemonic(uniprot_id) if mnemonic is None or not mnemonic.endswith('_HUMAN'): continue # Otherwise, look up the gene name in HGNC and match against the # agent text gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: unmatched += 1 continue if agent_text.upper() == gene_name.upper(): matched += 1 protein_map[agent_text] = { 'TEXT': agent_text, 'UP': uniprot_id } else: unmatched += 1 logger.info('Exact matches for %d proteins' % matched) logger.info('No match (or no gene name) for %d proteins' % unmatched) return protein_map
def get_drug_targets(fname='drug_grounding.csv'): df = pandas.read_csv(fname, index_col=None, header=None) abbrevs = df[1] target_upids = df[6] targets = {} for abb, tupid in zip(abbrevs, target_upids): targets[abb] = [uniprot_client.get_gene_name(ui) for ui in tupid.split(',')] return targets
def get_drug_inhibition_stmts(drug): """Query ChEMBL for kinetics data given drug as Agent get back statements Parameters ---------- drug : Agent Agent representing drug with MESH or CHEBI grounding Returns ------- stmts : list of INDRA statements INDRA statements generated by querying ChEMBL for all kinetics data of a drug interacting with protein targets """ chebi_id = drug.db_refs.get('CHEBI') mesh_id = drug.db_refs.get('MESH') if chebi_id: drug_chembl_id = chebi_client.get_chembl_id(chebi_id) elif mesh_id: drug_chembl_id = get_chembl_id(mesh_id) else: logger.error('Drug missing ChEBI or MESH grounding.') return None logger.info('Drug: %s' % (drug_chembl_id)) query_dict = {'query': 'activity', 'params': {'molecule_chembl_id': drug_chembl_id, 'limit': 10000} } res = send_query(query_dict) activities = res['activities'] targ_act_dict = activities_by_target(activities) target_chembl_ids = [x for x in targ_act_dict] protein_targets = get_protein_targets_only(target_chembl_ids) filtered_targ_act_dict = {t: targ_act_dict[t] for t in [x for x in protein_targets]} stmts = [] for target_chembl_id in filtered_targ_act_dict: target_activity_ids = filtered_targ_act_dict[target_chembl_id] target_activites = [x for x in activities if x['activity_id'] in target_activity_ids] target_upids = [] targ_comp = protein_targets[target_chembl_id]['target_components'] for t_c in targ_comp: target_upids.append(t_c['accession']) evidence = [] for assay in target_activites: ev = get_evidence(assay) if not ev: continue evidence.append(ev) if len(evidence) > 0: for target_upid in target_upids: agent_name = uniprot_client.get_gene_name(target_upid) target_agent = Agent(agent_name, db_refs={'UP': target_upid}) st = Inhibition(drug, target_agent, evidence=evidence) stmts.append(st) return stmts
def get_unannotated_antibody_genes(data): """Return the gene names corresponding to unannotated ABs.""" all_genes = [] for k, v in unannotated_ab_map.items(): up_ids = v.split(',') for up_id in up_ids: gene_name = uniprot_client.get_gene_name(up_id) all_genes.append(gene_name) return sorted(list(set(all_genes)))
def load_brca_sites(): filename = 'sources/Merged_dataset_normalized_subset.csv' sites = set([]) for row in read_unicode_csv(filename, skiprows=1): entry_info = row[0] site_info = entry_info.split('_')[1] up_id = row[-1] gene_name = uniprot_client.get_gene_name(up_id) sites.add((gene_name, site_info)) return sites
def get_drug_targets(fname='drug_grounding.csv'): df = pandas.read_csv(fname, index_col=None, header=None) abbrevs = df[1] target_upids = df[6] targets = {} for abb, tupid in zip(abbrevs, target_upids): targets[abb] = [ uniprot_client.get_gene_name(ui) for ui in tupid.split(',') ] return targets
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': up_id = id db_refs = {'UP': up_id} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id name = hgnc_client.get_hgnc_name(hgnc_id) else: name = uniprot_client.get_gene_name(up_id) # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = { database: id } # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex name = ent_name famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id name = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase', 'DRUGBANK'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() db_refs = {gnd_type: id} name = ent_name # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} return Agent(name, db_refs=db_refs)
def protein_map_from_twg(twg): """Build map of entity texts to validate protein grounding. Looks at the grounding of the entity texts extracted from the statements and finds proteins where there is grounding to a human protein that maps to an HGNC name that is an exact match to the entity text. Returns a dict that can be used to update/expand the grounding map. Parameters ---------- twg : list of tuple list of tuples of the form output by agent_texts_with_grounding Returns ------- protein_map : dict dict keyed on agent text with associated values {'TEXT': agent_text, 'UP': uniprot_id}. Entries are for agent texts where the grounding map was able to find human protein grounded to this agent_text in Uniprot. """ protein_map = {} unmatched = 0 matched = 0 logger.info('Building grounding map for human proteins') for agent_text, grounding_list, _ in twg: # If 'UP' (Uniprot) not one of the grounding entries for this text, # then we skip it. if 'UP' not in [entry[0] for entry in grounding_list]: continue # Otherwise, collect all the Uniprot IDs for this protein. uniprot_ids = [entry[1] for entry in grounding_list if entry[0] == 'UP'] # For each Uniprot ID, look up the species for uniprot_id in uniprot_ids: # If it's not a human protein, skip it mnemonic = uniprot_client.get_mnemonic(uniprot_id) if mnemonic is None or not mnemonic.endswith('_HUMAN'): continue # Otherwise, look up the gene name in HGNC and match against the # agent text gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: unmatched += 1 continue if agent_text.upper() == gene_name.upper(): matched += 1 protein_map[agent_text] = {'TEXT': agent_text, 'UP': uniprot_id} else: unmatched += 1 logger.info('Exact matches for %d proteins' % matched) logger.info('No match (or no gene name) for %d proteins' % unmatched) return protein_map
def get_drug_targets(fname=None): if not fname: fname = drug_grounding_file df = pandas.read_csv(fname, index_col=None, header=None, encoding='utf-8') abbrevs = df[1] target_upids = df[6] targets = {} for abb, tupid in zip(abbrevs, target_upids): targets[abb] = [uniprot_client.get_gene_name(ui) for ui in tupid.split(',')] return targets
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] agent.db_refs = db_refs_tmp # Check if we have a BE entry be_id = agent.db_refs.get('BE') # Try to map to BE from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = bioentities_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # BE takes precedence if we have it if be_id: agent.db_refs['BE'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id
def rename_agents(self, stmts): """Return a list of mapped statements with updated agent names. Creates a new list of statements without modifying the original list. The agents in a statement should be renamed if the grounding map has updated their db_refs. If an agent contains a FamPlex grounding, the FamPlex ID is used as a name. Otherwise if it contains a Uniprot ID, an attempt is made to find the associated HGNC gene name. If one can be found it is used as the agent name and the associated HGNC ID is added as an entry to the db_refs. If neither a FamPlex ID or HGNC name can be found, falls back to the original name. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` List of statements whose Agents need their names updated. Returns ------- mapped_stmts : list of :py:class:`indra.statements.Statement` A new list of Statements with updated Agent names """ # Make a copy of the stmts mapped_stmts = deepcopy(stmts) # Iterate over the statements for _, stmt in enumerate(mapped_stmts): # Iterate over the agents for agent in stmt.agent_list(): if agent is None: continue # If there's a FamPlex ID, prefer that for the name if agent.db_refs.get('FPLX'): agent.name = agent.db_refs.get('FPLX') # Take a HGNC name from Uniprot next elif agent.db_refs.get('UP'): # Try for the gene name gene_name = uniprot_client.get_gene_name( agent.db_refs.get('UP'), web_fallback=False) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # Take the text string #if agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') # If this fails, then we continue with no change # Fall back to the text string #elif agent.db_refs.get('TEXT'): # agent.name = agent.db_refs.get('TEXT') return mapped_stmts
def _get_agent(self, ent_name, ent_type, id, database): # Returns a list of agents corresponding to this id # (If it is a signor complex, returns an Agent object with complex # constituents as BoundConditions if database == 'SIGNOR' and id in self.complex_map: components = self.complex_map[id] agents = self._get_complex_agents(id) # Return the first agent with the remaining agents as a bound # condition agent = agents[0] agent.bound_conditions = \ [BoundCondition(a, True) for a in agents[1:]] return agent else: gnd_type = _type_db_map[(ent_type, database)] if gnd_type == 'UP': up_id = id db_refs = {'UP': up_id} name = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(name) if hgnc_id: db_refs['HGNC'] = hgnc_id # Map SIGNOR protein families to FamPlex families elif ent_type == 'proteinfamily': db_refs = {database: id} # Keep the SIGNOR family ID in db_refs key = (database, id) # Use SIGNOR name unless we have a mapping in FamPlex name = ent_name famplex_id = famplex_map.get(key) if famplex_id is None: logger.info('Could not find %s in FamPlex map' % str(key)) else: db_refs['FPLX'] = famplex_id name = famplex_id # Other possible groundings are PUBCHEM, SIGNOR, etc. elif gnd_type is not None: if database not in ('PUBCHEM', 'SIGNOR', 'ChEBI', 'miRBase'): raise ValueError('Unexpected database %s' % database) if database == 'PUBCHEM' and id.startswith('CID:'): # We take off the CID: prefix plus fix an issue with # SIGNOR's format in which it leaves extra spaces around # the ID, as in 'CID: 923' id = id[4:].strip() db_refs = {gnd_type: id} name = ent_name # If no grounding, include as an untyped/ungrounded node else: name = ent_name db_refs = {} return Agent(name, db_refs=db_refs)
def fix_protein_grounding(agent): for k, v in agent.db_refs.items(): agent.db_refs.pop(k, None) agent.db_refs[k.upper()] = v if not agent.db_refs.get('TEXT'): agent.db_refs['TEXT'] = agent.name up_id = agent.db_refs.get('UP') if up_id: up_id = up_id.split('-')[0] agent.db_refs['UP'] = up_id hgnc_symbol = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if hgnc_id: agent.name = hgnc_symbol agent.db_refs['HGNC'] = hgnc_id
def get_genes_for_go_ids(go_ids, goa): """Return genes that are annotated with a given go ID or its children.""" all_go_ids = set() for go_id in go_ids: children_go_ids = { ch[1] for ch in bio_ontology.get_children('GO', go_id) } all_go_ids.add(go_id) all_go_ids |= children_go_ids df = goa[goa['GO_ID'].isin(all_go_ids)] up_ids = sorted(list(set(df['DB_ID']))) gene_names = [uniprot_client.get_gene_name(up_id) for up_id in up_ids] gene_names = {g for g in gene_names if g} return gene_names
def _agent_from_id(db_id): # There are some Ensembl protein IDs which we currently can't normalize # to anything else (unlike ENSG). if db_id.startswith('ENSP'): db_refs = {'ENSEMBL': db_id} name = db_id # All other entries are UniProt IDs else: name = uniprot_client.get_gene_name(db_id) if not name: return None db_refs = {'UP': db_id} hgnc_id = uniprot_client.get_hgnc_id(db_id) if hgnc_id: db_refs['HGNC'] = hgnc_id return Agent(name, db_refs=db_refs)
def get_name(bpe): # FIXME Deal with case when HGNC entry is not name # Deal with case when multiple Uniprot IDs marked as # primary hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if hgnc_id is not None: name = BiopaxProcessor._get_hgnc_name(hgnc_id) if name is None: name = bpe.getDisplayName() elif uniprot_id is not None: name = uniprot_client.get_gene_name(uniprot_id) if name is None: name = bpe.getDisplayName() else: name = bpe.getDisplayName() return name
def get_antibody_map(data): phos_ab_map = get_phospho_antibody_map() ab_map = {} for _, row in data['antibody'].iterrows(): ab_name = row['Protein Data ID'] if ab_name in phos_ab_map: continue upids = row['UniProt ID'].split(',') for upid in upids: hgnc_symbol = uniprot_client.get_gene_name(upid) hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) target = Agent(hgnc_symbol, db_refs={'UP': upid, 'HGNC': hgnc_id}) try: ab_map[ab_name].append(target) except KeyError: ab_map[ab_name] = [target] ab_map.update(phos_ab_map) return ab_map
def get_all_enzymes(): HOME = str(Path.home()) ec_code_path = '.obo/ec-code/ec-code.obo' if not os.path.exists(os.path.join(HOME, ec_code_path)): _ = pyobo.get_id_name_mapping('ec-code') obo = obonet.read_obo(os.path.join(HOME, ec_code_path)) else: obo = obonet.read_obo(os.path.join(HOME, ec_code_path)) up_nodes = set() for node in obo.nodes: if node.startswith('uniprot'): up_nodes.add(node[8:]) human_ups = {u for u in up_nodes if uniprot_client.is_human(u)} enzymes = {uniprot_client.get_gene_name(u) for u in human_ups} enzymes = {g for g in enzymes if not hgnc_client.is_kinase(g)} enzymes = {g for g in enzymes if not hgnc_client.is_phosphatase(g)} logger.info(f'Filtered {len(enzymes)} enzymes in total') return enzymes
def get_all_gene_names(data, out_file='prior_genes.txt'): """Return all gene names corresponding to all ABs.""" filt = pandas.notnull(data['antibody']['Protein Data ID']) data_filt = data['antibody'][filt] gene_names = data_filt['Gene Name'] uniprot_ids = data_filt['UniProt ID'] all_genes = set() invalid_genes = set() for gn, upid in zip(gene_names, uniprot_ids): # Some entries are lists of genes separated by commas # and we also strip off extra spaces names = [x.strip() for x in gn.split(',')] ids = [x.strip() for x in upid.split(',')] names_from_ids = [uniprot_client.get_gene_name(x) for x in ids] # Find invalid gene names for name in names: if not hgnc_client.get_hgnc_id(name): print('Invalid or deprecated gene symbol: %s' % name) invalid_genes.add(name) # Find inconsistent gene names and UniProt IDs if set(names) != set(names_from_ids): print('Inconsistent entries:') print('- Given gene names: %s' % ','.join(names)) print('- Genes from uniprot IDs: %s' % ','.join(names_from_ids)) # Add both the gene names and the gene names derived from UniProt IDs all_genes = all_genes.union(set(names)).union(set(names_from_ids)) # Finally remove the invalid gene names all_genes = list(all_genes.difference(invalid_genes)) # Add the unannotated genes unannotated_ab_genes = get_unannotated_antibody_genes(data) all_genes += unannotated_ab_genes # Add drug target genes drug_targets = get_drug_targets() for targets in drug_targets.values(): all_genes += targets # Add other important genes, for now, the RAS pathway all_genes += get_ras227_genes() all_genes = sorted(list(set(all_genes))) print('%d genes in total' % len(all_genes)) with open(out_file, 'wb') as fh: for gene in all_genes: fh.write(('%s\n' % gene).encode('utf-8')) return all_genes
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) node_name = node['n'] up_id = cx_db_refs.get('UP') if up_id: db_refs = {'UP': up_id, 'TEXT': node_name} hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id gene_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def _initialize_node_agents(self): """Initialize internal dicts containing node information.""" nodes = _get_dict_from_list('nodes', self.cx) invalid_genes = [] for node in nodes: id = node['@id'] cx_db_refs = self.get_aliases(node) up_id = cx_db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) hgnc_id = hgnc_client.get_hgnc_id(gene_name) db_refs = {'UP': up_id, 'HGNC': hgnc_id, 'TEXT': gene_name} agent = Agent(gene_name, db_refs=db_refs) self._node_names[id] = gene_name self._node_agents[id] = agent continue else: node_name = node['n'] self._node_names[id] = node_name hgnc_id = hgnc_client.get_hgnc_id(node_name) db_refs = {'TEXT': node_name} if not hgnc_id: if not self.require_grounding: self._node_agents[id] = \ Agent(node_name, db_refs=db_refs) invalid_genes.append(node_name) else: db_refs.update({'HGNC': hgnc_id}) up_id = hgnc_client.get_uniprot_id(hgnc_id) # It's possible that a valid HGNC ID will not have a # Uniprot ID, as in the case of HOTAIR (HOX transcript # antisense RNA, HGNC:33510) if up_id: db_refs.update({'UP': up_id}) self._node_agents[id] = Agent(node_name, db_refs=db_refs) if invalid_genes: verb = 'Skipped' if self.require_grounding else 'Included' logger.info('%s invalid gene symbols: %s' % (verb, ', '.join(invalid_genes)))
def get_grounding(self): import indra.databases.hgnc_client as hgc import indra.databases.uniprot_client as upc be = self.db_refs.get('FPLX') if be: return ('FPLX', be) hgnc = self.db_refs.get('HGNC') if hgnc: if isinstance(hgnc, list): hgnc = hgnc[0] return ('HGNC', hgc.get_hgnc_name(str(hgnc))) up = self.db_refs.get('UP') if up: if isinstance(up, list): up = up[0] if upc.is_human(up): gene_name = upc.get_gene_name(up, web_fallback=False) if gene_name: return ('HGNC', gene_name) else: return ('UP', up) return (None, None)
def _get_name_by_id(self, entity_id): entity_term = self.tree.find("TERM/[@id='%s']" % entity_id) name = entity_term.find("name") if name is None: warnings.warn('Entity without a name') return '' try: dbid = entity_term.attrib["dbid"] except: warnings.warn('No grounding information for %s' % name.text) return self._get_valid_component_name(name.text) dbids = dbid.split('|') hgnc_ids = [i for i in dbids if i.startswith('HGNC')] up_ids = [i for i in dbids if i.startswith('UP')] #TODO: handle protein families like 14-3-3 with IDs like # XFAM:PF00244.15, FA:00007 if hgnc_ids: if len(hgnc_ids) > 1: warnings.warn('%d HGNC IDs reported.' % len(hgnc_ids)) hgnc_id = re.match(r'HGNC\:([0-9]*)', hgnc_ids[0]).groups()[0] hgnc_name = self._get_hgnc_name(hgnc_id) return self._get_valid_component_name(hgnc_name) elif up_ids: if len(hgnc_ids) > 1: warnings.warn('%d UniProt IDs reported.' % len(up_ids)) up_id = re.match(r'UP\:([A-Z0-9]*)', up_ids[0]).groups()[0] up_rdf = up_client.query_protein(up_id) # First try to get HGNC name hgnc_name = up_client.get_hgnc_name(up_rdf) if hgnc_name is not None: return self._get_valid_component_name(hgnc_name) # Next, try to get the gene name gene_name = up_client.get_gene_name(up_rdf) if gene_name is not None: return self._get_valid_component_name(gene_name) # By default, return the text of the name tag name_txt = name.text.strip('|') return self._get_valid_component_name(name_txt)
def _get_element_name(bpe): if _is_protein(bpe): hgnc_id = BiopaxProcessor._get_hgnc_id(bpe) uniprot_id = BiopaxProcessor._get_uniprot_id(bpe) if hgnc_id is not None: name = BiopaxProcessor._get_hgnc_name(hgnc_id) if name is None: name = bpe.getDisplayName() elif uniprot_id is not None: name = uniprot_client.get_gene_name(uniprot_id) if name is None: name = bpe.getDisplayName() else: name = bpe.getDisplayName() elif _is_small_molecule(bpe): name = bpe.getDisplayName() elif _is_physical_entity(bpe): name = bpe.getDisplayName() else: logger.info('Unhandled entity type %s' % bpe.getModelInterface().getName()) name = bpe.getDisplayName() return name
def _get_agent_from_ref(self, ref): # TODO: handle collections if ref.attrib.get('category') == 'collection': #logger.warning('Skipping collection Agent.') return None # Find the name, uid and raw-text tags first and get their text # content if available uid_tag = ref.find("var/[@name='uid']") name_tag = ref.find("var/[@name='name']") text_tag = ref.find("var/[@name='raw-text']") if name_tag is not None and name_tag.text: name = name_tag.text else: name = None if uid_tag is not None and uid_tag.text: uid = uid_tag.text else: uid = None if text_tag is not None and text_tag.text: raw_text = text_tag.text else: raw_text = None # TODO: factor this out and reuse fix_agents db_refs = {} # Save raw text if available if raw_text: db_refs['TEXT'] = raw_text agent_name = raw_text # If we have a proper UID then we try to reconstruct an Agent from that if uid is not None and len(uid.split(':')) == 2: db_ns, db_id = uid.split(':') be_id = famplex_map.get((db_ns, db_id)) if be_id: db_refs[db_ns] = db_id db_refs['FPLX'] = be_id agent_name = be_id elif db_ns in ['UP', 'Uniprot']: db_refs['UP'] = db_id gene_name = uniprot_client.get_gene_name(db_id) if gene_name: agent_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'NCIT': db_refs['NCIT'] = db_id target = ncit_map.get(db_id) if target: db_refs[target[0]] = target[1] if target[0] == 'HGNC': up_id = hgnc_client.get_uniprot_id(target[1]) agent_name = hgnc_client.get_hgnc_name(target[1]) if up_id: db_refs['UP'] = up_id elif target[0] == 'UP': agent_name = uniprot_client.get_gene_name(target[1]) if agent_name: hgnc_id = hgnc_client.get_hgnc_id(agent_name) if hgnc_id: db_refs['HGNC'] = hgnc_id elif db_ns == 'FA': db_refs['NXP'] = 'FA:' + db_id elif db_ns == 'XFAM': db_refs['PF'] = db_id.split('.')[0] elif db_ns == 'CHEBI': db_refs['CHEBI'] = 'CHEBI:' + db_id elif db_ns in ['GO', 'MESH', 'FPLX']: db_refs[db_ns] = db_id # Handle old BE mappings and add them as FPLX elif db_ns == 'BE': db_refs['FPLX'] = db_id elif db_ns in ['PR', 'CO', 'CVCL', 'EFO', 'ORPHANET']: db_refs[db_ns] = db_id else: logger.warning('Unknown database name space %s' % db_ns) if not agent_name: if raw_text is not None: agent_name = raw_text else: return None assert(agent_name) agent = Agent(agent_name, db_refs=db_refs) return agent
def test_get_gene_name_nonhuman(): gene_name = uniprot_client.get_gene_name('P31938') assert gene_name == 'Map2k1' assert unicode_strs(gene_name)
def test_get_gene_name_human(): gene_name = uniprot_client.get_gene_name('P00533') assert gene_name == 'EGFR' assert unicode_strs(gene_name)
def map_agents(self, stmts, do_rename=True): # Make a copy of the stmts mapped_stmts = [] num_skipped = 0 # Iterate over the statements for stmt in stmts: mapped_stmt = deepcopy(stmt) # Iterate over the agents skip_stmt = False for agent in mapped_stmt.agent_list(): if agent is None or agent.db_refs.get('TEXT') is None: continue agent_text = agent.db_refs.get('TEXT') # Look this string up in the grounding map # If not in the map, leave agent alone and continue try: map_db_refs = self.gm[agent_text] except KeyError: continue # If it's in the map but it maps to None, then filter out # this statement by skipping it if map_db_refs is None: # Increase counter if this statement has not already # been skipped via another agent if not skip_stmt: num_skipped += 1 logger.debug("Skipping %s" % agent_text) skip_stmt = True # If it has a value that's not None, map it and add it else: # Otherwise, update the agent's db_refs field gene_name = None map_db_refs = deepcopy(self.gm.get(agent_text)) up_id = map_db_refs.get('UP') hgnc_sym = map_db_refs.get('HGNC') if up_id and not hgnc_sym: gene_name = uniprot_client.get_gene_name(up_id, False) if gene_name: hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: map_db_refs['HGNC'] = hgnc_id elif hgnc_sym and not up_id: # Override the HGNC symbol entry from the grounding # map with an HGNC ID hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if hgnc_id: map_db_refs['HGNC'] = hgnc_id # Now get the Uniprot ID for the gene up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: map_db_refs['UP'] = up_id # If there's no HGNC ID for this symbol, raise an # Exception else: raise ValueError('No HGNC ID corresponding to gene ' 'symbol %s in grounding map.' % hgnc_sym) # If we have both, check the gene symbol ID against the # mapping from Uniprot elif up_id and hgnc_sym: # Get HGNC Symbol from Uniprot gene_name = uniprot_client.get_gene_name(up_id) if not gene_name: raise ValueError('No gene name found for Uniprot ' 'ID %s (expected %s)' % (up_id, hgnc_sym)) # We got gene name, compare it to the HGNC name else: if gene_name != hgnc_sym: raise ValueError('Gene name %s for Uniprot ID ' '%s does not match HGNC ' 'symbol %s given in grounding ' 'map.' % (gene_name, up_id, hgnc_sym)) else: hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym) if not hgnc_id: raise ValueError('No HGNC ID ' 'corresponding to gene ' 'symbol %s in grounding ' 'map.' % hgnc_sym) # Assign the DB refs from the grounding map to the agent agent.db_refs = map_db_refs # Are we renaming right now? if do_rename: # If there's a Bioentities ID, prefer that for the name if agent.db_refs.get('BE'): agent.name = agent.db_refs.get('BE') # Get the HGNC symbol or gene name (retrieved above) elif hgnc_sym is not None: agent.name = hgnc_sym elif gene_name is not None: agent.name = gene_name # Check if we should skip the statement if not skip_stmt: mapped_stmts.append(mapped_stmt) logger.info('%s statements filtered out' % num_skipped) return mapped_stmts
def get_agent_from_entity_info(entity_info): """Return an INDRA Agent by processing an entity_info dict.""" # This will be the default name. If we get a gene name, it will # override this rawtext name. raw_text = entity_info['entityText'] name = raw_text # Get the db refs. refs = {'TEXT': raw_text} ref_counts = Counter([entry['source'] for entry in entity_info['entityId']]) for source, count in ref_counts.items(): if source in ('Entrez', 'UniProt') and count > 1: logger.info('%s has %d entries for %s, skipping' % (raw_text, count, source)) return None, None muts = [] for id_dict in entity_info['entityId']: if id_dict['source'] == 'Entrez': refs['EGID'] = id_dict['idString'] hgnc_id = hgnc_client.get_hgnc_from_entrez(id_dict['idString']) if hgnc_id is not None: # Check against what we may have already inferred from # UniProt. If it disagrees with this, let it be. Inference # from Entrez isn't as reliable. if 'HGNC' in refs.keys(): if refs['HGNC'] != hgnc_id: msg = ('HGNC:%s previously set does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], hgnc_id, refs['EGID']) logger.info(msg) else: refs['HGNC'] = hgnc_id elif id_dict['source'] == 'UniProt': refs['UP'] = id_dict['idString'] gene_name = uniprot_client.get_gene_name(id_dict['idString']) if gene_name is not None: name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id is not None: # Check to see if we have a conflict with an HGNC id # found from the Entrez id. If so, overwrite with this # one, in which we have greater faith. if 'HGNC' in refs.keys() and refs['HGNC'] != hgnc_id: msg = ('Inferred HGNC:%s from UP:%s does not' ' match HGNC:%s from EGID:%s') % \ (refs['HGNC'], refs['UP'], hgnc_id, refs['EGID']) logger.info(msg) refs['HGNC'] = hgnc_id elif id_dict['source'] in ('Tax', 'NCBI'): refs['TAX'] = id_dict['idString'] elif id_dict['source'] == 'CHEBI': refs['CHEBI'] = 'CHEBI:%s' % id_dict['idString'] # These we take as is elif id_dict['source'] in ('MESH', 'OMIM', 'CTD'): refs[id_dict['source']] = id_dict['idString'] # Handle mutations elif id_dict['source'] == 'Unk' and \ id_dict['entityType'] == 'ProteinMutation': # {'idString': 'p|SUB|Y|268|A', 'source': 'Unk', # 'tool': 'PubTator', 'entityType': 'ProteinMutation'} # Mpk1(Y268A)' if id_dict['idString'].startswith('p|SUB|'): try: # Handle special cases like p|SUB|A|30|P;RS#:104893878 parts = id_dict['idString'].split(';')[0].split('|') residue_from, pos, residue_to = parts[2:5] mut = MutCondition(pos, residue_from, residue_to) muts.append(mut) except Exception as e: logger.info('Could not process mutation %s' % id_dict['idString']) else: logger.info('Unhandled mutation: %s' % id_dict['idString']) else: logger.warning("Unhandled id type: {source}={idString}" .format(**id_dict)) raw_coords = (entity_info['charStart'], entity_info['charEnd']) return Agent(name, db_refs=refs, mutations=muts), raw_coords
def test_get_gene_name_no_gene_name(): gene_name = uniprot_client.get_gene_name('P04434', web_fallback=False) assert gene_name is None gene_name = uniprot_client.get_gene_name('P04434', web_fallback=True) assert gene_name is None
def test_get_gene_name_multiple_gene_names(): gene_name = uniprot_client.get_gene_name('Q5VWM5') assert gene_name == 'PRAMEF9'
def get_participant(agent): # Handle missing Agent as generic protein if agent is None: return get_generic('protein') # The Agent is not missing text_name = agent.db_refs.get('TEXT') if text_name is None: text_name = agent.name participant = {} participant['entity_text'] = [text_name] hgnc_id = agent.db_refs.get('HGNC') uniprot_id = agent.db_refs.get('UP') chebi_id = agent.db_refs.get('CHEBI') pfam_def_ids = agent.db_refs.get('PFAM-DEF') # If HGNC grounding is available, that is the first choice if hgnc_id: uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if uniprot_id: uniprot_mnemonic = str(uniprot_client.get_mnemonic(uniprot_id)) participant['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic participant['entity_type'] = 'protein' elif chebi_id: pubchem_id = chebi_client.get_pubchem_id(chebi_id) participant['identifier'] = 'PUBCHEM:%s' % pubchem_id participant['entity_type'] = 'chemical' elif pfam_def_ids: participant['entity_type'] = 'protein_family' participant['entities'] = [] pfam_def_list = [] for p in pfam_def_ids.split('|'): dbname, dbid = p.split(':') pfam_def_list.append({dbname: dbid}) for pdi in pfam_def_list: # TODO: handle non-uniprot protein IDs here uniprot_id = pdi.get('UP') if uniprot_id: entity_dict = {} uniprot_mnemonic = \ str(uniprot_client.get_mnemonic(uniprot_id)) gene_name = uniprot_client.get_gene_name(uniprot_id) if gene_name is None: gene_name = "" entity_dict['entity_text'] = [gene_name] entity_dict['identifier'] = 'UNIPROT:%s' % uniprot_mnemonic entity_dict['entity_type'] = 'protein' participant['entities'].append(entity_dict) else: participant['identifier'] = '' participant['entity_type'] = 'protein' features = [] not_features = [] # Binding features for bc in agent.bound_conditions: feature = { 'feature_type': 'binding_feature', 'bound_to': { # NOTE: get type and identifier for bound to protein 'entity_type': 'protein', 'entity_text': [bc.agent.name], 'identifier': '' } } if bc.is_bound: features.append(feature) else: not_features.append(feature) # Modification features for mc in agent.mods: feature = { 'feature_type': 'modification_feature', 'modification_type': mc.mod_type.lower(), } if mc.position is not None: pos = int(mc.position) feature['location'] = pos if mc.residue is not None: feature['aa_code'] = mc.residue if mc.is_modified: features.append(feature) else: not_features.append(feature) # Mutation features for mc in agent.mutations: feature = {} feature['feature_type'] = 'mutation_feature' if mc.residue_from is not None: feature['from_aa'] = mc.residue_from if mc.residue_to is not None: feature['to_aa'] = mc.residue_to if mc.position is not None: pos = int(mc.position) feature['location'] = pos features.append(feature) if features: participant['features'] = features if not_features: participant['not_features'] = not_features return participant
def _fix_agent(agent): if agent is None: return # First we fix some name spaces db_refs_tmp = copy(agent.db_refs) for db_ns, db_id in agent.db_refs.items(): # Change FA name space if db_ns == 'FA': db_refs_tmp.pop('FA', None) db_refs_tmp['NXPFA'] = db_id # Change IPR name space elif db_ns == 'IPR': db_refs_tmp.pop('IPR', None) db_refs_tmp['IP'] = db_id # Change XFAM name space elif db_ns == 'XFAM': db_refs_tmp.pop('XFAM', None) db_refs_tmp['PF'] = db_id.split('.')[0] elif db_ns == 'GO': if db_id.startswith('GO:'): db_refs_tmp['GO'] = db_id else: db_refs_tmp['GO'] = 'GO:' + db_id # Change PCID name space elif db_ns == 'PCID': db_refs_tmp.pop('PCID', None) db_refs_tmp['PUBCHEM'] = db_id agent.db_refs = db_refs_tmp # Check if we have a FPLX entry and handle old BE mappings if 'BE' in agent.db_refs: agent.db_refs['FPLX'] = agent.db_refs.pop('BE') be_id = agent.db_refs.get('FPLX') # Try to map to FPLX from NXP, IPR, PF, NCIT if not be_id: for db_ns, db_id in agent.db_refs.items(): be_id = famplex_map.get((db_ns, db_id)) if be_id: break # Try mapping NCIT to specific genes if possible if not be_id and 'NCIT' in agent.db_refs: target = ncit_map.get(agent.db_refs['NCIT']) if target: agent.db_refs[target[0]] = target[1] # Check what entries we have up_id = agent.db_refs.get('UP') hgnc_id = agent.db_refs.get('HGNC') # FPLX takes precedence if we have it if be_id: agent.db_refs['FPLX'] = be_id agent.name = be_id elif hgnc_id: gene_name = hgnc_client.get_hgnc_name(hgnc_id) if gene_name: agent.name = gene_name if not up_id: up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: agent.db_refs['UP'] = up_id elif up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: agent.name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: agent.db_refs['HGNC'] = hgnc_id # If it doesn't have a gene name, it's better to just # use the raw string name otherwise Sparser sets # has Uniprot IDs or mnemonics as the name else: name = agent.db_refs.get('TEXT', agent.name) agent.name = name
def test_get_gene_name_unreviewed(): gene_name = uniprot_client.get_gene_name('X6RK18', web_fallback=False) assert gene_name == 'EXO5' assert unicode_strs(gene_name)