def parse_context_entry(entry, grounder, sentence=None): """Return a dict of context type and object processed from an entry.""" match = re.match(r'(.*): (.*)', entry) if not match: return None context_type, context_txt = match.groups() if context_type not in allowed_contexts: logger.warning('Unknown context type %s' % context_type) return None terms = grounder(context_txt, context=sentence) if not terms: logger.warning('Could not ground %s context: %s' % (context_type, context_txt)) db_refs = {} if terms: db_refs = standardize_db_refs({terms[0].term.db: terms[0].term.id}) db_refs['TEXT'] = context_txt standard_name = None if terms: standard_name = bio_ontology.get_name(terms[0].term.db, terms[0].term.id) name = standard_name if standard_name else context_txt context = RefContext(name=name, db_refs=db_refs) return {allowed_contexts[context_type]: context}
def test_lspci(): assert bio_ontology.get_name('LSPCI', '18') == 'Pentane-1,5-Diamine' members = bio_ontology.get_children('LSPCI', '18') # These are some of the members, not all expected_members = {('CAS', '462-94-2'), ('CHEBI', 'CHEBI:18127'), ('CHEMBL', 'CHEMBL119296'), ('PUBCHEM', '273')} assert expected_members < set(members)
def get_categories(fplx_node): """Return category labels for a given protein family ontology node.""" children = bio_ontology.get_children(*bio_ontology.get_ns_id(fplx_node), ns_filter='HGNC') children_names = {bio_ontology.get_name(*ch) for ch in children} child_categories = {categories[name] for name in children_names if name in categories} return child_categories
def get_category(node): """Return a category label for a given specific protein ontology node.""" name = bio_ontology.get_name(*bio_ontology.get_ns_id(node)) category = categories.get(name) if category: category_node = category_map[category] return category_node return None
def test_mesh_replacements(): assert bio_ontology.get_name('MESH', 'D000086382') == 'COVID-19' assert bio_ontology.isrel('MESH', 'C000657245', 'MESH', 'D000086382', {'replaced_by'}) assert bio_ontology.get_replacement('MESH', 'C000657245') == \ ('MESH', 'D000086382') assert standardize_db_refs({'MESH': 'C000657245'}).get('MESH') == \ 'D000086382'
def get_go_receptors(): receptor_terms = ['signaling receptor activity'] receptor_go_ids = [ bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms ] receptor_go_ids = expand_with_child_go_terms(receptor_go_ids) # Filtering out the nuclear receptors from the receptor list receptor_go_ids = { r for r in receptor_go_ids if 'receptor' in bio_ontology.get_name('GO', r) or 'sensor' in bio_ontology.get_name('GO', r) or 'channel' in bio_ontology.get_name('GO', r) } nuclear_receptor_go_ids = expand_with_child_go_terms(['GO:0004879']) receptor_genes_go = get_genes_for_go_ids(receptor_go_ids) - \ get_genes_for_go_ids(nuclear_receptor_go_ids) receptor_genes_go -= {'NR2C2', 'EGF'} return receptor_genes_go
def get_binding_site_name(agent): """Return a binding site name from a given agent.""" # Try to construct a binding site name based on parent grounding = agent.get_grounding() if grounding != (None, None): top_parents = bio_ontology.get_top_level_parents(*grounding) if top_parents: parent_name = bio_ontology.get_name(*top_parents[0]) if parent_name: return _n(parent_name).lower() return _n(agent.name).lower()
def rename_chemical(agent): from indra.ontology.bio import bio_ontology if agent.db_refs['CHEBI'] in chebi_to_selleck_dict: selleckname = chebi_to_selleck_dict[agent.db_refs['CHEBI']] if selleckname: agent.name = selleckname return for db_ns, db_id in sorted(agent.db_refs.items()): if db_ns in {'TEXT', 'TEXT_NORM', 'CHEBI'}: continue name = bio_ontology.get_name(db_ns, db_id) if name: agent.name = name return
def get_binding_site_name(agent): """Return a binding site name from a given agent.""" # Try to construct a binding site name based on parent grounding = agent.get_grounding() # We don't want to accidentally deal with very deep ontological # cases here such as CHEBI (e.g., GTP) which requires thousands # of lookups to resolve if grounding != (None, None) and grounding[0] in {'HGNC', 'FPLX'}: top_parents = bio_ontology.get_top_level_parents(*grounding) if top_parents: parent_name = bio_ontology.get_name(*top_parents[0]) if parent_name: return _n(parent_name).lower() return _n(agent.name).lower()
def _require_agent(self, ag, ns, num=None): if not self.strict: return if ns in ['NAME', 'FPLX', ]: name = ag elif ns != 'TEXT': name = bio_ontology.get_name(ns, ag) else: # If the namespace is TEXT, what do we do? return self.agent_set.add(name) if num is not None: self.agent_dict[num] = name return
def applicable(model, test): """Return True of all test entities are in the set of model entities""" model_entities = model.entities test_entities = test.get_entities() test_entity_groups = [] for te in test_entities: te_group = [te] ns, gr = te.get_grounding() children = bio_ontology.get_children(ns, gr) for ns, gr in children: name = bio_ontology.get_name(ns, gr) ag = Agent(name, db_refs={ns: gr}) te_group.append(ag) test_entity_groups.append(te_group) return RefinementTestConnector._overlap(model_entities, test_entity_groups)
def _make_famplex_lookup(): """Create a famplex lookup dictionary. Keys are sorted tuples of HGNC gene names and values are the corresponding FamPlex ID. """ fplx_lookup = {} bio_ontology.initialize() for node in bio_ontology.nodes: ns, id = bio_ontology.get_ns_id(node) if ns == 'FPLX': children = bio_ontology.get_children(ns, id) hgnc_children = [ bio_ontology.get_name(*c) for c in children if c[0] == 'HGNC' ] fplx_lookup[tuple(sorted(hgnc_children))] = id return fplx_lookup
def _add_node(self, agent, uuid=None): node_key = agent.name node_id = self._existing_nodes.get(node_key) # if the node already exists we do not want to add it again # we must however add its uuid if node_id is not None: # fetch the appropriate node n = [x for x in self._nodes if x['data']['id'] == node_id][0] uuid_list = n['data']['uuid_list'] if uuid not in uuid_list: uuid_list.append(uuid) return node_id db_refs = _get_db_refs(agent) node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node_name = agent.name node_name = node_name.replace('_', ' ') if 'FPLX' in db_refs: expanded_families = bio_ontology.get_children( *agent.get_grounding(), ns_filter={'HGNC'}) else: expanded_families = [] members = {} for member in expanded_families: member_db_refs = {member[0]: member[1]} member_db_refs = standardize_db_refs(member_db_refs) gene_name = bio_ontology.get_name(*member) members[gene_name] = {'db_refs': {}} for dbns, dbid in member_db_refs.items(): url = get_identifiers_url(dbns, dbid) if url: members[gene_name]['db_refs'][dbns] = url node = { 'data': { 'id': node_id, 'name': node_name, 'db_refs': db_refs, 'parent': '', 'members': members, 'uuid_list': [uuid] } } self._nodes.append(node) return node_id
def get_pain_mol(): PAIN_SIGNAL_MOL = { "Prostaglandins": "CHEBI:26333", "Brandykinin": "CHEBI:3165" } CHEBI_LIST = {} CHEBI_NAMES = {} for compounds, chebi_id in PAIN_SIGNAL_MOL.items(): CHEBI_LIST[compounds] = \ [children[1] for children in bio_ontology.get_children('CHEBI', chebi_id)] CHEBI_NAMES[compounds] = \ [bio_ontology.get_name('CHEBI', ids) for ids in CHEBI_LIST[compounds]] return CHEBI_NAMES
def get_receptors(): receptor_terms = ['signaling receptor activity'] receptor_go_ids = [ bio_ontology.get_id_from_name('GO', term)[1] for term in receptor_terms ] receptor_go_ids = expand_with_child_go_terms(receptor_go_ids) # Filtering out the nuclear receptors from the receptor list receptor_go_ids = {r for r in receptor_go_ids if 'receptor' in bio_ontology.get_name('GO', r)} - \ expand_with_child_go_terms(['GO:0004879']) receptor_genes_go = get_genes_for_go_ids(receptor_go_ids) receptor_genes_go -= {'NR2C2', 'EGF'} # Add ION channels to the receptor list ion_channels = set() with open(ION_CHANNELS, 'r') as fh: for line in fh: ion_channels.add(line.strip()) receptor_genes_go |= ion_channels return receptor_genes_go
def get_enzyme_products(de_enzymes): df = pd.read_csv(PC_SIF_URL, sep='\t', header=None) logFC_list = list(de_enzymes.keys()) de_enzymes_list = list(de_enzymes.values()) filtered_df = [ { 'Enzyme': s[0], 'Interaction': s[1], 'product': s[2], 'logFC': logFC_list[de_enzymes_list.index(s[0])] } for _, s in df.iterrows() if s[0] in de_enzymes_list and re.match('controls-production-of', s[1]) ] # If there are any CHEBI ID's, then convert # their ID's to names filtered_df = pd.DataFrame(filtered_df) for rows, s in filtered_df.iterrows(): if s[2].startswith("CHEBI"): filtered_df.at[rows, 'product'] = \ bio_ontology.get_name('CHEBI', s[2]) return filtered_df.sort_values(by='logFC', ascending=False)
def unify_lspci(stmts): from indra.statements.agent import default_ns_order from indra.ontology.bio import bio_ontology logger.info('Unifying by LSPCI with %d statements' % len(stmts)) orig_ns_order = indra.statements.agent.default_ns_order[:] indra.statements.agent.default_ns_order = ['LSPCI'] + \ indra.statements.agent.default_ns_order agents_by_lspci = defaultdict(list) ns_order = default_ns_order + ['CHEMBL', 'DRUGBANK', 'HMS-LINCS', 'CAS'] for stmt in stmts: for agent in stmt.real_agent_list(): if 'LSPCI' in agent.db_refs: agents_by_lspci[agent.db_refs['LSPCI']].append(agent) else: agent_gr = agent.get_grounding(ns_order=ns_order) if agent_gr[0] is None: continue else: parents = bio_ontology.get_parents(*agent_gr) lspci_parents = [p[1] for p in parents if p[0] == 'LSPCI'] if len(lspci_parents) != 1: continue lspci_parent = lspci_parents[0] agents_by_lspci[lspci_parent].append(agent) for lspci, agents in agents_by_lspci.items(): lspci_name = bio_ontology.get_name('LSPCI', lspci) standard_name = lspci_name if lspci_name else agents[0].name for agent in agents: agent.db_refs['LSPCI'] = lspci agent.name = standard_name unique_stmts = ac.run_preassembly(stmts, run_refinement=False) indra.statements.agent.default_ns_order = orig_ns_order logger.info('Finished unification with %d statements' % len(unique_stmts)) return unique_stmts
def normalize_sif_names(sif_df: DataFrame): """Try to normalize names in the sif dump dataframe This function tries to normalize the names of the entities in the sif dump. The 'bio_ontology' is the arbiter of what constitutes a normalized name. If no name exists, no further attempt to change the name is made. Parameters ---------- sif_df : The sif dataframe """ from indra.ontology.bio import bio_ontology bio_ontology.initialize() logger.info('Getting ns, id, name tuples') # Get the set of grounded entities ns_id_name_tups = set( zip(sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union( set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)) ) # Get the ontology name, if it exists, and check if the name in the # dataframe needs update logger.info('Checking which names need updating') inserted_set = set() for ns_, id_, cur_name in tqdm(ns_id_name_tups): oname = bio_ontology.get_name(ns_, id_) # If there is a name in the ontology and it is different than the # original, insert it if oname and oname != cur_name and (ns_, id_, oname) not in inserted_set: inserted_set.add((ns_, id_, oname)) if len(inserted_set) > 0: logger.info(f'Found {len(inserted_set)} names in dataframe that need ' f'renaming') # Make dataframe of rename dict logger.info('Making rename dataframe') df_dict = defaultdict(list) for ns_, id_, name in inserted_set: df_dict['ns'].append(ns_) df_dict['id'].append(id_) df_dict['name'].append(name) rename_df = pd.DataFrame(df_dict) # Do merge on with relevant columns from sif for both A and B logger.info('Getting temporary dataframes for renaming') # Get dataframe with ns, id, new name column rename_a = sif_df[['agA_ns', 'agA_id']].merge( right=rename_df, left_on=['agA_ns', 'agA_id'], right_on=['ns', 'id'], how='left' ).drop('ns', axis=1).drop('id', axis=1) # Check which rows have name entries truthy_a = pd.notna(rename_a.name) # Rename in sif_df from new names sif_df.loc[truthy_a, 'agA_name'] = rename_a.name[truthy_a] # Repeat for agB_name rename_b = sif_df[['agB_ns', 'agB_id']].merge( right=rename_df, left_on=['agB_ns', 'agB_id'], right_on=['ns', 'id'], how='left' ).drop('ns', axis=1).drop('id', axis=1) truthy_b = pd.notna(rename_b.name) sif_df.loc[truthy_b, 'agB_name'] = rename_b.name[truthy_b] # Check that there are no missing names logger.info('Performing sanity checks') assert sum(pd.isna(sif_df.agA_name)) == 0 assert sum(pd.isna(sif_df.agB_name)) == 0 # Get the set of ns, id, name tuples and check diff ns_id_name_tups_after = set( zip(sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union( set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)) ) # Check that rename took place assert ns_id_name_tups_after != ns_id_name_tups # Check that all new names are used assert set(rename_df.name).issubset({n for _, _, n in ns_id_name_tups_after}) logger.info('Sif dataframe renamed successfully') else: logger.info('No names need renaming')
def set_style_expression_mutation(self, model, cell_line='A375_SKIN'): """Sets the fill color of each node based on its expression level on the given cell line, and the stroke color based on whether it is a mutation. Parameters ---------- model: list<indra.statements.Statement> A list of INDRA statements cell_line: str A cell line for which we're interested in protein expression level """ labels = self.label_to_glyph_ids.keys() label_to_agent = {} for label in labels: for statement in model: for agent in statement.agent_list(): if agent is not None and _n(agent.name) == label: label_to_agent[label] = agent agent_to_expression_level = {} for agent in label_to_agent.values(): if 'HGNC' not in agent.db_refs and 'FPLX' not in agent.db_refs: # This is not a gene agent_to_expression_level[agent] = 0 continue if 'FPLX' not in agent.db_refs: gene_names = [agent.name] else: children = bio_ontology.get_children('FPLX', agent.db_refs['FPLX']) gene_names = [bio_ontology.get_name(*child) for child in children] # Compute mean expression level expression_levels = [] logger.info('Getting expression status of proteins: %s' % str(gene_names)) l = self.get_expression(gene_names, cell_line) for line in l: for element in l[line]: level = l[line][element] if level is not None: expression_levels.append(l[line][element]) if len(expression_levels) == 0: mean_level = None else: mean_level = sum(expression_levels) / len(expression_levels) agent_to_expression_level[agent] = mean_level # Create a normalized expression score between 0 and 1 # Compute min and maximum levels min_level = None max_level = None for agent, level in agent_to_expression_level.items(): if level is None: continue if min_level is None: min_level = level if max_level is None: max_level = level if level < min_level: min_level = level if level > max_level: max_level = level # Compute scores agent_to_score = {} if max_level is not None: level_span = max_level - min_level for agent, level in agent_to_expression_level.items(): if level is None or level_span == 0: agent_to_score[agent] = 0 else: agent_to_score[agent] = (level - min_level) / level_span # Map scores to colors and assign colors to labels agent_to_color = {} for agent, score in agent_to_score.items(): if 'HGNC' not in agent.db_refs and 'FPLX' not in agent.db_refs: color = cm.Blues(0.3) color_str = colors.to_hex(color[:3]) else: # color = cm.plasma(score) color = cm.Greens(0.6*score + 0.2) color_str = colors.to_hex(color[:3]) assert(len(color_str) == 7) stroke_color = \ self._choose_stroke_color_from_mutation_status(agent.name, cell_line) self.set_style(agent.name, stroke_color, color_str)
def get_name(namespace: str, identifier: str) -> str: name = bio_ontology.get_name(namespace, identifier) return name
for compound, names in PAIN_MOL_NAMES.items() if rows[2] in names] df = pd.DataFrame(celltype_pain_interaction) return df if __name__ == "__main__": df = pd.read_csv(PC_SIF_URL, sep='\t', header=None) df = df[df[1] == 'controls-production-of'] pain_signal_mol = { "Prostaglandins": "CHEBI:26333", "Brandykinin": "CHEBI:3165" } chebi_list = {} for compounds, chebi_id in pain_signal_mol.items(): chebi_list[compounds] = [ children[1] for children in bio_ontology.get_children('CHEBI', chebi_id) ] df = df[df[2].isin(chebi_list)] chebi_stmts = [{ 'Enzyme': row[0], 'Statement': row[1], 'CHEBI_ID': row[2], 'CHEBI_Name': bio_ontology.get_name('CHEBI', row[2]) } for _, row in df.iterrows()] df = pd.DataFrame(chebi_stmts) df.to_csv("enzyme_interactions.tsv", sep="\t", header=True, index=False)