def get_famplex_links_from_lists(genes_appearing, fplx_appearing): links = [] for gene in genes_appearing: parent_ids = [p[1] for p in bio_ontology.get_parents('HGNC', gene)] parents_appearing = fplx_appearing & set(parent_ids) links += [(gene, parent) for parent in parents_appearing] for fplx_child in fplx_appearing: parent_ids = [ p[1] for p in bio_ontology.get_parents('FPLX', fplx_child) ] parents_appearing = fplx_appearing & set(parent_ids) links += [(fplx_child, parent) for parent in parents_appearing] return links
def add_ido_parents(bio_ontology: BioOntology): ido_root = bio_ontology.label('IDO', '0') bio_ontology.add_node(ido_root, name='infectious disease concept') edges_to_add = [] for node in bio_ontology.nodes(): if bio_ontology.get_ns(node) == 'IDO' and not \ bio_ontology.get_parents(*bio_ontology.get_ns_id(node)): edges_to_add.append((node, ido_root, {'type': 'isa'})) bio_ontology.add_edges_from(edges_to_add)
def add_efo_parents(bio_ontology): edges_to_add = [] efo_root = 'EFO:0000001' for node in bio_ontology.nodes(): if bio_ontology.get_ns(node) == 'EFO' and \ not bio_ontology.get_parents(*bio_ontology.get_ns_id(node)): edges_to_add.append((node, efo_root, {'type': 'isa'})) print('Adding %d EFO isa edges.' % len(edges_to_add)) bio_ontology.add_edges_from(edges_to_add)
def get_famplex_terms(genes): """Get a list of associated FamPlex IDs from a list of gene IDs.""" all_parents = set() for hgnc_id in genes: parent_ids = {p[1] for p in bio_ontology.get_parents('HGNC', hgnc_id)} all_parents |= parent_ids fplx_terms = sorted(list(all_parents)) logger.info('Found %d relevant FamPlex terms.' % (len(fplx_terms))) return fplx_terms
def get_family(agents): """Get a FamPlex family if all of its members are given.""" family_sets = [] ag_groundings = [] for ag in agents: gr = ag.get_grounding() ag_groundings.append(gr) parents = bio_ontology.get_parents(*gr) families = {p for p in parents if p[0] == 'FPLX'} family_sets.append(families) common_families = family_sets[0].intersection(*family_sets) if not common_families: return for fam in common_families: children = bio_ontology.get_children(*fam) # Check if all family members are present if set(children) == set(ag_groundings): return fam[1]
def unify_lspci(stmts): from indra.statements.agent import default_ns_order from indra.ontology.bio import bio_ontology logger.info('Unifying by LSPCI with %d statements' % len(stmts)) orig_ns_order = indra.statements.agent.default_ns_order[:] indra.statements.agent.default_ns_order = ['LSPCI'] + \ indra.statements.agent.default_ns_order agents_by_lspci = defaultdict(list) ns_order = default_ns_order + ['CHEMBL', 'DRUGBANK', 'HMS-LINCS', 'CAS'] for stmt in stmts: for agent in stmt.real_agent_list(): if 'LSPCI' in agent.db_refs: agents_by_lspci[agent.db_refs['LSPCI']].append(agent) else: agent_gr = agent.get_grounding(ns_order=ns_order) if agent_gr[0] is None: continue else: parents = bio_ontology.get_parents(*agent_gr) lspci_parents = [p[1] for p in parents if p[0] == 'LSPCI'] if len(lspci_parents) != 1: continue lspci_parent = lspci_parents[0] agents_by_lspci[lspci_parent].append(agent) for lspci, agents in agents_by_lspci.items(): lspci_name = bio_ontology.get_name('LSPCI', lspci) standard_name = lspci_name if lspci_name else agents[0].name for agent in agents: agent.db_refs['LSPCI'] = lspci agent.name = standard_name unique_stmts = ac.run_preassembly(stmts, run_refinement=False) indra.statements.agent.default_ns_order = orig_ns_order logger.info('Finished unification with %d statements' % len(unique_stmts)) return unique_stmts
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str], date: str, mesh_id_dict: Optional[Dict] = None, graph_type: GraphTypes = 'digraph', include_entity_hierarchies: bool = True, sign_dict: Optional[Dict[str, int]] = None, stmt_types: Optional[List[str]] = None, z_sc_path: Optional[Union[str, pd.DataFrame]] = None, verbosity: int = 0) \ -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]: """Return a NetworkX digraph from a pandas dataframe of a db dump Parameters ---------- df : Union[str, pd.DataFrame] A dataframe, either as a file path to a file (.pkl or .csv) or a pandas DataFrame object. date : str A date string specifying when the data was dumped from the database. mesh_id_dict : dict A dict object mapping statement hashes to all mesh ids sharing a common PMID graph_type : str Return type for the returned graph. Currently supports: - 'digraph': DiGraph (Default) - 'multidigraph': MultiDiGraph - 'signed': Tuple[DiGraph, MultiDiGraph] - 'signed-expanded': Tuple[DiGraph, MultiDiGraph] - 'digraph-signed-types': DiGraph include_entity_hierarchies : bool If True, add edges between nodes if they are related ontologically with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge is added between the nodes BRCA and BRCA1. Default: True. Note that this option only is available for the options directed/unsigned graph and multidigraph. sign_dict : Dict[str, int] A dictionary mapping a Statement type to a sign to be used for the edge. By default only Activation and IncreaseAmount are added as positive edges and Inhibition and DecreaseAmount are added as negative edges, but a user can pass any other Statement types in a dictionary. stmt_types : List[str] A list of statement types to epxand out to other signs z_sc_path: If provided, must be or be path to a square dataframe with HGNC symbols as names on the axes and floats as entries verbosity: int Output various messages if > 0. For all messages, set to 4. Returns ------- Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]] The type is determined by the graph_type argument """ graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded', 'digraph-signed-types') if graph_type.lower() not in graph_options: raise ValueError(f'Graph type {graph_type} not supported. Can only ' f'chose between {graph_options}') sign_dict = sign_dict if sign_dict else default_sign_dict graph_type = graph_type.lower() date = date if date else datetime.now().strftime('%Y-%m-%d') if isinstance(df, str): sif_df = file_opener(df) else: sif_df = df if z_sc_path is not None: if isinstance(z_sc_path, str): if z_sc_path.endswith('h5'): logger.info(f'Loading z-scores from {z_sc_path}') z_sc_df = pd.read_hdf(z_sc_path) elif z_sc_path.endswith('pkl'): logger.info(f'Loading z-scores from {z_sc_path}') z_sc_df: pd.DataFrame = file_opener(z_sc_path) else: raise ValueError(f'Unrecognized file: {z_sc_path}') elif isinstance(z_sc_path, pd.DataFrame): z_sc_df = z_sc_path else: raise ValueError('Only file paths and data frames allowed as ' 'arguments to z_sc_path') else: z_sc_df = None # If signed types: filter out rows that of unsigned types if graph_type == 'digraph-signed-types': sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())] sif_df = sif_dump_df_merger(sif_df, graph_type, sign_dict, stmt_types, mesh_id_dict, verbosity=verbosity) # Map ns:id to node name logger.info('Creating dictionary mapping (ns,id) to node name') ns_id_name_tups = set(zip( sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union( set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name))) ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups} # Map hashes to edge for non-signed graphs if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}: logger.info('Creating dictionary mapping hashes to edges for ' 'unsigned graph') hash_edge_dict = { h: (a, b) for a, b, h in zip(sif_df.agA_name, sif_df.agB_name, sif_df.stmt_hash) } # Create graph from df if graph_type == 'multidigraph': indranet_graph = IndraNet.from_df(sif_df) elif graph_type in ('digraph', 'digraph-signed-types'): # Flatten indranet_graph = IndraNet.digraph_from_df(sif_df, 'complementary_belief', _weight_mapping) elif graph_type in ('signed', 'signed-expanded'): signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df( df=sif_df, flattening_method='complementary_belief', weight_mapping=_weight_mapping) signed_node_graph: DiGraph = signed_edges_to_signed_nodes( graph=signed_edge_graph, copy_edge_data=True) signed_edge_graph.graph['date'] = date signed_node_graph.graph['date'] = date signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename # Get hash to signed edge mapping logger.info('Creating dictionary mapping hashes to edges for ' 'unsigned graph') seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set) for edge in signed_edge_graph.edges: for es in signed_edge_graph.edges[edge]['statements']: if graph_type == 'signed': seg_hash_edge_dict[es['stmt_hash']] = edge else: seg_hash_edge_dict[es['stmt_hash']].add(edge) signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set) for edge in signed_node_graph.edges: for es in signed_node_graph.edges[edge]['statements']: if graph_type == 'signed': sng_hash_edge_dict[es['stmt_hash']] = edge else: sng_hash_edge_dict[es['stmt_hash']].add(edge) signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict if z_sc_df is not None: # Set z-score attributes add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df) add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df) return signed_edge_graph, signed_node_graph else: raise ValueError(f'Unrecognized graph type {graph_type}. Must be one ' f'of: {", ".join(graph_options)}') if z_sc_df is not None: # Set z-score attributes add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df) # Add hierarchy relations to graph (not applicable for signed graphs) if include_entity_hierarchies and graph_type in ('multidigraph', 'digraph'): from depmap_analysis.network_functions.famplex_functions import \ get_all_entities logger.info('Fetching entity hierarchy relationships') full_entity_list = get_all_entities() logger.info('Adding entity hierarchy manager as graph attribute') node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list} added_pairs = set() # Save (A, B, URI) logger.info('Building entity relations to be added to data frame') entities = 0 non_corr_weight = None if z_sc_df is not None: # Get non-corr weight for edge in indranet_graph.edges: if indranet_graph.edges[edge]['z_score'] == 0: non_corr_weight = indranet_graph.edges[edge]['corr_weight'] break assert non_corr_weight is not None z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight} else: z_sc_attrs = {} for ns, _id, uri in full_entity_list: node = _id # Get name in case it's different than id if ns_id_to_nodename.get((ns, _id), None): node = ns_id_to_nodename[(ns, _id)] else: ns_id_to_nodename[(ns, _id)] = node # Add famplex edge for pns, pid in bio_ontology.get_parents(ns, _id): puri = get_identifiers_url(pns, pid) pnode = pid if ns_id_to_nodename.get((pns, pid), None): pnode = ns_id_to_nodename[(pns, pid)] else: ns_id_to_nodename[(pns, pid)] = pnode # Check if edge already exists if (node, pnode, puri) not in added_pairs: entities += 1 # Belief and evidence are conditional added_pairs.add((node, pnode, puri)) # A, B, uri of B ed = { 'agA_name': node, 'agA_ns': ns, 'agA_id': _id, 'agB_name': pnode, 'agB_ns': pns, 'agB_id': pid, 'stmt_type': 'fplx', 'evidence_count': 1, 'source_counts': { 'fplx': 1 }, 'stmt_hash': puri, 'belief': 1.0, 'weight': MIN_WEIGHT, 'curated': True, 'english': f'{pns}:{pid} is an ontological parent ' f'of {ns}:{_id}', 'z_score': 0, 'corr_weight': 1 } # Add non-existing nodes if ed['agA_name'] not in indranet_graph.nodes: indranet_graph.add_node(ed['agA_name'], ns=ed['agA_ns'], id=ed['agA_id']) if ed['agB_name'] not in indranet_graph.nodes: indranet_graph.add_node(ed['agB_name'], ns=ed['agB_ns'], id=ed['agB_id']) # Add edges ed.pop('agA_id') ed.pop('agA_ns') ed.pop('agB_id') ed.pop('agB_ns') if indranet_graph.is_multigraph(): # MultiDiGraph indranet_graph.add_edge(ed['agA_name'], ed['agB_name'], **ed) else: # DiGraph u = ed.pop('agA_name') v = ed.pop('agB_name') # Check edge if indranet_graph.has_edge(u, v): indranet_graph.edges[(u, v)]['statements'].append(ed) else: indranet_graph.add_edge(u, v, belief=1.0, weight=1.0, statements=[ed], **z_sc_attrs) logger.info('Loaded %d entity relations into dataframe' % entities) indranet_graph.graph['node_by_uri'] = node_by_uri indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename indranet_graph.graph['edge_by_hash'] = hash_edge_dict indranet_graph.graph['date'] = date return indranet_graph
def test_mtorc_get_parents(): p = bio_ontology.get_parents('HGNC', hgnc_client.get_hgnc_id('RICTOR')) assert len(p) == 1 assert p == [('FPLX', 'mTORC2')]
def test_ido_parents(): parents = bio_ontology.get_parents('IDO', '0000514') assert ('IDO', '0000509') in parents
def test_efo_bfo_relations(): assert set(bio_ontology.get_parents('EFO', '0004542')) == \ {('BFO', '0000015'), ('EFO', '0000001')}
def test_get_parents(): prkaa1 = ('HGNC', '9376') ampk = ('FPLX', 'AMPK') p1 = bio_ontology.get_parents(*prkaa1) assert len(p1) == 8, p1 assert ampk in p1
def has_fplx_parents(bio_ontology, node): """Return True if the given ontology node has FamPlex parents.""" parents = bio_ontology.get_parents(*bio_ontology.get_ns_id(node)) if any(p[0] == 'FPLX' for p in parents): return True return False
def test_eccode_isa(): parents = set(bio_ontology.get_parents('ECCODE', '1.1.1.1')) assert parents == {('ECCODE', '1.1.1'), ('ECCODE', '1.1'), ('ECCODE', '1')}, parents assert bio_ontology.isa('ECCODE', '1.1.1.1', 'ECCODE', '1.1.1')
ip = indra_db_rest.get_statements(agents=['%s@FPLX' % fplx_id], ev_limit=10000) stmts = filter_out_medscan(ip.statements) stmts = ac.filter_human_only(stmts) return stmts if __name__ == '__main__': with open(kinase_pkl, 'rb') as fh: kinase_stmts = pickle.load(fh) fplx_by_kinase = defaultdict(set) kinase_by_fplx = defaultdict(set) kinase_counts = {} for kinase, stmts in kinase_stmts.items(): hgnc_id = hgnc_client.get_hgnc_id(kinase) parents = bio_ontology.get_parents('HGNC', hgnc_id) fplx_by_kinase[kinase] |= {fplx_id for _, fplx_id in parents} for _, fplx_id in parents: kinase_by_fplx[fplx_id].add(kinase) kinase_counts[kinase] = len(stmts) kinase_by_fplx = dict(kinase_by_fplx) fplx_by_kinase = dict(fplx_by_kinase) fplx_stmts = {} fplx_counts = {} for fplx_id in tqdm.tqdm(kinase_by_fplx): fplx_stmts[fplx_id] = get_fplx_stmts(fplx_id) fplx_counts[fplx_id] = len(fplx_stmts[fplx_id]) with open('fplx_stmts.pkl', 'wb') as fh: pickle.dump(fplx_stmts, fh) with open('fplx_counts.json', 'w') as fh: