def get_drug_statements(groundings): all_stmts = {} for db_ns, db_id in groundings: print('Searching for %s@%s' % (db_id, db_ns)) idp = indra_db_rest.get_statements(subject='%s@%s' % (db_id, db_ns), ev_limit=100) stmts = idp.statements stmts = ac.filter_by_type(stmts, Inhibition) + \ ac.filter_by_type(stmts, Complex) new_stmts = [] for stmt in stmts: new_ev = [] for ev in stmt.evidence: if ev.source_api != 'medscan': new_ev.append(ev) if not new_ev: continue stmt.evidence = new_ev new_stmts.append(stmt) for stmt in new_stmts: all_stmts[stmt.get_hash()] = stmt stmts = list(all_stmts.values()) stmts = filter_db_support(stmts) stmts = fix_invalid(stmts) return stmts
def get_omnipath_stmts(): stmts = omnipath_client.get_all_modifications() phos_stmts = ac.filter_by_type(stmts, Phosphorylation) dephos_stmts = ac.filter_by_type(stmts, Dephosphorylation) stmts = phos_stmts + dephos_stmts stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def normalize_active_forms(stmts): af_stmts = ac.filter_by_type(stmts, ActiveForm) relevant_af_stmts = [] for stmt in af_stmts: if (not stmt.agent.mods) and (not stmt.agent.mutations): continue relevant_af_stmts.append(stmt) print('%d relevant ActiveForms' % len(relevant_af_stmts)) non_af_stmts = ac.filter_by_type(stmts, ActiveForm, invert=True) af_stmts = ac.run_preassembly(relevant_af_stmts) stmts = af_stmts + non_af_stmts return stmts
def get_curation_texts(): """Return activity/amount evidence texts based on curations.""" # FIXME: get_statements_from_hashes will get the right statements but we # collect _all_ evidences of these statements, not just the ones that # were specifically curated. It might make more sense to filter down # the set of evidence to the specific evidence hashes to which each # curation corresponds. curations = get_curations(tag='act_vs_amt') stmts = get_statements_from_hashes([cur.pa_hash for cur in curations]) # Note that these are flipped because the curation implies opposite amt_txts = get_ev_texts(ac.filter_by_type(stmts, RegulateActivity)) act_txts = get_ev_texts(ac.filter_by_type(stmts, RegulateAmount)) curations = get_curations(tag='correct') stmts = get_statements_from_hashes([cur.pa_hash for cur in curations]) amt_txts += get_ev_texts(ac.filter_by_type(stmts, RegulateAmount)) act_txts += get_ev_texts(ac.filter_by_type(stmts, RegulateActivity)) return act_txts, amt_txts
def preprocess_stmts(stmts, data_genes): # Filter the INDRA Statements to be put into the model stmts = ac.filter_mutation_status(stmts, {'BRAF': [('V', '600', 'E')]}, ['PTEN']) stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) # Simplify activity types ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() af_stmts = ac.filter_by_type(ml.statements, ActiveForm) non_af_stmts = ac.filter_by_type(ml.statements, ActiveForm, invert=True) af_stmts = ac.run_preassembly(af_stmts) stmts = af_stmts + non_af_stmts # Replace activations when possible ml = MechLinker(stmts) ml.gather_explicit_activities() ml.replace_activations() # Require active forms ml.require_active_forms() num_stmts = len(ml.statements) while True: # Remove inconsequential PTMs ml.statements = ac.filter_inconsequential_mods(ml.statements, get_mod_whitelist()) ml.statements = ac.filter_inconsequential_acts(ml.statements, get_mod_whitelist()) if num_stmts <= len(ml.statements): break num_stmts = len(ml.statements) stmts = ml.statements return stmts
def filter_by_type(): """Filter to a given INDRA Statement type.""" if request.method == 'OPTIONS': return {} response = request.body.read().decode('utf-8') body = json.loads(response) stmts_json = body.get('statements') stmt_type_str = body.get('type') stmt_type_str = stmt_type_str.capitalize() stmt_type = getattr(sys.modules[__name__], stmt_type_str) stmts = stmts_from_json(stmts_json) stmts_out = ac.filter_by_type(stmts, stmt_type) return _return_stmts(stmts_out)
def filter_by_type(): """Filter to a given INDRA Statement type.""" response = request.body.read().decode('utf-8') body = json.loads(response) stmts_json = body.get('statements') stmt_type_str = body.get('type') stmt_type_str = stmt_type_str.capitalize() stmt_type = getattr(sys.modules[__name__], stmt_type_str) stmts = stmts_from_json(stmts_json) stmts_out = ac.filter_by_type(stmts, stmt_type) if stmts_out: stmts_json = stmts_to_json(stmts_out) res = {'statements': stmts_json} return res else: res = {'statements': []} return res
def assemble_sif(stmts, data, out_file): """Return an assembled SIF.""" # Filter for high-belief statements stmts = ac.filter_belief(stmts, 0.99) stmts = ac.filter_top_level(stmts) # Filter for Activation / Inhibition stmts_act = ac.filter_by_type(stmts, Activation) stmts_inact = ac.filter_by_type(stmts, Inhibition) stmts = stmts_act + stmts_inact # Get Ras227 and filter statments ras_genes = process_data.get_ras227_genes() ras_genes = [x for x in ras_genes if x not in ['YAP1']] stmts = ac.filter_gene_list(stmts, ras_genes, 'all') # Get the drugs inhibiting their targets as INDRA # statements def get_drug_statements(): drug_targets = process_data.get_drug_targets() drug_stmts = [] for dn, tns in drug_targets.items(): da = Agent(dn + ':Drugs') for tn in tns: ta = Agent(tn) drug_stmt = Inhibition(da, ta) drug_stmts.append(drug_stmt) return drug_stmts drug_stmts = get_drug_statements() stmts = stmts + drug_stmts # Because of a bug in CNO, node names containing AND # need to be replaced def rename_and_nodes(st): for s in st: for a in s.agent_list(): if a is not None: if a.name.find('AND') != -1: a.name = a.name.replace('AND', 'A_ND') rename_and_nodes(stmts) # Rewrite statements to replace genes with their corresponding # antibodies when possible stmts = rewrite_ab_stmts(stmts, data) def filter_ab_edges(st, policy='all'): st_out = [] for s in st: if policy == 'all': all_ab = True for a in s.agent_list(): if a is not None: if a.name.find('_p') == -1 and \ a.name.find('Drugs') == -1: all_ab = False break if all_ab: st_out.append(s) elif policy == 'one': any_ab = False for a in s.agent_list(): if a is not None and a.name.find('_p') != -1: any_ab = True break if any_ab: st_out.append(s) return st_out stmts = filter_ab_edges(stmts, 'all') # Get a list of the AB names that end up being covered in the prior network # This is important because other ABs will need to be taken out of the # MIDAS file to work. def get_ab_names(st): prior_abs = set() for s in st: for a in s.agent_list(): if a is not None: if a.name.find('_p') != -1: prior_abs.add(a.name) return sorted(list(prior_abs)) pkn_abs = get_ab_names(stmts) print('Boolean PKN contains these antibodies: %s' % ', '.join(pkn_abs)) # Make the SIF model sa = SifAssembler(stmts) sa.make_model(use_name_as_key=True) sif_str = sa.print_model() with open(out_file, 'wb') as fh: fh.write(sif_str.encode('utf-8')) # Make the MIDAS data file used for training the model midas_data = process_data.get_midas_data(data, pkn_abs) return sif_str
def test_filter_by_type(): st_out = ac.filter_by_type([st1, st14], Phosphorylation) assert (len(st_out) == 1)
groundings = set() for agent in drug_agents: db_ns, db_id = agent.get_grounding() if db_ns is None: print('No grounding for %s (%s)' % (agent, str(agent.db_refs))) db_ns, db_id = ('TEXT', agent.db_refs['TEXT']) groundings.add((db_ns, db_id)) all_stmts = [] for db_ns, db_id in groundings: print('Searching for %s@%s' % (db_id, db_ns)) idp = indra_db_rest.get_statements(subject='%s@%s' % (db_id, db_ns), ev_limit=100) stmts = idp.statements stmts = ac.filter_by_type(stmts, Inhibition) + \ ac.filter_by_type(stmts, Complex) new_stmts = [] for stmt in stmts: new_ev = [] for ev in stmt.evidence: if ev.source_api != 'medscan': new_ev.append(ev) if not new_ev: continue stmt.evidence = new_ev new_stmts.append(stmt) all_stmts += new_stmts with open('../stmts/drug_stmts.pkl', 'wb') as fh: pickle.dump(all_stmts, fh)
def make_model_by_preassembly(self, exclude_stmts=None, complex_members=3, graph_type='multi_graph', sign_dict=None, belief_scorer=None, weight_flattening=None, extra_columns=None): """Assemble an IndraNet graph object by preassembling the statements according to selected graph type. Parameters ---------- exclude_stmts : list[str] A list of statement type names to not include in the graph. complex_members : int Maximum allowed size of a complex to be included in the graph. All complexes larger than complex_members will be rejected. For accepted complexes, all permutations of their members will be added as edges. Default is `3`. graph_type : str Specify the type of graph to assemble. Chose from 'multi_graph' (default), 'digraph', 'signed'. Default is `multi_graph`. sign_dict : dict A dictionary mapping a Statement type to a sign to be used for the edge. This parameter is only used with the 'signed' option. See IndraNet.to_signed_graph for more info. belief_scorer : Optional[indra.belief.BeliefScorer] Instance of BeliefScorer class to use in calculating edge probabilities. If None is provided (default), then the default scorer is used. weight_flattening : function(networkx.DiGraph) A function taking at least the graph G as an argument and returning G after adding edge weights as an edge attribute to the flattened edges using the reserved keyword 'weight'. Example: >>> def weight_flattening(G): ... # Sets the flattened weight to the average of the ... # inverse source count ... for edge in G.edges: ... w = [1/s['evidence_count'] ... for s in G.edges[edge]['statements']] ... G.edges[edge]['weight'] = sum(w)/len(w) ... return G Returns ------- model : IndraNet IndraNet graph object. """ # Filter out statements with one agent or with None subject stmts = [ stmt for stmt in self.statements if len(stmt.real_agent_list()) > 1 ] if exclude_stmts: exclude_types = tuple( get_statement_by_name(st_type) for st_type in exclude_stmts) stmts = [ stmt for stmt in stmts if not isinstance(stmt, exclude_types) ] # Store edge data in statement annotations stmts = _store_edge_data(stmts, extra_columns) if graph_type == 'signed': if not sign_dict: sign_dict = default_sign_dict graph_stmts = [] # Only keep statements with explicit signs for stmt_type in sign_dict: graph_stmts += ac.filter_by_type(stmts, stmt_type) graph_stmts += ac.filter_by_type(stmts, Influence) # Conversion statements can also be turned into two types of signed conv_stmts = ac.filter_by_type(stmts, Conversion) for stmt in conv_stmts: if stmt.subj: for obj in stmt.obj_from: graph_stmts.append( DecreaseAmount(stmt.subj, obj, stmt.evidence)) for obj in stmt.obj_to: graph_stmts.append( IncreaseAmount(stmt.subj, obj, stmt.evidence)) # Merge statements by agent name and polarity graph_stmts = ac.run_preassembly(graph_stmts, return_toplevel=False, belief_scorer=belief_scorer, matches_fun=partial( agent_name_polarity_matches, sign_dict=sign_dict), run_refinement=False) G = nx.MultiDiGraph() elif graph_type in ['digraph', 'multi_graph']: # Keep Complex and Conversion aside complex_stmts = ac.filter_by_type(stmts, Complex) conv_stmts = ac.filter_by_type(stmts, Conversion) graph_stmts = [ stmt for stmt in stmts if stmt not in complex_stmts and stmt not in conv_stmts ] for stmt in complex_stmts: agents = stmt.real_agent_list() if len(agents) > complex_members: continue for a, b in permutations(agents, 2): graph_stmts.append(IncreaseAmount(a, b, stmt.evidence)) for stmt in conv_stmts: if stmt.subj: for obj in stmt.obj_from: graph_stmts.append( DecreaseAmount(stmt.subj, obj, stmt.evidence)) for obj in stmt.obj_to: graph_stmts.append( IncreaseAmount(stmt.subj, obj, stmt.evidence)) if graph_type == 'digraph': # Merge statements by agent names graph_stmts = ac.run_preassembly( graph_stmts, return_toplevel=False, belief_scorer=belief_scorer, matches_fun=agent_name_stmt_matches, run_refinement=False) G = nx.DiGraph() else: G = nx.MultiGraph() for stmt in graph_stmts: agents = stmt.agent_list() for ag in agents: ag_ns, ag_id = get_ag_ns_id(ag) G.add_node(ag.name, ns=ag_ns, id=ag_id) # We merged some different statements together based on their # agent names and polarity, we can retrieve the original # statements data back from annotations unique_stmts = {} for evid in stmt.evidence: edge_data = evid.annotations['indranet_edge'] if edge_data['stmt_hash'] not in unique_stmts: unique_stmts[edge_data['stmt_hash']] = edge_data statement_data = list(unique_stmts.values()) if graph_type == 'signed': if isinstance(stmt, Influence): stmt_pol = stmt.overall_polarity() if stmt_pol == 1: sign = 0 elif stmt_pol == -1: sign = 1 else: continue else: sign = sign_dict[type(stmt).__name__] G.add_edge(agents[0].name, agents[1].name, sign, statements=statement_data, belief=stmt.belief, sign=sign) elif graph_type == 'digraph': G.add_edge(agents[0].name, agents[1].name, statements=statement_data, belief=stmt.belief) else: if statement_data: edge_data = statement_data[0] else: edge_data = _get_edge_data(stmt, extra_columns) G.add_edge(agents[0].name, agents[1].name, **edge_data) if weight_flattening: G = weight_flattening(G) return G
def filter_neg(stmts): inhib_stmts = ac.filter_by_type(stmts, Inhibition) decamt_stmts = ac.filter_by_type(stmts, DecreaseAmount) return inhib_stmts + decamt_stmts
def assemble_pysb(stmts, data_genes, contextualize=False): # Filter the INDRA Statements to be put into the model stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) # Strip the extraneous supports/supported by here strip_supports(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) # Simplify activity types ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() stmts = normalize_active_forms(ml.statements) # Replace activations when possible ml = MechLinker(stmts) ml.gather_explicit_activities() ml.replace_activations() # Require active forms ml.require_active_forms() num_stmts = len(ml.statements) while True: # Remove inconsequential PTMs ml.statements = ac.filter_inconsequential_mods(ml.statements, get_mod_whitelist()) ml.statements = ac.filter_inconsequential_acts(ml.statements, get_mod_whitelist()) if num_stmts <= len(ml.statements): break num_stmts = len(ml.statements) stmts = ml.statements # Save the Statements here ac.dump_statements(stmts, prefixed_pkl('pysb_stmts')) # Add drug target Statements drug_target_stmts = get_drug_target_statements() stmts += drug_target_stmts # Just generate the generic model pa = PysbAssembler() pa.add_statements(stmts) model = pa.make_model() with open(prefixed_pkl('pysb_model'), 'wb') as f: pickle.dump(model, f) # Run this extra part only if contextualize is set to True if not contextualize: return cell_lines_no_data = ['COLO858', 'K2', 'MMACSF', 'MZ7MEL', 'WM1552C'] for cell_line in cell_lines: if cell_line not in cell_lines_no_data: stmtsc = contextualize_stmts(stmts, cell_line, data_genes) else: stmtsc = stmts pa = PysbAssembler() pa.add_statements(stmtsc) model = pa.make_model() if cell_line not in cell_lines_no_data: contextualize_model(model, cell_line, data_genes) ac.dump_statements(stmtsc, prefixed_pkl('pysb_stmts_%s' % cell_line)) with open(prefixed_pkl('pysb_model_%s' % cell_line), 'wb') as f: pickle.dump(model, f)
for stmt in stmts: st = get_text(stmt.subj) ot = get_text(stmt.obj) if text_too_long(st, k) or text_too_long(ot, k): continue new_stmts.append(stmt) logger.info(f'{len(new_stmts)} statements after filter.') return new_stmts if __name__ == '__main__': wm_ont = load_world_ontology(wm_ont_url) # Load all raw statements eidos_stmts = load_eidos() eidos_stmts = ac.filter_by_type(eidos_stmts, Influence) hume_stmts = load_hume() hume_stmts = ac.filter_by_type(hume_stmts, Influence) hume_stmts = remove_hume_redundant(hume_stmts, None) #sofia_stmts = load_sofia() #cwms_stmts = load_cwms() # Reground where needed # sofia_stmts = reground_stmts(sofia_stmts, wm_ont, 'WM') # cwms_stmts = reground_stmts(cwms_stmts, wm_ont, 'WM') # Put statements together and filter to influence #stmts = eidos_stmts + hume_stmts + sofia_stmts + cwms_stmts stmts = eidos_stmts + hume_stmts # Remove name spaces that aren't needed in CauseMos remove_namespaces(stmts, ['WHO', 'MITRE12', 'UN'])
def test_filter_by_type(): st_out = ac.filter_by_type([st1, st14], Phosphorylation) assert len(st_out) == 1
def get_signor_stmts(): """Return a list of activity and a list of amount regulation stmts.""" sp = signor.process_from_web() return ac.filter_by_type(sp.statements, RegulateActivity), \ ac.filter_by_type(sp.statements, RegulateAmount)
agent = Agent(concept.name, db_refs={gr[0]: gr[1], 'TEXT': concept.name}) standardize_agent_name(agent, standardize_refs=True) return agent def get_regulate_activity(stmt): subj = get_agent(stmt.subj.concept) obj = get_agent(stmt.obj.concept) if not subj or not obj: return None pol = stmt.overall_polarity() stmt_type = Activation if pol == 1 or not pol else Inhibition bio_stmt = stmt_type(subj, obj, evidence=stmt.evidence) return bio_stmt if __name__ == '__main__': root = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) with open(os.path.join(root, 'stmts', 'eidos_statements.pkl'), 'rb') as fh: stmts = pickle.load(fh) stmts = ac.filter_by_type(stmts, Influence) bio_stmts = [] for stmt in tqdm.tqdm(stmts): bio_stmt = get_regulate_activity(stmt) if bio_stmt: bio_stmts.append(bio_stmt) with open(os.path.join(root, 'stmts', 'eidos_bio_statements.pkl'), 'wb') as fh: pickle.dump(bio_stmts, fh)
if __name__ == "__main__": stmts = "../work/phospho_stmts.pkl" prize_outpath = "../work/pybel_prize.tsv" interactome_path = "../work/big_pybel_interactome2.tsv" site_file = "../work/gsea_sites.rnk" # Load the statements linking kinases/regulators to phospho sites # in the data stmts = ac.load_statements(stmts) # Employ filters to reduce network size stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts) # In this data, statements of these two types will not act on # a short enough timescale to play a meaningful role stmts = ac.filter_by_type(stmts, DecreaseAmount, invert=True) stmts = ac.filter_by_type(stmts, IncreaseAmount, invert=True) stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_enzyme_kinase(stmts) # Assemble a pybel graph from statements pba = PybelAssembler(stmts) pb_graph = make_model(pba) signed_graph = to_signed_nodes(pb_graph) gn_dict = get_gene_node_dict(signed_graph) # Next we have to load the data file and assign values to site_data = read_site_file(site_file) dump_steiner_files(signed_graph, site_data, prize_outpath,