def main(args): # This file takes about 32 GB to load if not args.infile: args.infile = './Data/indra_raw/bioexp_all_raw.pkl' if not args.outfile: args.outfile = './filtered_indra_network.sif' # Load statements from file stmts_raw = assemble_corpus.load_statements(args.infile) # Expand families, fix grounding errors and run run preassembly stmts_fixed = assemble_corpus.run_preassembly( assemble_corpus.map_grounding( assemble_corpus.expand_families(stmts_raw))) # Default filtering: specific (unique) genes that are grounded. stmts_filtered = assemble_corpus.filter_grounded_only( assemble_corpus.filter_genes_only(stmts_fixed, specific_only=True)) # Custom filters if args.human_only: stmts_filtered = assemble_corpus.filter_human_only(stmts_filtered) if args.filter_direct: stmts_filtered = assemble_corpus.filter_direct(stmts_filtered) binary_stmts = [s for s in stmts_filtered if len(s.agent_list()) == 2 and s.agent_list()[0] is not None] rows = [] for s in binary_stmts: rows.append([ag.name for ag in s.agent_list()]) # Write rows to .sif file with open(args.outfile, 'w', newline='') as csvfile: wrtr = csv.writer(csvfile, delimiter='\t') for row in rows: wrtr.writerow(row)
def run_assembly(self): """Run INDRA's assembly pipeline on the Statements.""" self.eliminate_copies() stmts = self.get_indra_stmts() stmts = self.filter_event_association(stmts) stmts = ac.filter_no_hypothesis(stmts) if not self.assembly_config.get('skip_map_grounding'): stmts = ac.map_grounding(stmts) if self.assembly_config.get('standardize_names'): ac.standardize_names_groundings(stmts) if self.assembly_config.get('filter_ungrounded'): score_threshold = self.assembly_config.get('score_threshold') stmts = ac.filter_grounded_only(stmts, score_threshold=score_threshold) if self.assembly_config.get('merge_groundings'): stmts = ac.merge_groundings(stmts) if self.assembly_config.get('merge_deltas'): stmts = ac.merge_deltas(stmts) relevance_policy = self.assembly_config.get('filter_relevance') if relevance_policy: stmts = self.filter_relevance(stmts, relevance_policy) if not self.assembly_config.get('skip_filter_human'): stmts = ac.filter_human_only(stmts) if not self.assembly_config.get('skip_map_sequence'): stmts = ac.map_sequence(stmts) # Use WM hierarchies and belief scorer for WM preassembly preassembly_mode = self.assembly_config.get('preassembly_mode') if preassembly_mode == 'wm': hierarchies = get_wm_hierarchies() belief_scorer = get_eidos_scorer() stmts = ac.run_preassembly(stmts, return_toplevel=False, belief_scorer=belief_scorer, hierarchies=hierarchies) else: stmts = ac.run_preassembly(stmts, return_toplevel=False) belief_cutoff = self.assembly_config.get('belief_cutoff') if belief_cutoff is not None: stmts = ac.filter_belief(stmts, belief_cutoff) stmts = ac.filter_top_level(stmts) if self.assembly_config.get('filter_direct'): stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) if self.assembly_config.get('mechanism_linking'): ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() ml.gather_explicit_activities() ml.replace_activations() ml.require_active_forms() stmts = ml.statements self.assembled_stmts = stmts
def get_omnipath_stmts(): stmts = omnipath_client.get_all_modifications() phos_stmts = ac.filter_by_type(stmts, Phosphorylation) dephos_stmts = ac.filter_by_type(stmts, Dephosphorylation) stmts = phos_stmts + dephos_stmts stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def run_assembly(stmts, filename): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4) ac.dump_statements(stmts, filename) return stmts
def test_readme_pipeline(): stmts = gn_stmts # Added only here, not in docs from indra.tools import assemble_corpus as ac stmts = ac.filter_no_hypothesis(stmts) stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.8) assert stmts, 'Update example to yield statements list of non-zero length'
def test_filter_human_only(): st_out = ac.filter_human_only([st1, st5]) assert len(st_out) == 2 st_out = ac.filter_human_only([st8, st9]) assert len(st_out) == 0 # Can we filter out statements with bound conditions grounded to non-human # genes? st_out = ac.filter_human_only([st20], remove_bound=False) assert len(st_out) == 0 # When we do such filtering, do we keep statements bounded to human genes? st_out = ac.filter_human_only([st21], remove_bound=False) assert len(st_out) == 1 # Can we remove bound conditions grounded to non-human genes? st_out = ac.filter_human_only([st20], remove_bound=True) assert len(st_out) == 1 assert len(st_out[0].sub.bound_conditions) == 0 # When we do so, do we keep bound conditions not grounded to non-human # genes? st_out = ac.filter_human_only([st21], remove_bound=True) assert len(st_out) == 1 assert len(st_out[0].sub.bound_conditions) == 1
def preassemble(self, filters=None, grounding_map=None): """Preassemble the Statements collected in the model. Use INDRA's GroundingMapper, Preassembler and BeliefEngine on the IncrementalModel and save the unique statements and the top level statements in class attributes. Currently the following filter options are implemented: - grounding: require that all Agents in statements are grounded - human_only: require that all proteins are human proteins - prior_one: require that at least one Agent is in the prior model - prior_all: require that all Agents are in the prior model Parameters ---------- filters : Optional[list[str]] A list of filter options to apply when choosing the statements. See description above for more details. Default: None grounding_map : Optional[dict] A user supplied grounding map which maps a string to a dictionary of database IDs (in the format used by Agents' db_refs). """ stmts = self.get_statements() # Filter out hypotheses stmts = ac.filter_no_hypothesis(stmts) # Fix grounding if grounding_map is not None: stmts = ac.map_grounding(stmts, grounding_map=grounding_map) else: stmts = ac.map_grounding(stmts) if filters and ('grounding' in filters): stmts = ac.filter_grounded_only(stmts) # Fix sites stmts = ac.map_sequence(stmts) if filters and 'human_only' in filters: stmts = ac.filter_human_only(stmts) # Run preassembly stmts = ac.run_preassembly(stmts, return_toplevel=False) # Run relevance filter stmts = self._relevance_filter(stmts, filters) # Save Statements self.assembled_stmts = stmts
def run_assembly(stmts, save_file): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one') stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) ac.dump_statements(stmts, save_file) return stmts
def run_assembly(self): """Run INDRA's assembly pipeline on the Statements. Returns ------- stmts : list[indra.statements.Statement] The list of assembled INDRA Statements. """ stmts = self.get_indra_smts() stmts = ac.filter_no_hypothesis(stmts) stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) return stmts
def get_indra_phos_stmts(): stmts = by_gene_role_type(stmt_type='Phosphorylation') stmts += by_gene_role_type(stmt_type='Dephosphorylation') stmts = ac.map_grounding(stmts) # Expand families before site mapping stmts = ac.expand_families(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.map_sequence(stmts) ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl') stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_phos_stmts_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl') return stmts
def get_indra_expression(): #inc_stmts = by_gene_role_type(stmt_type='IncreaseAmount') #dec_stmts = by_gene_role_type(stmt_type='DecreaseAmount') #stmts = inc_stmts + dec_stmts #ac.dump_statements(stmts, 'indra_regulate_amount_stmts.pkl') #stmts = ac.load_statements('indra_regulate_amount_stmts.pkl') #stmts = ac.map_grounding(stmts) # Expand families before site mapping #stmts = ac.expand_families(stmts) #stmts = ac.filter_grounded_only(stmts) #stmts = ac.map_sequence(stmts) #stmts = ac.run_preassembly(stmts, poolsize=4, # save='indra_regulate_amount_pre.pkl') stmts = ac.load_statements('indra_regulate_amount_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts) stmts = [s for s in stmts if s.agent_list()[0] is not None] return stmts
def regulons_from_stmts(stmts, filename): regulons = defaultdict(set) stmts = ac.filter_genes_only(stmts) stmts = ac.filter_human_only(stmts) for stmt in stmts: kinase = stmt.enz.name # Blacklist annoying stmts from NCI-PID if (kinase == 'BRAF' or kinase == 'RAF1') and \ (stmt.sub.name == 'MAPK1' or stmt.sub.name == 'MAPK3'): continue if stmt.residue and stmt.position: site = '%s_%s%s' % (stmt.sub.name, stmt.residue, stmt.position) regulons[kinase].add(site) rows = [] for kinase, sites in regulons.items(): rows.append([kinase, 'Description'] + [s for s in sites]) with open(filename, 'wt') as f: csvwriter = csv.writer(f, delimiter='\t') csvwriter.writerows(rows)
def get_indra_reg_act_stmts(): try: stmts = ac.load_statements('sources/indra_reg_act_stmts.pkl') return stmts except: pass stmts = [] for stmt_type in ('Activation', 'Inhibition', 'ActiveForm'): print("Getting %s statements from INDRA DB" % stmt_type) stmts += by_gene_role_type(stmt_type=stmt_type) stmts = ac.map_grounding(stmts, save='sources/indra_reg_act_gmap.pkl') stmts = ac.filter_grounded_only(stmts) stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_reg_act_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_reg_act_stmts.pkl') return stmts
def load_statements_from_synapse(synapse_id='syn11273504'): syn = synapseclient.Synapse() syn.login() # Obtain a pointer and download the data syn_data = syn.get(synapse_id) stmts = [] for row in read_unicode_csv(syn_data.path, delimiter='\t'): sub_name, site_info = row[0].split(':') res = site_info[0] pos = site_info[1:] gene_list = row[1].split(',') for enz_name in gene_list: enz = Agent(enz_name, db_refs=get_ids(enz_name)) sub = Agent(sub_name, db_refs=get_ids(sub_name)) stmt = Phosphorylation(enz, sub, res, pos) stmts.append(stmt) stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def assemble_statements(kinase, stmts, curs): """Run assembly steps on statements.""" # Remove unary statements and ones with many agents stmts = [stmt for stmt in stmts if (1 < len(stmt.real_agent_list()) < 4)] stmts = replace_ctd(stmts, ctd_stmts_by_gene.get(kinase, [])) # We do this at this point to make sure we capture the original DB # hashes before modifying statements to allow lookup for stmt in stmts: for ev in stmt.evidence: ev.annotations['prior_hash'] = stmt.get_hash() stmts = fix_invalidities(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_by_curation(stmts, curations=curs) stmts = unify_lspci(stmts) stmts = remove_contradictions(stmts) # Rename chemicals logger.info('Renaming chemicals') for stmt in stmts: for agent in stmt.real_agent_list(): if agent.db_refs.get('CHEBI') and len(agent.name) > 25: rename_chemical(agent) # Remove long names logger.info('Removing statements with long names') stmts = [ stmt for stmt in stmts if all( len(a.name) < 20 for a in stmt.real_agent_list()) ] logger.info('%d statements remaining' % len(stmts)) # Remove microRNAs logger.info('Removing microRNA statements') stmts = [ stmt for stmt in stmts if not any('miR' in a.name for a in stmt.real_agent_list()) ] logger.info('%d statements remaining' % len(stmts)) stmts = add_source_urls(stmts) with open('data/assembled/%s.pkl' % kinase, 'wb') as fh: pickle.dump(stmts, fh) return stmts
def get_fplx_stmts(fplx_id): ip = indra_db_rest.get_statements(agents=['%s@FPLX' % fplx_id], ev_limit=10000) stmts = filter_out_medscan(ip.statements) stmts = ac.filter_human_only(stmts) return stmts
return pd.DataFrame(tf_df) wd = __file__ INDRA_SIF = os.path.join(os.pardir, 'input', 'sif.pkl') with open(INDRA_SIF, 'rb') as fh: SIF = pickle.load(fh) n_stmt_type = list(SIF.columns).index('stmt_type') n_stmt_hash = list(SIF.columns).index('stmt_hash') hash_set = set() for r, c in SIF.iterrows(): if c[n_stmt_type] == 'IncreaseAmount' or c[n_stmt_type] == 'DecreaseAmount': hash_set.add(c[n_stmt_hash]) #stmts = download_statements(hash_set) indra_stmts = list(stmts.values()) with open('../output/all_stmts.pkl', 'wb') as fh: pickle.dump(indra_stmts, fh) indra_stmts = filter_human_only(indra_stmts) indra_stmts = filter_genes_only(indra_stmts) indra_stmts = filter_transcription_factor(indra_stmts) indra_stmts_db_only = filter_db_only(indra_stmts) indra_stmts_df = make_dataframe(indra_stmts) indra_stmts_df.to_csv('../output/indra_all_tf.csv') indra_stmts_db_only_df = make_dataframe(indra_stmts_db_only) indra_stmts_db_only_df.to_csv('../output/indra_db_only_tf.csv')
def test_filter_human_only(): st_out = ac.filter_human_only([st1, st5]) assert len(st_out) == 2 st_out = ac.filter_human_only([st8, st9]) assert len(st_out) == 0
def get_phosphosite_stmts(): stmts = ac.load_statements('sources/phosphosite_stmts.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
if not reassemble: stmts = ac.load_statements(pjoin(outf, 'preassembled.pkl')) #stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) else: #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl')) assemble_models = [] assemble_models.append('sif') assemble_models.append('pysb') assemble_models.append('cx') ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py'))
prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reach_stmts = ac.filter_no_hypothesis(reach_stmts) #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl')) extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl')) reading_stmts = reach_stmts + extra_stmts reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts + extra_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl'), poolsize=4) ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly if 'sif' in assemble_models: