def test_filter_gene_list_one(): st_out = ac.filter_gene_list([st1, st2], ['a'], 'one') assert len(st_out) == 2 st_out = ac.filter_gene_list([st1, st2], ['a'], 'all') assert len(st_out) == 0 st_out = ac.filter_gene_list([st1, st2], ['a', 'b'], 'all') assert len(st_out) == 1 st_out = ac.filter_gene_list([st1, st2], ['a', 'b'], 'invalid') assert len(st_out) == 2 # Can we exclude a statement with a bound condition agent not on the filter # list? st_out = ac.filter_gene_list([st18], ['a', 'b', 'd'], 'all') # All genes in the list assert len(st_out) == 1 st_out = ac.filter_gene_list([st18], ['a', 'b'], 'all') # Bound condition for sub not in list assert len(st_out) == 0 st_out = ac.filter_gene_list([st18], ['a', 'b'], 'one') # Bound condition for sub not in list but we only need to match one assert len(st_out) == 1 st_out = ac.filter_gene_list([st18], ['d'], 'one') # Only the bound condition is in filter list assert len(st_out) == 1 # Can we remove bound conditions that are not in the filter list? st_out = ac.filter_gene_list([st18], ['a', 'b', 'd'], 'all', remove_bound=True) assert len(st_out[0].sub.bound_conditions) == 1 st_out = ac.filter_gene_list([st18], ['a', 'b'], 'all', remove_bound=True) assert len(st_out[0].sub.bound_conditions) == 0
def test_filter_gene_list_families(): stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'], 'one', allow_families=False) assert len(stmts_out) == 1 assert stmts_out[0] == st16 stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'], 'one', allow_families=True) assert len(stmts_out) == 2 assert st16 in stmts_out assert st17 in stmts_out
def test_filter_gene_list_invert(): st_out = ac.filter_gene_list([st1, st2], ['a'], 'one', invert=True) assert len(st_out) == 0 st_out = ac.filter_gene_list([st1, st2], ['d'], 'one', invert=True) assert len(st_out) == 1 assert st_out[0].sub.name == 'b' st_out = ac.filter_gene_list([st1, st2], ['a', 'd'], 'all', invert=True) assert len(st_out) == 1 assert st_out[0].sub.name == 'b' st_out = ac.filter_gene_list([st1, st2], ['a', 'b', 'd'], 'all', invert=True) assert len(st_out) == 0
def get_bel_stmts(self, filter=False): """Get relevant statements from the BEL large corpus. Performs a series of neighborhood queries and then takes the union of all the statements. Because the query process can take a long time for large gene lists, the resulting list of statements are cached in a pickle file with the filename `<basename>_bel_stmts.pkl`. If the pickle file is present, it is used by default; if not present, the queries are performed and the results are cached. Parameters ---------- filter : bool If True, includes only those statements that exclusively mention genes in :py:attr:`gene_list`. Default is False. Note that the full (unfiltered) set of statements are cached. Returns ------- list of :py:class:`indra.statements.Statement` List of INDRA statements extracted from the BEL large corpus. """ if self.basename is not None: bel_stmt_path = '%s_bel_stmts.pkl' % self.basename # Check for cached BEL stmt file if self.basename is not None and os.path.isfile(bel_stmt_path): logger.info("Loading BEL statements from %s" % bel_stmt_path) with open(bel_stmt_path, 'rb') as f: bel_statements = pickle.load(f) # No cache, so perform the queries else: bel_statements = [] for gene in self.gene_list: logger.info("Getting BEL statements for gene %s" % gene) bel_proc = bel.process_ndex_neighborhood([gene]) if bel_proc is not None: bel_statements += bel_proc.statements # Save to pickle file if we're caching if self.basename is not None: with open(bel_stmt_path, 'wb') as f: pickle.dump(bel_statements, f, protocol=2) # Optionally filter out statements not involving only our gene set if filter: if len(self.gene_list) > 1: bel_statements = ac.filter_gene_list(bel_statements, self.gene_list, 'one') else: bel_statements = ac.filter_gene_list(bel_statements, self.gene_list, 'all') return bel_statements
def get_bel_stmts(self, filter=False): """Get relevant statements from the BEL large corpus. Performs a series of neighborhood queries and then takes the union of all the statements. Because the query process can take a long time for large gene lists, the resulting list of statements are cached in a pickle file with the filename `<basename>_bel_stmts.pkl`. If the pickle file is present, it is used by default; if not present, the queries are performed and the results are cached. Parameters ---------- filter : bool If True, includes only those statements that exclusively mention genes in :py:attr:`gene_list`. Default is False. Note that the full (unfiltered) set of statements are cached. Returns ------- list of :py:class:`indra.statements.Statement` List of INDRA statements extracted from the BEL large corpus. """ bel_proc = bel.process_pybel_neighborhood(self.gene_list) bel_statements = bel_proc.statements # Save to pickle file if we're caching if self.basename is not None: with open('%s_bel_stmts.pkl' % self.basename, 'wb') as f: pickle.dump(bel_statements, f) # Optionally filter out statements not involving only our gene set if filter: if len(self.gene_list) > 1: bel_statements = ac.filter_gene_list(bel_statements, self.gene_list, 'all') return bel_statements
def get_stmts_for_gene_list(gene_list, other_entities): """Return all Statements between genes in a given list. Parameters ---------- gene_list : list[str] A list of HGNC symbols for genes to query. other_entities : list[str] A list of other entities to keep as part of the set of Statements. Returns ------- list[indra.statements.Statement] A list of INDRA Statements between the given list of genes and other entities specified. """ stmts = [] for gene in gene_list: logger.info(f'Querying {gene}') st = get_stmts_for_gene(gene) logger.info(f'Got {len(st)} statements for {gene}') stmts += st stmts = ac.filter_gene_list(stmts, gene_list + other_entities, policy='all') return stmts
def run_assembly(stmts, filename): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4) ac.dump_statements(stmts, filename) return stmts
def run_assembly(stmts, save_file): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one') stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) ac.dump_statements(stmts, save_file) return stmts
def assemble_pysb(stmts, data_genes, out_file): """Return an assembled PySB model.""" stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.reduce_activities(stmts) pa = PysbAssembler() pa.add_statements(stmts) model = pa.make_model() # Add observables o = Observable('MAPK1p', model.monomers['MAPK1'](T185='p', Y187='p')) model.add_component(o) o = Observable('MAPK3p', model.monomers['MAPK3'](T202='p', Y204='p')) model.add_component(o) o = Observable('GSK3Ap', model.monomers['GSK3A'](S21='p')) model.add_component(o) o = Observable('GSK3Bp', model.monomers['GSK3B'](S9='p')) model.add_component(o) o = Observable('RPS6p', model.monomers['RPS6'](S235='p')) model.add_component(o) o = Observable('EIF4EBP1p', model.monomers['EIF4EBP1'](S65='p')) model.add_component(o) o = Observable('JUNp', model.monomers['JUN'](S73='p')) model.add_component(o) o = Observable('FOXO3p', model.monomers['FOXO3'](S315='p')) model.add_component(o) o = Observable('AKT1p', model.monomers['AKT1'](S473='p')) model.add_component(o) o = Observable('AKT2p', model.monomers['AKT2'](S474='p')) model.add_component(o) o = Observable('AKT3p', model.monomers['AKT3'](S='p')) model.add_component(o) o = Observable('ELK1', model.monomers['ELK1'](S383='p')) model.add_component(o) # Set context pa.set_context('SKMEL28_SKIN') pa.save_model(out_file) ke = KappaExporter(model) with open('%s.ka' % base_file, 'wb') as fh: base_file, _ = os.path.splitext(out_file) fh.write(ke.export().encode('utf-8')) return model
def get_bel_stmts(self, filter=False): """Get relevant statements from the BEL large corpus. Performs a series of neighborhood queries and then takes the union of all the statements. Because the query process can take a long time for large gene lists, the resulting list of statements are cached in a pickle file with the filename `<basename>_bel_stmts.pkl`. If the pickle file is present, it is used by default; if not present, the queries are performed and the results are cached. Parameters ---------- filter : bool If True, includes only those statements that exclusively mention genes in :py:attr:`gene_list`. Default is False. Note that the full (unfiltered) set of statements are cached. Returns ------- list of :py:class:`indra.statements.Statement` List of INDRA statements extracted from the BEL large corpus. """ if self.basename is not None: bel_stmt_path = '%s_bel_stmts.pkl' % self.basename # Check for cached BEL stmt file if self.basename is not None and os.path.isfile(bel_stmt_path): logger.info("Loading BEL statements from %s" % bel_stmt_path) with open(bel_stmt_path, 'rb') as f: bel_statements = pickle.load(f) # No cache, so perform the queries else: bel_proc = bel.process_pybel_neighborhood(self.gene_list, network_file=self.bel_corpus) bel_statements = bel_proc.statements # Save to pickle file if we're caching if self.basename is not None: with open(bel_stmt_path, 'wb') as f: pickle.dump(bel_statements, f) # Optionally filter out statements not involving only our gene set if filter: if len(self.gene_list) > 1: bel_statements = ac.filter_gene_list(bel_statements, self.gene_list, 'all') return bel_statements
def preprocess_stmts(stmts, data_genes): # Filter the INDRA Statements to be put into the model stmts = ac.filter_mutation_status(stmts, {'BRAF': [('V', '600', 'E')]}, ['PTEN']) stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) # Simplify activity types ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() af_stmts = ac.filter_by_type(ml.statements, ActiveForm) non_af_stmts = ac.filter_by_type(ml.statements, ActiveForm, invert=True) af_stmts = ac.run_preassembly(af_stmts) stmts = af_stmts + non_af_stmts # Replace activations when possible ml = MechLinker(stmts) ml.gather_explicit_activities() ml.replace_activations() # Require active forms ml.require_active_forms() num_stmts = len(ml.statements) while True: # Remove inconsequential PTMs ml.statements = ac.filter_inconsequential_mods(ml.statements, get_mod_whitelist()) ml.statements = ac.filter_inconsequential_acts(ml.statements, get_mod_whitelist()) if num_stmts <= len(ml.statements): break num_stmts = len(ml.statements) stmts = ml.statements return stmts
def assemble_sif(stmts, data, out_file): """Return an assembled SIF.""" # Filter for high-belief statements stmts = ac.filter_belief(stmts, 0.99) stmts = ac.filter_top_level(stmts) # Filter for Activation / Inhibition stmts_act = ac.filter_by_type(stmts, Activation) stmts_inact = ac.filter_by_type(stmts, Inhibition) stmts = stmts_act + stmts_inact # Get Ras227 and filter statments ras_genes = process_data.get_ras227_genes() ras_genes = [x for x in ras_genes if x not in ['YAP1']] stmts = ac.filter_gene_list(stmts, ras_genes, 'all') # Get the drugs inhibiting their targets as INDRA # statements def get_drug_statements(): drug_targets = process_data.get_drug_targets() drug_stmts = [] for dn, tns in drug_targets.items(): da = Agent(dn + ':Drugs') for tn in tns: ta = Agent(tn) drug_stmt = Inhibition(da, ta) drug_stmts.append(drug_stmt) return drug_stmts drug_stmts = get_drug_statements() stmts = stmts + drug_stmts # Because of a bug in CNO, node names containing AND # need to be replaced def rename_and_nodes(st): for s in st: for a in s.agent_list(): if a is not None: if a.name.find('AND') != -1: a.name = a.name.replace('AND', 'A_ND') rename_and_nodes(stmts) # Rewrite statements to replace genes with their corresponding # antibodies when possible stmts = rewrite_ab_stmts(stmts, data) def filter_ab_edges(st, policy='all'): st_out = [] for s in st: if policy == 'all': all_ab = True for a in s.agent_list(): if a is not None: if a.name.find('_p') == -1 and \ a.name.find('Drugs') == -1: all_ab = False break if all_ab: st_out.append(s) elif policy == 'one': any_ab = False for a in s.agent_list(): if a is not None and a.name.find('_p') != -1: any_ab = True break if any_ab: st_out.append(s) return st_out stmts = filter_ab_edges(stmts, 'all') # Get a list of the AB names that end up being covered in the prior network # This is important because other ABs will need to be taken out of the # MIDAS file to work. def get_ab_names(st): prior_abs = set() for s in st: for a in s.agent_list(): if a is not None: if a.name.find('_p') != -1: prior_abs.add(a.name) return sorted(list(prior_abs)) pkn_abs = get_ab_names(stmts) print('Boolean PKN contains these antibodies: %s' % ', '.join(pkn_abs)) # Make the SIF model sa = SifAssembler(stmts) sa.make_model(use_name_as_key=True) sif_str = sa.print_model() with open(out_file, 'wb') as fh: fh.write(sif_str.encode('utf-8')) # Make the MIDAS data file used for training the model midas_data = process_data.get_midas_data(data, pkn_abs) return sif_str
#stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) else: #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl')) assemble_models = [] assemble_models.append('sif') assemble_models.append('pysb') assemble_models.append('cx') ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly if 'sif' in assemble_models:
def get_biopax_stmts(self, filter=False, query='pathsbetween', database_filter=None): """Get relevant statements from Pathway Commons. Performs a "paths between" query for the genes in :py:attr:`gene_list` and uses the results to build statements. This function caches two files: the list of statements built from the query, which is cached in `<basename>_biopax_stmts.pkl`, and the OWL file returned by the Pathway Commons Web API, which is cached in `<basename>_pc_pathsbetween.owl`. If these cached files are found, then the results are returned based on the cached file and Pathway Commons is not queried again. Parameters ---------- filter : Optional[bool] If True, includes only those statements that exclusively mention genes in :py:attr:`gene_list`. Default is False. query : Optional[str] Defined what type of query is executed. The two options are 'pathsbetween' which finds paths between the given list of genes and only works if more than 1 gene is given, and 'neighborhood' which searches the immediate neighborhood of each given gene. Note that for pathsbetween queries with more thatn 60 genes, the query will be executed in multiple blocks for scalability. database_filter: Optional[list[str]] A list of PathwayCommons databases to include in the query. Returns ------- list of :py:class:`indra.statements.Statement` List of INDRA statements extracted from Pathway Commons. """ # If we're using a cache, initialize the appropriate filenames if self.basename is not None: biopax_stmt_path = '%s_biopax_stmts.pkl' % self.basename biopax_ras_owl_path = '%s_pc_pathsbetween.owl' % self.basename # Check for cached Biopax stmt file at the given path # if it's there, return the statements from the cache if self.basename is not None and os.path.exists(biopax_stmt_path): logger.info("Loading Biopax statements from %s" % biopax_stmt_path) with open(biopax_stmt_path, 'rb') as f: bp_statements = pickle.load(f) return bp_statements # Check for cached file before querying Pathway Commons Web API if self.basename is not None and os.path.exists(biopax_ras_owl_path): logger.info("Loading Biopax from OWL file %s" % biopax_ras_owl_path) bp = biopax.process_owl(biopax_ras_owl_path) # OWL file not found; do query and save to file else: if (len(self.gene_list) < 2) and (query == 'pathsbetween'): logger.warning('Using neighborhood query for one gene.') query = 'neighborhood' if query == 'pathsbetween': if len(self.gene_list) > 60: block_size = 60 else: block_size = None bp = biopax.process_pc_pathsbetween( self.gene_list, database_filter=database_filter, block_size=block_size) elif query == 'neighborhood': bp = biopax.process_pc_neighborhood( self.gene_list, database_filter=database_filter) else: logger.error('Invalid query type: %s' % query) return [] # Save the file if we're caching if self.basename is not None: bp.save_model(biopax_ras_owl_path) # Save statements to pickle file if we're caching if self.basename is not None: with open(biopax_stmt_path, 'wb') as f: pickle.dump(bp.statements, f) # Optionally filter out statements not involving only our gene set if filter: policy = 'one' if len(self.gene_list) > 1 else 'all' stmts = ac.filter_gene_list(bp.statements, self.gene_list, policy) else: stmts = bp.statements return stmts
if __name__ == '__main__': db = get_db('primary') db_curations = get_curations(db=db) tp = tas.process_from_web() #targets = ['TMPRSS2', 'ACE2', 'FURIN', 'CTSB', 'CTSL'] targets = [ 'PIKFYVE', 'INPP5E', 'PIK3C2A', 'PIK3C2B', 'PIK3C2G', 'PI4K2A', 'PI4K2B', 'PI4KB', 'EHD3', 'PIK3C3' ] all_stmts = [] all_ev_counts = {} with open('ctd_drugbank_tas_pikfyve.pkl', 'rb') as f: all_ctd_stmts = pickle.load(f) all_ctd_stmts = filter_neg(all_ctd_stmts) for target in targets: stmts = get_statements(target) fname = '%s.html' % target ctd_stmts = ac.filter_gene_list(all_ctd_stmts, [target], policy='one') stmts += ctd_stmts all_stmts += stmts make_html(stmts, fname) s3_client = boto3.client('s3') with open(fname, 'r') as fh: html_str = fh.read() s3_client.put_object(Bucket='indra-covid19', Key='drugs_for_target/%s' % fname, Body=html_str.encode('utf-8'), ContentType='text/html', ACL='public-read') make_drug_list(all_stmts, all_ev_counts)
# The file in which the preassembled statements will be saved pre_stmts_file = prefixed_pkl('preassembled') if reassemble: # Load various files that were previously produced sources = [ 'indradb', 'trips', 'bel', 'biopax', 'phosphosite', 'r3', 'sparser' ] stmts = [] for source in sources: stmts += ac.load_statements(prefixed_pkl(source)) stmts = ac.filter_no_hypothesis(stmts) # Fix grounding and filter to grounded entities and for proteins, # filter to the human ones stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) # Combinatorially expand protein families stmts = ac.expand_families(stmts) # Apply a strict filter to statements based on the gene names stmts = ac.filter_gene_list(stmts, gene_names, 'all') # Fix errors in references to protein sequences stmts = ac.map_sequence(stmts) # Run preassembly and save result stmts = ac.run_preassembly(stmts, return_toplevel=False) ac.dump_statements(stmts, pre_stmts_file) # Load the preassembled statements stmts = ac.load_statements(pre_stmts_file) # Run assembly into a PySB model assemble_pysb.assemble_pysb(stmts, gene_names, contextualize=True)
save=pjoin(outf, 'gmapped_prior.pkl')) reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reach_stmts = ac.filter_no_hypothesis(reach_stmts) #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl')) extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl')) reading_stmts = reach_stmts + extra_stmts reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts + extra_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl'), poolsize=4) ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly if 'sif' in assemble_models: sif_str = assemble_sif(stmts, data, pjoin(outf, 'PKN-korkut_all_ab.sif'))
def get_biopax_stmts(self, filter=False, query='pathsbetween', database_filter=None): """Get relevant statements from Pathway Commons. Performs a "paths between" query for the genes in :py:attr:`gene_list` and uses the results to build statements. This function caches two files: the list of statements built from the query, which is cached in `<basename>_biopax_stmts.pkl`, and the OWL file returned by the Pathway Commons Web API, which is cached in `<basename>_pc_pathsbetween.owl`. If these cached files are found, then the results are returned based on the cached file and Pathway Commons is not queried again. Parameters ---------- filter : Optional[bool] If True, includes only those statements that exclusively mention genes in :py:attr:`gene_list`. Default is False. query : Optional[str] Defined what type of query is executed. The two options are 'pathsbetween' which finds paths between the given list of genes and only works if more than 1 gene is given, and 'neighborhood' which searches the immediate neighborhood of each given gene. Note that for pathsbetween queries with more thatn 60 genes, the query will be executed in multiple blocks for scalability. database_filter: Optional[list[str]] A list of PathwayCommons databases to include in the query. Returns ------- list of :py:class:`indra.statements.Statement` List of INDRA statements extracted from Pathway Commons. """ # If we're using a cache, initialize the appropriate filenames if self.basename is not None: biopax_stmt_path = '%s_biopax_stmts.pkl' % self.basename biopax_ras_owl_path = '%s_pc_pathsbetween.owl' % self.basename # Check for cached Biopax stmt file at the given path # if it's there, return the statements from the cache if self.basename is not None and os.path.isfile(biopax_stmt_path): logger.info("Loading Biopax statements from %s" % biopax_stmt_path) with open(biopax_stmt_path, 'rb') as f: bp_statements = pickle.load(f) return bp_statements # Check for cached file before querying Pathway Commons Web API if self.basename is not None and os.path.isfile(biopax_ras_owl_path): logger.info("Loading Biopax from OWL file %s" % biopax_ras_owl_path) bp = biopax.process_owl(biopax_ras_owl_path) # OWL file not found; do query and save to file else: if (len(self.gene_list) < 2) and (query == 'pathsbetween'): logger.warning('Using neighborhood query for one gene.') query = 'neighborhood' if query == 'pathsbetween': if len(self.gene_list) > 60: block_size = 60 else: block_size = None bp = biopax.process_pc_pathsbetween(self.gene_list, database_filter=database_filter, block_size=block_size) elif query == 'neighborhood': bp = biopax.process_pc_neighborhood(self.gene_list, database_filter=database_filter) else: logger.error('Invalid query type: %s' % query) return [] # Save the file if we're caching if self.basename is not None: bp.save_model(biopax_ras_owl_path) # Save statements to pickle file if we're caching if self.basename is not None: with open(biopax_stmt_path, 'wb') as f: pickle.dump(bp.statements, f) # Optionally filter out statements not involving only our gene set if filter: policy = 'one' if len(self.gene_list) > 1 else 'all' stmts = ac.filter_gene_list(bp.statements, self.gene_list, policy) else: stmts = bp.statements return stmts
def assemble_pysb(stmts, data_genes, contextualize=False): # Filter the INDRA Statements to be put into the model stmts = ac.filter_by_type(stmts, Complex, invert=True) stmts = ac.filter_direct(stmts) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) # Strip the extraneous supports/supported by here strip_supports(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'all') stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) # Simplify activity types ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() stmts = normalize_active_forms(ml.statements) # Replace activations when possible ml = MechLinker(stmts) ml.gather_explicit_activities() ml.replace_activations() # Require active forms ml.require_active_forms() num_stmts = len(ml.statements) while True: # Remove inconsequential PTMs ml.statements = ac.filter_inconsequential_mods(ml.statements, get_mod_whitelist()) ml.statements = ac.filter_inconsequential_acts(ml.statements, get_mod_whitelist()) if num_stmts <= len(ml.statements): break num_stmts = len(ml.statements) stmts = ml.statements # Save the Statements here ac.dump_statements(stmts, prefixed_pkl('pysb_stmts')) # Add drug target Statements drug_target_stmts = get_drug_target_statements() stmts += drug_target_stmts # Just generate the generic model pa = PysbAssembler() pa.add_statements(stmts) model = pa.make_model() with open(prefixed_pkl('pysb_model'), 'wb') as f: pickle.dump(model, f) # Run this extra part only if contextualize is set to True if not contextualize: return cell_lines_no_data = ['COLO858', 'K2', 'MMACSF', 'MZ7MEL', 'WM1552C'] for cell_line in cell_lines: if cell_line not in cell_lines_no_data: stmtsc = contextualize_stmts(stmts, cell_line, data_genes) else: stmtsc = stmts pa = PysbAssembler() pa.add_statements(stmtsc) model = pa.make_model() if cell_line not in cell_lines_no_data: contextualize_model(model, cell_line, data_genes) ac.dump_statements(stmtsc, prefixed_pkl('pysb_stmts_%s' % cell_line)) with open(prefixed_pkl('pysb_model_%s' % cell_line), 'wb') as f: pickle.dump(model, f)