def test_map_sequence(): a = Agent('MAPK1', db_refs={'UP': 'P28482', 'HGNC': '6871'}) st1 = Phosphorylation(None, a, 'T', '182') st2 = Phosphorylation(None, a, 'T', '185') st3 = Phosphorylation(None, a, 'Y', '999') st_out = ac.map_sequence([st1]) assert (len(st_out) == 1) assert (st_out[0].position == '185') st_out = ac.map_sequence([st2]) assert (len(st_out) == 1) assert (st_out[0].position == '185') st_out = ac.map_sequence([st3]) assert (len(st_out) == 0)
def test_map_sequence(): a = Agent('MAPK1', db_refs={'UP': 'P28482', 'HGNC': '6871'}) st1 = Phosphorylation(None, a, 'T', '182') st2 = Phosphorylation(None, a, 'T', '185') st3 = Phosphorylation(None, a, 'Y', '999') st_out = ac.map_sequence([st1]) assert len(st_out) == 1, st_out assert st_out[0].position == '185' st_out = ac.map_sequence([st2]) assert len(st_out) == 1, st_out assert st_out[0].position == '185' st_out = ac.map_sequence([st3]) assert len(st_out) == 0, st_out
def _do_old_fashioned_preassembly(stmts): grounded_stmts = ac.map_grounding(stmts, use_adeft=True, gilda_mode='local') ms_stmts = ac.map_sequence(grounded_stmts, use_cache=True) opa_stmts = ac.run_preassembly(ms_stmts, return_toplevel=False) return opa_stmts
def respond_get_paper_model(self, content): """Get and display the model from a paper, indicated by pmid.""" pmid_raw = content.gets('pmid') prefix = 'PMID-' if pmid_raw.startswith(prefix) and pmid_raw[len(prefix):].isdigit(): pmid = pmid_raw[len(prefix):] else: return self.make_failure('BAD_INPUT') try: stmts = get_statements_for_paper([('pmid', pmid)]) except IndraDBRestAPIError as e: if e.status_code == 404 and 'Invalid or unavailable' in e.reason: logger.error("Could not find pmid: %s" % e.reason) return self.make_failure('MISSING_MECHANISM') else: raise e if not stmts: resp = KQMLPerformative('SUCCESS') resp.set('relations-found', 0) return resp stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) unique_stmts = ac.run_preassembly(stmts, return_toplevel=True) diagrams = _make_diagrams(stmts) self.send_display_model(diagrams) resp = KQMLPerformative('SUCCESS') resp.set('relations-found', len(unique_stmts)) resp.set('dump-limit', str(DUMP_LIMIT)) return resp
def respond_get_paper_model(self, content): """Get and display the model from a paper, indicated by pmid.""" pmid_raw = content.gets('pmid') prefix = 'PMID-' if pmid_raw.startswith(prefix) and pmid_raw[len(prefix):].isdigit(): pmid = pmid_raw[len(prefix):] else: return self.make_failure('BAD_INPUT') try: stmts = get_statements_for_paper([('pmid', pmid)], simple_response=True) except IndraDBRestAPIError as e: if e.status_code == 404 and 'Invalid or unavailable' in e.reason: logger.error("Could not find pmid: %s" % e.reason) return self.make_failure('MISSING_MECHANISM') else: raise e if not stmts: resp = KQMLPerformative('SUCCESS') resp.set('relations-found', 0) return resp stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) unique_stmts = ac.run_preassembly(stmts, return_toplevel=True) diagrams = _make_diagrams(stmts) self.send_display_model(diagrams) resp = KQMLPerformative('SUCCESS') resp.set('relations-found', len(unique_stmts)) resp.set('dump-limit', str(DUMP_LIMIT)) return resp
def _make_unique_statement_set(self, stmt_tpls): """Perform grounding, sequence mapping, and find unique set from stmts. This method returns a list of statement objects, as well as a set of tuples of the form (uuid, matches_key) which represent the links between raw (evidence) statements and their unique/preassembled counterparts. """ stmts = [] uuid_sid_dict = {} for sid, stmt in stmt_tpls: uuid_sid_dict[stmt.uuid] = sid stmts.append(stmt) stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) stmt_groups = self.pa._get_stmt_matching_groups(stmts) unique_stmts = [] evidence_links = defaultdict(lambda: set()) for _, duplicates in stmt_groups: # Get the first statement and add the evidence of all subsequent # Statements to it for stmt_ix, stmt in enumerate(duplicates): if stmt_ix == 0: first_stmt = stmt.make_generic_copy() stmt_hash = first_stmt.get_hash(shallow=True) evidence_links[stmt_hash].add(uuid_sid_dict[stmt.uuid]) # This should never be None or anything else assert isinstance(first_stmt, type(stmt)) unique_stmts.append(first_stmt) return unique_stmts, flatten_evidence_dict(evidence_links)
def run_assembly(self): """Run INDRA's assembly pipeline on the Statements.""" self.eliminate_copies() stmts = self.get_indra_stmts() stmts = self.filter_event_association(stmts) stmts = ac.filter_no_hypothesis(stmts) if not self.assembly_config.get('skip_map_grounding'): stmts = ac.map_grounding(stmts) if self.assembly_config.get('standardize_names'): ac.standardize_names_groundings(stmts) if self.assembly_config.get('filter_ungrounded'): score_threshold = self.assembly_config.get('score_threshold') stmts = ac.filter_grounded_only(stmts, score_threshold=score_threshold) if self.assembly_config.get('merge_groundings'): stmts = ac.merge_groundings(stmts) if self.assembly_config.get('merge_deltas'): stmts = ac.merge_deltas(stmts) relevance_policy = self.assembly_config.get('filter_relevance') if relevance_policy: stmts = self.filter_relevance(stmts, relevance_policy) if not self.assembly_config.get('skip_filter_human'): stmts = ac.filter_human_only(stmts) if not self.assembly_config.get('skip_map_sequence'): stmts = ac.map_sequence(stmts) # Use WM hierarchies and belief scorer for WM preassembly preassembly_mode = self.assembly_config.get('preassembly_mode') if preassembly_mode == 'wm': hierarchies = get_wm_hierarchies() belief_scorer = get_eidos_scorer() stmts = ac.run_preassembly(stmts, return_toplevel=False, belief_scorer=belief_scorer, hierarchies=hierarchies) else: stmts = ac.run_preassembly(stmts, return_toplevel=False) belief_cutoff = self.assembly_config.get('belief_cutoff') if belief_cutoff is not None: stmts = ac.filter_belief(stmts, belief_cutoff) stmts = ac.filter_top_level(stmts) if self.assembly_config.get('filter_direct'): stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) stmts = ac.filter_mod_nokinase(stmts) stmts = ac.filter_transcription_factor(stmts) if self.assembly_config.get('mechanism_linking'): ml = MechLinker(stmts) ml.gather_explicit_activities() ml.reduce_activities() ml.gather_modifications() ml.reduce_modifications() ml.gather_explicit_activities() ml.replace_activations() ml.require_active_forms() stmts = ml.statements self.assembled_stmts = stmts
def process_statements(stmts, **generate_id_map_kwargs): stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) pa = Preassembler(hierarchies) unique_stmts = make_unique_statement_set(pa, stmts) match_key_maps = get_match_key_maps(pa, unique_stmts, **generate_id_map_kwargs) return unique_stmts, match_key_maps
def get_omnipath_stmts(): stmts = omnipath_client.get_all_modifications() phos_stmts = ac.filter_by_type(stmts, Phosphorylation) dephos_stmts = ac.filter_by_type(stmts, Dephosphorylation) stmts = phos_stmts + dephos_stmts stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def map_sequence(): """Map sequence on a list of INDRA Statements.""" if request.method == 'OPTIONS': return {} response = request.body.read().decode('utf-8') body = json.loads(response) stmts_json = body.get('statements') stmts = stmts_from_json(stmts_json) stmts_out = ac.map_sequence(stmts) return _return_stmts(stmts_out)
def run_assembly(stmts, filename): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) #stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4) ac.dump_statements(stmts, filename) return stmts
def test_readme_pipeline(): stmts = gn_stmts # Added only here, not in docs from indra.tools import assemble_corpus as ac stmts = ac.filter_no_hypothesis(stmts) stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.8) assert stmts, 'Update example to yield statements list of non-zero length'
def _clean_statements(self, stmts): """Perform grounding, sequence mapping, and find unique set from stmts. This method returns a list of statement objects, as well as a set of tuples of the form (uuid, matches_key) which represent the links between raw (evidence) statements and their unique/preassembled counterparts. """ self._log("Map grounding...") stmts = ac.map_grounding(stmts) self._log("Map sequences...") stmts = ac.map_sequence(stmts, use_cache=True) return stmts
def test_map_sequence_blank_entries(): """Make sure sites curated as erroneous with no mappings don't get treated as valid mappings.""" mapk1 = Agent('MAPK1', db_refs={'UP': 'P28482'}) rps6 = Agent('RPS6', db_refs={'UP': 'P62753'}) phos_rps6 = Agent('RPS6', mods=[ModCondition('phosphorylation', 'T', '389')], db_refs={'UP': 'P62753'}) st1 = Phosphorylation(mapk1, rps6, 'T', '389') st2 = Phosphorylation(phos_rps6, mapk1, 'T', '185') mapped = ac.map_sequence([st1, st2]) assert len(mapped) == 0
def preassemble(self, filters=None, grounding_map=None): """Preassemble the Statements collected in the model. Use INDRA's GroundingMapper, Preassembler and BeliefEngine on the IncrementalModel and save the unique statements and the top level statements in class attributes. Currently the following filter options are implemented: - grounding: require that all Agents in statements are grounded - human_only: require that all proteins are human proteins - prior_one: require that at least one Agent is in the prior model - prior_all: require that all Agents are in the prior model Parameters ---------- filters : Optional[list[str]] A list of filter options to apply when choosing the statements. See description above for more details. Default: None grounding_map : Optional[dict] A user supplied grounding map which maps a string to a dictionary of database IDs (in the format used by Agents' db_refs). """ stmts = self.get_statements() # Filter out hypotheses stmts = ac.filter_no_hypothesis(stmts) # Fix grounding if grounding_map is not None: stmts = ac.map_grounding(stmts, grounding_map=grounding_map) else: stmts = ac.map_grounding(stmts) if filters and ('grounding' in filters): stmts = ac.filter_grounded_only(stmts) # Fix sites stmts = ac.map_sequence(stmts) if filters and 'human_only' in filters: stmts = ac.filter_human_only(stmts) # Run preassembly stmts = ac.run_preassembly(stmts, return_toplevel=False) # Run relevance filter stmts = self._relevance_filter(stmts, filters) # Save Statements self.assembled_stmts = stmts
def map_grounding(): """Map sequence on a list of INDRA Statements.""" response = request.body.read().decode('utf-8') body = json.loads(response) stmts_json = body.get('statements') stmts = stmts_from_json(stmts_json) stmts_out = ac.map_sequence(stmts) if stmts_out: stmts_json = stmts_to_json(stmts_out) res = {'statements': stmts_json} return res else: res = {'statements': []} return res
def run_assembly(stmts, save_file): stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, gene_names, 'one') stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) stmts = ac.filter_belief(stmts, 0.95) stmts = ac.filter_top_level(stmts) stmts = ac.filter_direct(stmts) stmts = ac.filter_enzyme_kinase(stmts) ac.dump_statements(stmts, save_file) return stmts
def run_assembly(self): """Run INDRA's assembly pipeline on the Statements. Returns ------- stmts : list[indra.statements.Statement] The list of assembled INDRA Statements. """ stmts = self.get_indra_smts() stmts = ac.filter_no_hypothesis(stmts) stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.run_preassembly(stmts, return_toplevel=False) return stmts
def get_indra_phos_stmts(): stmts = by_gene_role_type(stmt_type='Phosphorylation') stmts += by_gene_role_type(stmt_type='Dephosphorylation') stmts = ac.map_grounding(stmts) # Expand families before site mapping stmts = ac.expand_families(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.map_sequence(stmts) ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl') stmts = ac.run_preassembly(stmts, poolsize=4, save='sources/indra_phos_stmts_pre.pkl') stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl') return stmts
def _clean_statements(self, stmts): """Perform grounding, sequence mapping, and find unique set from stmts. This method returns a list of statement objects, as well as a set of tuples of the form (uuid, matches_key) which represent the links between raw (evidence) statements and their unique/preassembled counterparts. """ eliminated_uuids = {} all_uuids = {s.uuid for s in stmts} self._log("Map grounding...") stmts = ac.map_grounding(stmts, use_adeft=True, gilda_mode='local') grounded_uuids = {s.uuid for s in stmts} eliminated_uuids['grounding'] = all_uuids - grounded_uuids self._log("Map sequences...") stmts = ac.map_sequence(stmts, use_cache=True) seqmapped_and_grounded_uuids = {s.uuid for s in stmts} eliminated_uuids['sequence mapping'] = \ grounded_uuids - seqmapped_and_grounded_uuids return stmts, eliminated_uuids
def load_statements_from_synapse(synapse_id='syn11273504'): syn = synapseclient.Synapse() syn.login() # Obtain a pointer and download the data syn_data = syn.get(synapse_id) stmts = [] for row in read_unicode_csv(syn_data.path, delimiter='\t'): sub_name, site_info = row[0].split(':') res = site_info[0] pos = site_info[1:] gene_list = row[1].split(',') for enz_name in gene_list: enz = Agent(enz_name, db_refs=get_ids(enz_name)) sub = Agent(sub_name, db_refs=get_ids(sub_name)) stmt = Phosphorylation(enz, sub, res, pos) stmts.append(stmt) stmts = ac.map_sequence(stmts) stmts = ac.filter_human_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=True) return stmts
def run_preassembly(self, stmts, print_summary=True): """Run complete preassembly procedure on the given statements. Results are returned as a dict and stored in the attribute :py:attr:`results`. They are also saved in the pickle file `<basename>_results.pkl`. Parameters ---------- stmts : list of :py:class:`indra.statements.Statement` Statements to preassemble. print_summary : bool If True (default), prints a summary of the preassembly process to the console. Returns ------- dict A dict containing the following entries: - `raw`: the starting set of statements before preassembly. - `duplicates1`: statements after initial de-duplication. - `valid`: statements found to have valid modification sites. - `mapped`: mapped statements (list of :py:class:`indra.preassembler.sitemapper.MappedStatement`). - `mapped_stmts`: combined list of valid statements and statements after mapping. - `duplicates2`: statements resulting from de-duplication of the statements in `mapped_stmts`. - `related2`: top-level statements after combining the statements in `duplicates2`. """ stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) self.results = ac.run_preassembly(stmts) # Save the results if we're caching if self.basename is not None: results_filename = '%s_results.pkl' % self.basename with open(results_filename, 'wb') as f: pickle.dump(self.results, f) return self.results
def pa_filter_unique_evidence(stmts): """Wrapper function for chaining preassembly statements meant to reduce the number of statements. stmts : list[:py:class:`indra.statements.Statement`] Returns ------- stmts : list[:py:class:`indra.statements.Statement`] List of preassembled indra statements """ # Ground statemtens: grounded_stmts = ac.map_grounding(stmts) # Use curated site information to standardize modification sites in stmts ms_stmts = ac.map_sequence(grounded_stmts) # Compiles together raw statements to one statement per type opa_stmts = ac.run_preassembly(ms_stmts, return_toplevel=False) return opa_stmts
def test_gene_network(): # Chunk 1: this is tested in _get_gene_network_stmts # from indra.tools.gene_network import GeneNetwork # gn = GeneNetwork(['H2AX']) # biopax_stmts = gn.get_biopax_stmts() # bel_stmts = gn.get_bel_stmts() # Chunk 2 from indra import literature pmids = literature.pubmed_client.get_ids_for_gene('H2AX') # Chunk 3 from indra import literature paper_contents = {} for pmid in pmids: content, content_type = literature.get_full_text(pmid, 'pmid') if content_type == 'abstract': paper_contents[pmid] = content if len(paper_contents) == 5: # Is 10 in actual code break # Chunk 4 from indra.sources import reach literature_stmts = [] for pmid, content in paper_contents.items(): rp = reach.process_text(content, url=reach.local_text_url) literature_stmts += rp.statements print('Got %d statements' % len(literature_stmts)) assert literature_stmts # replaces a print statements # Chunk 6 from indra.tools import assemble_corpus as ac # stmts = biopax_stmts + bel_stmts + literature_stmts # tested elsewhere stmts = gn_stmts + literature_stmts # Added instead of above line stmts = ac.map_grounding(stmts) stmts = ac.map_sequence(stmts) stmts = ac.run_preassembly(stmts) assert stmts # Chunk 7 from indra.assemblers.cx import CxAssembler from indra.databases import ndex_client cxa = CxAssembler(stmts) cx_str = cxa.make_model() assert cx_str # Chunk 8 # ndex_cred = {'user': '******', 'password': '******'} # network_id = ndex_client.create_network(cx_str, ndex_cred) # print(network_id) # Chunk 9 from indra.assemblers.indranet import IndraNetAssembler indranet_assembler = IndraNetAssembler(statements=stmts) indranet = indranet_assembler.make_model() assert len(indranet.nodes) > 0, 'indranet conatins no nodes' assert len(indranet.edges) > 0, 'indranet conatins no edges' # Chunk 10 import networkx as nx paths = nx.single_source_shortest_path(G=indranet, source='H2AX', cutoff=1) assert paths # Chunk 11 from indra.assemblers.pysb import PysbAssembler pysb = PysbAssembler(statements=stmts) pysb_model = pysb.make_model() assert pysb_model
else: #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl')) assemble_models = [] assemble_models.append('sif') assemble_models.append('pysb') assemble_models.append('cx') ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly if 'sif' in assemble_models: sif_str = assemble_sif(stmts, data, pjoin(outf, 'PKN-korkut_all_ab.sif'))
# The file in which the preassembled statements will be saved pre_stmts_file = prefixed_pkl('preassembled') if reassemble: # Load various files that were previously produced sources = [ 'indradb', 'trips', 'bel', 'biopax', 'phosphosite', 'r3', 'sparser' ] stmts = [] for source in sources: stmts += ac.load_statements(prefixed_pkl(source)) stmts = ac.filter_no_hypothesis(stmts) # Fix grounding and filter to grounded entities and for proteins, # filter to the human ones stmts = ac.map_grounding(stmts) stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_human_only(stmts) # Combinatorially expand protein families stmts = ac.expand_families(stmts) # Apply a strict filter to statements based on the gene names stmts = ac.filter_gene_list(stmts, gene_names, 'all') # Fix errors in references to protein sequences stmts = ac.map_sequence(stmts) # Run preassembly and save result stmts = ac.run_preassembly(stmts, return_toplevel=False) ac.dump_statements(stmts, pre_stmts_file) # Load the preassembled statements stmts = ac.load_statements(pre_stmts_file) # Run assembly into a PySB model assemble_pysb.assemble_pysb(stmts, gene_names, contextualize=True)
reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reach_stmts = ac.filter_no_hypothesis(reach_stmts) #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl')) extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl')) reading_stmts = reach_stmts + extra_stmts reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts + extra_stmts stmts = ac.filter_grounded_only(stmts) stmts = ac.filter_genes_only(stmts, specific_only=False) stmts = ac.filter_human_only(stmts) stmts = ac.expand_families(stmts) stmts = ac.filter_gene_list(stmts, data_genes, 'one') stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl')) #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl')) stmts = ac.run_preassembly(stmts, return_toplevel=False, save=pjoin(outf, 'preassembled.pkl'), poolsize=4) ### PySB assembly if 'pysb' in assemble_models: pysb_model = assemble_pysb(stmts, data_genes, pjoin(outf, 'korkut_model_pysb.py')) ### SIF assembly if 'sif' in assemble_models: sif_str = assemble_sif(stmts, data, pjoin(outf, 'PKN-korkut_all_ab.sif')) ### CX assembly
def _do_old_fashioned_preassembly(stmts): grounded_stmts = ac.map_grounding(stmts) ms_stmts = ac.map_sequence(grounded_stmts) opa_stmts = ac.run_preassembly(ms_stmts, return_toplevel=False) return opa_stmts