def test_filter_emmaa_stmts(): estmt1 = EmmaaStatement(stmt, date, search_terms, {'internal': True}) estmt2 = EmmaaStatement(stmt, date, search_terms, {'internal': False}) estmt3 = EmmaaStatement(stmt, date, search_terms) del estmt3.metadata # Imitate older style statement withou metadata # Only estmt2 with internal False should be filtered out filtered_estmts = filter_emmaa_stmts_by_metadata([estmt1, estmt2, estmt3], {'internal': True}) assert len(filtered_estmts) == 2 assert estmt1 in filtered_estmts assert estmt3 in filtered_estmts
def test_model_extend(): ev1 = Evidence(pmid='1234', text='abcd', source_api='x') ev2 = Evidence(pmid='1234', text='abcde', source_api='x') ev3 = Evidence(pmid='1234', text='abcd', source_api='x') indra_sts = [Phosphorylation(None, Agent('a'), evidence=ev) for ev in [ev1, ev2, ev3]] emmaa_sts = [EmmaaStatement(st, datetime.datetime.now(), []) for st in indra_sts] em = EmmaaModel('x', {'search_terms': [], 'ndex': {'network': None}}) em.add_statements([emmaa_sts[0]]) em.extend_unique(emmaa_sts[1:]) assert len(em.stmts) == 2 stmt = EmmaaStatement(Phosphorylation(None, Agent('b'), evidence=ev1), datetime.datetime.now(), []) em.extend_unique([stmt]) assert len(em.stmts) == 3
def generate_model(model_name): """Generate a simple model for end-to-end testing using natural language.""" tp = trips.process_text('BRAF activates MAP2K1. ' 'Active MAP2K1 activates MAPK1.') indra_stmts = tp.statements emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), 'MAPK1') for stmt in indra_stmts ] # Create a CXAssembled model, upload to NDEx and retrieve key #cxa = CxAssembler(indra_stmts) #cxa.make_model() #ndex_id = cxa.upload_model(private=False) config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } emmaa_model = EmmaaModel(model_name, config_dict) emmaa_model.add_statements(emmaa_stmts) return emmaa_model, config_dict
def read_elsevier_eidos_search_terms(piis_to_terms): """Return extracted EmmaaStatements given a dict of PIIS to SearchTerms. Parameters ---------- piis_to_terms : dict A dict representing a set of PIIs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ piis = list(piis_to_terms.keys()) date = datetime.datetime.utcnow() texts = read_piis(piis) pii_stmts = process_texts(texts) estmts = [] for pii, stmts in pii_stmts.items(): for stmt in stmts: for evid in stmt.evidence: evid.annotations['pii'] = pii es = EmmaaStatement(stmt, date, piis_to_terms[pii]) estmts.append(es) return estmts
def read_db_ids_search_terms(id_search_terms, id_type): """Return extracted EmmaaStatements from INDRA database given an ID-search term dict. Parameters ---------- id_search_terms : dict A dict representing a set of IDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given IDs. """ ids = list(id_search_terms.keys()) date = datetime.datetime.utcnow() db = get_primary_db() id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db) estmts = [] for _id, stmt_jsons in id_stmts.items(): stmts = stmts_from_json(stmt_jsons) for stmt in stmts: es = EmmaaStatement(stmt, date, id_search_terms[_id]) estmts.append(es) return estmts
def read_db_pmid_search_terms(pmid_search_terms): """Return extracted EmmaaStatements from INDRA database given a PMID-search term dict. Parameters ---------- pmid_search_terms : dict A dict representing a set of PMIDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ pmids = list(pmid_search_terms.keys()) date = datetime.datetime.utcnow() db = get_primary_db() pmid_stmts = get_statements_by_paper(pmids, id_type='pmid', db=db, preassembled=False) estmts = [] for pmid, stmts in pmid_stmts.items(): for stmt in stmts: es = EmmaaStatement(stmt, date, pmid_search_terms[pmid]) estmts.append(es) return estmts
def test_model_json(): """Test the json structure and content of EmmaaModel.to_json() output""" indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } emmaa_model = EmmaaModel('test', config_dict) emmaa_model.add_statements(emmaa_stmts) emmaa_model_json = emmaa_model.to_json() # Test json structure assert emmaa_model_json['name'] == 'test' assert isinstance(emmaa_model_json['stmts'], list) assert emmaa_model_json['ndex_network'] == \ 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' # Test config assert emmaa_model_json['search_terms'][0]['type'] == 'gene' assert emmaa_model_json['search_terms'][0]['db_refs'] == {'HGNC': '20974'} # Test json statements assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'Active MAP2K1 activates MAPK1.' == \ emmaa_model_json['stmts'][1]['stmt']['evidence'][0]['text'] assert emmaa_model_json['stmts'][0]['stmt']['subj']['name'] == 'BRAF' assert emmaa_model_json['stmts'][1]['stmt']['subj']['name'] == 'MAP2K1' assert emmaa_model_json['stmts'][1]['stmt']['obj']['name'] == 'MAPK1' # Need hashes to be strings so that javascript can read them assert isinstance( emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['source_hash'], str)
def make_gene_statements(self): """Generate Statements from the gene list.""" drug_names = [st.name for st in self.search_terms if st.type == 'drug'] indra_stmts = get_stmts_for_gene_list(self.gene_list, drug_names) estmts = [EmmaaStatement(stmt, datetime.datetime.now(), []) for stmt in indra_stmts] self.stmts = estmts
def create_model(relevance=None, paper_ids=None): indra_stmts = [ Activation(Agent('BRAF', db_refs={'HGNC': '1097'}), Agent('MAP2K1', db_refs={'HGNC': '6840'}), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion', text_refs={'TRID': '1234'})]), Activation(Agent('MAP2K1', db_refs={'HGNC': '6840'}, activity=ActivityCondition('activity', True)), Agent('MAPK1', db_refs={'HGNC': '6871'}), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.', source_api='assertion', text_refs={'TRID': '2345'})]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement( indra_stmts[0], datetime.datetime.now(), [st], {'internal': True, 'curated': False}), EmmaaStatement( indra_stmts[1], datetime.datetime.now(), [st], {'internal': True, 'curated': True}) ] config_dict = { 'ndex': {'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf'}, 'search_terms': [{'db_refs': {'HGNC': '20974'}, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene'}], 'human_readable_name': 'Test Model', 'test': { 'statement_checking': {'max_path_length': 5, 'max_paths': 1}, 'test_corpus': 'simple_tests', 'mc_types': ['pysb', 'pybel', 'signed_graph', 'unsigned_graph']}, 'assembly': [ {'function': 'filter_no_hypothesis'}, {'function': 'map_grounding'}, {'function': 'filter_grounded_only'}, {'function': 'filter_human_only'}, {'function': 'map_sequence'}, {'function': 'run_preassembly', 'kwargs': { 'return_toplevel': False}}]} if relevance: config_dict['assembly'].append( {'function': 'filter_relevance', 'kwargs': {'policy': relevance}}) emmaa_model = EmmaaModel('test', config_dict, paper_ids) emmaa_model.add_statements(emmaa_stmts) return emmaa_model
def upload_prior(ctype, config): fname = f'../models/{ctype}/prior_stmts.pkl' with open(fname, 'rb') as fh: stmts = pickle.load(fh) estmts = [ EmmaaStatement(stmt, datetime.datetime.now(), []) for stmt in stmts ] model = EmmaaModel(ctype, config) model.add_statements(estmts) model.update_to_ndex()
def get_emmaa_statements(stmts, gene_names): def is_internal(stmt): # If all the agents are gene names, this is an internal statement. # We classify any statements with drugs in them as external. return all([a.name in gene_names for a in stmt.real_agent_list()]) estmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [], {'internal': is_internal(stmt)}) for stmt in stmts ] return estmts
def test_filter_relevance(): config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates ' 'MAPK1.', source_api='assertion')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] # Try no filter first emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 2, emmaa_model.assembled_stmts # Next do a prior_one filter config_dict['assembly'] = {'filter_relevance': 'prior_one'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 1, emmaa_model.assembled_stmts assert emmaa_model.assembled_stmts[0].obj.name == 'MAPK1' # Next do a prior_all filter config_dict['assembly'] = {'filter_relevance': 'prior_all'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 0
def make_gene_statements(self) -> List[EmmaaStatement]: """Generate Statements from the gene list.""" if self.stmts: return self.stmts def is_internal(stmt): # If all the agents are gene names, this is an internal statement. # We classify any statements with drugs in them as external. return all([a.name in self.gene_list for a in stmt.real_agent_list()]) drug_names = [st.name for st in self.search_terms if st.type == 'drug'] indra_stmts = get_stmts_for_gene_list(self.gene_list, drug_names) estmts = [EmmaaStatement(stmt, datetime.datetime.now(), [], {'internal': is_internal(stmt)}) for stmt in indra_stmts] self.stmts = estmts return self.stmts
def get_statements(self, mode='all', batch_size=100): """Return EMMAA Statements for this prior's literature set. Parameters ---------- mode : 'all' or 'distilled' The 'distilled' mode makes sure that the "best", non-redundant set of raw statements are found across potentially redundant text contents and reader versions. The 'all' mode doesn't do such distillation but is significantly faster. batch_size : Optional[int] Determines how many PMIDs to fetch statements for in each iteration. Default: 100. Returns ------- list of EmmaaStatement A list of EMMAA Statements corresponding to extractions from the subset of literature defined by this prior's search terms. """ if self.stmts: return self.stmts terms_to_pmids = \ EmmaaModel.search_pubmed(search_terms=self.search_terms, date_limit=None) pmids_to_terms = defaultdict(list) for term, pmids in terms_to_pmids.items(): for pmid in pmids: pmids_to_terms[pmid].append(term) pmids_to_terms = dict(pmids_to_terms) all_pmids = set(pmids_to_terms.keys()) raw_statements_by_pmid = \ get_raw_statements_for_pmids(all_pmids, mode=mode, batch_size=batch_size) timestamp = datetime.datetime.now() for pmid, stmts in raw_statements_by_pmid.items(): for stmt in stmts: self.stmts.append( EmmaaStatement(stmt, timestamp, pmids_to_terms[pmid], {'internal': True})) return self.stmts
def update_cancer(cancer_type): """Update the model for the given cancer. A JSON config file must be present for the given cancer type, located in the models/<cancer_type>/config.json. Parameters ---------- cancer_type : str A short string which is the name of the cancer, and corresponds to a directory in the models directory, as described above. """ print(cancer_type) with open(f'models/{cancer_type}/prior_stmts.pkl', 'rb') as fh: stmts = pickle.load(fh) config = json.load(open(f'models/{cancer_type}/config.json', 'r')) em = EmmaaModel(cancer_type, config) ess = [EmmaaStatement(st, datetime.datetime.now(), []) for st in stmts] em.add_statements(ess) em.save_to_s3() return
def test_direct_path_tests(): model = create_model() stmt = Activation(Agent('BRAF', db_refs={'HGNC': '1097'}), Agent('MAPK1', db_refs={'UP': 'P28482'})) model.stmts.append( EmmaaStatement(stmt, datetime.datetime.now(), [], { 'internal': True, 'curated': False })) tests = [StatementCheckingTest(stmt)] mm = ModelManager(model) tm = TestManager([mm], tests) tm.make_tests(ScopeTestConnector()) tm.run_tests(allow_direct=True) for mc_type in ['pysb', 'signed_graph', 'unsigned_graph']: res = mm.mc_types[mc_type]['test_results'][0] print(res.paths) assert len(res.paths[0]) == 2, (mc_type, res.paths[0]) # 1 edge tm.run_tests(allow_direct=False) for mc_type in ['pysb', 'signed_graph', 'unsigned_graph']: # Look at the seecond test result here res = mm.mc_types[mc_type]['test_results'][1] assert len(res.paths[0]) == 3, (mc_type, res.paths[0]) # 2 edges
def read_pmid_search_terms(pmid_search_terms): """Return extracted EmmaaStatements given a PMID-search term dict. Parameters ---------- pmid_search_terms : dict A dict representing a set of PMIDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ pmids = list(pmid_search_terms.keys()) date = datetime.datetime.utcnow() pmid_stmts = read_pmids(pmids, date) estmts = [] for pmid, stmts in pmid_stmts.items(): for stmt in stmts: es = EmmaaStatement(stmt, date, pmid_search_terms[pmid]) estmts.append(es) return estmts