def search_terms_from_nodes(node_list): """Build a list of Pubmed search terms from the nodes returned by make_prior.""" terms = [] for node in node_list: if node.startswith('HGNC:'): hgnc_id = node.split(':')[1] hgnc_name = get_hgnc_name(hgnc_id) if hgnc_name is None: logger.log(f'{node} is not a valid HGNC ID') else: term = SearchTerm(type='gene', name=hgnc_name, search_term=f'"{hgnc_name}"', db_refs={'HGNC': hgnc_id}) terms.append(term) elif node.startswith('MESH:'): mesh_id = node.split(':')[1] # TODO: get actual process name here term = SearchTerm(type='bioprocess', name=mesh_id, search_term=f'{mesh_id}[MeSH Terms]', db_refs={'MESH': mesh_id}) terms.append(term) # TODO: handle GO here else: logger.warning(f'Could not create search term from {node}') return sorted(terms, key=lambda x: x.name)
def make_search_terms(terms, ontology_file): """Make SearchTerm objects standardized to a given ontology from terms. Parameters ---------- terms : list[str] A list of terms corresponding to suffixes of entries in the ontology. ontology_file : str A path to a file containing ontology. Returns ------- search_terms : set A set of SearchTerm objects constructed from given terms and ontology having standardized names. """ search_terms = set() with open(ontology_file, 'r') as f: lines = f.readlines() ontologies = [] for line in lines: links = line.split('> <') link = links[0] ont_start = link.find('UN') ont = link[ont_start:] ontologies.append(ont) for ont in ontologies: for term in terms: if ont.endswith(term): search_term = term.replace('_', ' ') name = search_term.capitalize() st = SearchTerm(type='concept', name=name, db_refs={'UN': ont}, search_term='\"%s\"' % search_term) search_terms.add(st) return search_terms
def test_model_json(): """Test the json structure and content of EmmaaModel.to_json() output""" indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } emmaa_model = EmmaaModel('test', config_dict) emmaa_model.add_statements(emmaa_stmts) emmaa_model_json = emmaa_model.to_json() # Test json structure assert emmaa_model_json['name'] == 'test' assert isinstance(emmaa_model_json['stmts'], list) assert emmaa_model_json['ndex_network'] == \ 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' # Test config assert emmaa_model_json['search_terms'][0]['type'] == 'gene' assert emmaa_model_json['search_terms'][0]['db_refs'] == {'HGNC': '20974'} # Test json statements assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'BRAF activates MAP2K1.' == \ emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['text'] assert 'Active MAP2K1 activates MAPK1.' == \ emmaa_model_json['stmts'][1]['stmt']['evidence'][0]['text'] assert emmaa_model_json['stmts'][0]['stmt']['subj']['name'] == 'BRAF' assert emmaa_model_json['stmts'][1]['stmt']['subj']['name'] == 'MAP2K1' assert emmaa_model_json['stmts'][1]['stmt']['obj']['name'] == 'MAPK1' # Need hashes to be strings so that javascript can read them assert isinstance( emmaa_model_json['stmts'][0]['stmt']['evidence'][0]['source_hash'], str)
def test_filter_relevance(): config_dict = { 'ndex': { 'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf' }, 'search_terms': [{ 'db_refs': { 'HGNC': '20974' }, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene' }] } indra_stmts = \ [Activation(Agent('BRAF', db_refs={'HGNC': '20974'}), Agent('MAP2K1'), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion')]), Activation(Agent('MAP2K1', activity=ActivityCondition('activity', True)), Agent('MAPK1'), evidence=[Evidence(text='Active MAP2K1 activates ' 'MAPK1.', source_api='assertion')]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement(stmt, datetime.datetime.now(), [st]) for stmt in indra_stmts ] # Try no filter first emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 2, emmaa_model.assembled_stmts # Next do a prior_one filter config_dict['assembly'] = {'filter_relevance': 'prior_one'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 1, emmaa_model.assembled_stmts assert emmaa_model.assembled_stmts[0].obj.name == 'MAPK1' # Next do a prior_all filter config_dict['assembly'] = {'filter_relevance': 'prior_all'} emmaa_model = EmmaaModel('test', config_dict) emmaa_model.extend_unique(emmaa_stmts) emmaa_model.run_assembly() assert len(emmaa_model.assembled_stmts) == 0
def _load_config(self, config): self.search_terms = [ SearchTerm.from_json(s) for s in config['search_terms'] ] if 'ndex' in config: self.ndex_network = config['ndex']['network'] else: self.ndex_network = None if 'reading' in config: self.reading_config = config['reading'] if 'assembly' in config: self.assembly_config = config['assembly'] if 'test' in config: self.test_config = config['test']
def test_read_db_pmid_search_terms(): """Check read_db_pmid_search_terms() function with different inputs.""" search_terms = [ SearchTerm('gene', 'AKT2', { 'HGNC': '392', 'UP': 'P31751' }, 'AKT2'), SearchTerm('gene', 'ACOX2', { 'HGNC': '120', 'UP': 'Q99424' }, 'ACOX2') ] # Check for empty input. assert len(read_db_pmid_search_terms({})) == 0 # Check for PMIDs that do not have any statements. nostmts_pmid = "22178463" assert len(read_db_pmid_search_terms({nostmts_pmid: search_terms})) == 0 # Check for PMIDs that have statements. stmts_pmid = "23431386" estmts = read_db_pmid_search_terms({stmts_pmid: search_terms}) assert len(estmts) > 0 assert isinstance(estmts[0], EmmaaStatement) estmts[0].search_terms == search_terms
def create_model(relevance=None, paper_ids=None): indra_stmts = [ Activation(Agent('BRAF', db_refs={'HGNC': '1097'}), Agent('MAP2K1', db_refs={'HGNC': '6840'}), evidence=[Evidence(text='BRAF activates MAP2K1.', source_api='assertion', text_refs={'TRID': '1234'})]), Activation(Agent('MAP2K1', db_refs={'HGNC': '6840'}, activity=ActivityCondition('activity', True)), Agent('MAPK1', db_refs={'HGNC': '6871'}), evidence=[Evidence(text='Active MAP2K1 activates MAPK1.', source_api='assertion', text_refs={'TRID': '2345'})]) ] st = SearchTerm('gene', 'MAP2K1', db_refs={}, search_term='MAP2K1') emmaa_stmts = [ EmmaaStatement( indra_stmts[0], datetime.datetime.now(), [st], {'internal': True, 'curated': False}), EmmaaStatement( indra_stmts[1], datetime.datetime.now(), [st], {'internal': True, 'curated': True}) ] config_dict = { 'ndex': {'network': 'a08479d1-24ce-11e9-bb6a-0ac135e8bacf'}, 'search_terms': [{'db_refs': {'HGNC': '20974'}, 'name': 'MAPK1', 'search_term': 'MAPK1', 'type': 'gene'}], 'human_readable_name': 'Test Model', 'test': { 'statement_checking': {'max_path_length': 5, 'max_paths': 1}, 'test_corpus': 'simple_tests', 'mc_types': ['pysb', 'pybel', 'signed_graph', 'unsigned_graph']}, 'assembly': [ {'function': 'filter_no_hypothesis'}, {'function': 'map_grounding'}, {'function': 'filter_grounded_only'}, {'function': 'filter_human_only'}, {'function': 'map_sequence'}, {'function': 'run_preassembly', 'kwargs': { 'return_toplevel': False}}]} if relevance: config_dict['assembly'].append( {'function': 'filter_relevance', 'kwargs': {'policy': relevance}}) emmaa_model = EmmaaModel('test', config_dict, paper_ids) emmaa_model.add_statements(emmaa_stmts) return emmaa_model
def _load_config(self, config): self.search_terms = [SearchTerm.from_json(s) for s in config['search_terms']] if 'ndex' in config: self.ndex_network = config['ndex']['network'] else: self.ndex_network = None if 'reading' in config: self.reading_config = config['reading'] if 'assembly' in config: self.assembly_config = config['assembly'] if 'test' in config: self.test_config = config['test'] if 'query' in config: self.query_config = config['query'] if 'human_readable_name' in config: self.human_readable_name = config['human_readable_name'] self.export_formats = config.get('export_formats', [])
def test_find_drugs_for_genes(): # SearchTerm for SRC SRC = SearchTerm(type='gene', name='SRC', search_term='"SRC"', db_refs={'HGNC': '11283'}) # drugs targeting KRAS drug_terms = find_drugs_for_genes([SRC]) # make sure there are results assert drug_terms # make sure the result is a list of search terms assert all(isinstance(term, SearchTerm) for term in drug_terms) # something is wrong if there are fewer than 10 drugs assert len(drug_terms) > 10 # test that some example drugs are included drug_names = set(term.name for term in drug_terms) example_drugs = set(['Dasatinib', 'Tozasertib', 'Ponatinib']) assert example_drugs <= drug_names
def search_biorxiv(collection_id, date_limit): """Search BioRxiv within date_limit. Parameters ---------- date_limit : int The number of days to search back from today. collection_id : str ID of a collection to search BioArxiv for. Returns ------- terms_to_dois : dict A dict representing biorxiv collection ID as key and DOIs returned by search as values. """ start_date = (datetime.datetime.utcnow() - datetime.timedelta(days=date_limit)) dois = biorxiv_client.get_collection_dois(collection_id, start_date) logger.info(f'{len(dois)} DOIs found') term = SearchTerm('other', f'biorxiv: {collection_id}', {}, None) terms_to_dois = {term: dois} return terms_to_dois
def test_find_drugs_for_genes(): # SearchTerm for SRC SRC = SearchTerm(type='gene', name='SRC', search_term='"SRC"', db_refs={'HGNC': '11283'}) # drugs targeting KRAS drug_terms = find_drugs_for_genes([SRC], [ Inhibition(Agent('Dasatinib', db_refs={'CHEBI': 'CHEBI:49375'}), Agent('SRC', db_refs={'HGNC': '11283'})), Inhibition(Agent('Ponatinib', db_refs={'CHEBI': 'CHEBI:78543'}), Agent('SRC', db_refs={'HGNC': '11283'})) ]) # make sure there are results assert drug_terms # make sure the result is a list of search terms assert all(isinstance(term, SearchTerm) for term in drug_terms) # test that some example drugs are included drug_names = set(term.name for term in drug_terms) assert drug_names == set(['Dasatinib', 'Ponatinib'])
filter_emmaa_stmts_by_metadata, filter_indra_stmts_by_metadata from emmaa.priors import SearchTerm from indra.statements import Activation, Agent, Evidence braf = Agent('BRAF', db_refs={'HGNC': '1097'}) map2k1 = Agent('MAP2K1', db_refs={'HGNC': '6840'}) stmt = Activation(braf, map2k1, evidence=[ Evidence(text='BRAF activates MAP2K1.', source_api='assertion', text_refs={'TRID': '1234'}) ]) date = datetime.datetime.now() search_terms = [ SearchTerm('gene', braf.name, braf.db_refs, '"BRAF"'), SearchTerm('gene', map2k1.name, map2k1.db_refs, '"MAP2K1"') ] def test_to_emmaa_stmts(): estmts = to_emmaa_stmts([stmt], date=date, search_terms=search_terms, metadata={'internal': True}) assert estmts estmt = estmts[0] assert isinstance(estmt, EmmaaStatement) assert estmt.stmt == stmt assert estmt.metadata == {'internal': True} emmaa_anns = estmt.stmt.evidence[0].annotations.get('emmaa')
def make_prior_from_genes(gene_list): """Return reactome prior based on a list of genes Parameters ---------- gene_list : list of str List of HGNC symbols for genes Returns ------- res : list of :py:class:`emmaa.priors.SearchTerm` List of search terms corresponding to all genes found in any reactome pathway containing one of the genes in the input gene list """ all_reactome_ids = set([]) for gene_name in gene_list: hgnc_id = get_hgnc_id(gene_name) uniprot_id = get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get Uniprot ID for HGNC symbol' f' {gene_name}') continue reactome_ids = rx_id_from_up_id(uniprot_id) if not reactome_ids: logger.warning('Could not get Reactome ID for Uniprot ID' f' {uniprot_id} with corresonding HGNC symbol' f' {gene_name}') continue all_reactome_ids.update(reactome_ids) all_pathways = set([]) for reactome_id in all_reactome_ids: if not re.match('^R-HSA-[0-9]', reactome_id): # skip non-human genes continue additional_pathways = get_pathways_containing_gene(reactome_id) if additional_pathways is not None: all_pathways.update(additional_pathways) all_genes = set([]) for pathway in all_pathways: additional_genes = get_genes_contained_in_pathway(pathway) if additional_genes is not None: all_genes.update(additional_genes) gene_terms = [] for uniprot_id in all_genes: hgnc_name = get_gene_name(uniprot_id) if hgnc_name is None: logger.warning('Could not get HGNC name for UniProt ID' f' {uniprot_id}') continue hgnc_id = get_hgnc_id(hgnc_name) if not hgnc_id: logger.warning('Could not find HGNC ID for HGNC symbol' f' {hgnc_name} with corresonding Uniprot ID' f' {uniprot_id}') continue term = SearchTerm(type='gene', name=hgnc_name, search_term=f'"{hgnc_name}"', db_refs={ 'HGNC': hgnc_id, 'UP': uniprot_id }) gene_terms.append(term) return sorted(gene_terms, key=lambda x: x.name)