def read_elsevier_eidos_search_terms(piis_to_terms): """Return extracted EmmaaStatements given a dict of PIIS to SearchTerms. Parameters ---------- piis_to_terms : dict A dict representing a set of PIIs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ piis = list(piis_to_terms.keys()) date = datetime.datetime.utcnow() texts = read_piis(piis) pii_stmts = process_texts(texts) estmts = [] for pii, stmts in pii_stmts.items(): for stmt in stmts: for evid in stmt.evidence: evid.annotations['pii'] = pii pii_estmts = to_emmaa_stmts(stmts, date, piis_to_terms[pii], {'internal': True}) estmts += pii_estmts return estmts
def update_from_files(self, files_config): """Add custom statements from files. Relevant part of reading config should look similar to: {"other_files": [ { "bucket": "indra-covid19", "filename": "ctd_stmts.pkl", "metadata": {"internal": true, "curated": true} } ] } """ new_estmts = [] for file_dict in files_config: bucket = file_dict['bucket'] fname = file_dict['filename'] metadata = file_dict['metadata'] file_stmts = load_pickle_from_s3(bucket, fname) logger.info(f'Loaded {len(file_stmts)} statements from {fname}.') file_estmts = to_emmaa_stmts(file_stmts, datetime.datetime.now(), [], metadata) new_estmts += file_estmts return new_estmts
def read_db_ids_search_terms(id_search_terms, id_type): """Return extracted EmmaaStatements from INDRA database given an ID-search term dict. Parameters ---------- id_search_terms : dict A dict representing a set of IDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given IDs. """ ids = list(id_search_terms.keys()) date = datetime.datetime.utcnow() db = get_db('primary') id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db) estmts = [] for _id, stmt_jsons in id_stmts.items(): stmts = stmts_from_json(stmt_jsons) id_estmts = to_emmaa_stmts(stmts, date, id_search_terms[_id], {'internal': True}) estmts += id_estmts return estmts
def update_with_cord19(self, cord19_config): """Update model with new CORD19 dataset statements. Relevant part of reading config should look similar to: {"cord19_update": { "metadata": { "internal": true, "curated": false }, "date_limit": 5 } } """ # Using local import to avoid dependency from covid_19.emmaa_update import make_model_stmts current_stmts = self.get_indra_stmts() metadata = cord19_config['metadata'] date_limit = cord19_config['date_limit'] new_stmts, paper_ids = make_model_stmts(current_stmts, date_limit=date_limit) new_estmts = to_emmaa_stmts(new_stmts, datetime.datetime.now(), [], metadata=metadata) self.add_paper_ids(paper_ids, 'TRID') return new_estmts
def test_to_emmaa_stmts(): estmts = to_emmaa_stmts([stmt], date=date, search_terms=search_terms, metadata={'internal': True}) assert estmts estmt = estmts[0] assert isinstance(estmt, EmmaaStatement) assert estmt.stmt == stmt assert estmt.metadata == {'internal': True} emmaa_anns = estmt.stmt.evidence[0].annotations.get('emmaa') assert emmaa_anns assert len(emmaa_anns['search_terms']) == 2 assert emmaa_anns['metadata'] == {'internal': True}
def update_with_cord19(self): """Update model with new CORD19 dataset statements.""" current_stmts = self.get_indra_stmts() drug_stmts = load_pickle_from_s3('indra-covid19', 'drug_stmts.pkl') gordon_stmts = load_pickle_from_s3('indra-covid19', 'gordon_ndex_stmts.pkl') virhostnet_stmts = load_pickle_from_s3('indra-covid19', 'virhostnet_stmts.pkl') ctd_stmts = load_pickle_from_s3('indra-covid19', 'ctd_stmts.pkl') logger.info(f'Loaded {len(current_stmts)} current model statements, ' f'{len(drug_stmts)} drug statements, {len(gordon_stmts)} ' f'Gordon statements, {len(virhostnet_stmts)} ' f'VirHostNet statements, {len(ctd_stmts)} CTD statements.') other_stmts = drug_stmts + gordon_stmts + virhostnet_stmts + ctd_stmts new_stmts = make_model_stmts(current_stmts, other_stmts) self.stmts = to_emmaa_stmts(new_stmts, datetime.datetime.now(), [])
def create_upload_model(model_name, full_name, indra_stmts, ndex_id=None): """Make and upload an EMMAA model from a list of INDRA Statements. Parameters ---------- short_name : str Short name of the model to use on S3. full_name : str Human-readable model name to use in EMMAA dashboard. indra_stmts : list of indra.statement INDRA Statements to be used to populate the EMMAA model. ndex_id : str UUID of the network corresponding to the model on NDex. If provided, the NDex network will be updated with the latest model content. If None (default), a new network will be created and the UUID stored in the model config files on S3. """ emmaa_stmts = to_emmaa_stmts(indra_stmts, datetime.datetime.now(), []) # Get updated CX content for the INDRA Statements cxa = CxAssembler(indra_stmts) cx_str = cxa.make_model() # If we don't have an NDex ID, create network and upload to Ndex if ndex_id is None: ndex_id = cxa.upload_model(private=False) print(f'NDex ID for {model_name} is {ndex_id}.') # If the NDEx ID is provided, update the existing network else: ndex_client.update_network(cx_str, ndex_id) # Create the config dictionary config_dict = {'ndex': {'network': ndex_id}, 'search_terms': []} # Create EMMAA model emmaa_model = EmmaaModel(model_name, config_dict) emmaa_model.add_statements(emmaa_stmts) # Upload model to S3 with config as YAML and JSON emmaa_model.save_to_s3() s3_client = boto3.client('s3') config_json = json.dumps(config_dict) s3_client.put_object(Body=config_json.encode('utf8'), Key='models/%s/config.json' % model_name, Bucket='emmaa') config_json = json.dumps(config_dict) s3_client.put_object(Body=config_json.encode('utf8'), Key='models/%s/config.json' % model_name, Bucket='emmaa')
def update_with_cord19(self): """Update model with new CORD19 dataset statements.""" # Using local import to avoid dependency from covid_19.emmaa_update import make_model_stmts current_stmts = self.get_indra_stmts() default_filenames = [ 'drug_stmts_v2.pkl', 'gordon_ndex_stmts.pkl', 'virhostnet_stmts.pkl', 'ctd_stmts.pkl'] if isinstance(self.reading_config['cord19_update'], dict): fnames = self.reading_config['cord19_update'].get( 'filenames', default_filenames) else: # if it's a boolean fnames = default_filenames other_stmts = [] for fname in fnames: file_stmts = load_pickle_from_s3('indra-covid19', fname) logger.info(f'Loaded {len(file_stmts)} statements from {fname}.') other_stmts += file_stmts new_stmts, paper_ids = make_model_stmts(current_stmts, other_stmts) self.stmts = to_emmaa_stmts(new_stmts, datetime.datetime.now(), []) self.add_paper_ids(paper_ids, 'TRID')
def create_upload_model(model_name, indra_stmts, config_file): """Make and upload an EMMAA model from a list of INDRA Statements. Parameters ---------- model_name : str Name of the model to use on S3. indra_stmts : list of indra.statement INDRA Statements to be used to populate the EMMAA model. config_file : str Path to the local config.json file. """ emmaa_stmts = to_emmaa_stmts(indra_stmts, datetime.datetime.now(), [], {'internal': True}) # Load config information with open(config_file, 'rt') as f: config_json = json.load(f) # If there is no ndex entry in the config, create a new network and update # the config file with the NDex network ID if 'ndex' not in config_json: cxa = CxAssembler(indra_stmts) cx_str = cxa.make_model() ndex_id = cxa.upload_model(private=False) print(f'NDex ID for {model_name} is {ndex_id}.') config_json['ndex'] = {'network': ndex_id} updated_config_file = f'{config_file}.updated' with open(updated_config_file, 'wt') as f: json.dump(config_json, f, indent=2) # If the NDEx ID is provided we don't need to update the existing network # because this will occur as part of the model assembly/update procedure # on EMMAA itself. # Create the config dictionary # Create EMMAA model emmaa_model = EmmaaModel(model_name, config_json) emmaa_model.add_statements(emmaa_stmts) # Upload model to S3 emmaa_model.save_to_s3() # Upload config JSON s3_client = boto3.client('s3') save_config_to_s3(model_name, config_json)
def read_pmid_search_terms(pmid_search_terms): """Return extracted EmmaaStatements given a PMID-search term dict. Parameters ---------- pmid_search_terms : dict A dict representing a set of PMIDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ pmids = list(pmid_search_terms.keys()) date = datetime.datetime.utcnow() pmid_stmts = read_pmids(pmids, date) estmts = [] for pmid, stmts in pmid_stmts.items(): pmid_estmts = to_emmaa_stmts(stmts, date, pmid_search_terms[pmid], {'internal': True}) estmts += pmid_estmts return estmts
def update_from_disease_map(self, disease_map_config): """Update model by processing MINERVA Disease Map. Relevant part of reading config should look similar to: {"disease_map": { "map_name": "covid19map", "filenames" : "all", # or a list of filenames "metadata": { "internal": true } } } """ filenames = disease_map_config['filenames'] map_name = disease_map_config['map_name'] metadata = disease_map_config['metadata'] logger.info('Loading Statements from %s Disease Map' % map_name) sp = process_from_web(filenames=filenames, map_name=map_name) new_estmts = to_emmaa_stmts(sp.statements, datetime.datetime.now(), [], metadata) logger.info('Got %d EMMAA Statements from %s Disease Map' % (len(new_estmts), map_name)) return new_estmts