def format_stmts(stmts, output_format): if output_format == 'tsv': msg = '' for stmt in stmts: if not stmt.evidence: logger.warning('Statement %s without evidence' % stmt.uuid) txt = '' pmid = '' else: txt = stmt.evidence[0].text if stmt.evidence[0].text else '' pmid = stmt.evidence[0].pmid if stmt.evidence[0].pmid else '' line = '%s\t%s\t%s\n' % (stmt, txt, pmid) msg += line return msg elif output_format == 'pkl': fname = 'indrabot.pkl' with open(fname, 'wb') as fh: pickle.dump(stmts, fh) return fname elif output_format == 'pdf': fname = 'indrabot.pdf' ga = GraphAssembler(stmts) ga.make_model() ga.save_pdf(fname) return fname elif output_format == 'json': msg = json.dumps(stmts_to_json(stmts), indent=1) return msg return None
def _stmts_from_proc(proc): if proc and proc.statements: stmts = stmts_to_json(proc.statements) res = {'statements': stmts} else: res = {'statements': []} return res
def assemble_one_corpus(): """For assembling one of the four corpora.""" path = '/home/bmg16/data/wm/2-Jsonld' corpus_size = '16k' prefix = '%s%s' % (path, corpus_size) fnames = glob.glob('%s/*.jsonld' % prefix) # For large corpus all_statements = [] for idx, fname in enumerate(fnames): ep = eidos.process_json_file(fname) for stmt in ep.statements: for ev in stmt.evidence: ev.annotations['provenance'][0]['document']['@id'] = \ os.path.basename(fname) all_statements += ep.statements print('%d: %d' % (idx, len(all_statements))) with open('%s/3-Indra%s.pkl' % (prefix, corpus_size), 'wb') as fh: pickle.dump(all_statements, fh) scorer = get_eidos_scorer() assembled_stmts = ac.run_preassembly(all_statements, belief_scorer=scorer, return_toplevel=False) jd = stmts_to_json(assembled_stmts, use_sbo=False) with open('%s/3-Indra%s.json' % (prefix, corpus_size), 'w') as fh: json.dump(jd, fh, indent=1)
def test_respond_expand_model_from_json(): mm = MRA_Module(testing=True) st = sts.Phosphorylation(sts.Agent('MEK'), sts.Agent('ERK')) msg = KQMLList('BUILD-MODEL') msg.sets('description', json.dumps(sts.stmts_to_json([st]))) msg.sets('format', 'indra_json') reply = mm.respond_build_model(msg) assert (reply.get('model')) assert (reply.get('model-id') == '1') st = sts.Phosphorylation(sts.Agent('RAF'), sts.Agent('MEK')) msg = KQMLList('EXPAND-MODEL') msg.sets('description', json.dumps(sts.stmts_to_json([st]))) msg.sets('format', 'indra_json') msg.set('model-id', '1') reply = mm.respond_expand_model(msg) assert (reply.get('model')) assert (reply.get('model-id') == '2')
def get_and_write_statements_from_pmids( pmids: Union[str, Iterable[str]], file: Union[None, str, TextIO] = None, json_file: Union[None, str, TextIO] = None, sep: Optional[str] = None, limit: Optional[int] = None, duplicates: bool = False, keep_only_query_pmids: bool = False, minimum_belief: Optional[float] = None, extra_columns: Optional[List[str]] = None, ) -> None: """Get INDRA statements for the given agents and write the to a TSV for BEL curation. :param pmids: A finite iterable of PubMed identifiers :param file: The file to write curation sheets to :param json_file: The file to output structured INDRA statement JSON to :param sep: The separator for the CSV. Defaults to a tab. :param limit: The optional limit of statements to write :param duplicates: should duplicate statements be written (with multiple evidences?) :param keep_only_query_pmids: If set only keeps evidences from this PMID. Warning: still might have multiple evidences. :param minimum_belief: The minimum belief score to keep :param extra_columns: Headers of extra columns for curation """ if isinstance(pmids, str): pmids = [pmids] statements = get_statements_from_pmids(pmids) if isinstance(json_file, str): with open(json_file, 'w') as _json_file: json.dump(stmts_to_json(statements), _json_file, indent=2) elif json_file is not None: json.dump(stmts_to_json(statements), json_file, indent=2) print_statements( statements, file=file, sep=sep, limit=limit, allow_duplicates=duplicates, keep_only_pmids=pmids if keep_only_query_pmids else None, minimum_belief=minimum_belief, extra_columns=extra_columns, )
def _get_gk_model_indra(): kras = Agent('KRAS', db_refs={'HGNC': '6407', 'UP': 'P01116'}) braf = Agent('BRAF', db_refs={'HGNC': '1097', 'UP': 'P15056'}) pp2a = Agent('PPP2CA') st1 = Phosphorylation(kras, braf) st2 = Dephosphorylation(pp2a, braf) stmts = [st1, st2] stmts_json = json.dumps(stmts_to_json(stmts)) return stmts_json
def combine_all_stmts(pkl_list, output_file): all_stmts = [] for pkl_file in pkl_list: all_stmts.extend(ac.load_statements(pkl_file)) ac.dump_statements(all_stmts, output_file) stmt_json = stmts_to_json(all_stmts) output_json = f"{output_file.rsplit('.', maxsplit=1)[0]}.json" with open(output_json, 'wt') as f: json.dump(stmt_json, f, indent=2) return all_stmts
def update_groundings(): if request.json is None: abort(Response('Missing application/json header.', 415)) # Get input parameters corpus_id = request.json.get('corpus_id') # Run the actual regrounding stmts = curator.update_groundings(corpus_id) stmts_json = stmts_to_json(stmts) return jsonify(stmts_json)
def save_tests_to_s3(tests, bucket, key, save_format='pkl'): """Save tests in pkl, json or jsonl format.""" if save_format == 'pkl': save_pickle_to_s3(tests, bucket, key) elif save_format in ['json', 'jsonl']: if isinstance(tests, list): stmts = [test.stmt for test in tests] elif isinstance(tests, dict): stmts = [test.stmt for test in tests['tests']] stmts_json = stmts_to_json(stmts) save_json_to_s3(stmts_json, bucket, key, save_format)
def s3_put(self, name, bucket=default_bucket, key_base_name=default_base_name): """Push a corpus object to S3 in the form of three json files The json files representing the object have S3 keys of the format <key_base_name>/<name>/<file>.json Parameters ---------- name : str The name of the model to upload. Is part of the S3 key. bucket : str The S3 bucket to upload the Corpus to. Default: 'world-modelers'. key_base_name : str The base object path to upload the json files to. Is part of the S3 key. Default: 'indra_models'. Returns ------- keys : tuple(str) A tuple of three strings giving the S3 key to the pushed objects """ key_base = key_base_name + '/' + name + '/' key_base = key_base.replace('//', '/') # # replace double slashes try: s3 = self._get_s3_client() # Structure and upload raw statements s3.put_object(Body=json.dumps(stmts_to_json(self.raw_statements)), Bucket=bucket, Key=key_base + 'raw_statements.json') # Structure and upload assembled statements s3.put_object(Body=_stmts_dict_to_json_str(self.statements), Bucket=bucket, Key=key_base + 'statements.json') # Structure and upload curations s3.put_object(Body=json.dumps(self.curations), Bucket=bucket, Key=key_base + 'curations.json') keys = tuple( key_base + s + '.json' for s in ['raw_statements', 'statements', 'curations']) logger.info('Corpus uploaded as %s, %s and %s at %s.' % (*keys, key_base)) return keys except Exception as e: logger.exception('Failed to put on s3: %s' % e) return None
def test_standalone_event(): se_jsonld = os.path.join(path_this, 'eidos_standalone_event.json') ep = eidos.process_json_file(se_jsonld) assert len(ep.statements) == 1 st = ep.statements[0] assert isinstance(st, Event) assert hasattr(st, 'evidence') ev = st.evidence[0] assert ev.text is not None js = st.to_json() assert js['evidence'] from indra.statements import stmts_to_json js2 = stmts_to_json([st])[0] assert 'evidence' in js2
def test_path_counts(): db = _get_test_db('stmt') # Put statements in the database model_id = 'test' date = '2021-01-01' stmts = [ Activation(Agent('A', db_refs={'HGNC': '1234'}), Agent('B', db_refs={'HGNC': '2345'}), evidence=[ Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1234'}), Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1235'}) ]), Phosphorylation(Agent('B', db_refs={'HGNC': '2345'}), Agent('C', db_refs={'HGNC': '3456'}), evidence=[ Evidence(text='B phosphorylates C.', source_api='assertion', text_refs={'TRID': '2345'}) ]) ] hash0 = str(stmts[0].get_hash()) hash1 = str(stmts[1].get_hash()) stmt_jsons = stmts_to_json(stmts) db.add_statements(model_id, date, stmt_jsons) # All path counts should be 0 path_counts = db.get_path_counts(model_id, date) assert len(path_counts) == 0 # Can update path counts multiple times, can be a subset of hashes db.update_statements_path_counts(model_id, date, {hash0: 7}) path_counts = db.get_path_counts(model_id, date) assert len(path_counts) == 1, path_counts assert path_counts[hash0] == 7 db.update_statements_path_counts(model_id, date, {hash0: 1, hash1: 5}) path_counts = db.get_path_counts(model_id, date) assert len(path_counts) == 2 assert path_counts[hash0] == 8 # 7 + 1 assert path_counts[hash1] == 5 db.update_statements_path_counts(model_id, date, {hash0: 3}) path_counts = db.get_path_counts(model_id, date) assert len(path_counts) == 2 assert path_counts[hash0] == 11 # 7 + 1 + 3 assert path_counts[hash1] == 5 # Only added 5
def save_stmts(stmts, model_name): stmts_json = stmts_to_json(stmts) # Save a timestapmed version and a generic latest version of files dated_key = f'assembled/{model_name}/statements_{self.date_str}' latest_key = f'assembled/{model_name}/' \ f'latest_statements_{model_name}' for ext in ('json', 'jsonl'): latest_obj_key = latest_key + '.' + ext logger.info('Uploading assembled statements to ' f'{latest_obj_key}') save_json_to_s3(stmts_json, bucket, latest_obj_key, ext) dated_jsonl = dated_key + '.jsonl' dated_zip = dated_key + '.gz' logger.info(f'Uploading assembled statements to {dated_jsonl}') save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl') logger.info(f'Uploading assembled statements to {dated_zip}') save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
def add_statements_for_record(self, record_key, stmts, indra_version): """Add a set of prepared statements for a given document.""" if not stmts: return None op = insert(wms_schema.PreparedStatements).values([ { 'record_key': record_key, 'indra_version': indra_version, 'stmt': stmt } # Note: the deepcopy here is done because when dumping # statements into JSON, the hash is overwritten, potentially # with an inadequate one (due to a custom matches_fun not being # given here). for stmt in stmts_to_json(deepcopy(stmts)) ]) return self.execute(op)
def save_assembled_statements(self, bucket=EMMAA_BUCKET_NAME): """Upload assembled statements jsons to S3 bucket.""" stmts = self.model.assembled_stmts stmts_json = stmts_to_json(stmts) # Save a timestapmed version and a generic latest version of files dated_key = f'assembled/{self.model.name}/statements_{self.date_str}' latest_key = f'assembled/{self.model.name}/' \ f'latest_statements_{self.model.name}' for ext in ('json', 'jsonl'): latest_obj_key = latest_key + '.' + ext logger.info(f'Uploading assembled statements to {latest_obj_key}') save_json_to_s3(stmts_json, bucket, latest_obj_key, ext) dated_jsonl = dated_key + '.jsonl' dated_zip = dated_key + '.gz' logger.info(f'Uploading assembled statements to {dated_jsonl}') save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl') logger.info(f'Uploading assembled statements to {dated_zip}') save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
def format_stmts(stmts, output_format, ev_counts=None, source_counts=None): if output_format == 'tsv': msg = '' for stmt in stmts: if not stmt.evidence: logger.warning('Statement %s without evidence' % stmt.uuid) txt = '' pmid = '' else: txt = '"%s"' % stmt.evidence[0].text if \ stmt.evidence[0].text else '' pmid = stmt.evidence[0].pmid if stmt.evidence[0].pmid else '' try: ea_txt = EnglishAssembler([stmt]).make_model() except Exception as e: ea_txt = '' logger.error('English assembly failed for %s' % stmt) logger.error(e) line = '%s\t%s\t%s\tPMID%s\n' % (stmt, ea_txt, txt, pmid) msg += line return msg elif output_format == 'pkl': fname = 'indrabot.pkl' with open(fname, 'wb') as fh: pickle.dump(stmts, fh) return fname elif output_format == 'pdf': fname = 'indrabot.pdf' ga = GraphAssembler(stmts) ga.make_model() ga.save_pdf(fname) return fname elif output_format == 'json': msg = json.dumps(stmts_to_json(stmts), indent=1) return msg elif output_format == 'html': ev_counts = {} if not ev_counts else ev_counts ha = HtmlAssembler(stmts, ev_totals=ev_counts, source_counts=source_counts) fname = 'indrabot.html' ha.save_model(fname) return fname return None
def test_get_dates_and_delete(): db = _get_test_db('stmt') model_id = 'test' # At first there are no statements in the database assert db.get_number_of_dates(model_id) == 0 assert db.get_oldest_date(model_id) is None # Put statements in the database date = '2021-01-01' stmts = [ Activation(Agent('A', db_refs={'HGNC': '1234'}), Agent('B', db_refs={'HGNC': '2345'}), evidence=[ Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1234'}), Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1235'}) ]), Phosphorylation(Agent('B', db_refs={'HGNC': '2345'}), Agent('C', db_refs={'HGNC': '3456'}), evidence=[ Evidence(text='B phosphorylates C.', source_api='assertion', text_refs={'TRID': '2345'}) ]) ] stmt_jsons = stmts_to_json(stmts) db.add_statements(model_id, date, stmt_jsons) # There should be one date assert db.get_number_of_dates(model_id) == 1 assert db.get_oldest_date(model_id) == date # Add another date date2 = '2022-01-01' db.add_statements(model_id, date2, stmt_jsons) assert db.get_number_of_dates(model_id) == 2 # Oldest date is still the first one assert db.get_oldest_date(model_id) == date # Delete statements from the first date db.delete_statements(model_id, date) # There should be one date left assert db.get_number_of_dates(model_id) == 1 assert db.get_oldest_date(model_id) == date2
def from_agents( agents: List[str], output: TextIO, statement_file: TextIO, belief_cutoff: float, no_duplicates: bool, no_ungrounded: bool, ): """Make a sheet for the given agents.""" statements = get_and_write_statements_from_agents( agents=agents, file=output, allow_duplicates=(not no_duplicates), allow_ungrounded=(not no_ungrounded), minimum_belief=belief_cutoff, ) if statement_file: json.dump(stmts_to_json(statements), statement_file, indent=2)
def test_get_statements_by_hash(): db = _get_test_db('stmt') # Put statements in the database model_id = 'test' date = '2021-01-01' stmts = [ Activation(Agent('A', db_refs={'HGNC': '1234'}), Agent('B', db_refs={'HGNC': '2345'}), evidence=[ Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1234'}), Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1235'}) ]), Phosphorylation(Agent('B', db_refs={'HGNC': '2345'}), Agent('C', db_refs={'HGNC': '3456'}), evidence=[ Evidence(text='B phosphorylates C.', source_api='assertion', text_refs={'TRID': '2345'}) ]) ] hash0 = stmts[0].get_hash() hash1 = stmts[1].get_hash() stmt_jsons = stmts_to_json(stmts) db.add_statements(model_id, date, stmt_jsons) # Load statements by hash stmts_loaded = db.get_statements_by_hash(model_id, date, [hash0, hash1]) assert len(stmts_loaded) == 2 assert stmts_loaded[0].get_hash() == hash0 assert stmts_loaded[1].get_hash() == hash1 stmts_loaded = db.get_statements_by_hash(model_id, date, [hash0]) assert len(stmts_loaded) == 1 assert stmts_loaded[0].get_hash() == hash0
def stmts_json_from_text(text): """Return an INDRA Statements JSON from text.""" stmts_json = stmts_to_json(stmts_from_text(text)) return stmts_json
def encode_indra_stmts(stmts): stmts_json = stmts_to_json(stmts) json_str = json.dumps(stmts_json) return json_str
def get_json(self): """Generate statement jsons and return the json bytes.""" msg = json.dumps(stmts_to_json(self.get_statements()), indent=1) return msg
def create_statements(self): content = KQMLList('INDRA-TO-NL') content.sets('statements', json.dumps(stmts_to_json(self.statements))) return get_request(content), content
def test_get_statements(): db = _get_test_db('stmt') # Put statements and path counts in the database model_id = 'test' date = '2021-01-01' stmts = [ Activation(Agent('A', db_refs={'HGNC': '1234'}), Agent('B', db_refs={'HGNC': '2345'}), evidence=[ Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1234'}), Evidence(text='A activates B.', source_api='assertion', text_refs={'TRID': '1235'}) ]), Phosphorylation(Agent('B', db_refs={'HGNC': '2345'}), Agent('C', db_refs={'HGNC': '3456'}), evidence=[ Evidence(text='B phosphorylates C.', source_api='assertion', text_refs={'TRID': '2345'}) ]), IncreaseAmount(Agent('A', db_refs={'HGNC': '1234'}), Agent('C', db_refs={'HGNC': '3456'})) ] stmts[0].belief = 0.8 stmts[1].belief = 0.9 stmts[2].belief = 0.5 hash0 = stmts[0].get_hash() hash1 = stmts[1].get_hash() hash2 = stmts[2].get_hash() stmt_jsons = stmts_to_json(stmts) db.add_statements(model_id, date, stmt_jsons) db.update_statements_path_counts(model_id, date, { str(hash0): 1, str(hash1): 5 }) # Load statements with different sort/filter options # Sort by evidence count stmts_loaded = db.get_statements(model_id, date, sort_by='evidence') assert len(stmts_loaded) == 3 assert stmts_loaded[0].get_hash() == hash0, len(stmts_loaded[0].evidence) assert stmts_loaded[1].get_hash() == hash1, len(stmts_loaded[1].evidence) assert stmts_loaded[2].get_hash() == hash2 # stmt with no evidence is last # Sort by belief stmts_loaded = db.get_statements(model_id, date, sort_by='belief') assert len(stmts_loaded) == 3 assert stmts_loaded[0].get_hash() == hash1 assert stmts_loaded[1].get_hash() == hash0 # Sort by path count stmts_loaded = db.get_statements(model_id, date, sort_by='paths') assert len(stmts_loaded) == 3 assert stmts_loaded[0].get_hash() == hash1, stmts_loaded assert stmts_loaded[1].get_hash() == hash0 # Filter by statement type stmts_loaded = db.get_statements(model_id, date, stmt_types=['Activation']) assert len(stmts_loaded) == 1 assert stmts_loaded[0].get_hash() == hash0 stmts_loaded = db.get_statements(model_id, date, stmt_types=['Phosphorylation']) assert len(stmts_loaded) == 1 assert stmts_loaded[0].get_hash() == hash1 # Filter by belief stmts_loaded = db.get_statements(model_id, date, min_belief=0.85) assert len(stmts_loaded) == 1 assert stmts_loaded[0].get_hash() == hash1 stmts_loaded = db.get_statements(model_id, date, max_belief=0.85) assert len(stmts_loaded) == 2 assert set([stmt.get_hash() for stmt in stmts_loaded]) == {hash0, hash2} stmts_loaded = db.get_statements(model_id, date, min_belief=0.85, max_belief=0.85) assert len(stmts_loaded) == 0 # Use offset and limit stmts_loaded = db.get_statements(model_id, date) assert len(stmts_loaded) == 3 stmts_loaded = db.get_statements(model_id, date, offset=1) assert len(stmts_loaded) == 2, stmts_loaded stmts_loaded = db.get_statements(model_id, date, limit=1) assert len(stmts_loaded) == 1 # Returns only remaining statements after upset even if limit is larger stmts_loaded = db.get_statements(model_id, date, offset=1, limit=5) assert len(stmts_loaded) == 2
#'50': '/home/bmg16/Dropbox/postdoc/darpa/src/indra_apps/' + \ # 'wm_fao/20181101/2-Jsonld50', '500': '/home/bmg16/Dropbox/postdoc/darpa/src/indra_apps/' + \ 'wm_fao/20181101/2-Jsonld500', '16k': '/home/bmg16/data/wm/2-Jsonld16k', } all_statements = [] for corpus_size, path in corpora.items(): fnames = glob.glob('%s/*.jsonld' % path) for idx, fname in enumerate(fnames): ep = eidos.process_json_file(fname) for stmt in ep.statements: for ev in stmt.evidence: ev.annotations['provenance'][0]['document']['@id'] = \ os.path.basename(fname) ev.annotations['provenance'][0]['document']['corpus'] = \ corpus_size all_statements += ep.statements print('%d: %d' % (idx, len(all_statements))) scorer = get_eidos_scorer() assembled_stmts = ac.run_preassembly(all_statements, belief_scorer=scorer, return_toplevel=False) jd = stmts_to_json(assembled_stmts, use_sbo=False) with open('3-Indra-merged-500-16k.json', 'w') as fh: json.dump(jd, fh, indent=1) # assemble_all()
POLYPHENOLS_LIST = 'input/list_polyphenols.xlsx' # Load the list of polyphenols df = pd.read_excel(POLYPHENOLS_LIST) results_dict = {} for name, pubchem_id in df[['polyphenols', 'pubchem_id']].values: # Query the INDRA DB web service using the INDRA Python API idrp = idr.get_statements(agents=[f'{pubchem_id}@PUBCHEM'], ev_limit=100000) # Run preassembly # 1. Fix common named entity normalization ("grounding") errors stmts = ac.map_grounding(idrp.statements) # 2. Fix inconsistent sites of post-translational modifications stmts = ac.map_sequence(stmts) # 3. Identify duplicate/overlapping statements, calculate belief stmts = ac.run_preassembly(stmts) # Convert statements to JSON stmts_json = stmts_to_json(stmts) # Store results in dict indexed by Pubchem ID results_dict[str(pubchem_id)] = { 'name': name, 'statements': stmts_json } # Save to file with open('output/polyphenol_stmts.json', 'wt') as f: json.dump(results_dict, f, indent=2)
def stmts_to_jsonl_str(stmts): return '\n'.join([json.dumps(stmt) for stmt in stmts_to_json(stmts)])
def stmts_json_from_text(text): ekb_xml = read_or_load(text) tp = trips.process_xml(ekb_xml) stmts_json = stmts_to_json(tp.statements) return stmts_json
def dump_stmts_json(stmts, fname): print('Dumping statements into JSON') jd = stmts_to_json(stmts, use_sbo=False) with open(fname, 'w') as fh: json.dump(jd, fh, indent=1)