def dump_local(self, base_folder): """Dump assembled corpus into local files.""" corpus_folder = os.path.join(base_folder, self.corpus_id) os.makedirs(corpus_folder, exist_ok=True) stmts_to_json_file(self.assembled_stmts, os.path.join(corpus_folder, 'statements.json'), format='jsonl') with open(os.path.join(corpus_folder, 'metadata.json'), 'w') as fh: json.dump(fh, self.metadata)
#stmts = load_eidos() #stmts = ac.filter_by_type(stmts, Influence) #remove_namespaces(stmts, ['WHO', 'MITRE12', 'UN', 'PROPS', # 'INTERVENTIONS']) ont = load_world_ontology(ont_url) if key != 'no_regrounding': stmts = reground_stmts(stmts, ont, 'WM', None, True) scorer = get_eidos_scorer() matches_fun, refinement_fun = None, None assembled_stmts = ac.run_preassembly(stmts, belief_scorer=scorer, matches_fun=matches_fun, refinement_fun=refinement_fun, normalize_equivalences=True, normalize_opposites=True, normalize_ns='WM', ontology=ont, return_toplevel=False, poolsize=4) print('-----Finished assembly-----') remove_raw_grounding(assembled_stmts) corpus_name = 'eidos-regrounding-20191214-%s' % key fname = os.path.join('.', corpus_name + '.json') sj = stmts_to_json_file(assembled_stmts, fname, matches_fun=matches_fun) corpus = Corpus(corpus_name, assembled_stmts, raw_statements=stmts) corpus.s3_put()
from indra.tools import assemble_corpus as ac from indra.statements import stmts_to_json_file from indra.assemblers.html import HtmlAssembler from indra.sources import reach tp = reach.process_pmc('PMC4455820', url=reach.local_nxml_url) if tp: stmts = tp.statements print(stmts) stmts = ac.filter_grounded_only(stmts) # Filter out ungrounded agents stmts = ac.run_preassembly( stmts, # Run preassembly return_toplevel=False, normalize_equivalences= True, # Optional: rewrite equivalent groundings to one standard normalize_opposites= True, # Optional: rewrite opposite groundings to one standard normalize_ns='WM' ) # Use 'WM' namespace to normalize equivalences and opposites stmts = ac.filter_belief(stmts, 0.8) # Apply belief cutoff of e.g., 0.8 stmts_to_json_file(stmts, 'PMC4455820.json') ha = HtmlAssembler(stmts) ha.save_model('PMC4455820.html') # # # #
#attempt to combine many statements from indra.tools import assemble_corpus as ac from indra.statements import stmts_to_json_file from indra.assemblers.html import HtmlAssembler from indra.sources import reach pmcids = ["PMC3717945", "PMC5906628"] stmts = [] for pmcid in pmcids: tp = reach.process_pmc(pmcid) stmts += tp.statements stmts = ac.filter_grounded_only(stmts) # Filter out ungrounded agents stmts = ac.run_preassembly( stmts, # Run preassembly return_toplevel=False, normalize_equivalences= True, # Optional: rewrite equivalent groundings to one standard normalize_opposites= True, # Optional: rewrite opposite groundings to one standard normalize_ns='WM' ) # Use 'WM' namespace to normalize equivalences and opposites stmts = ac.filter_belief(stmts, 0.8) # Apply belief cutoff of e.g., 0.8 stmts_to_json_file(stmts, 'bigresults.json') ha = HtmlAssembler(stmts) ha.save_model('bigresults.html')
def export_json(statements, fname): """Export statements into JSON.""" stmts_to_json_file(statements, fname)
# Querying for and assembling statements all_stmts = [] for db_ns, db_id, name in groundings: if db_id in black_list: print('Skipping %s in black list' % name) continue print('Looking up %s' % name) db_stmts = get_db_stmts_by_grounding(db_ns, db_id) tas_stmts = get_tas_stmts(db_ns, db_id) if db_ns == 'HGNC' else [] stmts = db_stmts + tas_stmts smts = ac.filter_by_curation(stmts, db_curations) stmts = reground_stmts(stmts, grounding_map, misgrounding_map) all_stmts += stmts all_stmts = make_unique_hashes(all_stmts) all_stmts = ac.run_preassembly(all_stmts) ######################################## # Dunp results with open('disease_map_indra_stmts_full.pkl', 'wb') as fh: pickle.dump(all_stmts, fh) stmts_to_json_file(all_stmts, 'disease_map_indra_stmts_full.json') filtered_stmts = filter_prior_all(all_stmts, groundings) with open('disease_map_indra_stmts_filtered.pkl', 'wb') as fh: pickle.dump(filtered_stmts, fh) stmts_to_json_file(filtered_stmts, 'disease_map_indra_stmts_filtered.json') ##################
from indra.statements import stmts_to_json_file import indra.tools.assemble_corpus as ac from assemble_model import process_eidos, assemble_stmts if __name__ == '__main__': stmts = process_eidos() stmts_to_json_file(stmts, 'eidos_500m_raw.json') stmts = assemble_stmts(stmts) stmts = ac.merge_groundings(stmts) stmts = ac.merge_deltas(stmts) stmts = ac.standardize_names_groundings(stmts) stmts_to_json_file(stmts, 'eidos_500m_assembled.json')
'@id': doc_id } }] else: prov = ev.annotations['provenance'][0]['document'] prov['@id'] = doc_id stmts += pp.statements if grounding == 'compositional': validate_grounding_format(stmts) ap = AssemblyPipeline.from_json_file('assembly_%s.json' % grounding) assembled_stmts = ap.run(stmts) if do_upload: corpus_id = 'compositional_v4' stmts_to_json_file(assembled_stmts, '%s.json' % corpus_id) meta_data = { 'corpus_id': corpus_id, 'description': ('Assembly of 4 reader outputs with the ' 'compositional ontology (%s).' % ont_url), 'display_name': 'Compositional ontology assembly v3', 'readers': readers, 'assembly': { 'level': 'grounding', 'grounding_threshold': 0.6, }, 'num_statements':
all_stmts = [] for db_ns, db_id, name in tqdm.tqdm(groundings): if db_id in black_list: print('Skipping %s in black list' % name) continue print('Looking up %s' % name) db_stmts = get_db_stmts_by_grounding(db_ns, db_id) tas_stmts = get_tas_stmts(db_ns, db_id) if db_ns == 'HGNC' else [] stmts = db_stmts + tas_stmts smts = ac.filter_by_curation(stmts, db_curations) stmts = reground_stmts(stmts, grounding_map, misgrounding_map) all_stmts += stmts all_stmts = make_unique_hashes(all_stmts) all_stmts = ac.run_preassembly(all_stmts) ######################################## # Dunp results with open(f'disease_map_indra_stmts_full_{version}.pkl', 'wb') as fh: pickle.dump(all_stmts, fh) stmts_to_json_file(all_stmts, f'disease_map_indra_stmts_full_{version}.json') filtered_stmts = filter_prior_all(all_stmts, groundings) with open(f'disease_map_indra_stmts_filtered_{version}.pkl', 'wb') as fh: pickle.dump(filtered_stmts, fh) stmts_to_json_file(filtered_stmts, f'disease_map_indra_stmts_filtered_{version}.json') ##################
def do_regrounding(stmts): concepts = [] for stmt in stmts: for concept in stmt.agent_list(): concept_txt = concept.db_refs.get('TEXT') concepts.append(concept_txt) groundings = er.reground_texts(concepts) # Update the corpus with new groundings idx = 0 for stmt in stmts: for concept in stmt.agent_list(): concept.db_refs['UN'] = groundings[idx] idx += 1 return stmts if __name__ == '__main__': config = load_config() fnames = config['files'] for fname in fnames: print('Processing %s' % fname) hp = hume.process_jsonld_file(fname) parts = fname.split('/') new_fname = '%s_%s' % (parts[-2], parts[-1]) new_fname = new_fname.replace('json-ld', 'json') print('Running regrounding') stmts = do_regrounding(hp.statements) print('Savig into JSON') stmts_to_json_file(hp.statements, new_fname)