def test_prior_prob_one_two(): be = BeliefEngine() prob = 1 - (default_probs['rand']['reach']**2 + default_probs['syst']['reach']) * \ (default_probs['rand']['trips'] + default_probs['syst']['trips']) st = Phosphorylation(None, Agent('a'), evidence=[ev1, ev1, ev2]) assert st.belief == 1 be.set_prior_probs([st]) assert st.belief == prob
def test_prior_prob_two_different(): be = BeliefEngine() prob = 1 - (be.prior_probs['rand']['reach'] + be.prior_probs['syst']['reach']) * \ (be.prior_probs['rand']['trips'] + be.prior_probs['syst']['trips']) st = Phosphorylation(None, Agent('a'), evidence=[ev1, ev2]) assert (st.belief == 1) be.set_prior_probs([st]) assert (st.belief == prob)
def test_wm_scorer(): scorer = wm_scorer.get_eidos_scorer() stmt = Influence(Concept('a'), Concept('b'), evidence=[Evidence(source_api='eidos')]) # Make sure other sources are still in the map assert 'hume' in scorer.prior_probs['rand'] assert 'biopax' in scorer.prior_probs['syst'] engine = BeliefEngine(scorer) engine.set_prior_probs([stmt])
def test_evidence_random_noise_prior(): type_probs = {'biopax': 0.9, 'geneways': 0.2} biopax_subtype_probs = {'reactome': 0.4, 'biogrid': 0.2} geneways_subtype_probs = {'phosphorylate': 0.5, 'bind': 0.7} subtype_probs = { 'biopax': biopax_subtype_probs, 'geneways': geneways_subtype_probs } ev_geneways_bind = Evidence(source_api='geneways', source_id=0, pmid=0, text=None, epistemics={}, annotations={'actiontype': 'bind'}) ev_biopax_reactome = Evidence(source_api='biopax', source_id=0, pmid=0, text=None, epistemics={}, annotations={'source_sub_id': 'reactome'}) ev_biopax_pid = Evidence(source_api='biopax', source_id=0, pmid=0, text=None, epistemics={}, annotations={'source_sub_id': 'pid'}) #Random noise prior for geneways bind evidence is the subtype prior, #since we specified it assert (evidence_random_noise_prior(ev_geneways_bind, type_probs, subtype_probs) == 0.7) #Random noise prior for reactome biopax evidence is the subtype prior, #since we specified it assert (evidence_random_noise_prior(ev_biopax_reactome, type_probs, subtype_probs) == 0.4) #Random noise prior for pid evidence is the subtype prior, #since we specified it assert (evidence_random_noise_prior(ev_biopax_pid, type_probs, subtype_probs) == 0.9) #Make sure this all still works when we go through the belief engine statements = [] members = [Agent('a'), Agent('b')] statements.append(Complex(members, evidence=ev_geneways_bind)) statements.append(Complex(members, evidence=ev_biopax_reactome)) statements.append(Complex(members, evidence=ev_biopax_pid)) p = {'rand': type_probs, 'syst': {'biopax': 0, 'geneways': 0}} engine = BeliefEngine(p, subtype_probs) engine.set_prior_probs(statements) assert (statements[0].belief == 1 - 0.7) assert (statements[1].belief == 1 - 0.4) assert (statements[2].belief == 1 - 0.9)
def test_hierarchy_probs1(): be = BeliefEngine() st1 = Phosphorylation(None, Agent('a'), evidence=[ev1]) st2 = Phosphorylation(None, Agent('b'), evidence=[ev2]) st2.supports = [st1] st1.supported_by = [st2] st1.belief = 0.5 st2.belief = 0.8 be.set_hierarchy_probs([st1, st2]) assert(st1.belief == 0.5) assert(st2.belief == 0.9)
def calculate_belief(stmts): scorer = SimpleScorer(subtype_probs={ 'biopax': { 'pc11': 0.2, 'phosphosite': 0.01 }, }) be = BeliefEngine(scorer=scorer) be.set_prior_probs(stmts) be.set_hierarchy_probs(stmts) return {str(s.get_hash()): s.belief for s in stmts}
def test_default_probs_override(): """Make sure default probs are overriden by constructor argument.""" prior_probs = {'rand': {'assertion': 0.5}} scorer = SimpleScorer(prior_probs) be = BeliefEngine(scorer) for err_type in ('rand', 'syst'): for k, v in scorer.prior_probs[err_type].items(): if err_type == 'rand' and k == 'assertion': assert v == 0.5 else: assert default_probs[err_type][k] == v
def test_hierarchy_probs3(): be = BeliefEngine() st1 = Phosphorylation(None, Agent('a'), evidence=[ev1]) st2 = Phosphorylation(None, Agent('b'), evidence=[ev2]) st3 = Phosphorylation(None, Agent('c'), evidence=[ev4]) st3.supports = [st1, st2] st1.supported_by = [st3] st2.supported_by = [st3] be.set_hierarchy_probs([st1, st2, st3]) assert_close_enough(st1.belief, 1 - 0.35) assert_close_enough(st2.belief, 1 - 0.35) assert_close_enough(st3.belief, 1 - 0.35 * 0.35 * 0.21)
def test_belief_calc_up_to_prior(): be = BeliefEngine() test_stmts = [ MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]), MockStatement(2, MockEvidence('biopax')), MockStatement(3, MockEvidence('signor')), MockStatement(4, MockEvidence('biogrid')), MockStatement(5, MockEvidence('bel')), MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]), ] be.set_prior_probs(test_stmts) results = {s.matches_key(): s.belief for s in test_stmts} print(results) assert len(results) == len(test_stmts), (len(results), len(test_stmts)) assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.'
def test_default_probs_extend(): """Make sure default probs are extended by constructor argument.""" prior_probs = {'rand': {'new_source': 0.1}, 'syst': {'new_source': 0.05}} scorer = SimpleScorer(prior_probs) be = BeliefEngine(scorer) for err_type in ('rand', 'syst'): assert 'new_source' in scorer.prior_probs[err_type] for k, v in scorer.prior_probs[err_type].items(): if err_type == 'rand' and k == 'new_source': assert v == 0.1 elif err_type == 'syst' and k == 'new_source': assert v == 0.05 else: assert default_probs[err_type][k] == v
def run_preassembly(stmts_in, **kwargs): """Run preassembly on a list of statements. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to preassemble. return_toplevel : Optional[bool] If True, only the top-level statements are returned. If False, all statements are returned irrespective of level of specificity. Default: True poolsize : Optional[int] The number of worker processes to use to parallelize the comparisons performed by the function. If None (default), no parallelization is performed. NOTE: Parallelization is only available on Python 3.4 and above. size_cutoff : Optional[int] Groups with size_cutoff or more statements are sent to worker processes, while smaller groups are compared in the parent process. Default value is 100. Not relevant when parallelization is not used. save : Optional[str] The name of a pickle file to save the results (stmts_out) into. save_unique : Optional[str] The name of a pickle file to save the unique statements into. Returns ------- stmts_out : list[indra.statements.Statement] A list of preassembled top-level statements. """ dump_pkl_unique = kwargs.get('save_unique') be = BeliefEngine() pa = Preassembler(hierarchies, stmts_in) run_preassembly_duplicate(pa, be, save=dump_pkl_unique) dump_pkl = kwargs.get('save') return_toplevel = kwargs.get('return_toplevel', True) poolsize = kwargs.get('poolsize', None) size_cutoff = kwargs.get('size_cutoff', 100) options = { 'save': dump_pkl, 'return_toplevel': return_toplevel, 'poolsize': poolsize, 'size_cutoff': size_cutoff } stmts_out = run_preassembly_related(pa, be, **options) return stmts_out
def test_belief_calc_up_to_hierarchy(): be = BeliefEngine() test_stmts = [ MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]), MockStatement(2, MockEvidence('biopax')), MockStatement(3, MockEvidence('signor')), MockStatement(4, MockEvidence('biogrid')), MockStatement(5, MockEvidence('bel')), MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]), ] be.set_prior_probs(test_stmts) init_results = {s.matches_key(): s.belief for s in test_stmts} print(init_results) supp_links = [(1, 2), (1, 3), (2, 3), (1, 5), (4, 3)] populate_support(test_stmts, supp_links) be.set_hierarchy_probs(test_stmts) results = {s.matches_key(): s.belief for s in test_stmts} print(results) # Test a couple very simple properties. assert len(results) == len(test_stmts), (len(results), len(test_stmts)) assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.' # Test the change from the initial. all_deltas_correct = True deltas_dict = {} for s in test_stmts: h = s.matches_key() b = s.belief # Get results res = {'actual': b - init_results[h]} # Define expectations. if s.supports: res['expected'] = 'increase' if res['actual'] <= 0: all_deltas_correct = False else: res['expected'] = 'no change' if res['actual'] != 0: all_deltas_correct = False deltas_dict[h] = res assert all_deltas_correct, deltas_dict
def test_hierarchy_probs4(): be = BeliefEngine() st1 = Phosphorylation(None, Agent('a'), evidence=[ev1]) st2 = Phosphorylation(None, Agent('b'), evidence=[ev2]) st3 = Phosphorylation(None, Agent('c'), evidence=[deepcopy(ev1)]) st4 = Phosphorylation(None, Agent('d'), evidence=[deepcopy(ev1)]) st4.supports = [st1, st2, st3] st3.supports = [st1] st2.supports = [st1] st1.supported_by = [st2, st3, st4] st2.supported_by = [st4] st3.supported_by = [st4] be.set_hierarchy_probs([st1, st2, st3, st4]) assert_close_enough(st1.belief, 1-0.35) assert_close_enough(st2.belief, 1-0.35*0.35) assert_close_enough(st3.belief, 1-(0.05 + 0.3*0.3)) assert_close_enough(st4.belief, 1-0.35*(0.05 + 0.3*0.3*0.3))
def test_negative_evidence(): prior_probs = {'rand': {'new_source': 0.1}, 'syst': {'new_source': 0.05}} getev = lambda x: Evidence(source_api='new_source', epistemics={'negated': x}) evs1 = [getev(x) for x in [True, True, False]] evs2 = [getev(x) for x in [False, False, False]] evs3 = [getev(x) for x in [True, True, True]] stmts = [Phosphorylation(None, Agent('a'), evidence=e) for e in [evs1, evs2, evs3]] scorer = SimpleScorer(prior_probs) engine = BeliefEngine(scorer) engine.set_prior_probs(stmts) pr = prior_probs['rand']['new_source'] ps = prior_probs['syst']['new_source'] assert_close_enough(stmts[0].belief, ((1-pr)-ps)*(1-((1-pr*pr)-ps))) assert_close_enough(stmts[1].belief, (1-pr*pr*pr)-ps) assert stmts[2].belief == 0
def setup_belief(): # Make a model lr = LogisticRegression() # Get all the sources source_list = CountsScorer.get_all_sources(test_stmts_cur) cs = CountsScorer(lr, source_list) # Train on curated stmt data cs.fit(test_stmts_cur, y_arr_stmts_cur) # Run predictions on test statements probs = cs.predict_proba(test_stmts_cur)[:, 1] # Now check if we get these same beliefs set on the statements when we # run with the belief engine: # Get scorer and belief engine instances for trained model be = BeliefEngine(scorer=cs) # Make a shallow copy of the test stmts so that we don't change beliefs # of the global instances as a side-effect of this test test_stmts_copy = copy(test_stmts_cur) return be, test_stmts_copy, probs
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = ac.filter_grounded_only(statements, score_threshold=0.4) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) print('%d top-level statements' % len(top_stmts)) return top_stmts
def run_preassembly(statements, hierarchies): print('%d total statements' % len(statements)) # Filter to grounded only statements = map_onto(statements) ac.dump_statements(statements, 'pi_mtg_demo_unfiltered.pkl') statements = ac.filter_grounded_only(statements, score_threshold=0.7) #statements = ac.filter_by_db_refs(statements, 'UN', # ['conflict', 'food_security', 'precipitation'], policy='one', # match_suffix=True) statements = ac.filter_by_db_refs( statements, 'UN', [ 'conflict', 'food_security', 'flooding', 'food_production', 'human_migration', 'drought', 'food_availability', 'market', 'food_insecurity' ], policy='all', match_suffix=True) assume_polarity(statements) statements = filter_has_polarity(statements) # Make a Preassembler with the Eidos and TRIPS ontology pa = Preassembler(hierarchies, statements) # Make a BeliefEngine and run combine duplicates be = BeliefEngine() unique_stmts = pa.combine_duplicates() print('%d unique statements' % len(unique_stmts)) be.set_prior_probs(unique_stmts) # Run combine related related_stmts = pa.combine_related(return_toplevel=False) be.set_hierarchy_probs(related_stmts) #related_stmts = ac.filter_belief(related_stmts, 0.8) # Filter to top-level Statements top_stmts = ac.filter_top_level(related_stmts) pa.stmts = top_stmts print('%d top-level statements' % len(top_stmts)) conflicts = pa.find_contradicts() top_stmts = remove_contradicts(top_stmts, conflicts) ac.dump_statements(top_stmts, 'pi_mtg_demo.pkl') return top_stmts
def test_hierarchy_probs4(): be = BeliefEngine() st1 = Phosphorylation(None, Agent('a'), evidence=[ev1]) st2 = Phosphorylation(None, Agent('b'), evidence=[ev2]) st3 = Phosphorylation(None, Agent('c'), evidence=[ev3]) st4 = Phosphorylation(None, Agent('d'), evidence=[ev1]) st4.supports = [st1, st2, st3] st3.supports = [st1] st2.supports = [st1] st1.supported_by = [st2, st3, st4] st2.supported_by = [st4] st3.supported_by = [st4] st1.belief = 0.5 st2.belief = 0.8 st3.belief = 0.2 st4.belief = 0.6 be.set_hierarchy_probs([st1, st2, st3]) assert(st1.belief == 0.5) assert(st2.belief == 0.9) assert(st3.belief == 0.6) assert(st4.belief == 0.968)
def run_preassembly(stmts_in, **kwargs): """Run preassembly on a list of statements. Parameters ---------- stmts_in : list[indra.statements.Statement] A list of statements to preassemble. return_toplevel : Optional[bool] If True, only the top-level statements are returned. If False, all statements are returned irrespective of level of specificity. Default: True save : Optional[str] The name of a pickle file to save the results (stmts_out) into. save_unique : Optional[str] The name of a pickle file to save the unique statements into. Returns ------- stmts_out : list[indra.statements.Statement] A list of preassembled top-level statements. """ dump_pkl = kwargs.get('save') dump_pkl_unique = kwargs.get('save_unique') be = BeliefEngine() pa = Preassembler(hierarchies, stmts_in) options = {'save': dump_pkl_unique} run_preassembly_duplicate(pa, be, **options) return_toplevel = kwargs.get('return_toplevel', True) options = {'save': dump_pkl, 'return_toplevel': return_toplevel} start = time.time() stmts_out = run_preassembly_related(pa, be, **options) end = time.time() elapsed = end - start logger.debug("Time elapsed, run_preassembly_related: %s" % elapsed) return stmts_out
if __name__ == '__main__': if len(sys.argv) < 3: logger.error('Usage: assemble_corpus.py <pickle_file> <output_folder>') sys.exit() stmts_fname = sys.argv[1] out_folder = sys.argv[2] stmts = load_statements(stmts_fname) logger.info('All statements: %d' % len(stmts)) cache_pkl = os.path.join(out_folder, 'mapped_stmts.pkl') options = {'save': cache_pkl, 'do_rename': True} stmts = map_grounding(stmts, **options) cache_pkl = os.path.join(out_folder, 'sequence_valid_stmts.pkl') options = {'save': cache_pkl} mapped_stmts = map_sequence(stmts, **options) be = BeliefEngine() pa = Preassembler(hierarchies, mapped_stmts) cache_pkl = os.path.join(out_folder, 'unique_stmts.pkl') options = {'save': cache_pkl} unique_stmts = run_preassembly_duplicate(pa, be, **options) cache_pkl = os.path.join(out_folder, 'top_stmts.pkl') options = {'save': cache_pkl} stmts = run_preassembly_related(pa, be, **options)
def test_default_probs(): """Make sure default probs are set with empty constructor.""" be = BeliefEngine() for err_type in ('rand', 'syst'): for k, v in default_probs[err_type].items(): assert default_probs[err_type][k] == v
def run_assembly(stmts, folder, pmcid, background_assertions=None): '''Run assembly on a list of statements, for a given PMCID.''' # Folder for index card output (scored submission) indexcard_prefix = folder + '/index_cards/' + pmcid # Folder for other outputs (for analysis, debugging) otherout_prefix = folder + '/other_outputs/' + pmcid # Do grounding mapping here # Load the TRIPS-specific grounding map and add to the default # (REACH-oriented) grounding map: trips_gm = load_grounding_map('trips_grounding_map.csv') default_grounding_map.update(trips_gm) gm = GroundingMapper(default_grounding_map) mapped_agent_stmts = gm.map_agents(stmts) renamed_agent_stmts = gm.rename_agents(mapped_agent_stmts) # Filter for grounding grounded_stmts = [] for st in renamed_agent_stmts: if all([is_protein_or_chemical(a) for a in st.agent_list()]): grounded_stmts.append(st) # Instantiate the Preassembler pa = Preassembler(hierarchies) pa.add_statements(grounded_stmts) print('== %s ====================' % pmcid) print('%d statements collected in total.' % len(pa.stmts)) # Combine duplicates unique_stmts = pa.combine_duplicates() print('%d statements after combining duplicates.' % len(unique_stmts)) # Run BeliefEngine on unique statements epe = BeliefEngine() epe.set_prior_probs(pa.unique_stmts) # Build statement hierarchy related_stmts = pa.combine_related() # Run BeliefEngine on hierarchy epe.set_hierarchy_probs(related_stmts) print('%d statements after combining related.' % len(related_stmts)) # Instantiate the mechanism linker ml = MechLinker(related_stmts) # Link statements linked_stmts = ml.link_statements() # Run BeliefEngine on linked statements epe.set_linked_probs(linked_stmts) # Print linked statements for debugging purposes print('Linked\n=====') for ls in linked_stmts: print(ls.inferred_stmt.belief, ls.inferred_stmt) print('=============') # Combine all statements including linked ones all_statements = ml.statements + [ls.inferred_stmt for ls in linked_stmts] # Instantiate a new preassembler pa = Preassembler(hierarchies, all_statements) # Build hierarchy again pa.combine_duplicates() # Choose the top-level statements related_stmts = pa.combine_related() # Remove top-level statements that came only from the prior if background_assertions is not None: nonbg_stmts = [ stmt for stmt in related_stmts if stmt not in background_assertions ] else: nonbg_stmts = related_stmts # Dump top-level statements in a pickle with open(otherout_prefix + '.pkl', 'wb') as fh: pickle.dump(nonbg_stmts, fh, protocol=2) # Flatten evidence for statements flattened_evidence_stmts = flatten_evidence(nonbg_stmts) # Start a card counter card_counter = 1 # We don't limit the number of cards reported in this round card_lim = float('inf') top_stmts = [] ############################################### # The belief cutoff for statements belief_cutoff = 0.3 ############################################### # Sort by amount of evidence for st in sorted(flattened_evidence_stmts, key=lambda x: x.belief, reverse=True): if st.belief >= belief_cutoff: print(st.belief, st) if st.belief < belief_cutoff: print('SKIP', st.belief, st) # If it's background knowledge, we skip the statement if is_background_knowledge(st): print('This statement is background knowledge - skipping.') continue # Assemble IndexCards ia = IndexCardAssembler([st], pmc_override=pmcid) ia.make_model() # If the index card was actually made # (not all statements can be assembled into index cards to # this is often not the case) if ia.cards: # Save the index card json ia.save_model(indexcard_prefix + '-%d.json' % card_counter) card_counter += 1 top_stmts.append(st) if card_counter > card_lim: break # Print the English-assembled model for debugging purposes ea = EnglishAssembler(top_stmts) print('=======================') print(ea.make_model()) print('=======================') # Print the statement graph graph = render_stmt_graph(nonbg_stmts) graph.draw(otherout_prefix + '_graph.pdf', prog='dot') # Print statement diagnostics print_stmts(pa.stmts, otherout_prefix + '_statements.tsv') print_stmts(related_stmts, otherout_prefix + '_related_statements.tsv')
def calculate_belief(stmts): be = BeliefEngine() be.set_prior_probs(stmts) be.set_hierarchy_probs(stmts) return {s.matches_key(): s.belief for s in stmts}
def test_prior_prob_assertion(): be = BeliefEngine() st = Phosphorylation(None, Agent('a'), evidence=[ev1, ev1, ev2, ev3]) assert st.belief == 1 be.set_prior_probs([st]) assert st.belief == 1
def test_check_prior_probs(): be = BeliefEngine() st = Phosphorylation(None, Agent('ERK'), evidence=[Evidence(source_api='xxx')]) be.set_prior_probs([st])
def update_beliefs(): if request.json is None: abort(Response('Missing application/json header.', 415)) # Get input parameters corpus_id = request.json.get('corpus_id') curations = request.json.get('curations', {}) return_beliefs = request.json.get('return_beliefs', False) # Get the right corpus try: corpus = corpora[corpus_id] except KeyError: abort(Response('The corpus_id "%s" is unknown.' % corpus_id, 400)) return # Start tabulating the curation counts prior_counts = {} subtype_counts = {} # Take each curation from the input for uuid, correct in curations.items(): # Save the curation in the corpus # TODO: handle already existing curation stmt = corpus.statements.get(uuid) if stmt is None: logger.warning('%s is not in the corpus.' % uuid) continue corpus.curations[uuid] = correct # Now take all the evidences of the statement and assume that # they follow the correctness of the curation and contribute to # counts for their sources for ev in stmt.evidence: # Make the index in the curation count list idx = 0 if correct else 1 extraction_rule = ev.annotations.get('found_by') # If there is no extraction rule then we just score the source if not extraction_rule: try: prior_counts[ev.source_api][idx] += 1 except KeyError: prior_counts[ev.source_api] = [0, 0] prior_counts[ev.source_api][idx] += 1 # Otherwise we score the specific extraction rule else: try: subtype_counts[ev.source_api][extraction_rule][idx] += 1 except KeyError: if ev.source_api not in subtype_counts: subtype_counts[ev.source_api] = {} subtype_counts[ev.source_api][extraction_rule] = [0, 0] subtype_counts[ev.source_api][extraction_rule][idx] += 1 # Finally, we update the scorer with the new curation counts scorer.update_counts(prior_counts, subtype_counts) # If not belief return is needed, we just stop here if not return_beliefs: return jsonify({}) # Otherwise we rerun the belief calculation on the corpus with # the updated scorer and return a dict of beliefs else: be = BeliefEngine(scorer) stmts = list(corpus.statements.values()) be.set_prior_probs(stmts) # Here we set beliefs based on actual curation for uuid, correct in corpus.curations.items(): stmt = corpus.statements.get(uuid) if stmt is None: logger.warning('%s is not in the corpus.' % uuid) continue stmt.belief = correct belief_dict = {st.uuid: st.belief for st in stmts} return jsonify(belief_dict)