def _condense_statements(self, cleaned_stmts, mk_done, new_mk_set, uuid_sid_dict): self._log("Condense into unique statements...") new_unique_stmts = [] evidence_links = defaultdict(lambda: set()) agent_tuples = set() for s in cleaned_stmts: h = s.get_hash(refresh=True) # If this statement is new, make it. if h not in mk_done and h not in new_mk_set: new_unique_stmts.append(s.make_generic_copy()) new_mk_set.add(h) # Add the evidence to the dict. evidence_links[h].add(uuid_sid_dict[s.uuid]) # Add any db refs to the agents. ref_data, _, _ = extract_agent_data(s, h) agent_tuples |= set(ref_data) return new_unique_stmts, evidence_links, agent_tuples
def _build_test_set(): agents = [{ 'NAME': 'ERK', 'FPLX': 'ERK', 'TEXT': 'MAPK' }, { 'NAME': 'TP53', 'HGNC': '11998' }, { 'NAME': 'MEK', 'FPLX': 'MEK' }, { 'NAME': 'Vemurafenib', 'CHEBI': 'CHEBI:63637' }] stypes = ['Phosphorylation', 'Activation', 'Inhibition', 'Complex'] sources = [('medscan', 'rd'), ('reach', 'rd'), ('pc11', 'db'), ('signor', 'db')] mesh_ids = ['D000225', 'D002352', 'D015536'] mesh_combos = [] for num_mesh in range(0, 3): if num_mesh == 1: mesh_groups = [[mid] for mid in mesh_ids] else: mesh_groups = combinations(mesh_ids, num_mesh) mesh_combos.extend(list(mesh_groups)) random.shuffle(mesh_combos) source_data = [] for num_srcs in range(1, 5): if num_srcs == 1: src_iter = [[src] for src in sources] else: src_iter = combinations(sources, num_srcs) for src_list in src_iter: only_src = None if len(src_list) > 1 else src_list[0][0] has_rd = any(t == 'rd' for _, t in src_list) if has_rd: mesh_ids = mesh_combos[len(source_data) % len(mesh_combos)] else: mesh_ids = [] source_data.append({ 'sources': {src: random.randint(1, 50) for src, _ in src_list}, 'has_rd': any(t == 'rd' for _, t in src_list), 'has_db': any(t == 'db' for _, t in src_list), 'only_src': only_src, 'mesh_ids': mesh_ids }) random.shuffle(source_data) stmts = [ tuple(tpl) + (None, None) for tpl in product(stypes, permutations(agents, 2)) ] stmts += [('ActiveForm', (ref, ), activity, is_active) for activity, is_active, ref in product( ['transcription', 'activity'], [True, False], agents)] complex_pairs = [] name_meta_rows = [] name_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') text_meta_rows = [] text_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') other_meta_rows = [] other_meta_cols = ('mk_hash', 'ag_num', 'db_name', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') source_meta_rows = [] source_meta_cols = ('mk_hash', 'reach', 'medscan', 'pc11', 'signor', 'ev_count', 'type_num', 'activity', 'is_active', 'agent_count', 'num_srcs', 'src_json', 'only_src', 'has_rd', 'has_db') mesh_meta_rows = [] mesh_meta_cols = ('mk_hash', 'ev_count', 'mesh_num', 'type_num', 'activity', 'is_active', 'agent_count') for stype, refs, activity, is_active in stmts: # Extract agents, and make a Statement. StmtClass = get_statement_by_name(stype) if stype == 'ActiveForm': ag = make_agent_from_ref(refs[0]) stmt = StmtClass(ag, activity=activity, is_active=is_active) else: ag1 = make_agent_from_ref(refs[0]) ag2 = make_agent_from_ref(refs[1]) if stype == 'Complex': if {ag1.name, ag2.name} in complex_pairs: continue stmt = StmtClass([ag1, ag2]) complex_pairs.append({ag1.name, ag2.name}) else: stmt = StmtClass(ag1, ag2) # Connect with a source. source_dict = source_data[len(source_meta_rows) % len(source_data)] ev_count = sum(source_dict['sources'].values()) src_row = (stmt.get_hash(), ) for src_name in ['reach', 'medscan', 'pc11', 'signor']: src_row += (source_dict['sources'].get(src_name), ) src_row += (ev_count, ro_type_map.get_int(stype), activity, is_active, len(refs), len(source_dict['sources']), json.dumps(source_dict['sources']), source_dict['only_src'], source_dict['has_rd'], source_dict['has_db']) source_meta_rows.append(src_row) # Add mesh rows for mesh_id in source_dict['mesh_ids']: mesh_meta_rows.append( (stmt.get_hash(), ev_count, int(mesh_id[1:]), ro_type_map.get_int(stype), activity, is_active, len(refs))) # Generate agent rows. ref_rows, _, _ = extract_agent_data(stmt, stmt.get_hash()) for row in ref_rows: row = row[:4] + (ro_role_map.get_int( row[4]), ro_type_map.get_int(stype), ev_count, activity, is_active, len(refs)) if row[2] == 'NAME': row = row[:2] + row[3:] name_meta_rows.append(row) elif row[2] == 'TEXT': row = row[:2] + row[3:] text_meta_rows.append(row) else: other_meta_rows.append(row) db = get_temp_db(clear=True) src_meta_cols = [{'name': col} for col, _ in sources] db.SourceMeta.load_cols(db.engine, src_meta_cols) for tbl in [ db.SourceMeta, db.MeshMeta, db.NameMeta, db.TextMeta, db.OtherMeta ]: tbl.__table__.create(db.engine) db.copy('readonly.source_meta', source_meta_rows, source_meta_cols) db.copy('readonly.mesh_meta', mesh_meta_rows, mesh_meta_cols) db.copy('readonly.name_meta', name_meta_rows, name_meta_cols) db.copy('readonly.text_meta', text_meta_rows, text_meta_cols) db.copy('readonly.other_meta', other_meta_rows, other_meta_cols) return db