def get_statement_object(db_stmt): """Get an INDRA Statement object from a db_stmt.""" if isinstance(db_stmt, bytes): jb = db_stmt else: jb = db_stmt.json return Statement._from_json(json.loads(jb.decode('utf-8')))
def test_conversion_keying(): stmt_json = { "type": "Conversion", "subj": { "name": "inflammatory response", "db_refs": {} }, "obj_from": [{ "name": "KNG1", "db_refs": { "HGNC": "6383", "UP": "P01042" } }], "obj_to": [{ "name": "Kallidin", "db_refs": { "SCHEM": "Kallidin" } }], "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb" } stmt_list = [Statement._from_json(stmt_json)] stmt_list[0].agent_list() list(_get_keyed_stmts(stmt_list)) return
def _choose_unique(not_duplicates, get_full_stmts, stmt_tpl_grp): """Choose one of the statements from a redundant set.""" assert stmt_tpl_grp, "This cannot be empty." if len(stmt_tpl_grp) == 1: s_tpl = stmt_tpl_grp[0] duplicate_ids = set() else: stmt_tpl_set = set(stmt_tpl_grp) preferred_tpls = {tpl for tpl in stmt_tpl_set if tpl[1] in not_duplicates} if not preferred_tpls: s_tpl = stmt_tpl_set.pop() elif len(preferred_tpls) == 1: s_tpl = preferred_tpls.pop() else: # len(preferred_stmts) > 1 assert False, \ ("Duplicate deduplicated statements found: %s" % str(preferred_tpls)) duplicate_ids = {tpl[1] for tpl in stmt_tpl_set if tpl[1] not in not_duplicates} if get_full_stmts: stmt_json = json.loads(s_tpl[2].decode('utf-8')) ret_stmt = Statement._from_json(stmt_json) else: ret_stmt = s_tpl[1] return ret_stmt, duplicate_ids
def get_statement(cls, cl_statement): """Get an INDRA Statement from cl-json""" stmt_json = cls.converter.cl_to_json(cl_statement) if not stmt_json: return None elif isinstance(stmt_json, list): return stmts_from_json(stmt_json) else: return Statement._from_json(stmt_json)
def _get_reading_statement_dict(db, clauses=None, get_full_stmts=True): """Get a nested dict of statements, keyed by ref, content, and reading.""" # Construct the query for metadata from the database. q = (db.session.query(db.TextRef, db.TextContent.id, db.TextContent.source, db.Reading.id, db.Reading.reader_version, db.RawStatements.id, db.RawStatements.json) .filter(db.RawStatements.reading_id == db.Reading.id, db.Reading.text_content_id == db.TextContent.id, db.TextContent.text_ref_id == db.TextRef.id)) if clauses: q = q.filter(*clauses) # Prime some counters. num_duplicate_evidence = 0 num_unique_evidence = 0 # Populate a dict with all the data. stmt_nd = NestedDict() for tr, tcid, src, rid, rv, sid, sjson in q.yield_per(1000): # Back out the reader name. for reader, rv_list in reader_versions.items(): if rv in rv_list: break else: raise Exception("rv %s not recognized." % rv) # Get the json for comparison and/or storage stmt_json = json.loads(sjson.decode('utf8')) stmt = Statement._from_json(stmt_json) _set_evidence_text_ref(stmt, tr) # Hash the compbined stmt and evidence matches key. stmt_hash = stmt.get_hash(shallow=False) # For convenience get the endpoint statement dict s_dict = stmt_nd[tr.id][src][tcid][reader][rv][rid] # Initialize the value to a set, and count duplicates if stmt_hash not in s_dict.keys(): s_dict[stmt_hash] = set() num_unique_evidence += 1 else: num_duplicate_evidence += 1 # Either store the statement, or the statement id. if get_full_stmts: s_dict[stmt_hash].add((sid, stmt)) else: s_dict[stmt_hash].add((sid, None)) # Report on the results. print("Found %d relevant text refs with statements." % len(stmt_nd)) print("number of statement exact duplicates: %d" % num_duplicate_evidence) print("number of unique statements: %d" % num_unique_evidence) return stmt_nd
def dump_statements(self, db): tc_rows = set(self.text_content.values()) tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint') logger.info(f"Dumping {len(tc_rows)} text content.") db.copy_lazy('text_content', tc_rows, tc_cols) # Look up tcids for newly entered content. tcids = db.select_all( [db.TextContent.text_ref_id, db.TextContent.id], db.TextContent.text_ref_id.in_(self.statements.keys()), db.TextContent.format == 'xdd' ) tcid_lookup = {trid: tcid for trid, tcid in tcids} # Compile reading and statements into rows. r_rows = set() r_cols = ('id', 'text_content_id', 'reader', 'reader_version', 'format', 'batch_id') s_rows = set() rd_batch_id = db.make_copy_batch_id() stmt_batch_id = db.make_copy_batch_id() stmts = [] for trid, trid_set in self.statements.items(): for reader, stmt_list in trid_set.items(): tcid = tcid_lookup[trid] reader_version = self.reader_versions[reader.upper()] reading_id = generate_reading_id(tcid, reader, reader_version) r_rows.add((reading_id, tcid, reader.upper(), reader_version, 'xdd', rd_batch_id)) for sj in stmt_list: stmt = Statement._from_json(sj) stmts.append(stmt) sd = DatabaseStatementData( stmt, reading_id, indra_version=self.indra_version ) s_rows.add(sd.make_tuple(stmt_batch_id)) logger.info(f"Dumping {len(r_rows)} readings.") db.copy_lazy('reading', r_rows, r_cols, commit=False) logger.info(f"Dumping {len(s_rows)} raw statements.") db.copy_lazy('raw_statements', s_rows, DatabaseStatementData.get_cols(), commit=False) if len(stmts): insert_raw_agents(db, stmt_batch_id, stmts, verbose=False, commit=False) update_rows = [(json.dumps(self.reader_versions), self.indra_version, group.key[:-1]) for group in self.groups] db.copy('xdd_updates', update_rows, ('reader_versions', 'indra_version', 'day_str')) return
def test_conversion_keying(): stmt_json = {"type": "Conversion", "subj": {"name": "inflammatory response", "db_refs": {}}, "obj_from": [{"name": "KNG1", "db_refs": {"HGNC": "6383", "UP": "P01042"}}], "obj_to": [{"name": "Kallidin", "db_refs": {"SCHEM": "Kallidin"}}], "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb"} stmt_list = [Statement._from_json(stmt_json)] stmt_list[0].agent_list() list(_get_keyed_stmts(stmt_list)) return
def insert_the_statements(self, input_tuples): print("Loading %d statements..." % len(input_tuples)) cols = self.test_data['raw_statements']['cols'] + ('source_hash', ) new_input_tuples = [] for t in input_tuples: s = Statement._from_json(json.loads(t[-1].decode('utf-8'))) t += (s.evidence[0].get_source_hash(), ) new_input_tuples.append(t) self.test_db.copy('raw_statements', new_input_tuples, cols) print("Inserting agents...") dbu.insert_agents(self.test_db, 'raw') return
def _get_input_stmt_tuples(num_stmts): print("\tPrepping the raw statements...") stmt_tuples, col_names = _get_stmt_tuples(num_stmts) copy_col_names = ('uuid', 'mk_hash', 'type', 'indra_version', 'json', 'reading_id', 'db_info_id') copy_stmt_tuples = [] for tpl in stmt_tuples: entry_dict = dict(zip(col_names, tpl)) json_bytes = entry_dict['json'] stmt = Statement._from_json(json.loads(json_bytes.decode('utf-8'))) entry_dict['mk_hash'] = stmt.get_hash() ret_tpl = tuple([entry_dict[col] for col in copy_col_names]) copy_stmt_tuples.append(ret_tpl) return copy_stmt_tuples, copy_col_names
def _json_str_to_stmts_dict(json_str): """Make a dict of statements keyed by their uuid's from json representation This function is the inverse of _stmts_dict_to_json_str() Parameters ---------- json_str : str A json compatible string Returns ------- stmt_dict : dict Dict with statements keyed by their uuid's: {uuid: stmt} """ stmt_jsons = json.loads(json_str) stmts = [Statement._from_json(s) for s in stmt_jsons] return {s.uuid: s for s in stmts}
def str_imp(o, uuid=None, other_stmt_keys=None): if o is None: return '~' cname = o.__class__.__name__ if cname == 'TextRef': return ('<TextRef: trid: %s, pmid: %s, pmcid: %s>' % (o.id, o.pmid, o.pmcid)) if cname == 'TextContent': return ('<TextContent: tcid: %s, trid: %s, src: %s>' % (o.id, o.text_ref_id, o.source)) if cname == 'Reading': return ('<Reading: rid: %s, tcid: %s, reader: %s, rv: %s>' % (o.id, o.text_content_id, o.reader, o.reader_version)) if cname == 'RawStatements': s = Statement._from_json(json.loads(o.json.decode())) s_str = ('<RawStmt: %s sid: %s, uuid: %s, type: %s, iv: %s, hash: %s>' % (str(s), o.id, o.uuid, o.type, o.indra_version, o.mk_hash)) if other_stmt_keys and s.get_hash(shallow=True) in other_stmt_keys: s_str = '+' + s_str if s.uuid == uuid: s_str = '*' + s_str return s_str
def get_filtered_db_stmts(db, get_full_stmts=False, clauses=None): """Get the set of statements/ids from databases minus exact duplicates.""" # Only get the json if it's going to be used. if get_full_stmts: tbl_list = [db.RawStatements.json] else: tbl_list = [db.RawStatements.id] db_s_q = db.filter_query(tbl_list, db.RawStatements.db_info_id.isnot(None)) # Add any other criterion specified at higher levels. if clauses: db_s_q = db_s_q.filter(*clauses) # Produce a generator of statement groups. db_stmt_data = db_s_q.yield_per(10000) if get_full_stmts: return { Statement._from_json(json.loads(s_json.decode('utf-8'))) for s_json, in db_stmt_data } else: return {sid for sid, in db_stmt_data}
def _get_statement_object(db_stmt): """Get an INDRA Statement object from a db_stmt.""" return Statement._from_json(json.loads(db_stmt.json.decode('utf-8')))
def _stmt_from_json(stmt_json_bytes): return Statement._from_json(json.loads(stmt_json_bytes.decode('utf-8')))
def distill_stmts_from_reading(db, get_full_stmts=False, clauses=None): """Get a corpus of statements from clauses and filters duplicate evidence. Note that this will only get statements from reading. Parameters ---------- db : :py:class:`DatabaseManager` A database manager instance to access the database. get_full_stmts : bool By default (False), only Statement ids (the primary index of Statements on the database) are returned. However, if set to True, serialized INDRA Statements will be returned. Note that this will in general be VERY large in memory, and therefore should be used with caution. clauses : None or list of sqlalchemy clauses By default None. Specify sqlalchemy clauses to reduce the scope of statements, e.g. `clauses=[db.Statements.type == 'Phosphorylation']` or `clauses=[db.Statements.uuid.in_([<uuids>])]`. Returns ------- stmt_dn : NestedDict A deeply nested recursive dictionary, carrying the metadata for the Statements. stmt_ret : set A set of either statement ids or serialized statements, depending on `get_full_stmts`. """ # Construct the query for metadata from the database. q = (db.session.query(db.TextContent.text_ref_id, db.TextContent.id, db.TextContent.source, db.Readings.id, db.Readings.reader_version, db.Statements.id, db.Statements.json).filter( db.TextContent.id == db.Readings.text_content_id, db.Readings.id == db.Statements.reader_ref)) if clauses: q.filter(*clauses) # Specify sources of fulltext content, and order priorities. full_text_content = ['manuscripts', 'pmc_oa', 'elsevier'] # Specify versions of readers, and preference. sparser_versions = ['sept14-linux\n', 'sept14-linux'] reach_versions = ['61059a-biores-e9ee36', '1.3.3-61059a-biores-'] # Prime some counters. num_duplicate_evidence = 0 num_unique_evidence = 0 # Populate a dict with all the data. stmt_nd = NestedDict() for trid, tcid, src, rid, rv, sid, sjson in q.yield_per(1000): # Back out the reader name. if rv in sparser_versions: reader = 'sparser' elif rv in reach_versions: reader = 'reach' else: raise Exception("rv %s not recognized." % rv) # Get the json for comparison and/or storage stmt_json = json.loads(sjson.decode('utf8')) stmt = Statement._from_json(stmt_json) # Hash the compbined stmt and evidence matches key. m_key = stmt.matches_key() + stmt.evidence[0].matches_key() stmt_hash = hash(m_key) # For convenience get the endpoint statement dict s_dict = stmt_nd[trid][src][tcid][reader][rv][rid] # Initialize the value to a set, and count duplicates if stmt_hash not in s_dict.keys(): s_dict[stmt_hash] = set() num_unique_evidence += 1 else: num_duplicate_evidence += 1 # Either store the statement, or the statement id. if get_full_stmts: s_dict[stmt_hash].add(stmt) else: s_dict[stmt_hash].add(sid) # Report on the results. print("Found %d relevant text refs with statements." % len(stmt_nd)) print("number of statement exact duplicates: %d" % num_duplicate_evidence) print("number of unique statements: %d" % num_unique_evidence) # Now we filter and get the set of statements/statement ids. stmts = set() for trid, src_dict in stmt_nd.items(): # Filter out unneeded fulltext. while sum([k != 'pubmed' for k in src_dict.keys()]) > 1: worst_src = min(src_dict, key=lambda x: full_text_content.index(x[0])) del src_dict[worst_src] # Filter out the older reader versions for reader, rv_list in [('reach', reach_versions), ('sparser', sparser_versions)]: for rv_dict in src_dict.gets(reader): best_rv = max(rv_dict, key=lambda x: rv_list.index(x)) # Take any one of the duplicates. Statements/Statement ids are # already grouped into sets of duplicates keyed by the # Statement and Evidence matches key hashes. We only want one # of each. stmts |= {(ev_hash, list(ev_set)[0]) for ev_hash, ev_set in rv_dict[best_rv].items()} return stmt_nd, stmts
def dump_statements(self, db): from indra_db.reading.read_db import DatabaseStatementData, \ generate_reading_id tc_rows = set(self.text_content.values()) tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint') logger.info(f"Dumping {len(tc_rows)} text content.") db.copy_lazy('text_content', tc_rows, tc_cols) # Look up tcids for newly entered content. tcids = db.select_all([db.TextContent.text_ref_id, db.TextContent.id], db.TextContent.text_ref_id.in_( self.statements.keys()), db.TextContent.format == 'xdd') tcid_lookup = {trid: tcid for trid, tcid in tcids} # Compile reading and statements into rows. r_rows = set() r_cols = ('id', 'text_content_id', 'reader', 'reader_version', 'format', 'batch_id') s_rows = set() rd_batch_id = db.make_copy_batch_id() stmt_batch_id = db.make_copy_batch_id() stmts = [] for trid, trid_set in self.statements.items(): for reader, stmt_list in trid_set.items(): tcid = tcid_lookup[trid] reader_version = self.reader_versions[reader.upper()] reading_id = generate_reading_id(tcid, reader, reader_version) r_rows.add((reading_id, tcid, reader.upper(), reader_version, 'xdd', rd_batch_id)) for sj in stmt_list: stmt = Statement._from_json(sj) stmts.append(stmt) sd = DatabaseStatementData( stmt, reading_id, indra_version=self.indra_version) s_rows.add(sd.make_tuple(stmt_batch_id)) logger.info(f"Dumping {len(r_rows)} readings.") db.copy_lazy('reading', r_rows, r_cols, commit=False, constraint='reading-uniqueness') logger.info(f"Dumping {len(s_rows)} raw statements.") skipped = db.copy_report_lazy('raw_statements', s_rows, DatabaseStatementData.get_cols(), commit=False) skipped_uuids = { t[DatabaseStatementData.get_cols().index('uuid')] for t in skipped } new_stmts = [s for s in stmts if s.uuid not in skipped_uuids] if len(new_stmts): insert_raw_agents(db, stmt_batch_id, new_stmts, verbose=False, commit=False) return