def get_statements_stats(fname=None, db=None, indra_version=None): if db is None: db = get_primary_db() tc_rdng_link = db.TextContent.id == db.Reading.text_content_id stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref __report_stat('\nStatement Statistics:', fname) __report_stat('---------------------', fname) if indra_version is not None: filters = [db.RawStatements.indra_version == indra_version] else: filters = [] total_raw_statements = db.count(db.RawStatements, *filters) __report_stat("Total number of raw statements: %d" % total_raw_statements, fname) readers = db.session.query(db.Reading.reader).distinct().all() sources = db.session.query(db.TextContent.source).distinct().all() stats = '' for reader, in readers: for src, in sources: cnt = db.count(db.RawStatements, stmt_rdng_link, tc_rdng_link, db.Reading.reader == reader, db.TextContent.source == src, *filters) stats += (' Raw statements from %s reading %s: %d\n' % (reader, src, cnt)) __report_stat("Statements by reader and content source:\n%s" % stats, fname) _report_groups(db, db.RawStatements.id, db.DBInfo.db_name, fname, db.RawStatements.db_info_id == db.DBInfo.id) if indra_version is None: _report_groups(db, db.RawStatements.id, db.RawStatements.indra_version, fname) return
def main(): parser = get_parser() args = parser.parse_args() if args.test: if 'test' not in args.database: from indra_db.tests.util import get_temp_db db = get_temp_db() else: db = get_db(args.database) elif args.database == 'primary': db = get_primary_db() else: db = get_db(args.database) readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI'] if args.method == 'local': bulk_manager = BulkLocalReadingManager(readers, buffer_days=args.buffer, n_procs=args.num_procs) elif args.method == 'aws': bulk_manager = BulkAwsReadingManager(readers, buffer_days=args.buffer, project_name=args.project_name) else: assert False, "This shouldn't be allowed." if args.task == 'read_all': bulk_manager.read_all(db) elif args.task == 'read_new': bulk_manager.read_new(db) return
def get_db_statistics(fname=None, db=None, tables=None): """Get statistics on the contents of the database""" if db is None: db = get_primary_db() task_dict = { 'text_ref': get_text_ref_stats, 'text_content': get_text_content_stats, 'readings': get_readings_stats, 'raw_statements': get_statements_stats, 'pa_statements': get_pa_statement_stats } task_order = [ 'text_ref', 'text_content', 'readings', 'raw_statements', 'pa_statements' ] # Get the statistics if tables is None: for task_name in task_order: stat_meth = task_dict[task_name] stat_meth(fname, db) else: table_set = set(tables) for task_name in [tn for tn in task_order if tn in table_set]: task_dict[task_name](fname, db) return
def get_text_content_stats(fname=None, db=None): if db is None: db = get_primary_db() tc_rdng_link = db.TextContent.id == db.Reading.text_content_id __report_stat("\nText Content statistics:", fname) __report_stat('------------------------', fname) total_content = db.count(db.TextContent) __report_stat("Total number of text content entries: %d" % total_content) latest_updates = (db.session.query(db.Updates.source, func.max(db.Updates.datetime)).group_by( db.Updates.source).all()) __report_stat( ("Latest updates:\n %s" % '\n '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname) content_read = db.count(db.Reading.text_content_id) __report_stat("Total content read: %d" % content_read, fname) fulltext_content = db.count(db.TextContent, db.TextContent.text_type == 'fulltext') __report_stat("Number of fulltext entries: %d" % fulltext_content, fname) fulltext_read = db.count(db.TextContent, db.TextContent.text_type == 'fulltext', tc_rdng_link) __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname) _report_groups(db, db.TextContent.id, db.TextContent.source, fname) _report_groups(db, db.TextContent.id, db.TextContent.source, fname, tc_rdng_link) return
def read_db_ids_search_terms(id_search_terms, id_type): """Return extracted EmmaaStatements from INDRA database given an ID-search term dict. Parameters ---------- id_search_terms : dict A dict representing a set of IDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given IDs. """ ids = list(id_search_terms.keys()) date = datetime.datetime.utcnow() db = get_primary_db() id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db) estmts = [] for _id, stmt_jsons in id_stmts.items(): stmts = stmts_from_json(stmt_jsons) for stmt in stmts: es = EmmaaStatement(stmt, date, id_search_terms[_id]) estmts.append(es) return estmts
def get_db_agent_mod_stmts(filename, cached=True): with open(filename, 'rb') as fh: site_stmts = pickle.load(fh) return site_stmts def has_mod_agents(stmt): mod_agents = [] for agent in stmt.agent_list(): if agent is not None: for mc in agent.mods: if has_site_pos(mc): return True return False def has_site_pos(mc): return mc.position is not None and mc.residue is not None batch_size = 100000 db = get_primary_db() site_stmts = [] for idx, db_stmt_batch in db.select_all_batched( batch_size, db.RawStatements, db.RawStatements.reading_id.isnot(None)): stmt_tuples = get_raw_stmts_frm_db_list(db, db_stmt_batch, fix_refs=False) stmts = [s[1] for s in stmt_tuples] for stmt in stmts: if has_mod_agents(stmt): site_stmts.append(stmt) print('Finished batch %d' % idx) print('Currently have %d site statements' % len(site_stmts)) with open(filename, 'wb') as f: pickle.dump(site_stmts, f) return site_stmts
def read_db_pmid_search_terms(pmid_search_terms): """Return extracted EmmaaStatements from INDRA database given a PMID-search term dict. Parameters ---------- pmid_search_terms : dict A dict representing a set of PMIDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given PMIDs. """ pmids = list(pmid_search_terms.keys()) date = datetime.datetime.utcnow() db = get_primary_db() pmid_stmts = get_statements_by_paper(pmids, id_type='pmid', db=db, preassembled=False) estmts = [] for pmid, stmts in pmid_stmts.items(): for stmt in stmts: es = EmmaaStatement(stmt, date, pmid_search_terms[pmid]) estmts.append(es) return estmts
def get_statement_jsons_from_papers(paper_refs, db=None, **kwargs): """Get the statements from a list of papers. Parameters ---------- paper_refs : list[(<id_type>, <paper_id>)] A list of tuples, where each tuple indicates and id-type (e.g. 'pmid') and an id value for a particular paper. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local databse instance. Some keyword arguments are passed directly to a lower level function: Other Parameters (kwargs) ------------------------- max_stmts : int or None Limit the number of statements queried. If None, no restriction is applied. offset : int or None Start reading statements by a given offset. If None, no offset is applied. Most commonly used in conjunction with `max_stmts`. ev_limit : int or None Limit the amount of evidence returned per Statement. best_first : bool If True, the preassembled statements will be sorted by the amount of evidence they have, and those with the most evidence will be prioritized. When using `max_stmts`, this means you will get the "best" statements. If False, statements will be queried in arbitrary order. Returns ------- A dictionary data structure containing, among other metadata, a dict of statement jsons under the key 'statements', themselves keyed by their shallow matches-key hashes. """ if db is None: db = get_primary_db() # Create a sub-query on the reading metadata q = db.session.query(db.ReadingRefLink.rid.label('rid')) conditions = [] for id_type, paper_id in paper_refs: tbl_attr = getattr(db.ReadingRefLink, id_type) if id_type in ['trid', 'tcid']: conditions.append(tbl_attr == paper_id) else: conditions.append(tbl_attr.like(paper_id)) q = q.filter(or_(*conditions)) sub_al = q.subquery('reading_ids') # Map the reading metadata query to mk_hashes with statement counts. mk_hashes_q = (db.session.query( db.PaMeta.mk_hash.label('mk_hash'), db.PaMeta.ev_count.label('ev_count')).filter( db.PaMeta.mk_hash == db.FastRawPaLink.mk_hash, db.FastRawPaLink.reading_id == sub_al.c.rid)) return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
def get_statement_jsons_from_hashes(mk_hashes, db=None, **kwargs): """Get statement jsons using the appropriate hashes.""" if db is None: db = get_primary_db() mk_hashes_q = (db.session.query(db.PaMeta.mk_hash, db.PaMeta.ev_count).filter( db.PaMeta.mk_hash.in_(mk_hashes))) return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
def get_stmt_count_from_db(): """Not recommended, very slow.""" hgnc_entries = get_hgnc_entries() random.seed(1) random.shuffle(hgnc_entries) db = get_primary_db() CHECKPOINT_FILE = 'checkpoint.pkl' if os.path.exists(CHECKPOINT_FILE): print("Loading from checkpoint") with open(CHECKPOINT_FILE, 'rb') as f: start_ix, stmt_counts = pickle.load(f) if start_ix == len(hgnc_entries): return stmt_counts else: start_ix = 0 stmt_counts = {} start = time.time() CHECKPOINT_INTERVAL = 100 for ix in range(start_ix, len(hgnc_entries)): hgnc_name, hgnc_id = hgnc_entries[ix] # Save the state of the dict if ix != 0 and ix % CHECKPOINT_INTERVAL == 0: print("Saving checkpoint") with open(CHECKPOINT_FILE, 'wb') as f: pickle.dump((ix, stmt_counts), f) # Run the query q = db.filter_query(db.RawStatements, db.RawAgents.stmt_id == db.RawStatements.id, db.RawAgents.db_name.like('HGNC'), db.RawAgents.db_id.like(str(hgnc_id))) # Get the statement count stmt_count = q.count() # Print some stats elapsed = time.time() - start time_per_gene = elapsed / (ix - start_ix + 1) num_remaining = len(hgnc_entries) - (ix + 1) sec_remaining = time_per_gene * num_remaining min_remaining = sec_remaining / 60. print("%d of %d: %d statements for %s (%s): Est %.2f min remaining" % (ix+1, len(hgnc_entries), stmt_count, hgnc_name, hgnc_id, min_remaining)) # Put count into dict stmt_counts[hgnc_name] = stmt_count # Save final results with open(CHECKPOINT_FILE, 'wb') as f: pickle.dump(len(hgnc_entries), stmt_counts) return stmt_counts
def get_pa_statement_stats(fname=None, db=None): if db is None: db = get_primary_db() __report_stat('\nStatement Statistics:', fname) __report_stat('---------------------', fname) stmt_q = db.filter_query(db.PAStatements) __report_stat("Total number of statments: %d" % stmt_q.count(), fname) statements_produced_by_indra_version = (db.session.query( db.PAStatements.indra_version, func.count( db.PAStatements.id)).group_by(db.PAStatements.indra_version).all()) __report_stat( ("Number of statements by indra version:\n %s" % '\n '.join([ '%s: %d' % (s, n) for s, n in statements_produced_by_indra_version ])), fname) return
def get_statement_essentials(clauses, count=1000, db=None, preassembled=True): """Get the type, agents, and id data for the specified statements. This function is useful for light-weight searches of basic mechanistic information, without the need to follow as many links in the database to populate the Statement objects. To get full statements, use `get_statements`. Parameters ---------- clauses : list list of sqlalchemy WHERE clauses to pass to the filter query. count : int Number of statements to retrieve and process in each batch. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. Returns ------- A list of tuples containing: `(uuid, sid, hash, type, (agent_1, agent_2, ...))`. """ if db is None: db = get_primary_db() stmts_tblname = 'pa_statements' if preassembled else 'raw_statements' stmt_data = [] db_stmts = db.select_all(stmts_tblname, *clauses, yield_per=count) for db_stmt in db_stmts: stmt = get_statement_object(db_stmt) sid = db_stmt.id if hasattr(db_stmt, 'id') else None stmt_data.append( (db_stmt.uuid, sid, stmt.get_hash(shallow=True), db_stmt.type, stmt.agent_list())) return stmt_data
def get_curations(db=None, **params): """Get all curations for a certain level given certain criteria.""" if db is None: db = get_primary_db() cur = db.Curation constraints = [] for key, val in params.items(): if key == 'hash_val': key = 'pa_hash' if key == 'ev_hash': key = 'source_hash' if isinstance(val, list) or isinstance(val, set) \ or isinstance(val, tuple): constraints.append(getattr(cur, key).in_(val)) else: constraints.append(getattr(cur, key) == val) return db.select_all(cur, *constraints)
def get_curator_counts(db=None): """Return a Counter of the number of curations submitted by each user. Parameters ---------- db : Optional[DatabaseManager] A database manager object used to access the database. If not given, the database configured as primary is used. Returns ------- collections.Counter A Counter of curator users by the number of curations they have submitted. """ if db is None: db = get_primary_db() res = db.select_all(db.Curation) curators = [r.curator for r in res] counter = Counter(curators) return counter
def get_text_ref_stats(fname=None, db=None): if db is None: db = get_primary_db() tc_rdng_link = db.TextContent.id == db.Reading.text_content_id __report_stat("Text ref statistics:", fname) __report_stat("--------------------", fname) total_refs = db.count(db.TextRef) __report_stat('Total number of text refs: %d' % total_refs, fname) refs_with_content = db.count(db.TextContent.text_ref_id) __report_stat('Total number of refs with content: %d' % refs_with_content, fname) refs_by_type = _report_groups(db, db.TextContent.text_ref_id, db.TextContent.text_type, fname) __report_stat(('Number of refs with only abstract: %d' % (refs_with_content - refs_by_type['fulltext'])), fname) refs_with_reading = db.count(db.TextContent.text_ref_id, tc_rdng_link) __report_stat('Number of refs that have been read: %d' % refs_with_reading, fname) _report_groups(db, db.TextContent.text_ref_id, db.TextContent.text_type, fname, tc_rdng_link) return
def get_readings_stats(fname=None, db=None): if db is None: db = get_primary_db() __report_stat('\nReading statistics:', fname) __report_stat('-------------------', fname) total_readings = db.count(db.Reading) __report_stat('Total number or readings: %d' % total_readings, fname) # There may be a way to do this more neatly with a group_by clause, however # the naive way of doing it leaves us with a miscount due to indistinct. reader_versions = (db.session.query( db.Reading.reader_version).distinct().all()) sources = db.session.query(db.TextContent.source).distinct().all() stats = '' for rv, in reader_versions: for src, in sources: cnt = db.count(db.Reading, db.TextContent.id == db.Reading.text_content_id, db.TextContent.source == src, db.Reading.reader_version == rv) stats += ' Readings by %s from %s: %d\n' % (rv, src, cnt) __report_stat("Readings by reader version and content source:\n%s" % stats, fname) return
def get_support(statements, db=None, recursive=False): """Populate the supports and supported_by lists of the given statements.""" warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) # TODO: Allow recursive mode (argument should probably be an integer level) if db is None: db = get_primary_db() if not isinstance(statements, dict): stmt_dict = {s.get_hash(shallow=True): s for s in statements} else: stmt_dict = statements logger.info("Populating support links.") support_links = db.select_all( [ db.PASupportLinks.supported_mk_hash, db.PASupportLinks.supporting_mk_hash ], or_(db.PASupportLinks.supported_mk_hash.in_(stmt_dict.keys()), db.PASupportLinks.supporting_mk_hash.in_(stmt_dict.keys()))) for supped_hash, supping_hash in set(support_links): if supped_hash == supping_hash: assert False, 'Self-support found on-load.' supped_stmt = stmt_dict.get(supped_hash) if supped_stmt is None: supped_stmt = Unresolved(shallow_hash=supped_hash) supping_stmt = stmt_dict.get(supping_hash) if supping_stmt is None: supping_stmt = Unresolved(shallow_hash=supping_hash) supped_stmt.supported_by.append(supping_stmt) supping_stmt.supports.append(supped_stmt) return
def submit_curation(hash_val, tag, curator, ip, api_key, text=None, ev_hash=None, source='direct_client', db=None): """Submit a curation for a given preassembled or raw extraction. Parameters ---------- hash_val : int The hash corresponding to the statement. tag : str A very short phrase categorizing the error or type of curation. curator : str The name or identifier for the curator. ip : str The ip address of user's computer. api_key : str If you have one, this can help identify you as a curator, and may lend extra weight to your curation(s). text : str A brief description of the problem. ev_hash : int A hash of the sentence and other evidence information. Elsewhere referred to as `source_hash`. source : str The name of the access point through which the curation was performed. The default is 'direct_client', meaning this function was used directly. Any higher-level application should identify itself here. db : DatabaseManager A database manager object used to access the database. """ if db is None: db = get_primary_db() inp = { 'tag': tag, 'text': text, 'curator': curator, 'ip': ip, 'source': source, 'pa_hash': hash_val, 'source_hash': ev_hash } auth = db._get_auth_info(api_key) if auth is None: raise NoAuthError(api_key, 'curation') inp['auth_id'] = auth[0] logger.info("Adding curation: %s" % str(inp)) try: dbid = db.insert(db.Curation, **inp) except IntegrityError as e: logger.error("Got a bad entry.") msg = e.args[0] detail_line = msg.splitlines()[1] m = re.match("DETAIL: .*?\(pa_hash\)=\((\d+)\).*?not present.*?pa.*?", detail_line) if m is None: raise e else: h = m.groups()[0] assert int(h) == int(hash_val), \ "Erred hash %s does not match input hash %s." % (h, hash_val) logger.error("Bad hash: %s" % h) raise BadHashError(h) return dbid
def get_statement_jsons_from_agents(agents=None, stmt_type=None, db=None, **kwargs): """Get json's for statements given agent refs and Statement type. Parameters ---------- agents : list[(<role>, <id>, <namespace>)] A list of agents, each specified by a tuple of information including: the `role`, which can be 'subject', 'object', or None, an `id`, such as the HGNC id, a CHEMBL id, or a FPLX id, etc, and the `namespace` which specifies which of the above is given in `id`. Some examples: (None, 'MEK', 'FPLX') ('object', '11998', 'HGNC') ('subject', 'MAP2K1', 'TEXT') Note that you will get the logical AND of the conditions given, in other words, each Statement will satisfy all constraints. stmt_type : str or None The type of statement to retrieve, e.g. 'Phosphorylation'. If None, no type restriction is imposed. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. Some keyword arguments are passed directly to a lower level function: Other Parameters (kwargs) ------------------------- max_stmts : int or None Limit the number of statements queried. If None, no restriction is applied. offset : int or None Start reading statements by a given offset. If None, no offset is applied. Most commonly used in conjunction with `max_stmts`. ev_limit : int or None Limit the amount of evidence returned per Statement. best_first : bool If True, the preassembled statements will be sorted by the amount of evidence they have, and those with the most evidence will be prioritized. When using `max_stmts`, this means you will get the "best" statements. If False, statements will be queried in arbitrary order. Returns ------- A dictionary data structure containing, among other metadata, a dict of statement jsons under the key 'statements', themselves keyed by their shallow matches-key hashes. """ # First look for statements matching the role'd agents. if db is None: db = get_primary_db() # TODO: Extend this to allow retrieval of raw statements. mk_hashes_q = None mk_hash_c = db.PaMeta.mk_hash.label('mk_hash') ev_count_c = db.PaMeta.ev_count.label('ev_count') for role, ag_dbid, ns in agents: # Make the id match paradigms for the database. ag_dbid = regularize_agent_id(ag_dbid, ns) # Create this query (for this agent) q = (db.session.query(mk_hash_c, ev_count_c).filter(db.PaMeta.db_id.like(ag_dbid), db.PaMeta.db_name.like(ns))) if stmt_type is not None: q = q.filter(db.PaMeta.type.like(stmt_type)) if role is not None: q = q.filter(db.PaMeta.role == role.upper()) # Intersect with the previous query. if mk_hashes_q: mk_hashes_q = mk_hashes_q.intersect(q) else: mk_hashes_q = q assert mk_hashes_q, "No conditions imposed." return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
prioritize=True, verbose=self.verbose) logger.info("Made %d readings." % len(outputs)) logger.info("Making statements...") rdb.produce_statements(outputs, n_proc=self.n_proc, db=db) return if __name__ == '__main__': if args.test: if 'test' not in args.database: db = get_test_db() else: db = get_db(args.database) elif args.database == 'primary': db = get_primary_db() else: db = get_db(args.database) if args.method == 'local': bulk_managers = [ BulkLocalReadingManager(reader_name, buffer_days=args.buffer, n_proc=args.num_procs) for reader_name in ['SPARSER', 'REACH'] ] elif args.method == 'aws': bulk_managers = [ BulkAwsReadingManager(reader_name, buffer_days=args.buffer, project_name=args.project_name)
def get_statements(clauses, count=1000, do_stmt_count=False, db=None, preassembled=True, with_support=False, fix_refs=True, with_evidence=True): """Select statements according to a given set of clauses. Parameters ---------- clauses : list list of sqlalchemy WHERE clauses to pass to the filter query. count : int Number of statements to retrieve and process in each batch. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. with_support : bool Choose whether to populate the supports and supported_by list attributes of the Statement objects. General results in slower queries. with_evidence : bool Choose whether or not to populate the evidence list attribute of the Statements. As with `with_support`, setting this to True will take longer. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- list of Statements from the database corresponding to the query. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) cnt = count if db is None: db = get_primary_db() stmts_tblname = 'pa_statements' if preassembled else 'raw_statements' if not preassembled: stmts = [] q = db.filter_query(stmts_tblname, *clauses) if do_stmt_count: logger.info("Counting statements...") num_stmts = q.count() logger.info("Total of %d statements" % num_stmts) db_stmts = q.yield_per(cnt) for subset in batch_iter(db_stmts, cnt): stmts.extend( get_raw_stmts_frm_db_list(db, subset, with_sids=False, fix_refs=fix_refs)) if do_stmt_count: logger.info("%d of %d statements" % (len(stmts), num_stmts)) else: logger.info("%d statements" % len(stmts)) else: logger.info("Getting preassembled statements.") if with_evidence: logger.info("Getting preassembled statements.") # Get pairs of pa statements with their linked raw statements clauses += [ db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id ] pa_raw_stmt_pairs = \ db.select_all([db.PAStatements, db.RawStatements], *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_wev(db, pa_raw_stmt_pairs, count=cnt, fix_refs=fix_refs) else: # Get just pa statements without their supporting raw statement(s). pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt) # Populate the supports/supported by fields. if with_support: get_support(stmt_dict, db=db) stmts = list(stmt_dict.values()) logger.info("In all, there are %d pa statements." % len(stmts)) return stmts
def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True): """Fill in the evidence for a list of pre-assembled statements. Parameters ---------- pa_stmt_list : list[Statement] A list of unique statements, generally drawn from the database pa_statement table (via `get_statemetns`). db : DatabaseManager instance or None An instance of a database manager. If None, defaults to the "primary" database, as defined in the db_config.ini file in .config/indra. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- None - modifications are made to the Statements "in-place". """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if db is None: db = get_primary_db() # Turn the list into a dict. stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list} if use_views: if fix_refs: raw_links = db.select_all([ db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json, db.FastRawPaLink.reading_id ], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())) rel_refs = ['pmid', 'rid'] ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs] else: raw_links = db.select_all( [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())) rid_ref_dict = {} myst_rid_rs_dict = defaultdict(list) for info in raw_links: if fix_refs: mk_hash, raw_json, rid = info else: mk_hash, raw_json = info rid = None json_dict = json.loads(raw_json.decode('utf-8')) ev_json = json_dict.get('evidence', []) assert len(ev_json) == 1, \ "Raw statements must have one evidence, got %d." % len(ev_json) ev = Evidence._from_json(ev_json[0]) stmt_dict[mk_hash].evidence.append(ev) if fix_refs: ref_dict = rid_ref_dict.get(rid) if ref_dict is None: myst_rid_rs_dict[rid].append(ev) if len(myst_rid_rs_dict) >= 1000: ref_data_list = db.select_all( ref_cols, db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys())) for pmid, rid in ref_data_list: rid_ref_dict[rid] = pmid for ev in myst_rid_rs_dict[rid]: ev.pmid = pmid myst_rid_rs_dict.clear() else: ev.pmid = rid_ref_dict[rid] else: # Get the data from the database raw_list = db.select_all( [db.PAStatements.mk_hash, db.RawStatements], db.PAStatements.mk_hash.in_(stmt_dict.keys()), db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id) # Note that this step depends on the ordering being maintained. mk_hashes, raw_stmt_objs = zip(*raw_list) raw_stmts = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs, with_sids=False) raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts) # Now attach the evidence for mk_hash, raw_stmt in raw_stmt_mk_pairs: # Each raw statement can have just one piece of evidence. stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0]) return