def get_statements(clauses, count=1000, do_stmt_count=True, db=None, preassembled=True): """Select statements according to a given set of clauses. Parameters ---------- clauses : list list of sqlalchemy WHERE clauses to pass to the filter query. count : int Number of statements to retrieve and process in each batch. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. Returns ------- list of Statements from the database corresponding to the query. """ if db is None: db = db_util.get_primary_db() stmts_tblname = 'pa_statements' if preassembled else 'statements' stmts = [] q = db.filter_query(stmts_tblname, *clauses) if do_stmt_count: logger.info("Counting statements...") num_stmts = q.count() logger.info("Total of %d statements" % num_stmts) db_stmts = q.yield_per(count) subset = [] total_counter = 0 for stmt in db_stmts: subset.append(stmt) if len(subset) == count: stmts.extend(db_util.make_stmts_from_db_list(subset)) subset = [] total_counter += 1 if total_counter % count == 0: if do_stmt_count: logger.info("%d of %d statements" % (total_counter, num_stmts)) else: logger.info("%d statements" % total_counter) stmts.extend(db_util.make_stmts_from_db_list(subset)) return stmts
def get_pa_statement_stats(fname=None, db=None): if db is None: db = get_primary_db() __report_stat('\nStatement Statistics:', fname) __report_stat('---------------------', fname) stmt_q = db.filter_query(db.PAStatements) __report_stat("Total number of statments: %d" % stmt_q.count(), fname) statements_produced_by_indra_version = (db.session.query( db.PAStatements.indra_version, func.count( db.PAStatements.id)).group_by(db.PAStatements.indra_version).all()) __report_stat( ("Number of statements by indra version:\n %s" % '\n '.join([ '%s: %d' % (s, n) for s, n in statements_produced_by_indra_version ])), fname) return
def get_statements_by_paper(id_val, id_type='pmid', count=1000, db=None, do_stmt_count=True): """Get the statements from a particular paper. Note: currently this can only retrieve raw statements, because of the partially implemented configuration of the pre-assembled Statement table. Parameters ---------- id_val : int or str The value of the id for the paper whose statements you wish to retrieve. id_type : str The type of id used (default is pmid). Options include pmid, pmcid, doi, pii, url, or manuscript_id. Note that pmid is generally the best means of getting a paper. count : int Number of statements to retrieve in each batch (passed to :py:func:`get_statements`). db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local databse instance. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. Returns ------- A list of Statements from the database corresponding to the paper id given. """ if db is None: db = db_util.get_primary_db() trid_list = _get_trids(db, id_val, id_type) if not trid_list: return None stmts = [] for trid in trid_list: clauses = [ db.TextContent.text_ref_id == trid, db.Readings.text_content_id == db.TextContent.id, db.Statements.reader_ref == db.Readings.id ] stmts.extend(get_statements(clauses, count=count, preassembled=False, do_stmt_count=do_stmt_count, db=db)) return stmts
def get_text_content_stats(fname=None, db=None): if db is None: db = get_primary_db() tc_rdng_link = db.TextContent.id == db.Readings.text_content_id __report_stat("\nText Content statistics:", fname) __report_stat('------------------------', fname) tc_q = db.filter_query(db.TextContent) total_content = tc_q.count() __report_stat("Total number of text content entries: %d" % total_content) latest_updates = (db.session.query(db.Updates.source, func.max(db.Updates.datetime)).group_by( db.Updates.source).all()) __report_stat( ("Latest updates:\n %s" % '\n '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname) tc_w_reading_q = tc_q.filter(tc_rdng_link) content_read = tc_w_reading_q.distinct().count() __report_stat("Total content read: %d" % content_read, fname) tc_fulltext_q = tc_q.filter(db.TextContent.text_type == 'fulltext') fulltext_content = tc_fulltext_q.distinct().count() __report_stat("Number of fulltext entries: %d" % fulltext_content, fname) tc_fulltext_read_q = tc_fulltext_q.filter(tc_rdng_link) fulltext_read = tc_fulltext_read_q.distinct().count() __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname) content_by_source = (db.session.query( db.TextContent.source, func.count(db.TextContent.id)).distinct().group_by( db.TextContent.source).all()) __report_stat( ("Content by source:\n %s" % '\n '.join(['%s: %d' % (s, n) for s, n in content_by_source])), fname) content_read_by_source = (db.session.query( db.TextContent.source, func.count( db.TextContent.id)).filter(tc_rdng_link).distinct().group_by( db.TextContent.source).all()) __report_stat( ("Content read by source:\n %s" % '\n '.join(['%s: %d' % (s, n) for s, n in content_read_by_source])), fname) return
def get_statements_stats(fname=None, db=None, indra_version=None): if db is None: db = get_primary_db() tc_rdng_link = db.TextContent.id == db.Reading.text_content_id stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref __report_stat('\nStatement Statistics:', fname) __report_stat('---------------------', fname) stmt_q = db.filter_query(db.RawStatements) if indra_version is not None: stmt_q = stmt_q.filter(db.RawStatements.indra_version == indra_version) __report_stat("Total number of statments: %d" % stmt_q.count(), fname) readers = db.session.query(db.Reading.reader).distinct().all() sources = db.session.query(db.TextContent.source).distinct().all() stats = '' for reader, in readers: for src, in sources: cnt = stmt_q.filter( stmt_rdng_link, tc_rdng_link, db.Reading.reader == reader, db.TextContent.source == src).distinct().count() stats += (' Statements from %s reading %s: %d\n' % (reader, src, cnt)) __report_stat("Statements by reader and content source:\n%s" % stats, fname) if indra_version is None: statements_by_db_source = (db.session.query( db.DBInfo.db_name, func.count(db.RawStatements.id)).filter( db.RawStatements.db_ref == db.DBInfo.id).distinct().group_by( db.DBInfo.db_name).all()) __report_stat(("Statements by database:\n %s" % '\n '.join( ['%s: %d' % (s, n) for s, n in statements_by_db_source])), fname) statements_by_indra_version = (db.session.query( db.RawStatements.indra_version, func.count(db.RawStatements.id)).group_by( db.RawStatements.indra_version).all()) __report_stat( ("Number of statements by indra version:\n %s" % '\n '.join( ['%s: %d' % (s, n) for s, n in statements_by_indra_version])), fname) return
def get_db_statistics(fname=None, db=None, tables=None): """Get statistics on the contents of the database""" if db is None: db = get_primary_db() task_dict = { 'text_ref': get_text_ref_stats, 'text_content': get_text_content_stats, 'readings': get_readings_stats, 'statements': get_statements_stats, 'pa_statements': get_pa_statement_stats } # Get the statistics if tables is None: for stat_meth in task_dict.values(): stat_meth(fname, db) else: for table_key in set(tables): task_dict[table_key](fname, db) return
def get_text_ref_stats(fname=None, db=None): if db is None: db = get_primary_db() tr_tc_link = db.TextRef.id == db.TextContent.text_ref_id tc_rdng_link = db.TextContent.id == db.Readings.text_content_id __report_stat("Text ref statistics:", fname) __report_stat("--------------------", fname) tr_q = db.filter_query(db.TextRef) total_refs = tr_q.count() __report_stat('Total number of text refs: %d' % total_refs, fname) tr_w_cont_q = tr_q.filter(tr_tc_link) refs_with_content = tr_w_cont_q.distinct().count() __report_stat('Total number of refs with content: %d' % refs_with_content, fname) tr_w_fulltext_q = tr_w_cont_q.filter( db.TextContent.text_type == 'fulltext') refs_with_fulltext = tr_w_fulltext_q.distinct().count() __report_stat('Number of refs with fulltext: %d' % refs_with_fulltext, fname) tr_w_abstract_q = tr_w_cont_q.filter( db.TextContent.text_type == 'abstract') refs_with_abstract = tr_w_abstract_q.distinct().count() __report_stat('Number of refs with abstract: %d' % refs_with_abstract, fname) __report_stat(('Number of refs with only abstract: %d' % (refs_with_content - refs_with_fulltext)), fname) tr_w_read_content_q = tr_w_cont_q.filter(tc_rdng_link) refs_with_reading = tr_w_read_content_q.distinct().count() __report_stat('Number of refs that have been read: %d' % refs_with_reading, fname) tr_w_fulltext_read_q = tr_w_fulltext_q.filter(tc_rdng_link) refs_with_fulltext_read = tr_w_fulltext_read_q.distinct().count() __report_stat( ('Number of refs with fulltext read: %d' % refs_with_fulltext_read), fname) return
def get_readings_stats(fname=None, db=None): if db is None: db = get_primary_db() __report_stat('\nReading statistics:', fname) __report_stat('-------------------', fname) rdg_q = db.filter_query(db.Readings) __report_stat('Total number or readings: %d' % rdg_q.count(), fname) # There may be a way to do this more neatly with a group_by clause, however # the naive way of doing it leaves us with a miscount due to indistinct. reader_versions = (db.session.query( db.Readings.reader_version).distinct().all()) sources = db.session.query(db.TextContent.source).distinct().all() stats = '' for rv, in reader_versions: for src, in sources: cnt = db.filter_query( db.Readings, db.TextContent.id == db.Readings.text_content_id, db.TextContent.source == src, db.Readings.reader_version == rv).distinct().count() stats += ' Readings by %s from %s: %d\n' % (rv, src, cnt) __report_stat("Readings by reader version and content source:\n%s" % stats, fname) return
def get_statements_by_gene_role_type(agent_id=None, agent_ns='HGNC-SYMBOL', role=None, stmt_type=None, count=1000, db=None, do_stmt_count=True, preassembled=True): """Get statements from the DB by stmt type, agent, and/or agent role. Parameters ---------- agent_id : str String representing the identifier of the agent from the given namespace. Note: if the agent namespace argument, `agent_ns`, is set to 'HGNC-SYMBOL', this function will treat `agent_id` as an HGNC gene symbol and perform an internal lookup of the corresponding HGNC ID. Default is 'HGNC-SYMBOL'. agent_ns : str Namespace for the identifier given in `agent_id`. role : str String corresponding to the role of the agent in the statement. Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`, `SelfModification`, and `ActiveForm` Statements). stmt_type : str Name of the Statement class. count : int Number of statements to retrieve in each batch (passed to :py:func:`get_statements`). db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local databse instance. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. Returns ------- list of Statements from the database corresponding to the query. """ if db is None: db = db_util.get_primary_db() if preassembled: Statements = db.PAStatements Agents = db.PAAgents else: Statements = db.Statements Agents = db.Agents if not (agent_id or role or stmt_type): raise ValueError('At least one of agent_id, role, or stmt_type ' 'must be specified.') clauses = [] if agent_id and agent_ns == 'HGNC-SYMBOL': hgnc_id = hgnc_client.get_hgnc_id(agent_id) if not hgnc_id: logger.warning('Invalid gene name: %s' % agent_id) return [] clauses.extend([Agents.db_name.like('HGNC'), Agents.db_id.like(hgnc_id)]) elif agent_id: clauses.extend([Agents.db_name.like(agent_ns), Agents.db_id.like(agent_id)]) if role: clauses.append(Agents.role == role) if agent_id or role: clauses.append(Agents.stmt_id == Statements.id) if stmt_type: clauses.append(Statements.type == stmt_type) stmts = get_statements(clauses, count=count, do_stmt_count=do_stmt_count, db=db, preassembled=preassembled) return stmts
outputs = rdb.produce_readings({'trid': trids}, [reader_inst], read_mode='unread_unread', db=db, prioritize=True, verbose=self.verbose) logger.info("Made %d readings." % len(outputs)) logger.info("Making statements...") rdb.produce_statements(outputs, n_proc=self.n_proc, db=db) return if __name__ == '__main__': if args.test: db = get_test_db() else: db = get_primary_db() if args.method == 'local': bulk_managers = [ BulkLocalReadingManager(reader_name, buffer_days=args.buffer, n_proc=args.num_procs) for reader_name in ['SPARSER', 'REACH'] ] elif args.method == 'aws': bulk_managers = [ BulkAwsReadingManager(reader_name, buffer_days=args.buffer, project_name=args.project_name) for reader_name in ['SPARSER', 'REACH'] ]
from indra.preassembler import hierarchy_manager as hm from indra.preassembler import Preassembler as pa from indra.tools import assemble_corpus as ac from indra.sources.indra_db_rest import client_api as capi from indra.sources.indra_db_rest.client_api import IndraDBRestError from collections import defaultdict from math import ceil, log10 import itertools as itt import logging from indra.db import client as dbc from indra.db import util as dbu from sqlalchemy.exc import StatementError import pdb db_prim = dbu.get_primary_db() dnf_logger = logging.getLogger('DepMapFunctionsLogger') def agent_name_set(stmt): """Returns the list of agent names in a statement. stmt : :py:class:`indra.statements.Statement` Returns ------- ags : list[agent names] """ ags = [] try: ags.update(list(map(lambda ag: ag.name, stmt.agent_list()))) except AttributeError: