Beispiel #1
0
def get_statements(clauses, count=1000, do_stmt_count=True, db=None,
                   preassembled=True):
    """Select statements according to a given set of clauses.

    Parameters
    ----------
    clauses : list
        list of sqlalchemy WHERE clauses to pass to the filter query.
    count : int
        Number of statements to retrieve and process in each batch.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.

    Returns
    -------
    list of Statements from the database corresponding to the query.
    """
    if db is None:
        db = db_util.get_primary_db()

    stmts_tblname = 'pa_statements' if preassembled else 'statements'

    stmts = []
    q = db.filter_query(stmts_tblname, *clauses)
    if do_stmt_count:
        logger.info("Counting statements...")
        num_stmts = q.count()
        logger.info("Total of %d statements" % num_stmts)
    db_stmts = q.yield_per(count)
    subset = []
    total_counter = 0
    for stmt in db_stmts:
        subset.append(stmt)
        if len(subset) == count:
            stmts.extend(db_util.make_stmts_from_db_list(subset))
            subset = []
        total_counter += 1
        if total_counter % count == 0:
            if do_stmt_count:
                logger.info("%d of %d statements" % (total_counter, num_stmts))
            else:
                logger.info("%d statements" % total_counter)

    stmts.extend(db_util.make_stmts_from_db_list(subset))
    return stmts
Beispiel #2
0
def get_pa_statement_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    __report_stat('\nStatement Statistics:', fname)
    __report_stat('---------------------', fname)
    stmt_q = db.filter_query(db.PAStatements)
    __report_stat("Total number of statments: %d" % stmt_q.count(), fname)
    statements_produced_by_indra_version = (db.session.query(
        db.PAStatements.indra_version, func.count(
            db.PAStatements.id)).group_by(db.PAStatements.indra_version).all())
    __report_stat(
        ("Number of statements by indra version:\n    %s" % '\n    '.join([
            '%s: %d' % (s, n) for s, n in statements_produced_by_indra_version
        ])), fname)
    return
Beispiel #3
0
def get_statements_by_paper(id_val, id_type='pmid', count=1000, db=None,
                            do_stmt_count=True):
    """Get the statements from a particular paper.

    Note: currently this can only retrieve raw statements, because of the
    partially implemented configuration of the pre-assembled Statement table.

    Parameters
    ----------
    id_val : int or str
        The value of the id for the paper whose statements you wish to retrieve.
    id_type : str
        The type of id used (default is pmid). Options include pmid, pmcid, doi,
        pii, url, or manuscript_id. Note that pmid is generally the best means
        of getting a paper.
    count : int
        Number of statements to retrieve in each batch (passed to
        :py:func:`get_statements`).
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.

    Returns
    -------
    A list of Statements from the database corresponding to the paper id given.
    """
    if db is None:
        db = db_util.get_primary_db()

    trid_list = _get_trids(db, id_val, id_type)
    if not trid_list:
        return None

    stmts = []
    for trid in trid_list:
        clauses = [
            db.TextContent.text_ref_id == trid,
            db.Readings.text_content_id == db.TextContent.id,
            db.Statements.reader_ref == db.Readings.id
        ]
        stmts.extend(get_statements(clauses, count=count, preassembled=False,
                                    do_stmt_count=do_stmt_count, db=db))
    return stmts
Beispiel #4
0
def get_text_content_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    tc_rdng_link = db.TextContent.id == db.Readings.text_content_id
    __report_stat("\nText Content statistics:", fname)
    __report_stat('------------------------', fname)
    tc_q = db.filter_query(db.TextContent)
    total_content = tc_q.count()
    __report_stat("Total number of text content entries: %d" % total_content)
    latest_updates = (db.session.query(db.Updates.source,
                                       func.max(db.Updates.datetime)).group_by(
                                           db.Updates.source).all())
    __report_stat(
        ("Latest updates:\n    %s" %
         '\n    '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname)
    tc_w_reading_q = tc_q.filter(tc_rdng_link)
    content_read = tc_w_reading_q.distinct().count()
    __report_stat("Total content read: %d" % content_read, fname)
    tc_fulltext_q = tc_q.filter(db.TextContent.text_type == 'fulltext')
    fulltext_content = tc_fulltext_q.distinct().count()
    __report_stat("Number of fulltext entries: %d" % fulltext_content, fname)
    tc_fulltext_read_q = tc_fulltext_q.filter(tc_rdng_link)
    fulltext_read = tc_fulltext_read_q.distinct().count()
    __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname)
    content_by_source = (db.session.query(
        db.TextContent.source,
        func.count(db.TextContent.id)).distinct().group_by(
            db.TextContent.source).all())
    __report_stat(
        ("Content by source:\n    %s" %
         '\n    '.join(['%s: %d' % (s, n) for s, n in content_by_source])),
        fname)
    content_read_by_source = (db.session.query(
        db.TextContent.source, func.count(
            db.TextContent.id)).filter(tc_rdng_link).distinct().group_by(
                db.TextContent.source).all())
    __report_stat(
        ("Content read by source:\n    %s" %
         '\n    '.join(['%s: %d' % (s, n)
                        for s, n in content_read_by_source])), fname)
    return
Beispiel #5
0
def get_statements_stats(fname=None, db=None, indra_version=None):
    if db is None:
        db = get_primary_db()
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref

    __report_stat('\nStatement Statistics:', fname)
    __report_stat('---------------------', fname)
    stmt_q = db.filter_query(db.RawStatements)
    if indra_version is not None:
        stmt_q = stmt_q.filter(db.RawStatements.indra_version == indra_version)
    __report_stat("Total number of statments: %d" % stmt_q.count(), fname)
    readers = db.session.query(db.Reading.reader).distinct().all()
    sources = db.session.query(db.TextContent.source).distinct().all()
    stats = ''
    for reader, in readers:
        for src, in sources:
            cnt = stmt_q.filter(
                stmt_rdng_link, tc_rdng_link, db.Reading.reader == reader,
                db.TextContent.source == src).distinct().count()
            stats += ('    Statements from %s reading %s: %d\n' %
                      (reader, src, cnt))
    __report_stat("Statements by reader and content source:\n%s" % stats,
                  fname)
    if indra_version is None:
        statements_by_db_source = (db.session.query(
            db.DBInfo.db_name, func.count(db.RawStatements.id)).filter(
                db.RawStatements.db_ref == db.DBInfo.id).distinct().group_by(
                    db.DBInfo.db_name).all())
        __report_stat(("Statements by database:\n    %s" % '\n    '.join(
            ['%s: %d' % (s, n) for s, n in statements_by_db_source])), fname)
        statements_by_indra_version = (db.session.query(
            db.RawStatements.indra_version,
            func.count(db.RawStatements.id)).group_by(
                db.RawStatements.indra_version).all())
        __report_stat(
            ("Number of statements by indra version:\n    %s" % '\n    '.join(
                ['%s: %d' % (s, n) for s, n in statements_by_indra_version])),
            fname)
    return
Beispiel #6
0
def get_db_statistics(fname=None, db=None, tables=None):
    """Get statistics on the contents of the database"""
    if db is None:
        db = get_primary_db()

    task_dict = {
        'text_ref': get_text_ref_stats,
        'text_content': get_text_content_stats,
        'readings': get_readings_stats,
        'statements': get_statements_stats,
        'pa_statements': get_pa_statement_stats
    }

    # Get the statistics
    if tables is None:
        for stat_meth in task_dict.values():
            stat_meth(fname, db)
    else:
        for table_key in set(tables):
            task_dict[table_key](fname, db)

    return
Beispiel #7
0
def get_text_ref_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    tr_tc_link = db.TextRef.id == db.TextContent.text_ref_id
    tc_rdng_link = db.TextContent.id == db.Readings.text_content_id
    __report_stat("Text ref statistics:", fname)
    __report_stat("--------------------", fname)
    tr_q = db.filter_query(db.TextRef)
    total_refs = tr_q.count()
    __report_stat('Total number of text refs: %d' % total_refs, fname)
    tr_w_cont_q = tr_q.filter(tr_tc_link)
    refs_with_content = tr_w_cont_q.distinct().count()
    __report_stat('Total number of refs with content: %d' % refs_with_content,
                  fname)
    tr_w_fulltext_q = tr_w_cont_q.filter(
        db.TextContent.text_type == 'fulltext')
    refs_with_fulltext = tr_w_fulltext_q.distinct().count()
    __report_stat('Number of refs with fulltext: %d' % refs_with_fulltext,
                  fname)
    tr_w_abstract_q = tr_w_cont_q.filter(
        db.TextContent.text_type == 'abstract')
    refs_with_abstract = tr_w_abstract_q.distinct().count()
    __report_stat('Number of refs with abstract: %d' % refs_with_abstract,
                  fname)
    __report_stat(('Number of refs with only abstract: %d' %
                   (refs_with_content - refs_with_fulltext)), fname)
    tr_w_read_content_q = tr_w_cont_q.filter(tc_rdng_link)
    refs_with_reading = tr_w_read_content_q.distinct().count()
    __report_stat('Number of refs that have been read: %d' % refs_with_reading,
                  fname)
    tr_w_fulltext_read_q = tr_w_fulltext_q.filter(tc_rdng_link)
    refs_with_fulltext_read = tr_w_fulltext_read_q.distinct().count()
    __report_stat(
        ('Number of refs with fulltext read: %d' % refs_with_fulltext_read),
        fname)
    return
Beispiel #8
0
def get_readings_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()

    __report_stat('\nReading statistics:', fname)
    __report_stat('-------------------', fname)
    rdg_q = db.filter_query(db.Readings)
    __report_stat('Total number or readings: %d' % rdg_q.count(), fname)
    # There may be a way to do this more neatly with a group_by clause, however
    # the naive way of doing it leaves us with a miscount due to indistinct.
    reader_versions = (db.session.query(
        db.Readings.reader_version).distinct().all())
    sources = db.session.query(db.TextContent.source).distinct().all()
    stats = ''
    for rv, in reader_versions:
        for src, in sources:
            cnt = db.filter_query(
                db.Readings, db.TextContent.id == db.Readings.text_content_id,
                db.TextContent.source == src,
                db.Readings.reader_version == rv).distinct().count()
            stats += '    Readings by %s from %s: %d\n' % (rv, src, cnt)
    __report_stat("Readings by reader version and content source:\n%s" % stats,
                  fname)
    return
Beispiel #9
0
def get_statements_by_gene_role_type(agent_id=None, agent_ns='HGNC-SYMBOL',
                                     role=None, stmt_type=None, count=1000,
                                     db=None, do_stmt_count=True,
                                     preassembled=True):
    """Get statements from the DB by stmt type, agent, and/or agent role.

    Parameters
    ----------
    agent_id : str
        String representing the identifier of the agent from the given
        namespace. Note: if the agent namespace argument, `agent_ns`, is set
        to 'HGNC-SYMBOL', this function will treat `agent_id` as an HGNC gene
        symbol and perform an internal lookup of the corresponding HGNC ID.
        Default is 'HGNC-SYMBOL'.
    agent_ns : str
        Namespace for the identifier given in `agent_id`.
    role : str
        String corresponding to the role of the agent in the statement.
        Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`,
        `SelfModification`, and `ActiveForm` Statements).
    stmt_type : str
        Name of the Statement class.
    count : int
        Number of statements to retrieve in each batch (passed to
        :py:func:`get_statements`).
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.

    Returns
    -------
    list of Statements from the database corresponding to the query.
    """
    if db is None:
        db = db_util.get_primary_db()

    if preassembled:
        Statements = db.PAStatements
        Agents = db.PAAgents
    else:
        Statements = db.Statements
        Agents = db.Agents

    if not (agent_id or role or stmt_type):
        raise ValueError('At least one of agent_id, role, or stmt_type '
                         'must be specified.')
    clauses = []
    if agent_id and agent_ns == 'HGNC-SYMBOL':
        hgnc_id = hgnc_client.get_hgnc_id(agent_id)
        if not hgnc_id:
            logger.warning('Invalid gene name: %s' % agent_id)
            return []
        clauses.extend([Agents.db_name.like('HGNC'),
                        Agents.db_id.like(hgnc_id)])
    elif agent_id:
        clauses.extend([Agents.db_name.like(agent_ns),
                        Agents.db_id.like(agent_id)])
    if role:
        clauses.append(Agents.role == role)
    if agent_id or role:
        clauses.append(Agents.stmt_id == Statements.id)
    if stmt_type:
        clauses.append(Statements.type == stmt_type)
    stmts = get_statements(clauses, count=count, do_stmt_count=do_stmt_count,
                           db=db, preassembled=preassembled)
    return stmts
Beispiel #10
0
        outputs = rdb.produce_readings({'trid': trids}, [reader_inst],
                                       read_mode='unread_unread',
                                       db=db,
                                       prioritize=True,
                                       verbose=self.verbose)
        logger.info("Made %d readings." % len(outputs))
        logger.info("Making statements...")
        rdb.produce_statements(outputs, n_proc=self.n_proc, db=db)
        return


if __name__ == '__main__':
    if args.test:
        db = get_test_db()
    else:
        db = get_primary_db()

    if args.method == 'local':
        bulk_managers = [
            BulkLocalReadingManager(reader_name,
                                    buffer_days=args.buffer,
                                    n_proc=args.num_procs)
            for reader_name in ['SPARSER', 'REACH']
        ]
    elif args.method == 'aws':
        bulk_managers = [
            BulkAwsReadingManager(reader_name,
                                  buffer_days=args.buffer,
                                  project_name=args.project_name)
            for reader_name in ['SPARSER', 'REACH']
        ]
from indra.preassembler import hierarchy_manager as hm
from indra.preassembler import Preassembler as pa
from indra.tools import assemble_corpus as ac
from indra.sources.indra_db_rest import client_api as capi
from indra.sources.indra_db_rest.client_api import IndraDBRestError
from collections import defaultdict
from math import ceil, log10
import itertools as itt
import logging
from indra.db import client as dbc
from indra.db import util as dbu
from sqlalchemy.exc import StatementError
import pdb
db_prim = dbu.get_primary_db()
dnf_logger = logging.getLogger('DepMapFunctionsLogger')


def agent_name_set(stmt):
    """Returns the list of agent names in a statement.

    stmt : :py:class:`indra.statements.Statement`

    Returns
    -------
    ags : list[agent names]

    """
    ags = []
    try:
        ags.update(list(map(lambda ag: ag.name, stmt.agent_list())))
    except AttributeError: