Exemple #1
0
def get_statements_from_hashes(statement_hashes,
                               preassembled=True,
                               db=None,
                               **kwargs):
    """Retrieve statement objects given only statement hashes.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_ro('primary')

    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements.")

    query = HasHash(statement_hashes)
    ev_lim = None
    if kwargs.get('with_evidence') is False:
        ev_lim = 0

    result = query.get_statements(db, ev_limit=ev_lim)
    return result.statements()
Exemple #2
0
def get_ro_source_info():
    from indra.sources import SOURCE_INFO
    from indra_db import get_ro
    ro = get_ro('primary')

    ro_srcs: set = ro.get_source_names()
    sources = {}
    for src_id in ro_srcs:
        src_info = {'id': src_id}
        lookup_id = src_id
        if src_id == 'vhn':
            lookup_id = 'virhostnet'
        elif src_id == 'bel_lc':
            lookup_id = 'bel'
        elif src_id == 'pe':
            lookup_id = 'phosphoelm'
        elif src_id == 'psp':
            lookup_id = 'phosphosite'

        src_info.update(SOURCE_INFO[lookup_id])

        if src_id == 'eidos':
            src_info['domain'] = 'biology'

        sources[src_id] = src_info
    return sources
Exemple #3
0
def get_mesh_ref_counts(mesh_terms, require_all=False, ro=None):
    """Get the number of distinct pmids by mesh term for each hash.

    This function directly queries a table in the readonly database that counts
    the number of distinct PMIDs for each mesh term/hash pair. Given a list of
    mesh terms, this will return a dictionary keyed by hash containing
    dictionaries indicating how much support the hash has from each of the given
    mesh IDs in terms of distinct PMIDs (thus distinct publications).

    Parameters
    ----------
    mesh_terms : list
        A list of mesh term strings of the form "D000#####".
    require_all : Optional[bool]
        If True, require that each entry in the result includes both mesh terms.
        In other words, only return results where, for each hash, articles exist
        with support from all MeSH IDs given, not just one or the other. Default
        is False
    ro : Optional[DatabaseManager]
        A database manager handle. The default is the primary readonly, as
        indicated by environment variables or the config file.
    """
    # Get the default readonly database, if needed..
    if ro is None:
        ro = get_ro('primary')

    # Make sure the mesh IDs are of the correct kind.
    if not all(m.startswith('D') or m.startswith('C') for m in mesh_terms):
        raise ValueError("All mesh terms must begin with C or D.")

    # Convert the IDs to numbers for faster lookup.
    result = {}
    for prefix, table in [('C', ro.MeshConceptRefCounts),
                          ('D', ro.MeshTermRefCounts)]:
        mesh_num_map = {
            int(m[1:]): m
            for m in mesh_terms if m.startswith(prefix)
        }
        if not mesh_num_map:
            continue

        # Build the query.
        nums = func.array_agg(table.mesh_num)
        counts = func.array_agg(table.ref_count)
        q = ro.session.query(table.mk_hash, nums.label('nums'),
                             counts.label('ref_counts'), table.pmid_count)
        if len(mesh_num_map.keys()) == 1:
            q = q.filter(table.mesh_num == list(mesh_num_map.keys())[0])
        elif len(mesh_num_map.keys()) > 1:
            q = q.filter(table.mesh_num.in_(mesh_num_map.keys()))
        q = q.group_by(table.mk_hash, table.pmid_count)

        # Apply the require all option by comparing the length of the nums array
        # to the number of inputs.
        if require_all:
            q = q.having(func.cardinality(nums) == len(mesh_num_map.keys()))

        # Parse the results.
        for mk_hash, nums, counts, pmid_count in q.all():
            count_dict = {
                mesh_num_map[mesh_num]: ref_count
                for mesh_num, ref_count in zip(nums, counts)
            }
            if mk_hash not in result:
                result[mk_hash] = count_dict
                result[mk_hash]['total'] = pmid_count
            else:
                result[mk_hash].update(count_dict)
                result[mk_hash]['total'] += sum(counts)

    # Little sloppy, but delete any that don't meet the require_all constraint.
    if require_all:
        num_terms = len(set(mesh_terms))
        for mk_hash in result.copy().keys():
            if len(result[mk_hash]) != num_terms + 1:
                result.pop(mk_hash)
    return result
Exemple #4
0
def get_statements_by_gene_role_type(agent_id=None,
                                     agent_ns='HGNC-SYMBOL',
                                     role=None,
                                     stmt_type=None,
                                     count=1000,
                                     db=None,
                                     do_stmt_count=False,
                                     preassembled=True,
                                     fix_refs=True,
                                     with_evidence=True,
                                     with_support=False,
                                     essentials_only=False):
    """Get statements from the DB by stmt type, agent, and/or agent role.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.

    Parameters
    ----------
    agent_id : str
        String representing the identifier of the agent from the given
        namespace. Note: if the agent namespace argument, `agent_ns`, is set
        to 'HGNC-SYMBOL', this function will treat `agent_id` as an HGNC gene
        symbol and perform an internal lookup of the corresponding HGNC ID.
        Default is 'HGNC-SYMBOL'.
    agent_ns : str
        Namespace for the identifier given in `agent_id`.
    role : str
        String corresponding to the role of the agent in the statement.
        Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`,
        `SelfModification`, and `ActiveForm` Statements).
    stmt_type : str
        Name of the Statement class.
    count : int (DEPRECATED)
        Number of statements to retrieve in each batch (passed to
        :py:func:`get_statements`).
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    do_stmt_count : bool (DEPRECATED)
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    preassembled : bool (DEPRECATED)
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.
    with_support : bool (DEPRECATED)
        Choose whether to populate the supports and supported_by list
        attributes of the Statement objects. Generally results in slower
        queries. DEFAULT IS CURRENTLY False.
    with_evidence : bool
        Choose whether or not to populate the evidence list attribute of the
        Statements. As with `with_support`, setting this to True will take
        longer.
    fix_refs : bool (DEPRECATED)
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.
    essentials_only : bool (DEPRECATED)
        Default is False. If True, retrieve only some metadata regarding the
        statements. Implicitly `with_support`, `with_evidence`, `fix_refs`, and
        `do_stmt_count` are all False, as none of the relevant features apply.

    Returns
    -------
    if essentials_only is False:
        list of Statements from the database corresponding to the query.
    else:
        list of tuples containing basic data from the statements.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_ro('primary')

    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements.")

    if not (agent_id or role or stmt_type):
        raise ValueError('At least one of agent_id, role, or stmt_type '
                         'must be specified.')

    if agent_id and agent_ns == 'HGNC-SYMBOL':
        hgnc_symbol = agent_id
        agent_id = hgnc_client.get_hgnc_id(hgnc_symbol)
        if not agent_id:
            logger.warning('Invalid gene name: %s' % hgnc_symbol)
            return []
        agent_ns = 'HGNC'

    query = EmptyQuery()
    if agent_id:
        query &= HasAgent(agent_id, agent_ns, role)
    if stmt_type:
        query &= HasType([stmt_type])

    if not isinstance(query, QueryCore):
        raise ValueError("Either agent_id or stmt_type must be given.")

    if essentials_only:
        raise DeprecationWarning("This functionality is no longer supported. "
                                 "Similar features are available in"
                                 "indra_db.client.readonly.query, especially "
                                 "the `get_interactions` methods.")
    if with_evidence:
        ev_lim = None
    else:
        ev_lim = 0

    if with_support:
        raise DeprecationWarning("This feature is not supported at this "
                                 "time, and was never truly supported.")

    result = query.get_statements(db, ev_limit=ev_lim)
    return result.statements()
Exemple #5
0
def get_statements_by_paper(id_list,
                            id_type='pmid',
                            db=None,
                            preassembled=True):
    """Get the statements from a list of paper ids.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.

    Parameters
    ----------
    id_list : list or set
        A list of ints or strs that are ids of papers of type `id_type`.
    id_type : str
        The type of id used (default is pmid). Options include pmid, pmcid,
        doi, pii, url, or manuscript_id. Note that pmid is generally the
        best means of getting a paper.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    preassembled : bool
        If True, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.

    Returns
    -------
    stmt_dict : dict
        A dict of Statements from the database keyed the paper id given. Papers
        that yielded no statements are not included. If `preassembled` is True,
        there may be ids which were not present in the original dataset, and
        there may be a key None for statements that has evidence from refs that
        did not have that id_type of reference.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements by paper.")
    if not db:
        db = get_ro('primary')

    query = FromPapers([(id_type, pid) for pid in id_list])
    result = query.get_statements(db)

    # Get the Statement object from the jsons. A statement shows up for
    # all papers that it references.
    result_dict = defaultdict(list)
    for stmt in result.statements():
        for ev in stmt.evidence:
            result_dict[ev.text_refs.get(id_type)].append(stmt)

    # Convert from defaultdict to ordinary dict.
    result_dict = dict(result_dict)
    return result_dict