Example #1
0
 def process_args(self, args_json):
     for arg in args_json:
         if arg == 'stmt_type':
             args_json[arg] = get_statement_by_name(args_json[arg])
         elif arg in ['matches_fun', 'refinement_fun']:
             args_json[arg] = pipeline_functions[args_json[arg]]
         elif arg == 'curations':
             Curation = namedtuple(
                 'Curation', ['pa_hash', 'source_hash', 'tag'])
             args_json[arg] = [
                 Curation(cur['pa_hash'], cur['source_hash'], cur['tag'])
                 for cur in args_json[arg]]
         elif arg == 'belief_scorer':
             if args_json[arg] == 'wm':
                 args_json[arg] = get_eidos_scorer()
             else:
                 args_json[arg] = None
         elif arg == 'ontology':
             if args_json[arg] == 'wm':
                 args_json[arg] = world_ontology
             else:
                 args_json[arg] = bio_ontology
         elif arg == 'whitelist' or arg == 'mutations':
             args_json[arg] = {
                 gene: [tuple(mod) for mod in mods]
                 for gene, mods in args_json[arg].items()}
     return args_json
def make_stmt_from_sort_key(key, verb, agents=None):
    """Make a Statement from the sort key.

    Specifically, the sort key used by `group_and_sort_statements`.
    """
    def make_agent(name):
        if name == 'None' or name is None:
            return None
        return Agent(name)

    StmtClass = get_statement_by_name(verb)
    inps = list(key[1])
    if agents is None:
        agents = []
    if verb == 'Complex':
        agents.extend([make_agent(name) for name in inps])
        stmt = StmtClass(agents[:])
    elif verb == 'Conversion':
        names_from = [make_agent(name) for name in inps[1]]
        names_to = [make_agent(name) for name in inps[2]]
        agents.extend(names_from + names_to)
        stmt = StmtClass(make_agent(inps[0]), names_from, names_to)
    elif verb == 'ActiveForm' or verb == 'HasActivity':
        agents.extend([make_agent(inps[0])])
        stmt = StmtClass(agents[0], inps[1], inps[2])
    elif verb == 'Influence':
        agents.extend([make_agent(inp) for inp in inps[:2]])
        stmt = Influence(*[Event(ag) for ag in agents])
    elif verb == 'Association':
        agents.extend([make_agent(inp) for inp in inps])
        stmt = StmtClass([Event(ag) for ag in agents])
    else:
        agents.extend([make_agent(name) for name in inps])
        stmt = StmtClass(*agents)
    return stmt
Example #3
0
def _english_from_agents_type(agA_name, agB_name, stmt_type):
    agA = Agent(agA_name)
    agB = Agent(agB_name)
    StmtClass = get_statement_by_name(stmt_type)
    if stmt_type.lower() == 'complex':
        stmt = StmtClass([agA, agB])
    else:
        stmt = StmtClass(agA, agB)
    return EnglishAssembler([stmt]).make_model()
Example #4
0
File: api.py Project: steppi/emmaa
def _make_query(query_dict, use_grouding_service=True):
    stmt_type = query_dict['typeSelection']
    stmt_class = get_statement_by_name(stmt_type)
    subj = get_agent_from_text(query_dict['subjectSelection'],
                               use_grouding_service)
    obj = get_agent_from_text(query_dict['objectSelection'],
                              use_grouding_service)
    stmt = stmt_class(subj, obj)
    query = PathProperty(path_stmt=stmt)
    return query
Example #5
0
    def _run(self, subject=None, object=None, agents=None, stmt_type=None,
             use_exact_type=False, persist=True, strict_stop=False,
             **api_params):
        self.__started = False
        self.__done_dict = defaultdict(lambda: False)
        self.__page_dict = defaultdict(lambda: 0)
        self.__th = None
        self.__quota = api_params['max_stmts']

        # Make sure we got at least SOME agents (the remote API will error if
        # we proceed with no arguments).
        if subject is None and object is None and not agents:
            raise ValueError("At least one agent must be specified, or else "
                             "the scope will be too large.")

        # Make timeouts apply differently in this case
        if not strict_stop:
            timeout = api_params.pop('timeout', None)
        else:
            timeout = api_params.get('timeout', None)

        # Formulate inputs for the agents..
        key_val_list = [('subject', subject), ('object', object)]
        params = {param_key: param_val for param_key, param_val in key_val_list
                  if param_val is not None}
        params.update(api_params)

        agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag)
                                                for i, ag in enumerate(agents)]

        # Handle the type(s).
        stmt_types = [stmt_type] if stmt_type else []
        if stmt_type is not None and not use_exact_type:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types += [cls.__name__ for cls in descendant_classes]

        # Handle the content if we were limited.
        args = [agent_strs, stmt_types, params, persist]
        logger.debug("The remainder of the query will be performed in a "
                     "thread...")
        self.__th = Thread(target=self._run_queries, args=args)
        self.__th.start()

        if timeout is None:
            logger.debug("Waiting for thread to complete...")
            self.__th.join()
        elif timeout:  # is not 0
            logger.debug("Waiting at most %d seconds for thread to complete..."
                         % timeout)
            self.__th.join(timeout)
        return
Example #6
0
    def __init__(self, subject=None, object=None, agents=None, stmt_type=None,
                 use_exact_type=False, persist=True, timeout=None, ev_limit=10,
                 best_first=True, tries=2, max_stmts=None):
        self.statements = []
        self.statements_sample = None
        self.__statement_jsons = {}
        self.__done_dict = defaultdict(lambda: False)
        self.__evidence_counts = {}
        self.__started = False
        self.__page_dict = defaultdict(lambda: 0)
        self.__th = None
        self.__quota = max_stmts

        # Make sure we got at least SOME agents (the remote API will error if
        # we proceed with no arguments).
        if subject is None and object is None and not agents:
            raise ValueError("At least one agent must be specified, or else "
                             "the scope will be too large.")

        # Formulate inputs for the agents..
        agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag)
                                                for i, ag in enumerate(agents)]
        key_val_list = [('subject', subject), ('object', object)]
        params = {param_key: param_val for param_key, param_val in key_val_list
                  if param_val is not None}
        params['best_first'] = best_first
        params['ev_limit'] = ev_limit
        params['tries'] = tries

        # Handle the type(s).
        stmt_types = [stmt_type] if stmt_type else []
        if stmt_type is not None and not use_exact_type:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types += [cls.__name__ for cls in descendant_classes]

        # Handle the content if we were limited.
        args = [agent_strs, stmt_types, params, persist]
        logger.info("The remainder of the query will be performed in a "
                    "thread...")
        self.__th = Thread(target=self._run_queries, args=args)
        self.__th.start()

        if timeout is None:
            logger.info("Waiting for thread to complete...")
            self.__th.join()
        elif timeout:  # is not 0
            logger.info("Waiting at most %d seconds for thread to complete..."
                        % timeout)
            self.__th.join(timeout)
        return
Example #7
0
 def get_argument_value(self, arg_json):
     """Get a value of an argument from its json version."""
     if self.is_function(arg_json, 'function'):
         # Argument is a function
         if arg_json.get('no_run', False):
             value = self.get_function_from_name(arg_json['function'])
         # Argument is a result of a function
         else:
             value = self.run_function(arg_json)
     # Argument is a statement type
     elif self.is_function(arg_json, 'stmt_type'):
         value = get_statement_by_name(arg_json.get('stmt_type'))
     # Argument is a simple value (str, int, boolean, etc.)
     else:
         value = arg_json
     return value
Example #8
0
def _make_query(query_dict):
    if 'typeSelection' in query_dict.keys():
        stmt_type = query_dict['typeSelection']
        stmt_class = get_statement_by_name(stmt_type)
        subj = get_agent_from_text(query_dict['subjectSelection'])
        obj = get_agent_from_text(query_dict['objectSelection'])
        stmt = stmt_class(subj, obj)
        query = PathProperty(path_stmt=stmt)
        tab = 'static'
    elif 'agentSelection' in query_dict.keys():
        agent = get_agent_from_trips(query_dict['agentSelection'])
        value = query_dict['valueSelection']
        if not value:
            value = None
        pattern = query_dict['patternSelection']
        query = DynamicProperty(agent, pattern, value)
        tab = 'dynamic'
    return query, tab
Example #9
0
def test_has_type():
    ro = get_db('primary')
    q = HasType(['Phosphorylation', 'Activation'])
    res = q.get_statements(ro, limit=5, ev_limit=8)
    stmts = res.statements()
    assert all(s.__class__.__name__ in ('Phosphorylation', 'Activation')
               for s in stmts)

    type_list = ['SelfModification', 'RegulateAmount', 'Translocation']
    q = HasType(type_list, include_subclasses=True)
    res = q.get_statements(ro, limit=5, ev_limit=8)
    stmts = res.statements()
    types = {
        t
        for bt in (get_statement_by_name(n) for n in type_list)
        for t in [bt] + get_all_descendants(bt)
    }
    assert all(type(s) in types for s in stmts)
Example #10
0
 def process_args(self, args_json):
     for arg in args_json:
         if arg == 'stmt_type':
             args_json[arg] = get_statement_by_name(args_json[arg])
         elif arg in ['matches_fun', 'refinement_fun']:
             args_json[arg] = pipeline_functions[args_json[arg]]
         elif arg == 'belief_scorer':
             # Here we could handle various string values of args_json[arg]
             # but there currently aren't any specific options
             args_json[arg] = None
         elif arg == 'ontology':
             # Here we could handle various string values of args_json[arg]
             # but there currently aren't any specific options
             args_json[arg] = bio_ontology
         elif arg == 'whitelist' or arg == 'mutations':
             args_json[arg] = {
                 gene: [tuple(mod) for mod in mods]
                 for gene, mods in args_json[arg].items()}
     return args_json
Example #11
0
def stmt_from_interaction(interaction):
    """Get a shell statement from an interaction."""
    StmtClass = get_statement_by_name(interaction['type'])
    if interaction['type'] == 'Complex':
        agents = [Agent(name) for name in interaction['agents'].values()]
        stmt = StmtClass(agents)
    elif interaction['type'] == 'ActiveForm':
        name = interaction['agents'][0]
        agent = Agent(name)
        stmt = StmtClass(agent, interaction['activity'],
                         interaction['is_active'])
    else:
        agents = [
            Agent(interaction['agents'][i])
            if interaction['agents'].get(i) else None
            for i in range(len(StmtClass._agent_order))
        ]
        stmt = StmtClass(*agents)
    return stmt
Example #12
0
def make_stmt_from_sort_key(key, verb):
    """Make a Statement from the sort key.

    Specifically, the sort key used by `group_and_sort_statements`.
    """
    def make_agent(name):
        if name == 'None' or name is None:
            return None
        return Agent(name)

    StmtClass = get_statement_by_name(verb)
    inps = list(key[1])
    if verb == 'Complex':
        stmt = StmtClass([make_agent(name) for name in inps])
    elif verb == 'Conversion':
        stmt = StmtClass(make_agent(inps[0]),
                         [make_agent(name) for name in inps[1]],
                         [make_agent(name) for name in inps[2]])
    elif verb == 'ActiveForm' or verb == 'HasActivity':
        stmt = StmtClass(make_agent(inps[0]), inps[1], inps[2])
    else:
        stmt = StmtClass(*[make_agent(name) for name in inps])
    return stmt
def make_stmt_from_sort_key(key, verb):
    """Make a Statement from the sort key.

    Specifically, the sort key used by `group_and_sort_statements`.
    """
    def make_agent(name):
        if name == 'None' or name is None:
            return None
        return Agent(name)

    StmtClass = get_statement_by_name(verb)
    inps = list(key[1])
    if verb == 'Complex':
        stmt = StmtClass([make_agent(name) for name in inps])
    elif verb == 'Conversion':
        stmt = StmtClass(make_agent(inps[0]),
                         [make_agent(name) for name in inps[1]],
                         [make_agent(name) for name in inps[2]])
    elif verb == 'ActiveForm' or verb == 'HasActivity':
        stmt = StmtClass(make_agent(inps[0]), inps[1], inps[2])
    else:
        stmt = StmtClass(*[make_agent(name) for name in inps])
    return stmt
Example #14
0
def get_statements(subject=None,
                   object=None,
                   agents=None,
                   stmt_type=None,
                   use_exact_type=False,
                   persist=True,
                   timeout=None,
                   simple_response=True,
                   ev_limit=10,
                   best_first=True,
                   tries=2,
                   max_stmts=None):
    """Get Statements from the INDRA DB web API matching given agents and type.

    There are two types of response available. You can just get a list of
    INDRA Statements, or you can get an IndraRestResponse object, which allows
    Statements to be loaded in a background thread, providing a sample of the
    best* content available promptly in the sample_statements attribute, and
    populates the statements attribute when the paged load is complete.

    *In the sense of having the most supporting evidence.

    Parameters
    ----------
    subject/object : str
        Optionally specify the subject and/or object of the statements in
        you wish to get from the database. By default, the namespace is assumed
        to be HGNC gene names, however you may specify another namespace by
        including `@<namespace>` at the end of the name string. For example, if
        you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`,
        or if you wanted to use the HGNC id, you could use `6871@HGNC`.
    agents : list[str]
        A list of agents, specified in the same manner as subject and object,
        but without specifying their grammatical position.
    stmt_type : str
        Specify the types of interactions you are interested in, as indicated
        by the sub-classes of INDRA's Statements. This argument is *not* case
        sensitive. If the statement class given has sub-classes
        (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both
        the class itself, and its subclasses, will be queried, by default. If
        you do not want this behavior, set use_exact_type=True. Note that if
        max_stmts is set, it is possible only the exact statement type will
        be returned, as this is the first searched. The processor then cycles
        through the types, getting a page of results for each type and adding it
        to the quota, until the max number of statements is reached.
    use_exact_type : bool
        If stmt_type is given, and you only want to search for that specific
        statement type, set this to True. Default is False.
    persist : bool
        Default is True. When False, if a query comes back limited (not all
        results returned), just give up and pass along what was returned.
        Otherwise, make further queries to get the rest of the data (which may
        take some time).
    timeout : positive int or None
        If an int, block until the work is done and statements are retrieved, or
        until the timeout has expired, in which case the results so far will be
        returned in the response object, and further results will be added in
        a separate thread as they become available. If simple_response is True,
        all statements available will be returned. Otherwise (if None), block
        indefinitely until all statements are retrieved. Default is None.
    simple_response : bool
        If True, a simple list of statements is returned (thus block should also
        be True). If block is False, only the original sample will be returned
        (as though persist was False), until the statements are done loading, in
        which case the rest should appear in the list. This behavior is not
        encouraged. Default is True (for the sake of backwards compatibility).
    ev_limit : int or None
        Limit the amount of evidence returned per Statement. Default is 10.
    best_first : bool
        If True, the preassembled statements will be sorted by the amount of
        evidence they have, and those with the most evidence will be
        prioritized. When using `max_stmts`, this means you will get the "best"
        statements. If False, statements will be queried in arbitrary order.
    tries : int > 0
        Set the number of times to try the query. The database often caches
        results, so if a query times out the first time, trying again after a
        timeout will often succeed fast enough to avoid a timeout. This can also
        help gracefully handle an unreliable connection, if you're willing to
        wait. Default is 2.
    max_stmts : int or None
        Select the maximum number of statements to return. When set less than
        1000 the effect is much the same as setting persist to false, and will
        guarantee a faster response. Default is None.

    Returns
    -------
    stmts : list[:py:class:`indra.statements.Statement`]
        A list of INDRA Statement instances. Note that if a supporting or
        supported Statement was not included in your query, it will simply be
        instantiated as an `Unresolved` statement, with `uuid` of the statement.
    """
    # Make sure we got at least SOME agents (the remote API will error if we
    # we proceed with no arguments.
    if subject is None and object is None and agents is None:
        raise ValueError("At least one agent must be specified, or else "
                         "the scope will be too large.")

    # Formulate inputs for the agents..
    agent_strs = [] if agents is None else [
        'agent%d=%s' % (i, ag) for i, ag in enumerate(agents)
    ]
    key_val_list = [('subject', subject), ('object', object)]
    params = {
        param_key: param_val
        for param_key, param_val in key_val_list if param_val is not None
    }
    params['best_first'] = best_first
    params['ev_limit'] = ev_limit
    params['tries'] = tries

    # Handle the type(s).
    stmt_types = [stmt_type] if stmt_type else []
    if stmt_type is not None and not use_exact_type:
        stmt_class = get_statement_by_name(stmt_type)
        descendant_classes = get_all_descendants(stmt_class)
        stmt_types += [cls.__name__ for cls in descendant_classes]

    # Get the response object
    resp = IndraDBRestResponse(max_stmts=max_stmts)
    resp.make_stmts_queries(agent_strs, stmt_types, params, persist, timeout)

    # Format the result appropriately.
    if simple_response:
        ret = resp.statements
    else:
        ret = resp
    return ret
Example #15
0
def get_statements(subject=None, object=None, agents=None, stmt_type=None,
                   use_exact_type=False, on_limit='sample'):
    """Get statements from INDRA's database using the web api.

    Parameters
    ----------
    subject/object : str
        Optionally specify the subject and/or object of the statements in
        you wish to get from the database. By default, the namespace is assumed
        to be HGNC gene names, however you may specify another namespace by
        including `@<namespace>` at the end of the name string. For example, if
        you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`,
        or if you wanted to use the HGNC id, you could use `6871@HGNC`.
    agents : list[str]
        A list of agents, specified in the same manner as subject and object,
        but without specifying their grammatical position.
    stmt_type : str
        Specify the types of interactions you are interested in, as indicated
        by the sub-classes of INDRA's Statements. This argument is *not* case
        sensitive. If the statement class given has sub-classes
        (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both
        the class itself, and its subclasses, will be queried, by default. If
        you do not want this behavior, set use_exact_type=True.
    use_exact_type : bool
        If stmt_type is given, and you only want to search for that specific
        statement type, set this to True. Default is False.
    on_limit : str
        There are four options for handling the a query that is to large:
        `sample` - (default) take a sample of statements from the result,
        `truncate` - simply return the first 10,000 statements of the result,
        `error` - raise an error if the query is too large, or
        `persist` - perform as many queries as needed to get all the statements.
        Note that this last option generally takes much much longer to execute.

    Returns
    -------
    stmts : list[:py:class:`indra.statements.Statement`]
        A list of INDRA Statement instances. Note that if a supporting or
        supported Statement was not included in your query, it will simply be
        instantiated as an `Unresolved` statement, with `uuid` of the statement.
    """
    # Make sure we got at least SOME agents (the remote API will error if we
    # we proceed with no arguments.
    if subject is None and object is None and agents is None:
        raise ValueError("At least one agent must be specified, or else "
                         "the scope will be too large.")

    # Formulate inputs for the agents..
    agent_strs = [] if agents is None else ['agent=%s' % ag for ag in agents]
    key_val_list = [('subject', subject), ('object', object)]
    params = {param_key: param_val for param_key, param_val in key_val_list
              if param_val is not None}
    params['on_limit'] = on_limit

    # Handle the type(s).
    if stmt_type is not None:
        if use_exact_type:
            params['type'] = stmt_type
            stmts = _make_stmts_query(agent_strs, params)
        else:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types = [cls.__name__ for cls in descendant_classes] \
                + [stmt_type]
            stmts = _query_stmt_types(agent_strs, params, stmt_types)
    else:
        stmts = _make_stmts_query(agent_strs, params)
    return stmts
Example #16
0
def _get_pa_stmt_jsons_w_mkhash_subquery(db,
                                         mk_hashes_q,
                                         best_first=True,
                                         max_stmts=None,
                                         offset=None,
                                         ev_limit=None):
    # Handle limiting.
    mk_hashes_q = mk_hashes_q.distinct()
    if best_first:
        mk_hashes_q = mk_hashes_q.order_by(desc(db.PaMeta.ev_count))
    if max_stmts is not None:
        mk_hashes_q = mk_hashes_q.limit(max_stmts)
    if offset is not None:
        mk_hashes_q = mk_hashes_q.offset(offset)

    # Create the link
    mk_hashes_al = mk_hashes_q.subquery('mk_hashes')
    raw_json_c = db.FastRawPaLink.raw_json.label('raw_json')
    pa_json_c = db.FastRawPaLink.pa_json.label('pa_json')
    reading_id_c = db.FastRawPaLink.reading_id.label('rid')
    cont_q = db.session.query(raw_json_c, pa_json_c, reading_id_c)
    cont_q = cont_q.filter(db.FastRawPaLink.mk_hash == mk_hashes_al.c.mk_hash)

    if ev_limit is not None:
        cont_q = cont_q.limit(ev_limit)

    # TODO: Only make a lateral-joined query when evidence is limited.
    json_content_al = cont_q.subquery().lateral('json_content')

    stmts_q = (mk_hashes_al.outerjoin(json_content_al, true()).outerjoin(
        db.ReadingRefLink, db.ReadingRefLink.rid == json_content_al.c.rid))

    ref_link_keys = [
        k for k in db.ReadingRefLink.__dict__.keys() if not k.startswith('_')
    ]
    selection = (select([
        mk_hashes_al.c.mk_hash, mk_hashes_al.c.ev_count,
        json_content_al.c.raw_json, json_content_al.c.pa_json
    ] + [getattr(db.ReadingRefLink, k)
         for k in ref_link_keys]).select_from(stmts_q))
    logger.debug("Executing sql to get statements:\n%s" % str(selection))

    proxy = db.session.connection().execute(selection)
    res = proxy.fetchall()

    stmts_dict = OrderedDict()
    ev_totals = OrderedDict()
    total_evidence = 0
    returned_evidence = 0
    if res:
        logger.debug("res is %d row by %d cols." % (len(res), len(res[0])))
    else:
        logger.debug("res is empty.")

    for row in res:
        mk_hash, ev_count, raw_json_bts, pa_json_bts = row[:4]
        ref_dict = {
            ref_link_keys[i]: row[4 + i]
            for i in range(len(ref_link_keys))
        }
        returned_evidence += 1
        raw_json = json.loads(raw_json_bts.decode('utf-8'))
        ev_json = raw_json['evidence'][0]

        # Add a new statements if the hash is new
        if mk_hash not in stmts_dict.keys():
            total_evidence += ev_count
            ev_totals[mk_hash] = ev_count
            stmts_dict[mk_hash] = json.loads(pa_json_bts.decode('utf-8'))
            stmts_dict[mk_hash]['evidence'] = []

        # Fix the pmid
        if ref_dict['pmid']:
            ev_json['pmid'] = ref_dict['pmid']

        # Add agents' raw text to annotations.
        raw_text = []
        for ag_name in get_statement_by_name(raw_json['type'])._agent_order:
            ag_value = raw_json.get(ag_name, None)
            if isinstance(ag_value, dict):
                raw_text.append(ag_value['db_refs'].get('TEXT'))
            elif ag_value is None:
                raw_text.append(None)
            else:
                for ag in ag_value:
                    raw_text.append(ag['db_refs'].get('TEXT'))
        if 'annotations' not in ev_json.keys():
            ev_json['annotations'] = {}
        ev_json['annotations']['agents'] = {'raw_text': raw_text}
        if 'prior_uuids' not in ev_json['annotations'].keys():
            ev_json['annotations']['prior_uuids'] = []
        ev_json['annotations']['prior_uuids'].append(raw_json['id'])
        if 'text_refs' not in ev_json.keys():
            ev_json['text_refs'] = {}
        ev_json['text_refs'].update(
            {k.upper(): v
             for k, v in ref_dict.items() if v is not None})

        if ref_dict['source']:
            ev_json['annotations']['content_source'] = ref_dict['source']

        # TODO: Remove this eventually. This is a patch!
        if 'source_hash' not in ev_json.keys():
            s = str(ev_json.get('source_api')) + str(ev_json.get('source_id'))
            if ev_json.get('text') and isinstance(ev_json['text'], str):
                s += ev_json['text']
            elif ev_json.get('pmid') and isinstance(ev_json['pmid'], str):
                s += ev_json['pmid']
            ev_json['source_hash'] = _make_hash(s, 16)

        stmts_dict[mk_hash]['evidence'].append(ev_json)

    ret = {
        'statements': stmts_dict,
        'evidence_totals': ev_totals,
        'total_evidence': total_evidence,
        'evidence_returned': returned_evidence
    }
    return ret
Example #17
0
def expand_signed(df: pd.DataFrame, sign_dict: Dict[str, int],
                  stmt_types: List[str], use_descendants: bool = True) \
        -> pd.DataFrame:
    """Expands out which statements should be added to the signed graph

    The statements types provided in 'stmt_types' will be added for both
    signs. To add more statement types of just one sign, add it to 'sign_dict'.

    Parameters
    ----------
    df : pd.DataFrame
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        The statement types to match to expand signs to. The rows matching
        these types will be duplicated and each copy gets a distinct sign.
    use_descendants : bool
        If True, also match descendants of the statements provided in
        'stmt_types' when adding the extended signs.

    Returns
    -------
    pd.DataFrame
    """
    if use_descendants:
        logger.info('Getting descendants to match for expanded signed graph')
        # Get name of descendants
        more_stmt_types = set(stmt_types)
        for s in stmt_types:
            more_stmt_types.update({
                s.__name__
                for s in get_all_descendants(get_statement_by_name(s))
            })
        stmt_types = list(more_stmt_types)

    # Add new sign column, set to None. Using 'initial_sign' allows usage of
    # IndraNet.to_signed_graph
    df['initial_sign'] = None

    # Locate relevant rows
    standard_sign = df.stmt_type.isin(sign_dict.keys())
    expand_sign = df.stmt_type.isin(stmt_types)
    assert sum(standard_sign) + sum(expand_sign) > 0, \
        'All rows filtered out from DataFrame. Check that statement types ' \
        'in sign_dict and stmt_types exist in the DataFrame.'
    if sum(expand_sign) == 0:
        logger.warning('No rows can be used for expanded signed edges. Check '
                       'that statement types in stmt_types exist in the '
                       'DataFrame.')

    # Add sign for signed statements
    logger.info('Setting initial sign for signed types')
    df.loc[standard_sign, 'initial_sign'] = \
        df.loc[standard_sign, 'stmt_type'].apply(lambda st: sign_dict.get(st))

    # Add positive sign to the rows with types in stmt_types
    df.loc[expand_sign, 'initial_sign'] = INT_PLUS

    # Copy rows for expand sign and switch sign
    logger.info('Setting initial sign for expanded signed types')
    add_rows = []
    for _, expand_row in df[expand_sign].iterrows():
        exp_row = [
            INT_MINUS if col == 'initial_sign' else val
            for col, val in expand_row.items()
        ]
        add_rows.append(exp_row)

    logger.info('Appending extended signed rows')
    extra_df = pd.DataFrame(add_rows, columns=df.columns.values)
    df = df.append(extra_df)

    # Remove all rows without assigned sign
    logger.info('Removing rows without signed')
    df = df[~df.initial_sign.isna()]

    # Re-cast sign column as int
    try:
        df.initial_sign = df.initial_sign.astype(pd.Int32Dtype())
    except Exception as exc:
        link = 'https://pandas.pydata.org/pandas-docs/stable/user_guide' \
          '/integer_na.html'
        logger.warning(f'Could not set sign column as Nullable Integer Data '
                       f'Type. MAke sure to use pandas v0.24+. See {link}')

    return df
Example #18
0
def _build_test_set():
    agents = [{
        'NAME': 'ERK',
        'FPLX': 'ERK',
        'TEXT': 'MAPK'
    }, {
        'NAME': 'TP53',
        'HGNC': '11998'
    }, {
        'NAME': 'MEK',
        'FPLX': 'MEK'
    }, {
        'NAME': 'Vemurafenib',
        'CHEBI': 'CHEBI:63637'
    }]
    stypes = ['Phosphorylation', 'Activation', 'Inhibition', 'Complex']
    sources = [('medscan', 'rd'), ('reach', 'rd'), ('pc11', 'db'),
               ('signor', 'db')]
    mesh_ids = ['D000225', 'D002352', 'D015536']

    mesh_combos = []
    for num_mesh in range(0, 3):
        if num_mesh == 1:
            mesh_groups = [[mid] for mid in mesh_ids]
        else:
            mesh_groups = combinations(mesh_ids, num_mesh)

        mesh_combos.extend(list(mesh_groups))
    random.shuffle(mesh_combos)

    source_data = []
    for num_srcs in range(1, 5):
        if num_srcs == 1:
            src_iter = [[src] for src in sources]
        else:
            src_iter = combinations(sources, num_srcs)

        for src_list in src_iter:
            only_src = None if len(src_list) > 1 else src_list[0][0]
            has_rd = any(t == 'rd' for _, t in src_list)
            if has_rd:
                mesh_ids = mesh_combos[len(source_data) % len(mesh_combos)]
            else:
                mesh_ids = []
            source_data.append({
                'sources': {src: random.randint(1, 50)
                            for src, _ in src_list},
                'has_rd': any(t == 'rd' for _, t in src_list),
                'has_db': any(t == 'db' for _, t in src_list),
                'only_src': only_src,
                'mesh_ids': mesh_ids
            })
    random.shuffle(source_data)

    stmts = [
        tuple(tpl) + (None, None)
        for tpl in product(stypes, permutations(agents, 2))
    ]
    stmts += [('ActiveForm', (ref, ), activity, is_active)
              for activity, is_active, ref in product(
                  ['transcription', 'activity'], [True, False], agents)]

    complex_pairs = []

    name_meta_rows = []
    name_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num',
                      'ev_count', 'activity', 'is_active', 'agent_count')

    text_meta_rows = []
    text_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num',
                      'ev_count', 'activity', 'is_active', 'agent_count')

    other_meta_rows = []
    other_meta_cols = ('mk_hash', 'ag_num', 'db_name', 'db_id', 'role_num',
                       'type_num', 'ev_count', 'activity', 'is_active',
                       'agent_count')

    source_meta_rows = []
    source_meta_cols = ('mk_hash', 'reach', 'medscan', 'pc11', 'signor',
                        'ev_count', 'type_num', 'activity', 'is_active',
                        'agent_count', 'num_srcs', 'src_json', 'only_src',
                        'has_rd', 'has_db')

    mesh_meta_rows = []
    mesh_meta_cols = ('mk_hash', 'ev_count', 'mesh_num', 'type_num',
                      'activity', 'is_active', 'agent_count')
    for stype, refs, activity, is_active in stmts:

        # Extract agents, and make a Statement.
        StmtClass = get_statement_by_name(stype)
        if stype == 'ActiveForm':
            ag = make_agent_from_ref(refs[0])
            stmt = StmtClass(ag, activity=activity, is_active=is_active)
        else:
            ag1 = make_agent_from_ref(refs[0])
            ag2 = make_agent_from_ref(refs[1])
            if stype == 'Complex':
                if {ag1.name, ag2.name} in complex_pairs:
                    continue
                stmt = StmtClass([ag1, ag2])
                complex_pairs.append({ag1.name, ag2.name})
            else:
                stmt = StmtClass(ag1, ag2)

        # Connect with a source.
        source_dict = source_data[len(source_meta_rows) % len(source_data)]
        ev_count = sum(source_dict['sources'].values())
        src_row = (stmt.get_hash(), )
        for src_name in ['reach', 'medscan', 'pc11', 'signor']:
            src_row += (source_dict['sources'].get(src_name), )
        src_row += (ev_count, ro_type_map.get_int(stype), activity, is_active,
                    len(refs), len(source_dict['sources']),
                    json.dumps(source_dict['sources']),
                    source_dict['only_src'], source_dict['has_rd'],
                    source_dict['has_db'])
        source_meta_rows.append(src_row)

        # Add mesh rows
        for mesh_id in source_dict['mesh_ids']:
            mesh_meta_rows.append(
                (stmt.get_hash(), ev_count, int(mesh_id[1:]),
                 ro_type_map.get_int(stype), activity, is_active, len(refs)))

        # Generate agent rows.
        ref_rows, _, _ = extract_agent_data(stmt, stmt.get_hash())
        for row in ref_rows:
            row = row[:4] + (ro_role_map.get_int(
                row[4]), ro_type_map.get_int(stype), ev_count, activity,
                             is_active, len(refs))
            if row[2] == 'NAME':
                row = row[:2] + row[3:]
                name_meta_rows.append(row)
            elif row[2] == 'TEXT':
                row = row[:2] + row[3:]
                text_meta_rows.append(row)
            else:
                other_meta_rows.append(row)

    db = get_temp_db(clear=True)
    src_meta_cols = [{'name': col} for col, _ in sources]
    db.SourceMeta.load_cols(db.engine, src_meta_cols)
    for tbl in [
            db.SourceMeta, db.MeshMeta, db.NameMeta, db.TextMeta, db.OtherMeta
    ]:
        tbl.__table__.create(db.engine)
    db.copy('readonly.source_meta', source_meta_rows, source_meta_cols)
    db.copy('readonly.mesh_meta', mesh_meta_rows, mesh_meta_cols)
    db.copy('readonly.name_meta', name_meta_rows, name_meta_cols)
    db.copy('readonly.text_meta', text_meta_rows, text_meta_cols)
    db.copy('readonly.other_meta', other_meta_rows, other_meta_cols)
    return db