Ejemplo n.º 1
0
 def __init__(self):
     self.__option_dict = {}
     for cls in get_all_descendants(StatementFinder):
         if cls.__name__.startswith('_'):
             continue
         self.__option_dict[un_camel(cls.__name__)] = cls
     return
Ejemplo n.º 2
0
 def __init__(
     self,
     model: BaseEstimator,
     source_list: List[str],
     include_more_specific: bool = False,
     use_stmt_type: bool = False,
     use_num_members: bool = False,
     use_num_pmids: bool = False,
     use_promoter: bool = False,
     use_avg_evidence_len: bool = False,
 ):
     # Call superclass constructor to store the model
     super(CountsScorer, self).__init__(model)
     self.source_list = source_list
     self.include_more_specific = include_more_specific
     self.use_stmt_type = use_stmt_type
     self.use_num_members = use_num_members
     self.use_num_pmids = use_num_pmids
     self.use_promoter = use_promoter
     self.use_avg_evidence_len = use_avg_evidence_len
     # Build dictionary mapping INDRA Statement types to integers
     if use_stmt_type:
         all_stmt_types = get_all_descendants(Statement)
         self.stmt_type_map = {
             t.__name__: ix
             for ix, t in enumerate(all_stmt_types)
         }
Ejemplo n.º 3
0
 def __init__(self):
     self.__option_dict = {}
     for cls in get_all_descendants(StatementFinder):
         if cls.__name__.startswith('_'):
             continue
         self.__option_dict[un_camel(cls.__name__)] = cls
     return
Ejemplo n.º 4
0
def get_class_from_name(cls_name, parent_cls):
    classes = get_all_descendants(parent_cls)
    for cl in classes:
        if cl.__name__.lower() == camelize(cls_name).lower():
            return cl
    raise NotAClassName(f'{cls_name} is not recognized as a '
                        f'{parent_cls.__name__} type!')
Ejemplo n.º 5
0
def search():
    stmt_types = {c.__name__ for c in get_all_descendants(Statement)}
    stmt_types -= {"Influence", "Event", "Unresolved"}
    stmt_types_json = json.dumps(sorted(list(stmt_types)))
    if TESTING["status"]:
        if not TESTING["deployment"]:
            vue_src = url_for("serve_indralab_vue", file="IndralabVue.umd.js")
            vue_style = url_for("serve_indralab_vue", file="IndralabVue.css")
        else:
            vue_root = TESTING["vue-root"]
            logging.info(f"Testing deployed vue files at: {vue_root}")
            vue_src = f"{vue_root}/IndralabVue.umd.js"
            vue_style = f"{vue_root}/IndralabVue.css"

    else:
        vue_src = f"{VUE_ROOT}/IndralabVue.umd.js"
        vue_style = f"{VUE_ROOT}/IndralabVue.css"
    return render_my_template(
        "search.html",
        "Search",
        source_colors=DEFAULT_SOURCE_COLORS,
        source_info=SOURCE_INFO,
        search_active=True,
        vue_src=vue_src,
        vue_style=vue_style,
        stmt_types_json=stmt_types_json,
        reverse_source_mapping=rev_source_mapping,
        sources_dict=sources_dict,
    )
Ejemplo n.º 6
0
    def __init__(self):
        all_stmt_classes = get_all_descendants(Statement)
        stmt_class_names = [sc.__name__ for sc in all_stmt_classes]
        stmt_class_names.sort()

        self._int_to_str = {}
        self._str_to_int = {}
        for stmt_type_num, stmt_type in enumerate(stmt_class_names):
            self._int_to_str[stmt_type_num] = stmt_type
            self._str_to_int[stmt_type] = stmt_type_num
Ejemplo n.º 7
0
    def _run(self, subject=None, object=None, agents=None, stmt_type=None,
             use_exact_type=False, persist=True, strict_stop=False,
             **api_params):
        self.__started = False
        self.__done_dict = defaultdict(lambda: False)
        self.__page_dict = defaultdict(lambda: 0)
        self.__th = None
        self.__quota = api_params['max_stmts']

        # Make sure we got at least SOME agents (the remote API will error if
        # we proceed with no arguments).
        if subject is None and object is None and not agents:
            raise ValueError("At least one agent must be specified, or else "
                             "the scope will be too large.")

        # Make timeouts apply differently in this case
        if not strict_stop:
            timeout = api_params.pop('timeout', None)
        else:
            timeout = api_params.get('timeout', None)

        # Formulate inputs for the agents..
        key_val_list = [('subject', subject), ('object', object)]
        params = {param_key: param_val for param_key, param_val in key_val_list
                  if param_val is not None}
        params.update(api_params)

        agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag)
                                                for i, ag in enumerate(agents)]

        # Handle the type(s).
        stmt_types = [stmt_type] if stmt_type else []
        if stmt_type is not None and not use_exact_type:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types += [cls.__name__ for cls in descendant_classes]

        # Handle the content if we were limited.
        args = [agent_strs, stmt_types, params, persist]
        logger.debug("The remainder of the query will be performed in a "
                     "thread...")
        self.__th = Thread(target=self._run_queries, args=args)
        self.__th.start()

        if timeout is None:
            logger.debug("Waiting for thread to complete...")
            self.__th.join()
        elif timeout:  # is not 0
            logger.debug("Waiting at most %d seconds for thread to complete..."
                         % timeout)
            self.__th.join(timeout)
        return
Ejemplo n.º 8
0
class End(Dumper):
    """Mark the dump as complete."""
    name = 'end'
    fmt = 'json'
    db_required = False
    requires = get_all_descendants(Dumper)

    def dump(self, continuing=False):
        s3 = boto3.client('s3')
        self.get_s3_path().upload(
            s3,
            json.dumps(
                {'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')}))
Ejemplo n.º 9
0
def dump_hierarchy():
    """Dump hierarchy of Dumper classes to S3."""
    hierarchy = {}
    for d in get_all_descendants(Dumper):
        # Skip the FullPaStmts here.
        if d.name == 'full_pa_stmts':
            continue
        command_name = d.name.replace('_', '-')
        hierarchy[command_name] = d.config_to_json()
    s3_base = get_s3_dump()
    s3_path = s3_base.get_element_path('hierarchy.json')
    s3 = boto3.client('s3')
    s3_path.upload(s3, json.dumps(hierarchy).encode('utf-8'))
Ejemplo n.º 10
0
    def __init__(self, subject=None, object=None, agents=None, stmt_type=None,
                 use_exact_type=False, persist=True, timeout=None, ev_limit=10,
                 best_first=True, tries=2, max_stmts=None):
        self.statements = []
        self.statements_sample = None
        self.__statement_jsons = {}
        self.__done_dict = defaultdict(lambda: False)
        self.__evidence_counts = {}
        self.__started = False
        self.__page_dict = defaultdict(lambda: 0)
        self.__th = None
        self.__quota = max_stmts

        # Make sure we got at least SOME agents (the remote API will error if
        # we proceed with no arguments).
        if subject is None and object is None and not agents:
            raise ValueError("At least one agent must be specified, or else "
                             "the scope will be too large.")

        # Formulate inputs for the agents..
        agent_strs = [] if agents is None else ['agent%d=%s' % (i, ag)
                                                for i, ag in enumerate(agents)]
        key_val_list = [('subject', subject), ('object', object)]
        params = {param_key: param_val for param_key, param_val in key_val_list
                  if param_val is not None}
        params['best_first'] = best_first
        params['ev_limit'] = ev_limit
        params['tries'] = tries

        # Handle the type(s).
        stmt_types = [stmt_type] if stmt_type else []
        if stmt_type is not None and not use_exact_type:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types += [cls.__name__ for cls in descendant_classes]

        # Handle the content if we were limited.
        args = [agent_strs, stmt_types, params, persist]
        logger.info("The remainder of the query will be performed in a "
                    "thread...")
        self.__th = Thread(target=self._run_queries, args=args)
        self.__th.start()

        if timeout is None:
            logger.info("Waiting for thread to complete...")
            self.__th.join()
        elif timeout:  # is not 0
            logger.info("Waiting at most %d seconds for thread to complete..."
                        % timeout)
            self.__th.join(timeout)
        return
Ejemplo n.º 11
0
    def __init__(self, corpus_config=None):
        self.__option_dict = {}
        self.corpus_config = corpus_config
        if corpus_config:
            logging.info('Loading MSA with configuration: %s' % corpus_config)
            from bioagents.msa.local_query import resource_manager
            self.idbr = resource_manager.get_resoure(corpus_config)
        else:
            logging.info('Using MSA with INDRA DB REST')
            from indra.sources import indra_db_rest as idbr
            self.idbr = idbr

        for cls in get_all_descendants(StatementFinder):
            if cls.__name__.startswith('_'):
                continue
            self.__option_dict[un_camel(cls.__name__)] = cls
        return
Ejemplo n.º 12
0
def test_has_type():
    ro = get_db('primary')
    q = HasType(['Phosphorylation', 'Activation'])
    res = q.get_statements(ro, limit=5, ev_limit=8)
    stmts = res.statements()
    assert all(s.__class__.__name__ in ('Phosphorylation', 'Activation')
               for s in stmts)

    type_list = ['SelfModification', 'RegulateAmount', 'Translocation']
    q = HasType(type_list, include_subclasses=True)
    res = q.get_statements(ro, limit=5, ev_limit=8)
    stmts = res.statements()
    types = {
        t
        for bt in (get_statement_by_name(n) for n in type_list)
        for t in [bt] + get_all_descendants(bt)
    }
    assert all(type(s) in types for s in stmts)
Ejemplo n.º 13
0
def search():
    stmt_types = {c.__name__ for c in get_all_descendants(Statement)}
    stmt_types -= {'Influence', 'Event', 'Unresolved'}
    stmt_types_json = json.dumps(sorted(list(stmt_types)))
    source_info, source_colors = get_html_source_info()
    if TESTING['status']:
        vue_src = url_for("serve_indralab_vue", file='IndralabVue.umd.js')
        vue_style = url_for("serve_indralab_vue", file='IndralabVue.css')
    else:
        vue_src = f'{VUE_ROOT}/IndralabVue.umd.js'
        vue_style = f'{VUE_ROOT}/IndralabVue.css'
    return render_my_template('search.html',
                              'Search',
                              source_colors=source_colors,
                              source_info=source_info,
                              search_active=True,
                              vue_src=vue_src,
                              vue_style=vue_style,
                              stmt_types_json=stmt_types_json)
Ejemplo n.º 14
0
class End(Dumper):
    """Mark the dump as complete."""
    name = 'end'
    fmt = 'json'
    db_required = False
    # We don't need a FullPaStmts as a pickle because we already have the
    # jsonl (keeping the class definition if ever need to save a pickle)
    requires = [
        dumper for dumper in get_all_descendants(Dumper)
        if dumper.name != 'full_pa_stmts'
    ]
    heavy_compute = False

    def dump(self, continuing=False):
        s3 = boto3.client('s3')
        self.get_s3_path().upload(
            s3,
            json.dumps({
                'datetime': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            }).encode('utf-8'))
Ejemplo n.º 15
0
def load_res_pos(ro=None):
    """Return residue/position data keyed by hash"""
    logger.info('Getting residue and position info')
    if ro is None:
        ro = get_ro('primary')
    res = {'residue': {}, 'position': {}}
    for stmt_type in get_all_descendants(Modification):
        stmt_name = stmt_type.__name__
        if stmt_name in ('Modification', 'AddModification',
                         'RemoveModification'):
            continue
        logger.info(f'Getting statements for type {stmt_name}')
        type_num = ro_type_map.get_int(stmt_name)
        query = ro.select_all(ro.FastRawPaLink.pa_json,
                              ro.FastRawPaLink.type_num == type_num)
        for jsb, in query:
            js = json.loads(jsb)
            if 'residue' in js:
                res['residue'][int(js['matches_hash'])] = js['residue']
            if 'position' in js:
                res['position'][int(js['matches_hash'])] = js['position']
    return res
Ejemplo n.º 16
0
def _build_verb_map():
    # We first get all statement types
    stmts = get_all_descendants(Statement)
    verb_map = {}
    # These are statement types that aren't binary and therefore don't need
    # to be included in the verb map
    non_binary = ('hasactivity', 'activeform', 'selfmodification',
                  'autophosphorylation', 'transphosphorylation', 'event',
                  'unresolved', 'association', 'complex')
    for stmt in stmts:
        # Get the class name
        name = stmt.__name__
        if name.lower() in non_binary:
            continue
        # Get the base verb form of the statement, e.g., "phosphorylate"
        base_verb = statement_base_verb(name.lower())
        verb_map[base_verb] = {'stmt': name, 'type': 'base'}
        # Get the present form of the statement, e.g., "inhibits"
        present_verb = statement_present_verb(name.lower())
        verb_map[present_verb] = {'stmt': name, 'type': 'present'}
        # Get the passive / state form of the statement, e.g., "activated"
        passive_verb = statement_passive_verb(name.lower())
        verb_map[passive_verb] = {'stmt': name, 'type': 'passive'}
    return verb_map
Ejemplo n.º 17
0
 def get_sorted_descendants(cls):
     return sorted(get_names(get_all_descendants(cls)))
Ejemplo n.º 18
0
def get_statements(subject=None, object=None, agents=None, stmt_type=None,
                   use_exact_type=False, on_limit='sample'):
    """Get statements from INDRA's database using the web api.

    Parameters
    ----------
    subject/object : str
        Optionally specify the subject and/or object of the statements in
        you wish to get from the database. By default, the namespace is assumed
        to be HGNC gene names, however you may specify another namespace by
        including `@<namespace>` at the end of the name string. For example, if
        you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`,
        or if you wanted to use the HGNC id, you could use `6871@HGNC`.
    agents : list[str]
        A list of agents, specified in the same manner as subject and object,
        but without specifying their grammatical position.
    stmt_type : str
        Specify the types of interactions you are interested in, as indicated
        by the sub-classes of INDRA's Statements. This argument is *not* case
        sensitive. If the statement class given has sub-classes
        (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both
        the class itself, and its subclasses, will be queried, by default. If
        you do not want this behavior, set use_exact_type=True.
    use_exact_type : bool
        If stmt_type is given, and you only want to search for that specific
        statement type, set this to True. Default is False.
    on_limit : str
        There are four options for handling the a query that is to large:
        `sample` - (default) take a sample of statements from the result,
        `truncate` - simply return the first 10,000 statements of the result,
        `error` - raise an error if the query is too large, or
        `persist` - perform as many queries as needed to get all the statements.
        Note that this last option generally takes much much longer to execute.

    Returns
    -------
    stmts : list[:py:class:`indra.statements.Statement`]
        A list of INDRA Statement instances. Note that if a supporting or
        supported Statement was not included in your query, it will simply be
        instantiated as an `Unresolved` statement, with `uuid` of the statement.
    """
    # Make sure we got at least SOME agents (the remote API will error if we
    # we proceed with no arguments.
    if subject is None and object is None and agents is None:
        raise ValueError("At least one agent must be specified, or else "
                         "the scope will be too large.")

    # Formulate inputs for the agents..
    agent_strs = [] if agents is None else ['agent=%s' % ag for ag in agents]
    key_val_list = [('subject', subject), ('object', object)]
    params = {param_key: param_val for param_key, param_val in key_val_list
              if param_val is not None}
    params['on_limit'] = on_limit

    # Handle the type(s).
    if stmt_type is not None:
        if use_exact_type:
            params['type'] = stmt_type
            stmts = _make_stmts_query(agent_strs, params)
        else:
            stmt_class = get_statement_by_name(stmt_type)
            descendant_classes = get_all_descendants(stmt_class)
            stmt_types = [cls.__name__ for cls in descendant_classes] \
                + [stmt_type]
            stmts = _query_stmt_types(agent_strs, params, stmt_types)
    else:
        stmts = _make_stmts_query(agent_strs, params)
    return stmts
Ejemplo n.º 19
0
def get_statements(subject=None,
                   object=None,
                   agents=None,
                   stmt_type=None,
                   use_exact_type=False,
                   persist=True,
                   timeout=None,
                   simple_response=True,
                   ev_limit=10,
                   best_first=True,
                   tries=2,
                   max_stmts=None):
    """Get Statements from the INDRA DB web API matching given agents and type.

    There are two types of response available. You can just get a list of
    INDRA Statements, or you can get an IndraRestResponse object, which allows
    Statements to be loaded in a background thread, providing a sample of the
    best* content available promptly in the sample_statements attribute, and
    populates the statements attribute when the paged load is complete.

    *In the sense of having the most supporting evidence.

    Parameters
    ----------
    subject/object : str
        Optionally specify the subject and/or object of the statements in
        you wish to get from the database. By default, the namespace is assumed
        to be HGNC gene names, however you may specify another namespace by
        including `@<namespace>` at the end of the name string. For example, if
        you want to specify an agent by chebi, you could use `CHEBI:6801@CHEBI`,
        or if you wanted to use the HGNC id, you could use `6871@HGNC`.
    agents : list[str]
        A list of agents, specified in the same manner as subject and object,
        but without specifying their grammatical position.
    stmt_type : str
        Specify the types of interactions you are interested in, as indicated
        by the sub-classes of INDRA's Statements. This argument is *not* case
        sensitive. If the statement class given has sub-classes
        (e.g. RegulateAmount has IncreaseAmount and DecreaseAmount), then both
        the class itself, and its subclasses, will be queried, by default. If
        you do not want this behavior, set use_exact_type=True. Note that if
        max_stmts is set, it is possible only the exact statement type will
        be returned, as this is the first searched. The processor then cycles
        through the types, getting a page of results for each type and adding it
        to the quota, until the max number of statements is reached.
    use_exact_type : bool
        If stmt_type is given, and you only want to search for that specific
        statement type, set this to True. Default is False.
    persist : bool
        Default is True. When False, if a query comes back limited (not all
        results returned), just give up and pass along what was returned.
        Otherwise, make further queries to get the rest of the data (which may
        take some time).
    timeout : positive int or None
        If an int, block until the work is done and statements are retrieved, or
        until the timeout has expired, in which case the results so far will be
        returned in the response object, and further results will be added in
        a separate thread as they become available. If simple_response is True,
        all statements available will be returned. Otherwise (if None), block
        indefinitely until all statements are retrieved. Default is None.
    simple_response : bool
        If True, a simple list of statements is returned (thus block should also
        be True). If block is False, only the original sample will be returned
        (as though persist was False), until the statements are done loading, in
        which case the rest should appear in the list. This behavior is not
        encouraged. Default is True (for the sake of backwards compatibility).
    ev_limit : int or None
        Limit the amount of evidence returned per Statement. Default is 10.
    best_first : bool
        If True, the preassembled statements will be sorted by the amount of
        evidence they have, and those with the most evidence will be
        prioritized. When using `max_stmts`, this means you will get the "best"
        statements. If False, statements will be queried in arbitrary order.
    tries : int > 0
        Set the number of times to try the query. The database often caches
        results, so if a query times out the first time, trying again after a
        timeout will often succeed fast enough to avoid a timeout. This can also
        help gracefully handle an unreliable connection, if you're willing to
        wait. Default is 2.
    max_stmts : int or None
        Select the maximum number of statements to return. When set less than
        1000 the effect is much the same as setting persist to false, and will
        guarantee a faster response. Default is None.

    Returns
    -------
    stmts : list[:py:class:`indra.statements.Statement`]
        A list of INDRA Statement instances. Note that if a supporting or
        supported Statement was not included in your query, it will simply be
        instantiated as an `Unresolved` statement, with `uuid` of the statement.
    """
    # Make sure we got at least SOME agents (the remote API will error if we
    # we proceed with no arguments.
    if subject is None and object is None and agents is None:
        raise ValueError("At least one agent must be specified, or else "
                         "the scope will be too large.")

    # Formulate inputs for the agents..
    agent_strs = [] if agents is None else [
        'agent%d=%s' % (i, ag) for i, ag in enumerate(agents)
    ]
    key_val_list = [('subject', subject), ('object', object)]
    params = {
        param_key: param_val
        for param_key, param_val in key_val_list if param_val is not None
    }
    params['best_first'] = best_first
    params['ev_limit'] = ev_limit
    params['tries'] = tries

    # Handle the type(s).
    stmt_types = [stmt_type] if stmt_type else []
    if stmt_type is not None and not use_exact_type:
        stmt_class = get_statement_by_name(stmt_type)
        descendant_classes = get_all_descendants(stmt_class)
        stmt_types += [cls.__name__ for cls in descendant_classes]

    # Get the response object
    resp = IndraDBRestResponse(max_stmts=max_stmts)
    resp.make_stmts_queries(agent_strs, stmt_types, params, persist, timeout)

    # Format the result appropriately.
    if simple_response:
        ret = resp.statements
    else:
        ret = resp
    return ret
Ejemplo n.º 20
0
        self._set_lambda_env({'INDRAROOVERRIDE': str(self.principal.url)})

    def __exit__(self, exc_type, value, traceback):
        # Check for exceptions. Only change back over if there were no
        # exceptions.
        if exc_type is None:
            logger.info("Directing the service back to %s." %
                        self.readonly.url)
            self._set_lambda_env({})
        else:
            logger.warning("An error %s occurred. Assuming the database is "
                           "not usable, and not transfering the service back "
                           "to Readonly." % exc_type)


dumpers = {dumper.name: dumper for dumper in get_all_descendants(Dumper)}


def dump(principal_db,
         readonly_db,
         delete_existing=False,
         allow_continue=True,
         load_only=False,
         dump_only=False):
    if delete_existing and 'readonly' in principal_db.get_schemas():
        principal_db.drop_schema('readonly')

    if not load_only:
        starter = Start()
        starter.dump(continuing=allow_continue)
Ejemplo n.º 21
0
from indra.statements import get_all_descendants, Statement
from indra_reading.batch.submitters.submitter import Submitter
from indra_reading.batch.util import bucket_name

DEFAULT_AVOID_STATEMENTS = ['Event', 'Influence', 'Unresolved']
VALID_STATEMENTS = [st.__name__ for st in get_all_descendants(Statement)
                    if st.__name__ not in DEFAULT_AVOID_STATEMENTS]


class PreassemblySubmitter(Submitter):
    job_class = 'preassembly'
    _purpose = 'db_preassembly'
    _job_queue_dict = {'run_db_reading_queue': ['create', 'update']}
    _job_def_dict = {'run_db_reading_jobdef': ['create', 'update']}

    def __init__(self, basename, task, *args, **kwargs):
        if task not in ['create', 'update']:
            raise ValueError(f"Invalid task '{task}': expected 'create' or "
                             f"'update'.")
        self.task = task
        super(PreassemblySubmitter, self).__init__(basename, *args, **kwargs)

    def _iter_over_select_queues(self):
        for jq, tasks in self._job_queue_dict.items():
            if self.task not in tasks:
                continue
            yield jq

    def _get_command(self, job_type_set, *args):
        if len(args) == 2:
            stmt_type, batch_size = args
Ejemplo n.º 22
0
def expand_signed(df: pd.DataFrame, sign_dict: Dict[str, int],
                  stmt_types: List[str], use_descendants: bool = True) \
        -> pd.DataFrame:
    """Expands out which statements should be added to the signed graph

    The statements types provided in 'stmt_types' will be added for both
    signs. To add more statement types of just one sign, add it to 'sign_dict'.

    Parameters
    ----------
    df : pd.DataFrame
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        The statement types to match to expand signs to. The rows matching
        these types will be duplicated and each copy gets a distinct sign.
    use_descendants : bool
        If True, also match descendants of the statements provided in
        'stmt_types' when adding the extended signs.

    Returns
    -------
    pd.DataFrame
    """
    if use_descendants:
        logger.info('Getting descendants to match for expanded signed graph')
        # Get name of descendants
        more_stmt_types = set(stmt_types)
        for s in stmt_types:
            more_stmt_types.update({
                s.__name__
                for s in get_all_descendants(get_statement_by_name(s))
            })
        stmt_types = list(more_stmt_types)

    # Add new sign column, set to None. Using 'initial_sign' allows usage of
    # IndraNet.to_signed_graph
    df['initial_sign'] = None

    # Locate relevant rows
    standard_sign = df.stmt_type.isin(sign_dict.keys())
    expand_sign = df.stmt_type.isin(stmt_types)
    assert sum(standard_sign) + sum(expand_sign) > 0, \
        'All rows filtered out from DataFrame. Check that statement types ' \
        'in sign_dict and stmt_types exist in the DataFrame.'
    if sum(expand_sign) == 0:
        logger.warning('No rows can be used for expanded signed edges. Check '
                       'that statement types in stmt_types exist in the '
                       'DataFrame.')

    # Add sign for signed statements
    logger.info('Setting initial sign for signed types')
    df.loc[standard_sign, 'initial_sign'] = \
        df.loc[standard_sign, 'stmt_type'].apply(lambda st: sign_dict.get(st))

    # Add positive sign to the rows with types in stmt_types
    df.loc[expand_sign, 'initial_sign'] = INT_PLUS

    # Copy rows for expand sign and switch sign
    logger.info('Setting initial sign for expanded signed types')
    add_rows = []
    for _, expand_row in df[expand_sign].iterrows():
        exp_row = [
            INT_MINUS if col == 'initial_sign' else val
            for col, val in expand_row.items()
        ]
        add_rows.append(exp_row)

    logger.info('Appending extended signed rows')
    extra_df = pd.DataFrame(add_rows, columns=df.columns.values)
    df = df.append(extra_df)

    # Remove all rows without assigned sign
    logger.info('Removing rows without signed')
    df = df[~df.initial_sign.isna()]

    # Re-cast sign column as int
    try:
        df.initial_sign = df.initial_sign.astype(pd.Int32Dtype())
    except Exception as exc:
        link = 'https://pandas.pydata.org/pandas-docs/stable/user_guide' \
          '/integer_na.html'
        logger.warning(f'Could not set sign column as Nullable Integer Data '
                       f'Type. MAke sure to use pandas v0.24+. See {link}')

    return df
Ejemplo n.º 23
0
from indra.statements import get_all_descendants, Statement
from indra_reading.batch.submitters.submitter import Submitter
from indra_reading.batch.util import bucket_name

DEFAULT_AVOID_STATEMENTS = ['Event', 'Influence', 'Unresolved']
VALID_STATEMENTS = [
    st.__name__ for st in get_all_descendants(Statement)
    if st.__name__ not in DEFAULT_AVOID_STATEMENTS
]


class PreassemblySubmitter(Submitter):
    job_class = 'preassembly'
    _purpose = 'db_preassembly'
    _job_queue_dict = {'run_db_reading_queue': ['create', 'update']}
    _job_def_dict = {'run_db_reading_jobdef': ['create', 'update']}

    def __init__(self, basename, task, *args, **kwargs):
        if task not in ['create', 'update']:
            raise ValueError(f"Invalid task '{task}': expected 'create' or "
                             f"'update'.")
        self.task = task
        super(PreassemblySubmitter, self).__init__(basename, *args, **kwargs)

    def _iter_over_select_queues(self):
        for jq, tasks in self._job_queue_dict.items():
            if self.task not in tasks:
                continue
            yield jq

    def _get_command(self, job_type_set, *args):