Example #1
0
def test_list_dumps():
    """Test the dump listing feature."""
    _build_s3_test_dump({
        '2020-01-01': ['start'],         # a dump that is unfinished.
        '2020-02-01': ['start', 'end'],  # a dump that is finished.
        '2020-03-01': ['sif']            # something strange but possible.
    })

    dump_head = config.get_s3_dump()

    def check_list(dumps, expected_timestamps):
        assert all(isinstance(s3p, S3Path) for s3p in dumps)
        assert all(s3p.key.startswith(dump_head.key) for s3p in dumps)
        time_stamps = [s3p.key.split('/')[-2] for s3p in dumps]
        assert expected_timestamps == time_stamps,\
            f"Expected: {expected_timestamps}, Got: {time_stamps}"

    all_dumps = dm.list_dumps()
    check_list(all_dumps, ['2020-01-01', '2020-02-01', '2020-03-01'])

    started_dumps = dm.list_dumps(started=True)
    check_list(started_dumps, ['2020-01-01', '2020-02-01'])

    done_dumps = dm.list_dumps(started=True, ended=True)
    check_list(done_dumps, ['2020-02-01'])

    unfinished_dumps = dm.list_dumps(started=True, ended=False)
    check_list(unfinished_dumps, ['2020-01-01'])
Example #2
0
def list_dumps():
    s3_base = get_s3_dump()
    s3 = boto3.client('s3')
    res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True))
    return [
        S3Path.from_key_parts(s3_base.bucket, d['Prefix'])
        for d in res['CommonPrefixes']
    ]
Example #3
0
 def is_dump_path(cls, s3_path):
     s3_base = get_s3_dump()
     if s3_base.bucket != s3_path.bucket:
         return False
     if s3_base.key not in s3_path.key:
         return False
     if cls.name not in s3_path.key:
         return False
     return True
Example #4
0
    def dump_readonly(self, dump_file=None):
        """Dump the readonly schema to s3."""

        # Form the name of the s3 file, if not given.
        if dump_file is None:
            from indra_db.config import get_s3_dump
            now_str = datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
            dump_loc = get_s3_dump()
            dump_file = dump_loc.get_element_path('readonly-%s.dump' % now_str)
        return self.pg_dump(dump_file, schema='readonly')
Example #5
0
def get_filled_ro(num_stmts):
    db = get_prepped_db(num_stmts, with_pa=True, with_agents=True)
    db.generate_readonly()
    s3_base = get_s3_dump()
    assert s3_base, "No s3 config available for db dumps."
    s3_path = dbu.S3Path.from_string(s3_base.to_string() + '-test')
    now_str = datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
    s3_path.get_element_path('readonly-%s.dump' % now_str)
    db.dump_readonly(s3_path)
    ro = get_temp_ro()
    ro.load_dump(s3_path)
    return ro
Example #6
0
def dump_hierarchy():
    """Dump hierarchy of Dumper classes to S3."""
    hierarchy = {}
    for d in get_all_descendants(Dumper):
        # Skip the FullPaStmts here.
        if d.name == 'full_pa_stmts':
            continue
        command_name = d.name.replace('_', '-')
        hierarchy[command_name] = d.config_to_json()
    s3_base = get_s3_dump()
    s3_path = s3_base.get_element_path('hierarchy.json')
    s3 = boto3.client('s3')
    s3_path.upload(s3, json.dumps(hierarchy).encode('utf-8'))
Example #7
0
def list_dumps(started=None, ended=None):
    """List all dumps, optionally filtered by their status.

    Parameters
    ----------
    started : Optional[bool]
        If True, find dumps that have started. If False, find dumps that have
        NOT been started. If None, do not filter by start status.
    ended : Optional[bool]
        The same as `started`, but checking whether the dump is ended or not.

    Returns
    -------
    list of S3Path objects
        Each S3Path object contains the bucket and key prefix information for
        a set of dump files, e.g.

            [S3Path(bigmech, indra-db/dumps/2020-07-16/),
             S3Path(bigmech, indra-db/dumps/2020-08-28/),
             S3Path(bigmech, indra-db/dumps/2020-09-18/),
             S3Path(bigmech, indra-db/dumps/2020-11-12/),
             S3Path(bigmech, indra-db/dumps/2020-11-13/)]
    """
    # Get all the dump "directories".
    s3_base = get_s3_dump()
    s3 = boto3.client('s3')
    res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True))
    if res['KeyCount'] == 0:
        return []
    dumps = [
        S3Path.from_key_parts(s3_base.bucket, d['Prefix'])
        for d in res['CommonPrefixes']
    ]

    # Filter to those that have "started"
    if started is not None:
        dumps = [
            p for p in dumps
            if p.get_element_path(Start.file_name()).exists(s3) == started
        ]

    # Filter to those that have "ended"
    if ended is not None:
        dumps = [
            p for p in dumps
            if p.get_element_path(End.file_name()).exists(s3) == ended
        ]

    return dumps
Example #8
0
    def get_latest_dump_file():
        import boto3
        from indra.util.aws import iter_s3_keys
        from indra_db.config import get_s3_dump

        s3 = boto3.client('s3')
        s3_path = get_s3_dump()

        logger.debug("Looking for the latest dump file on s3 to %s." % s3_path)

        # Get the most recent file from s3.
        max_date_str = None
        max_lm_date = None
        latest_key = None
        for key, lm_date in iter_s3_keys(s3, with_dt=True, **s3_path.kw()):

            # Get the date string from the name, ignoring non-standard files.
            suffix = key.split('/')[-1]
            m = re.match('readonly-(\S+).dump', suffix)
            if m is None:
                logger.debug("{key} is not a standard key, will not be "
                             "considered.".format(key=key))
                continue
            date_str, = m.groups()

            # Compare the the current maxes. If the date_str and the last
            # -modified date don't agree, raise an error.
            if not max_lm_date \
                    or date_str > max_date_str and lm_date > max_lm_date:
                max_date_str = date_str
                max_lm_date = lm_date
                latest_key = key
            elif max_lm_date \
                    and (date_str > max_date_str or lm_date > max_lm_date):
                raise S3DumpTimeAmbiguityError(key, date_str > max_date_str,
                                               lm_date > max_lm_date)
        logger.debug("Latest dump file from %s was found to be %s."
                     % (s3_path, latest_key))

        return S3Path(s3_path.bucket, latest_key)
Example #9
0
def _build_s3_test_dump(structure):
    """Build an s3 dump for testing.

    The input is a structure of the following form:
    ```
    structure = {
       '2020-01-01': [
            'start',
            'readonly',
            'sif',
       ],
       '2020-02-01': [
            'start',
            'readonly',
            'sif',
            'belief',
            'end'
       ]
    }
    ```
    where the names given are the canonical names of dumpers (see class
    definitions or `dumpers` global for details).
    """
    s3 = boto3.client('s3')
    dump_head = config.get_s3_dump()
    s3.create_bucket(Bucket=dump_head.bucket)
    for date_stamp, contents in structure.items():
        for dump_name in contents:
            dumper_class = dm.dumpers[dump_name]
            kwargs = {}
            if dumper_class.db_required:
                if 'readonly' in dumper_class.db_options:
                    kwargs['ro'] = None
                else:
                    kwargs['db'] = None
            dumper_class(date_stamp=date_stamp, **kwargs).shallow_mock_dump()
Example #10
0
 def _gen_s3_name(self):
     s3_base = get_s3_dump()
     s3_path = s3_base.get_element_path(self.date_stamp,
                                        '%s.%s' % (self.name, self.fmt))
     return s3_path
Example #11
0
 def _gen_s3_name(self):
     s3_base = get_s3_dump()
     s3_path = s3_base.get_element_path(self.date_stamp, self.file_name())
     return s3_path
Example #12
0
def test_dump_build():
    """Test the dump pipeline.

    Method
    ------
    CREATE CONTEXT:
    - Create a local principal database with a small amount of content.
      Aim for representation of stmt motifs and sources.
    - Create a local readonly database.
    - Create a fake bucket (moto)

    RUN THE DUMP

    CHECK THE RESULTS
    """
    assert config.is_db_testing()

    # Create the dump locale.
    s3 = boto3.client('s3')
    dump_head = config.get_s3_dump()
    s3.create_bucket(Bucket=dump_head.bucket)
    assert dump_head.bucket == S3_DATA_LOC['bucket']

    # Create the principal database.
    db = get_temp_db(clear=True)

    db.copy('text_ref', [        # trid
        ('1', 1, 'PMC1', 1),     # 1
        ('2', 2, 'PMC2', 2),     # 2
        ('3', 3, None, None),    # 3
        (None, None, 'PMC4', 4)  # 4
    ], ('pmid', 'pmid_num', 'pmcid', 'pmcid_num'))

    db.copy('mesh_ref_annotations', [
        (1, 11, False),
        (1, 13, False),
        (1, 12, True),
        (2, 12, True),
        (3, 13, False),
        (3, 33, True)
    ], ('pmid_num', 'mesh_num', 'is_concept'))

    db.copy('text_content', [              # tcid
        (1, 'pubmed', 'txt', 'abstract'),  # 1
        (1, 'pmc', 'xml', 'fulltext'),     # 2
        (2, 'pubmed', 'txt', 'title'),     # 3
        (3, 'pubmed', 'txt', 'abstract'),  # 4
        (3, 'pmc', 'xml', 'fulltext'),     # 5
        (4, 'pmc', 'xml', 'fulltext')      # 6
    ], ('text_ref_id', 'source', 'format', 'text_type'))

    db.copy('reading', [(tcid, rdr, 1, reader_versions[rdr][-1], 'emtpy')
                        for tcid, rdr in [
        # 1             2             3
        (1, 'reach'), (1, 'eidos'), (1, 'isi'),

        # 4
        (2, 'reach'),

        # 5             6            7
        (3, 'reach'), (3, 'eidos'), (3, 'trips'),

        # 8
        (4, 'reach'),

        # 9
        (5, 'reach'),

        # 10
        (6, 'reach')
    ]], ('text_content_id', 'reader', 'batch_id', 'reader_version', 'format'))

    db.copy('db_info', [
        ('signor', 'signor', 'Signor'),       # 1
        ('pc', 'biopax', 'Pathway Commons'),  # 2
        ('medscan', 'medscan', 'MedScan')     # 3
    ], ('db_name', 'source_api', 'db_full_name'))

    raw_stmts = {
        'reading': {
            2: [
                Inhibition(
                    Agent('Fever', db_refs={'TEXT': 'fever', 'MESH': 'D005334'}),
                    Agent('Cough', db_refs={'TEXT': 'cough', 'MESH': 'D003371'}),
                    evidence=Evidence(text="We found fever inhibits cough.")
                )
            ],
            4: [
                Phosphorylation(
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}),
                    Agent('ERK', db_refs={'FPLX': 'MEK', 'TEXT': 'erk'}),
                    evidence=Evidence(text="mek phosphorylates erk, so say I.")
                ),
                Activation(
                    Agent('MAP2K1', db_refs={'HGNC': '6840', 'TEXT': 'MEK1'}),
                    Agent('MAPK1', db_refs={'HGNC': '6871', 'TEXT': 'ERK1'}),
                    evidence=Evidence(text="MEK1 activates ERK1, or os I'm told.")
                ),
                Activation(
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}),
                    evidence=Evidence(text="ERK activates JNK, maybe.")
                ),
                Complex([
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MAP2K'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'MAPK'}),
                    Agent('RAF', db_refs={'FPLX': 'RAF', 'TEXT': 'RAF'})
                ], evidence=Evidence(text="MAP2K, MAPK, and RAF form a complex."))
            ],
            7: [
                Activation(
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}),
                    evidence=Evidence(text='ERK activates JNK, maybe.')
                )
            ],
            8: [
                Complex([
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'erk'})
                ], evidence=Evidence(text="...in the mek-erk complex."))
            ],
        },
        'databases': {
            2: [
                Conversion(
                    Agent('FRK', db_refs={'HGNC': '3955'}),
                    [Agent('ATP', db_refs={'MESH': 'D000255'})],
                    [Agent('hydron', db_refs={'CHEBI': 'CHEBI:15378'})]
                )
            ],
            3: [
                Phosphorylation(
                    Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MEK'}),
                    Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}),
                    evidence=Evidence(text="...MEK phosphorylates ERK medscan.")
                )
            ]
        }
    }
    simple_insert_stmts(db, raw_stmts)

    # Run preassembly.
    prass.create_corpus(db)

    # Do the dump proceedure.
    ro = get_temp_ro(clear=True)
    dump(db, ro)

    # Check that the s3 dump exists.
    all_dumps = dm.list_dumps()
    assert len(all_dumps) == 1

    # Check to make sure all the dump files are present.
    dump_path = all_dumps[0]
    file_list = dump_path.list_objects(s3)
    assert dm.Start.from_list(file_list)
    assert dm.Readonly.from_list(file_list)
    assert dm.Belief.from_list(file_list)
    assert dm.Sif.from_list(file_list)
    assert dm.StatementHashMeshId.from_list(file_list)
    assert dm.FullPaStmts.from_list(file_list)
    assert dm.End.from_list(file_list)

    # Check what tables are active in the readonly database.
    active_tables = ro.get_active_tables()
    for tbl in ro.get_tables():
        if ro.tables[tbl]._temp:
            # If it was temp, it should be gone.
            assert tbl not in active_tables
        else:
            # Otherwise, it should be there.
            assert tbl in active_tables

    # Check that the principal db has no more ro schema.
    assert 'readonly' not in db.get_schemas()

    # Check contents of the readonly database.
    assert len(ro.select_all(ro.FastRawPaLink)) \
           == len(db.select_all(db.RawUniqueLinks))

    # Check that a query basically works.
    from indra_db.client.readonly import HasAgent
    res = HasAgent('MEK').get_statements(ro)
    assert len(res.statements()) == 2, len(res.statements())

    # Check that belief is represented in the table.
    bdict = {h: b for h, b in ro.select_all([ro.SourceMeta.mk_hash,
                                             ro.SourceMeta.belief])}
    assert all(1 >= b > 0 for b in bdict.values())

    # Check to make sure lambda was diverted correctly.
    call_records = config.get_test_call_records()
    assert len(call_records) == 2
    assert all(rec.func_name == '_set_lambda_env' for rec in call_records)
    assert all(isinstance(rec.args[1], dict) for rec in call_records)
    assert 'INDRAROOVERRIDE' in call_records[0].args[1]
    assert call_records[0].args[1]['INDRAROOVERRIDE'] == str(db.url)
    assert not call_records[1].args[1]