def test_list_dumps(): """Test the dump listing feature.""" _build_s3_test_dump({ '2020-01-01': ['start'], # a dump that is unfinished. '2020-02-01': ['start', 'end'], # a dump that is finished. '2020-03-01': ['sif'] # something strange but possible. }) dump_head = config.get_s3_dump() def check_list(dumps, expected_timestamps): assert all(isinstance(s3p, S3Path) for s3p in dumps) assert all(s3p.key.startswith(dump_head.key) for s3p in dumps) time_stamps = [s3p.key.split('/')[-2] for s3p in dumps] assert expected_timestamps == time_stamps,\ f"Expected: {expected_timestamps}, Got: {time_stamps}" all_dumps = dm.list_dumps() check_list(all_dumps, ['2020-01-01', '2020-02-01', '2020-03-01']) started_dumps = dm.list_dumps(started=True) check_list(started_dumps, ['2020-01-01', '2020-02-01']) done_dumps = dm.list_dumps(started=True, ended=True) check_list(done_dumps, ['2020-02-01']) unfinished_dumps = dm.list_dumps(started=True, ended=False) check_list(unfinished_dumps, ['2020-01-01'])
def list_dumps(): s3_base = get_s3_dump() s3 = boto3.client('s3') res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True)) return [ S3Path.from_key_parts(s3_base.bucket, d['Prefix']) for d in res['CommonPrefixes'] ]
def is_dump_path(cls, s3_path): s3_base = get_s3_dump() if s3_base.bucket != s3_path.bucket: return False if s3_base.key not in s3_path.key: return False if cls.name not in s3_path.key: return False return True
def dump_readonly(self, dump_file=None): """Dump the readonly schema to s3.""" # Form the name of the s3 file, if not given. if dump_file is None: from indra_db.config import get_s3_dump now_str = datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S') dump_loc = get_s3_dump() dump_file = dump_loc.get_element_path('readonly-%s.dump' % now_str) return self.pg_dump(dump_file, schema='readonly')
def get_filled_ro(num_stmts): db = get_prepped_db(num_stmts, with_pa=True, with_agents=True) db.generate_readonly() s3_base = get_s3_dump() assert s3_base, "No s3 config available for db dumps." s3_path = dbu.S3Path.from_string(s3_base.to_string() + '-test') now_str = datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S') s3_path.get_element_path('readonly-%s.dump' % now_str) db.dump_readonly(s3_path) ro = get_temp_ro() ro.load_dump(s3_path) return ro
def dump_hierarchy(): """Dump hierarchy of Dumper classes to S3.""" hierarchy = {} for d in get_all_descendants(Dumper): # Skip the FullPaStmts here. if d.name == 'full_pa_stmts': continue command_name = d.name.replace('_', '-') hierarchy[command_name] = d.config_to_json() s3_base = get_s3_dump() s3_path = s3_base.get_element_path('hierarchy.json') s3 = boto3.client('s3') s3_path.upload(s3, json.dumps(hierarchy).encode('utf-8'))
def list_dumps(started=None, ended=None): """List all dumps, optionally filtered by their status. Parameters ---------- started : Optional[bool] If True, find dumps that have started. If False, find dumps that have NOT been started. If None, do not filter by start status. ended : Optional[bool] The same as `started`, but checking whether the dump is ended or not. Returns ------- list of S3Path objects Each S3Path object contains the bucket and key prefix information for a set of dump files, e.g. [S3Path(bigmech, indra-db/dumps/2020-07-16/), S3Path(bigmech, indra-db/dumps/2020-08-28/), S3Path(bigmech, indra-db/dumps/2020-09-18/), S3Path(bigmech, indra-db/dumps/2020-11-12/), S3Path(bigmech, indra-db/dumps/2020-11-13/)] """ # Get all the dump "directories". s3_base = get_s3_dump() s3 = boto3.client('s3') res = s3.list_objects_v2(Delimiter='/', **s3_base.kw(prefix=True)) if res['KeyCount'] == 0: return [] dumps = [ S3Path.from_key_parts(s3_base.bucket, d['Prefix']) for d in res['CommonPrefixes'] ] # Filter to those that have "started" if started is not None: dumps = [ p for p in dumps if p.get_element_path(Start.file_name()).exists(s3) == started ] # Filter to those that have "ended" if ended is not None: dumps = [ p for p in dumps if p.get_element_path(End.file_name()).exists(s3) == ended ] return dumps
def get_latest_dump_file(): import boto3 from indra.util.aws import iter_s3_keys from indra_db.config import get_s3_dump s3 = boto3.client('s3') s3_path = get_s3_dump() logger.debug("Looking for the latest dump file on s3 to %s." % s3_path) # Get the most recent file from s3. max_date_str = None max_lm_date = None latest_key = None for key, lm_date in iter_s3_keys(s3, with_dt=True, **s3_path.kw()): # Get the date string from the name, ignoring non-standard files. suffix = key.split('/')[-1] m = re.match('readonly-(\S+).dump', suffix) if m is None: logger.debug("{key} is not a standard key, will not be " "considered.".format(key=key)) continue date_str, = m.groups() # Compare the the current maxes. If the date_str and the last # -modified date don't agree, raise an error. if not max_lm_date \ or date_str > max_date_str and lm_date > max_lm_date: max_date_str = date_str max_lm_date = lm_date latest_key = key elif max_lm_date \ and (date_str > max_date_str or lm_date > max_lm_date): raise S3DumpTimeAmbiguityError(key, date_str > max_date_str, lm_date > max_lm_date) logger.debug("Latest dump file from %s was found to be %s." % (s3_path, latest_key)) return S3Path(s3_path.bucket, latest_key)
def _build_s3_test_dump(structure): """Build an s3 dump for testing. The input is a structure of the following form: ``` structure = { '2020-01-01': [ 'start', 'readonly', 'sif', ], '2020-02-01': [ 'start', 'readonly', 'sif', 'belief', 'end' ] } ``` where the names given are the canonical names of dumpers (see class definitions or `dumpers` global for details). """ s3 = boto3.client('s3') dump_head = config.get_s3_dump() s3.create_bucket(Bucket=dump_head.bucket) for date_stamp, contents in structure.items(): for dump_name in contents: dumper_class = dm.dumpers[dump_name] kwargs = {} if dumper_class.db_required: if 'readonly' in dumper_class.db_options: kwargs['ro'] = None else: kwargs['db'] = None dumper_class(date_stamp=date_stamp, **kwargs).shallow_mock_dump()
def _gen_s3_name(self): s3_base = get_s3_dump() s3_path = s3_base.get_element_path(self.date_stamp, '%s.%s' % (self.name, self.fmt)) return s3_path
def _gen_s3_name(self): s3_base = get_s3_dump() s3_path = s3_base.get_element_path(self.date_stamp, self.file_name()) return s3_path
def test_dump_build(): """Test the dump pipeline. Method ------ CREATE CONTEXT: - Create a local principal database with a small amount of content. Aim for representation of stmt motifs and sources. - Create a local readonly database. - Create a fake bucket (moto) RUN THE DUMP CHECK THE RESULTS """ assert config.is_db_testing() # Create the dump locale. s3 = boto3.client('s3') dump_head = config.get_s3_dump() s3.create_bucket(Bucket=dump_head.bucket) assert dump_head.bucket == S3_DATA_LOC['bucket'] # Create the principal database. db = get_temp_db(clear=True) db.copy('text_ref', [ # trid ('1', 1, 'PMC1', 1), # 1 ('2', 2, 'PMC2', 2), # 2 ('3', 3, None, None), # 3 (None, None, 'PMC4', 4) # 4 ], ('pmid', 'pmid_num', 'pmcid', 'pmcid_num')) db.copy('mesh_ref_annotations', [ (1, 11, False), (1, 13, False), (1, 12, True), (2, 12, True), (3, 13, False), (3, 33, True) ], ('pmid_num', 'mesh_num', 'is_concept')) db.copy('text_content', [ # tcid (1, 'pubmed', 'txt', 'abstract'), # 1 (1, 'pmc', 'xml', 'fulltext'), # 2 (2, 'pubmed', 'txt', 'title'), # 3 (3, 'pubmed', 'txt', 'abstract'), # 4 (3, 'pmc', 'xml', 'fulltext'), # 5 (4, 'pmc', 'xml', 'fulltext') # 6 ], ('text_ref_id', 'source', 'format', 'text_type')) db.copy('reading', [(tcid, rdr, 1, reader_versions[rdr][-1], 'emtpy') for tcid, rdr in [ # 1 2 3 (1, 'reach'), (1, 'eidos'), (1, 'isi'), # 4 (2, 'reach'), # 5 6 7 (3, 'reach'), (3, 'eidos'), (3, 'trips'), # 8 (4, 'reach'), # 9 (5, 'reach'), # 10 (6, 'reach') ]], ('text_content_id', 'reader', 'batch_id', 'reader_version', 'format')) db.copy('db_info', [ ('signor', 'signor', 'Signor'), # 1 ('pc', 'biopax', 'Pathway Commons'), # 2 ('medscan', 'medscan', 'MedScan') # 3 ], ('db_name', 'source_api', 'db_full_name')) raw_stmts = { 'reading': { 2: [ Inhibition( Agent('Fever', db_refs={'TEXT': 'fever', 'MESH': 'D005334'}), Agent('Cough', db_refs={'TEXT': 'cough', 'MESH': 'D003371'}), evidence=Evidence(text="We found fever inhibits cough.") ) ], 4: [ Phosphorylation( Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}), Agent('ERK', db_refs={'FPLX': 'MEK', 'TEXT': 'erk'}), evidence=Evidence(text="mek phosphorylates erk, so say I.") ), Activation( Agent('MAP2K1', db_refs={'HGNC': '6840', 'TEXT': 'MEK1'}), Agent('MAPK1', db_refs={'HGNC': '6871', 'TEXT': 'ERK1'}), evidence=Evidence(text="MEK1 activates ERK1, or os I'm told.") ), Activation( Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}), evidence=Evidence(text="ERK activates JNK, maybe.") ), Complex([ Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MAP2K'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'MAPK'}), Agent('RAF', db_refs={'FPLX': 'RAF', 'TEXT': 'RAF'}) ], evidence=Evidence(text="MAP2K, MAPK, and RAF form a complex.")) ], 7: [ Activation( Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}), evidence=Evidence(text='ERK activates JNK, maybe.') ) ], 8: [ Complex([ Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'erk'}) ], evidence=Evidence(text="...in the mek-erk complex.")) ], }, 'databases': { 2: [ Conversion( Agent('FRK', db_refs={'HGNC': '3955'}), [Agent('ATP', db_refs={'MESH': 'D000255'})], [Agent('hydron', db_refs={'CHEBI': 'CHEBI:15378'})] ) ], 3: [ Phosphorylation( Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MEK'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), evidence=Evidence(text="...MEK phosphorylates ERK medscan.") ) ] } } simple_insert_stmts(db, raw_stmts) # Run preassembly. prass.create_corpus(db) # Do the dump proceedure. ro = get_temp_ro(clear=True) dump(db, ro) # Check that the s3 dump exists. all_dumps = dm.list_dumps() assert len(all_dumps) == 1 # Check to make sure all the dump files are present. dump_path = all_dumps[0] file_list = dump_path.list_objects(s3) assert dm.Start.from_list(file_list) assert dm.Readonly.from_list(file_list) assert dm.Belief.from_list(file_list) assert dm.Sif.from_list(file_list) assert dm.StatementHashMeshId.from_list(file_list) assert dm.FullPaStmts.from_list(file_list) assert dm.End.from_list(file_list) # Check what tables are active in the readonly database. active_tables = ro.get_active_tables() for tbl in ro.get_tables(): if ro.tables[tbl]._temp: # If it was temp, it should be gone. assert tbl not in active_tables else: # Otherwise, it should be there. assert tbl in active_tables # Check that the principal db has no more ro schema. assert 'readonly' not in db.get_schemas() # Check contents of the readonly database. assert len(ro.select_all(ro.FastRawPaLink)) \ == len(db.select_all(db.RawUniqueLinks)) # Check that a query basically works. from indra_db.client.readonly import HasAgent res = HasAgent('MEK').get_statements(ro) assert len(res.statements()) == 2, len(res.statements()) # Check that belief is represented in the table. bdict = {h: b for h, b in ro.select_all([ro.SourceMeta.mk_hash, ro.SourceMeta.belief])} assert all(1 >= b > 0 for b in bdict.values()) # Check to make sure lambda was diverted correctly. call_records = config.get_test_call_records() assert len(call_records) == 2 assert all(rec.func_name == '_set_lambda_env' for rec in call_records) assert all(isinstance(rec.args[1], dict) for rec in call_records) assert 'INDRAROOVERRIDE' in call_records[0].args[1] assert call_records[0].args[1]['INDRAROOVERRIDE'] == str(db.url) assert not call_records[1].args[1]