def test_util_find_on_s3_functions(): # Local imports are recommended when using moto from emmaa.util import sort_s3_files_by_date_str, find_latest_s3_file, \ find_nth_latest_s3_file, find_number_of_files_on_s3 # Bucket has mm (pkl) and results (json) files, both in results folder client = setup_bucket(add_mm=True, add_results=True) # Get both files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/') assert len(files) == 2 # Specific extension files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/', '.json') assert len(files) == 1 # Longer prefix files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/results_') assert len(files) == 1 assert find_latest_s3_file(TEST_BUCKET_NAME, 'results/test/results_') assert not find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/results_') assert find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/') assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/') == 2 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/results_') == 1 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/', '.json') == 1
def get_model_stats(model, extension='.json'): """Gets the latest statistics for the given model Parameters ---------- model : str Model name to look for extension : str Returns ------- model_data : json The json formatted data containing the statistics for the model """ s3 = get_s3_client() # Need jsons for model meta data and test statistics. File name examples: # stats/skcm/stats_2019-08-20-17-34-40.json prefix = f'stats/{model}/stats_' latest_file_key = find_latest_s3_file(bucket=EMMAA_BUCKET_NAME, prefix=prefix, extension=extension) model_data_object = s3.get_object(Bucket=EMMAA_BUCKET_NAME, Key=latest_file_key) return json.loads(model_data_object['Body'].read().decode('utf8'))
def model_last_updated(model, extension='.pkl'): """Find the most recent pickle file of model and return its creation date Example file name: models/aml/model_2018-12-13-18-11-54.pkl Parameters ---------- model : str Model name to look for extension : str The extension the model file needs to have. Default is '.pkl' Returns ------- last_updated : str A string of the format "YYYY-MM-DD-HH-mm-ss" """ prefix = f'models/{model}/model_' try: return strip_out_date( find_latest_s3_file(bucket=EMMAA_BUCKET_NAME, prefix=prefix, extension=extension)) except TypeError: logger.info('Could not find latest update date') return ''
def get_emmaa_model_statements(model): s3 = get_s3_client(unsigned=True) latest_model_json = find_latest_s3_file('emmaa', 'assembled/%s' % model) stmts_obj = s3.get_object(Bucket='emmaa', Key=latest_model_json) stmts = stmts_from_json( json.loads(stmts_obj['Body'].read().decode('utf-8'))) return stmts
def _get_latest_round(self): latest_key = find_latest_s3_file('emmaa', f'results/{self.model_name}/results_', extension='.json') if latest_key is None: logger.info(f'Could not find a key to the latest test results ' f'for {self.model_name} model.') return tr = TestRound.load_from_s3_key(latest_key) return tr
def load_test_results_from_s3(model_name): base_key = f'results/{model_name}' latest_result_key = find_latest_s3_file('emmaa', f'{base_key}/results_', extension='.json') client = get_s3_client() logger.info(f'Loading test results from {latest_result_key}') obj = client.get_object(Bucket='emmaa', Key=latest_result_key) test_results = json.loads(obj['Body'].read().decode('utf8')) return test_results
def get_assembled_statements(model, date=None, bucket=EMMAA_BUCKET_NAME): """Load and return a list of assembled statements. Parameters ---------- model : str A name of a model. date : str or None Date in "YYYY-MM-DD" format for which to load the statements. If None, loads the latest available statements. bucket : str Name of S3 bucket to look for a file. Defaults to 'emmaa'. Returns ------- stmts : list[indra.statements.Statement] A list of assembled statements. latest_file_key : str Key of a file with statements on s3. """ if not date: prefix = f'assembled/{model}/statements_' else: prefix = f'assembled/{model}/statements_{date}' # Try loading gzip file latest_file_key = find_latest_s3_file(bucket, prefix, '.gz') if not latest_file_key: # Could be saved with .zip extension latest_file_key = find_latest_s3_file(bucket, prefix, '.zip') if latest_file_key: stmt_jsons = load_gzip_json_from_s3(bucket, latest_file_key) else: # Try loading json file latest_file_key = find_latest_s3_file(bucket, prefix, '.json') if latest_file_key: stmt_jsons = load_json_from_s3(bucket, latest_file_key) # Didn't get gzip, zip or json else: logger.info(f'No assembled statements found for {model}.') return None, None stmts = stmts_from_json(stmt_jsons) return stmts, latest_file_key
def _load_stmts_from_cache(model): stmts, file_key = stmts_cache.get(model, (None, None)) latest_on_s3 = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') if file_key != latest_on_s3: stmts, file_key = get_assembled_statements(model, EMMAA_BUCKET_NAME) stmts_cache[model] = (stmts, file_key) else: logger.info(f'Loaded assembled stmts for {model} from cache.') return stmts
def get_assembled_statements(model, bucket=EMMAA_BUCKET_NAME): latest_file_key = find_latest_s3_file(bucket, f'assembled/{model}/statements_', '.json') if not latest_file_key: logger.info(f'No assembled statements found for {model}.') return None, None logger.info(f'Loading assembled statements for {model} from S3.') stmt_jsons = load_json_from_s3(bucket, latest_file_key) stmts = stmts_from_json(stmt_jsons) return stmts, latest_file_key
def _get_latest_round(self): latest_key = find_latest_s3_file( self.bucket, f'results/{self.model_name}/model_manager_', extension='.pkl') if latest_key is None: logger.info(f'Could not find a key to the latest model manager ' f'for {self.model_name} model.') return logger.info(f'Loading latest round from {latest_key}') mr = ModelRound.load_from_s3_key(latest_key, bucket=self.bucket) return mr
def get_latest_statements_url(model): if does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/latest_statements_{model}'): fkey = f'assembled/{model}/latest_statements_{model}.json' link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'): fkey = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' else: link = None return {'link': link}
def _get_latest_round(self): latest_key = find_latest_s3_file( self.bucket, f'results/{self.model_name}/results_{self.test_corpus}', extension='.json') if latest_key is None: logger.info(f'Could not find a key to the latest test results ' f'for {self.model_name} model.') return logger.info(f'Loading latest round from {latest_key}') tr = TestRound.load_from_s3_key(latest_key, bucket=self.bucket) return tr
def _load_tests_from_cache(test_corpus): tests, file_key = tests_cache.get(test_corpus, (None, None)) latest_on_s3 = find_latest_s3_file(EMMAA_BUCKET_NAME, f'tests/{test_corpus}', '.pkl') if file_key != latest_on_s3: tests, file_key = load_tests_from_s3(test_corpus, EMMAA_BUCKET_NAME) if isinstance(tests, dict): tests = tests['tests'] tests_cache[test_corpus] = (tests, file_key) else: logger.info(f'Loaded {test_corpus} from cache.') return tests
def _get_previous_json_stats(self): key = find_latest_s3_file('emmaa', f'stats/{self.model_name}/stats_', extension='.json') if key is None: logger.info(f'Could not find a key to the previous statistics ' f'for {self.model_name} model.') return client = get_s3_client() logger.info(f'Loading earlier statistics from {key}') obj = client.get_object(Bucket='emmaa', Key=key) previous_json_stats = json.loads(obj['Body'].read().decode('utf8')) return previous_json_stats
def load_model_manager_from_cache(model_name, bucket=EMMAA_BUCKET_NAME): model_manager = model_manager_cache.get(model_name) if model_manager: latest_on_s3 = find_latest_s3_file( bucket, f'results/{model_name}/model_manager_', '.pkl') cached_date = model_manager.date_str logger.info(f'Found model manager cached on {cached_date} and ' f'latest file on S3 is {latest_on_s3}') if cached_date in latest_on_s3: logger.info(f'Loaded model manager for {model_name} from cache.') return model_manager logger.info(f'Loading model manager for {model_name} from S3.') model_manager = load_model_manager_from_s3(model_name=model_name, bucket=bucket) model_manager_cache[model_name] = model_manager return model_manager
def load_latest_statements(model): if does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/latest_statements_{model}'): fkey = f'assembled/{model}/latest_statements_{model}.json' elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'): fkey = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') else: fkey = None if fkey: stmt_jsons = load_json_from_s3(EMMAA_BUCKET_NAME, fkey) link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' else: stmt_jsons = [] link = '' return {'statements': stmt_jsons, 'link': link}
def load_model_manager_from_s3(model_name=None, key=None, bucket=EMMAA_BUCKET_NAME): # First try find the file from specified key if key: try: model_manager = load_pickle_from_s3(bucket, key) if not model_manager.model.assembled_stmts: stmts, _ = get_assembled_statements(model_manager.model.name, strip_out_date( model_manager.date_str, 'date'), bucket=bucket) model_manager.model.assembled_stmts = stmts return model_manager except Exception as e: logger.info('Could not load the model manager directly') logger.info(e) if not model_name: model_name = key.split('/')[1] date = strip_out_date(key, 'date') logger.info('Trying to load model manager from statements') try: model_manager = ModelManager.load_from_statements( model_name, date=date, bucket=bucket) return model_manager except Exception as e: logger.info('Could not load the model manager from ' 'statements') logger.info(e) return None # Now try find the latest key for given model if model_name: # Versioned key = find_latest_s3_file(bucket, f'results/{model_name}/model_manager_', '.pkl') if key is None: # Non-versioned key = f'results/{model_name}/latest_model_manager.pkl' return load_model_manager_from_s3(model_name=model_name, key=key, bucket=bucket) # Could not find either from key or from model name. logger.info('Could not find the model manager.') return None
def _get_previous_json_stats(self): key = find_latest_s3_file( self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # This is the first time statistics is generated for this model if key is None: logger.info(f'Could not find a key to the previous statistics ') return # If stats for this date exists, previous stats is the second latest if strip_out_date(key) == self.latest_round.date_str: logger.info(f'Statistics for latest round already exists') key = find_nth_latest_s3_file( 1, self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # Store the date string to find previous round with it self.previous_date_str = strip_out_date(key) logger.info(f'Loading earlier statistics from {key}') previous_json_stats = load_json_from_s3(self.bucket, key) return previous_json_stats
def load_stmts_from_s3(model_name, bucket=EMMAA_BUCKET_NAME): """Return the list of EMMAA Statements constituting the latest model. Parameters ---------- model_name : str The name of the model whose config should be loaded. Returns ------- stmts : list of emmaa.statements.EmmaaStatement The list of EMMAA Statements in the latest model version. """ base_key = f'models/{model_name}' latest_model_key = find_latest_s3_file(bucket, f'{base_key}/model_', extension='.pkl') logger.info(f'Loading model state from {latest_model_key}') stmts = load_pickle_from_s3(bucket, latest_model_key) return stmts, latest_model_key
def load_from_statements(cls, model_name, mode='local', date=None, bucket=EMMAA_BUCKET_NAME): config = load_config_from_s3(model_name, bucket=bucket) if date: prefix = f'papers/{model_name}/paper_ids_{date}' else: prefix = f'papers/{model_name}/paper_ids_' paper_key = find_latest_s3_file(bucket, prefix, 'json') if paper_key: paper_ids = load_json_from_s3(bucket, paper_key) else: paper_ids = None model = EmmaaModel(model_name, config, paper_ids) # Loading assembled statements to avoid reassembly stmts, fname = get_assembled_statements(model_name, date, bucket) model.assembled_stmts = stmts model.date_str = strip_out_date(fname, 'datetime') mm = cls(model, mode=mode) return mm
def load_stmts_from_s3(model_name): """Return the list of EMMAA Statements constituting the latest model. Parameters ---------- model_name : str The name of the model whose config should be loaded. Returns ------- stmts : list of emmaa.statements.EmmaaStatement The list of EMMAA Statements in the latest model version. """ client = get_s3_client() base_key = f'models/{model_name}' latest_model_key = find_latest_s3_file('emmaa', f'{base_key}/model_', extension='.pkl') logger.info(f'Loading model state from {latest_model_key}') obj = client.get_object(Bucket='emmaa', Key=latest_model_key) stmts = pickle.loads(obj['Body'].read()) return stmts
def load_tests_from_s3(test_name, bucket=EMMAA_BUCKET_NAME): """Load Emmaa Tests with the given name from S3. Parameters ---------- test_name : str Looks for a test file in the emmaa bucket on S3 with key 'tests/{test_name}'. Return ------ list of EmmaaTest List of EmmaaTest objects loaded from S3. """ prefix = f'tests/{test_name}' try: test_key = find_latest_s3_file(bucket, prefix, '.pkl') except ValueError: test_key = f'tests/{test_name}.pkl' logger.info(f'Loading tests from {test_key}') tests = load_pickle_from_s3(bucket, test_key) return tests, test_key
def load_model_manager_from_s3(model_name=None, key=None, bucket=EMMAA_BUCKET_NAME): # First try find the file from specified key if key: try: model_manager = load_pickle_from_s3(bucket, key) return model_manager except Exception as e: logger.info('Could not load the model manager') logger.info(e) # Now try find the latest key for given model if model_name: # Versioned key = find_latest_s3_file(bucket, f'results/{model_name}/model_manager_', '.pkl') if key is None: # Non-versioned key = f'results/{model_name}/latest_model_manager.pkl' return load_model_manager_from_s3(key=key, bucket=bucket) # Could not find either from key or from model name. logger.info('Could not find the model manager.') return None
def get_model_stats(model, mode, tests=None, date=None, extension='.json', n=0, bucket=EMMAA_BUCKET_NAME): """Gets the latest statistics for the given model Parameters ---------- model : str Model name to look for mode : str Type of stats to generate (model or test) tests : str A name of a test corpus. Default is large_corpus_tests. date : str or None Date for which the stats will be returned in "YYYY-MM-DD" format. extension : str Extension of the file. n : int Index of the file in list of S3 files sorted by date (0-indexed). bucket : str Name of bucket on S3. Returns ------- model_data : json The json formatted data containing the statistics for the model """ if not tests: tests = _default_test(model, bucket=bucket) # If date is not specified, get the latest or the nth if not date: if mode == 'model': date = last_updated_date(model, 'model_stats', 'date', extension=extension, n=n, bucket=bucket) elif mode == 'test': date = last_updated_date(model, 'test_stats', 'date', tests=tests, extension=extension, n=n, bucket=bucket) else: raise TypeError('Mode must be either model or tests') # Try find new formatted stats (separate for model and tests) if mode == 'model': # File name example: # model_stats/skcm/model_stats_2019-08-20-17-34-40.json prefix = f'model_stats/{model}/model_stats_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) elif mode == 'test': # File name example: # stats/skcm/test_stats_large_corpus_tests_2019-08-20-17-34-40.json prefix = f'stats/{model}/test_stats_{tests}_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) else: raise TypeError('Mode must be either model or tests') # This might be an older file with model and test stats combined. # File name example: stats/skcm/stats_2019-08-20-17-34-40.json if not latest_file_key and (mode == 'model' or (mode == 'test' and tests == _default_test(model))): prefix = f'stats/{model}/stats_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) # If we still didn't filnd the file it probably does not exist if not latest_file_key: return None, None return (load_json_from_s3(bucket, latest_file_key), latest_file_key)
def get_all_statements_page(model): sort_by = request.args.get('sort_by', 'evidence') page = int(request.args.get('page', 1)) filter_curated = request.args.get('filter_curated', False) filter_curated = (filter_curated == 'true') offset = (page - 1) * 1000 stmts = _load_stmts_from_cache(model) stmts_by_hash = {} for stmt in stmts: stmts_by_hash[str(stmt.get_hash())] = stmt msg = None curations = get_curations() cur_counts = _count_curations(curations, stmts_by_hash) if filter_curated: stmts = [ stmt for stmt in stmts if str(stmt.get_hash()) not in cur_counts ] test_stats, _ = get_model_stats(model, 'test') stmt_counts = test_stats['test_round_summary'].get('path_stmt_counts', []) stmt_counts_dict = dict(stmt_counts) if len(stmts) % 1000 == 0: total_pages = len(stmts) // 1000 else: total_pages = len(stmts) // 1000 + 1 if page + 1 <= total_pages: next_page = page + 1 else: next_page = None if page != 1: prev_page = page - 1 else: prev_page = None if sort_by == 'evidence': stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True)[offset:offset + 1000] elif sort_by == 'paths': if not stmt_counts: msg = 'Sorting by paths is not available, sorting by evidence' stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True)[offset:offset + 1000] else: stmts = [] for (stmt_hash, count) in stmt_counts[offset:offset + 1000]: try: stmts.append(stmts_by_hash[stmt_hash]) except KeyError: continue stmt_rows = [] for stmt in stmts: stmt_row = _get_stmt_row(stmt, 'model_statement', model, cur_counts, None, stmt_counts_dict) stmt_rows.append(stmt_row) table_title = f'All statements in {model.upper()} model.' if does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/latest_statements_{model}'): fkey = f'assembled/{model}/latest_statements_{model}.json' link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'): fkey = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' else: link = None return render_template('evidence_template.html', stmt_rows=stmt_rows, model=model, source='model_statement', table_title=table_title, msg=msg, is_all_stmts=True, prev=prev_page, next=next_page, filter_curated=filter_curated, sort_by=sort_by, link=link)