def add_model_from_s3(self, model_id, config=None, number_of_updates=3, bucket=EMMAA_BUCKET_NAME): """Add data for one model from S3 files.""" if not config: config = load_config_from_s3(model_id) test_corpora = config['test']['test_corpus'] if isinstance(test_corpora, str): test_corpora = [test_corpora] stmt_files = sort_s3_files_by_date_str( bucket, f'assembled/{model_id}/statements_', '.gz') stmt_files_to_use = stmt_files[:number_of_updates] for stmt_file in stmt_files_to_use: date = strip_out_date(stmt_file, 'date') dt = strip_out_date(stmt_file, 'datetime') # First get and add statements stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file) self.add_statements(model_id, date, stmt_jsons) # Also update the path counts from each test corpus for test_corpus in test_corpora: key = f'results/{model_id}/results_{test_corpus}_{dt}.json' try: results = load_json_from_s3(bucket, key) path_counts = results[0].get('path_stmt_counts') if path_counts: self.update_statements_path_counts( model_id, date, path_counts) except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': logger.warning(f'No results file for {key}, skipping') continue else: raise e
def test_statements_url(self): rj = self.call_api('get', 'latest/statements_url/marm_model') assert rj set(rj.keys()) == {'link'} key = rj['link'].split('/')[-1] assert not strip_out_date(key) dated_rj = self.call_api( 'get', 'latest/statements_url/marm_model?dated=true') key = dated_rj['link'].split('/')[-1] assert strip_out_date(key)
def _make_path_stmts(self, stmts, merge=False): sentences = [] date = strip_out_date(self.date_str, 'date') if merge and isinstance(stmts[0], Statement): groups = group_and_sort_statements(stmts, grouping_level='relation') for _, rel_key, group_stmts, _ in groups: sentence = make_string_from_relation_key(rel_key) + '.' stmt_hashes = [gr_st.get_hash() for _, _, gr_st, _ in group_stmts] url_param = parse.urlencode( {'stmt_hash': stmt_hashes, 'source': 'model_statement', 'model': self.model.name, 'date': date}, doseq=True) link = f'/evidence?{url_param}' sentences.append((link, sentence, '')) else: for stmt in stmts: if isinstance(stmt, PybelEdge): sentence = pybel_edge_to_english(stmt) sentences.append(('', sentence, '')) elif isinstance(stmt, RefEdge): sentence = stmt.to_english() sentences.append(('', sentence, '')) else: ea = EnglishAssembler([stmt]) sentence = ea.make_model() stmt_hashes = [stmt.get_hash()] url_param = parse.urlencode( {'stmt_hash': stmt_hashes, 'source': 'model_statement', 'model': self.model.name, 'date': date}, doseq=True) link = f'/evidence?{url_param}' sentences.append((link, sentence, '')) return sentences
def model_last_updated(model, extension='.pkl'): """Find the most recent pickle file of model and return its creation date Example file name: models/aml/model_2018-12-13-18-11-54.pkl Parameters ---------- model : str Model name to look for extension : str The extension the model file needs to have. Default is '.pkl' Returns ------- last_updated : str A string of the format "YYYY-MM-DD-HH-mm-ss" """ prefix = f'models/{model}/model_' try: return strip_out_date( find_latest_s3_file(bucket=EMMAA_BUCKET_NAME, prefix=prefix, extension=extension)) except TypeError: logger.info('Could not find latest update date') return ''
def load_from_s3(klass, model_name, bucket=EMMAA_BUCKET_NAME): """Load the latest model state from S3. Parameters ---------- model_name : str Name of model to load. This function expects the latest model to be found on S3 in the emmaa bucket with key 'models/{model_name}/model_{date_string}', and the model config file at 'models/{model_name}/config.json'. Returns ------- emmaa.model.EmmaaModel Latest instance of EmmaaModel with the given name, loaded from S3. """ config = load_config_from_s3(model_name, bucket=bucket) stmts, stmts_key = load_stmts_from_s3(model_name, bucket=bucket) date = strip_out_date(stmts_key) # Stmts and papers should be from the same date key = f'papers/{model_name}/paper_ids_{date}.json' try: paper_ids = load_json_from_s3(bucket, key) except ClientError as e: logger.warning(f'Could not find paper IDs mapping due to: {e}') paper_ids = None em = klass(model_name, config, paper_ids) em.stmts = stmts if not paper_ids: em.paper_ids = em.get_paper_ids_from_stmts(stmts) return em
def load_model_manager_from_s3(model_name=None, key=None, bucket=EMMAA_BUCKET_NAME): # First try find the file from specified key if key: try: model_manager = load_pickle_from_s3(bucket, key) if not model_manager.model.assembled_stmts: stmts, _ = get_assembled_statements(model_manager.model.name, strip_out_date( model_manager.date_str, 'date'), bucket=bucket) model_manager.model.assembled_stmts = stmts return model_manager except Exception as e: logger.info('Could not load the model manager directly') logger.info(e) if not model_name: model_name = key.split('/')[1] date = strip_out_date(key, 'date') logger.info('Trying to load model manager from statements') try: model_manager = ModelManager.load_from_statements( model_name, date=date, bucket=bucket) return model_manager except Exception as e: logger.info('Could not load the model manager from ' 'statements') logger.info(e) return None # Now try find the latest key for given model if model_name: # Versioned key = find_latest_s3_file(bucket, f'results/{model_name}/model_manager_', '.pkl') if key is None: # Non-versioned key = f'results/{model_name}/latest_model_manager.pkl' return load_model_manager_from_s3(model_name=model_name, key=key, bucket=bucket) # Could not find either from key or from model name. logger.info('Could not find the model manager.') return None
def _get_previous_json_stats(self): key = find_latest_s3_file( self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # This is the first time statistics is generated for this model if key is None: logger.info(f'Could not find a key to the previous statistics ') return # If stats for this date exists, previous stats is the second latest if strip_out_date(key) == self.latest_round.date_str: logger.info(f'Statistics for latest round already exists') key = find_nth_latest_s3_file( 1, self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # Store the date string to find previous round with it self.previous_date_str = strip_out_date(key) logger.info(f'Loading earlier statistics from {key}') previous_json_stats = load_json_from_s3(self.bucket, key) return previous_json_stats
def load_from_statements(cls, model_name, mode='local', date=None, bucket=EMMAA_BUCKET_NAME): config = load_config_from_s3(model_name, bucket=bucket) if date: prefix = f'papers/{model_name}/paper_ids_{date}' else: prefix = f'papers/{model_name}/paper_ids_' paper_key = find_latest_s3_file(bucket, prefix, 'json') if paper_key: paper_ids = load_json_from_s3(bucket, paper_key) else: paper_ids = None model = EmmaaModel(model_name, config, paper_ids) # Loading assembled statements to avoid reassembly stmts, fname = get_assembled_statements(model_name, date, bucket) model.assembled_stmts = stmts model.date_str = strip_out_date(fname, 'datetime') mm = cls(model, mode=mode) return mm
def load_from_s3_key(cls, key, bucket=EMMAA_BUCKET_NAME): logger.info(f'Loading json from {key}') json_results = load_json_from_s3(bucket, key) date_str = json_results[0].get('date_str', strip_out_date(key)) return cls(json_results, date_str)
def last_updated_date(model, file_type='model', date_format='date', tests='large_corpus_tests', extension='.pkl', n=0, bucket=EMMAA_BUCKET_NAME): """Find the most recent or the nth file of given type on S3 and return its creation date. Example file name: models/aml/model_2018-12-13-18-11-54.pkl Parameters ---------- model : str Model name to look for file_type : str Type of a file to find the latest file for. Accepted values: 'model', 'test_results', 'model_stats', 'test_stats'. date_format : str Format of the returned date. Accepted values are 'datetime' (returns a date in the format "YYYY-MM-DD-HH-mm-ss") and 'date' (returns a date in the format "YYYY-MM-DD"). Default is 'date'. extension : str The extension the model file needs to have. Default is '.pkl' n : int Index of the file in list of S3 files sorted by date (0-indexed). bucket : str Name of bucket on S3. Returns ------- last_updated : str A string of the selected format. """ if file_type == 'model': folder_name = 'models' prefix_new = prefix_old = f'models/{model}/model_' elif file_type == 'test_results': prefix_new = f'results/{model}/results_{tests}' prefix_old = f'results/{model}/results_' elif file_type == 'model_stats': prefix_new = f'model_stats/{model}/model_stats_' prefix_old = f'stats/{model}/stats_' elif file_type == 'test_stats': prefix_new = f'stats/{model}/test_stats_{tests}' prefix_old = f'stats/{model}/stats_' else: raise TypeError(f'Files of type {file_type} are not supported') try: return strip_out_date(find_nth_latest_s3_file(n=n, bucket=bucket, prefix=prefix_new, extension=extension), date_format=date_format) except TypeError: try: return strip_out_date(find_nth_latest_s3_file(n=n, bucket=bucket, prefix=prefix_old, extension=extension), date_format=date_format) except TypeError: logger.info('Could not find latest update date') return ''