def test_util_find_on_s3_functions(): # Local imports are recommended when using moto from emmaa.util import sort_s3_files_by_date_str, find_latest_s3_file, \ find_nth_latest_s3_file, find_number_of_files_on_s3 # Bucket has mm (pkl) and results (json) files, both in results folder client = setup_bucket(add_mm=True, add_results=True) # Get both files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/') assert len(files) == 2 # Specific extension files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/', '.json') assert len(files) == 1 # Longer prefix files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/results_') assert len(files) == 1 assert find_latest_s3_file(TEST_BUCKET_NAME, 'results/test/results_') assert not find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/results_') assert find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/') assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/') == 2 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/results_') == 1 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/', '.json') == 1
def _get_previous_json_stats(self): key = find_latest_s3_file( self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # This is the first time statistics is generated for this model if key is None: logger.info(f'Could not find a key to the previous statistics ') return # If stats for this date exists, previous stats is the second latest if strip_out_date(key) == self.latest_round.date_str: logger.info(f'Statistics for latest round already exists') key = find_nth_latest_s3_file( 1, self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # Store the date string to find previous round with it self.previous_date_str = strip_out_date(key) logger.info(f'Loading earlier statistics from {key}') previous_json_stats = load_json_from_s3(self.bucket, key) return previous_json_stats
def last_updated_date(model, file_type='model', date_format='date', tests='large_corpus_tests', extension='.pkl', n=0, bucket=EMMAA_BUCKET_NAME): """Find the most recent or the nth file of given type on S3 and return its creation date. Example file name: models/aml/model_2018-12-13-18-11-54.pkl Parameters ---------- model : str Model name to look for file_type : str Type of a file to find the latest file for. Accepted values: 'model', 'test_results', 'model_stats', 'test_stats'. date_format : str Format of the returned date. Accepted values are 'datetime' (returns a date in the format "YYYY-MM-DD-HH-mm-ss") and 'date' (returns a date in the format "YYYY-MM-DD"). Default is 'date'. extension : str The extension the model file needs to have. Default is '.pkl' n : int Index of the file in list of S3 files sorted by date (0-indexed). bucket : str Name of bucket on S3. Returns ------- last_updated : str A string of the selected format. """ if file_type == 'model': folder_name = 'models' prefix_new = prefix_old = f'models/{model}/model_' elif file_type == 'test_results': prefix_new = f'results/{model}/results_{tests}' prefix_old = f'results/{model}/results_' elif file_type == 'model_stats': prefix_new = f'model_stats/{model}/model_stats_' prefix_old = f'stats/{model}/stats_' elif file_type == 'test_stats': prefix_new = f'stats/{model}/test_stats_{tests}' prefix_old = f'stats/{model}/stats_' else: raise TypeError(f'Files of type {file_type} are not supported') try: return strip_out_date(find_nth_latest_s3_file(n=n, bucket=bucket, prefix=prefix_new, extension=extension), date_format=date_format) except TypeError: try: return strip_out_date(find_nth_latest_s3_file(n=n, bucket=bucket, prefix=prefix_old, extension=extension), date_format=date_format) except TypeError: logger.info('Could not find latest update date') return ''