Ejemplo n.º 1
0
 def save_to_s3(self, bucket=EMMAA_BUCKET_NAME):
     """Dump the model state to S3."""
     date_str = make_date_str()
     fname = f'models/{self.name}/model_{date_str}'
     # Dump as pickle
     save_pickle_to_s3(self.stmts, bucket, key=fname + '.pkl')
     # Dump as json
     save_json_to_s3(self.stmts, bucket, key=fname + '.json')
Ejemplo n.º 2
0
 def save_to_s3(self, bucket=EMMAA_BUCKET_NAME):
     """Dump the model state to S3."""
     fname = f'models/{self.name}/model_{self.date_str}'
     # Dump as pickle
     save_pickle_to_s3(self.stmts, bucket, key=fname+'.pkl')
     # Save ids to stmt hashes mapping as json
     id_fname = f'papers/{self.name}/paper_ids_{self.date_str}.json'
     save_json_to_s3(list(self.paper_ids), bucket, key=id_fname)
Ejemplo n.º 3
0
 def upload_results(self,
                    test_corpus='large_corpus_tests',
                    test_data=None,
                    bucket=EMMAA_BUCKET_NAME):
     """Upload results to s3 bucket."""
     json_dict = self.results_to_json(test_data)
     result_key = (f'results/{self.model.name}/results_'
                   f'{test_corpus}_{self.date_str}.json')
     logger.info(f'Uploading test results to {result_key}')
     save_json_to_s3(json_dict, bucket, result_key)
Ejemplo n.º 4
0
def save_tests_to_s3(tests, bucket, key, save_format='pkl'):
    """Save tests in pkl, json or jsonl format."""
    if save_format == 'pkl':
        save_pickle_to_s3(tests, bucket, key)
    elif save_format in ['json', 'jsonl']:
        if isinstance(tests, list):
            stmts = [test.stmt for test in tests]
        elif isinstance(tests, dict):
            stmts = [test.stmt for test in tests['tests']]
        stmts_json = stmts_to_json(stmts)
        save_json_to_s3(stmts_json, bucket, key, save_format)
Ejemplo n.º 5
0
 def save_assembled_statements(self, bucket=EMMAA_BUCKET_NAME):
     """Upload assembled statements jsons to S3 bucket."""
     stmt_jsons = self.assembled_stmts_to_json()
     # Save a timestapmed version and a generic latest version of files
     key1 = f'assembled/{self.model.name}/statements_{self.date_str}.json'
     key2 = f'assembled/{self.model.name}/' \
            f'latest_statements_{self.model.name}.json'
     logger.info(f'Uploading assembled statements to {key1}')
     save_json_to_s3(stmt_jsons, bucket, key1)
     logger.info(f'Uploading assembled statements to {key2}')
     save_json_to_s3(stmt_jsons, bucket, key2)
Ejemplo n.º 6
0
def save_config_to_s3(model_name, config, bucket=EMMAA_BUCKET_NAME):
    """Upload config settings for a model to S3.

    Parameters
    ----------
    model_name : str
        The name of the model whose config should be saved to S3.
    config : dict
        A JSON dict of configurations for the model.
    """
    base_key = f'models/{model_name}'
    config_key = f'{base_key}/config.json'
    logger.info(f'Saving model config to {config_key}')
    save_json_to_s3(config, bucket, config_key)
Ejemplo n.º 7
0
 def save_stmts(stmts, model_name):
     stmts_json = stmts_to_json(stmts)
     # Save a timestapmed version and a generic latest version of files
     dated_key = f'assembled/{model_name}/statements_{self.date_str}'
     latest_key = f'assembled/{model_name}/' \
                  f'latest_statements_{model_name}'
     for ext in ('json', 'jsonl'):
         latest_obj_key = latest_key + '.' + ext
         logger.info('Uploading assembled statements to '
                     f'{latest_obj_key}')
         save_json_to_s3(stmts_json, bucket, latest_obj_key, ext)
     dated_jsonl = dated_key + '.jsonl'
     dated_zip = dated_key + '.gz'
     logger.info(f'Uploading assembled statements to {dated_jsonl}')
     save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl')
     logger.info(f'Uploading assembled statements to {dated_zip}')
     save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
Ejemplo n.º 8
0
 def save_assembled_statements(self, bucket=EMMAA_BUCKET_NAME):
     """Upload assembled statements jsons to S3 bucket."""
     stmts = self.model.assembled_stmts
     stmts_json = stmts_to_json(stmts)
     # Save a timestapmed version and a generic latest version of files
     dated_key = f'assembled/{self.model.name}/statements_{self.date_str}'
     latest_key = f'assembled/{self.model.name}/' \
                  f'latest_statements_{self.model.name}'
     for ext in ('json', 'jsonl'):
         latest_obj_key = latest_key + '.' + ext
         logger.info(f'Uploading assembled statements to {latest_obj_key}')
         save_json_to_s3(stmts_json, bucket, latest_obj_key, ext)
     dated_jsonl = dated_key + '.jsonl'
     dated_zip = dated_key + '.gz'
     logger.info(f'Uploading assembled statements to {dated_jsonl}')
     save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl')
     logger.info(f'Uploading assembled statements to {dated_zip}')
     save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
Ejemplo n.º 9
0
def test_util_s3_intelligent_tiering():
    from emmaa.util import save_json_to_s3, get_s3_client, get_s3_archive_status
    # Generate a file that's larger than 128kB
    large_file = ''.join(['a' for i in range(1024 * 129)])
    # Put it in a dict in order to serialize it as JSON
    large_file_dict = {'large_file': large_file}
    # Save it to S3
    client = get_s3_client()
    bucket = client.create_bucket(Bucket=TEST_BUCKET_NAME, ACL='public-read')
    key = 'intelligent_tiering_test/large_file.json'
    save_json_to_s3(obj=large_file_dict,
                    bucket=TEST_BUCKET_NAME,
                    key=key,
                    intelligent_tiering=True)

    # Check the archive status
    status = get_s3_archive_status(TEST_BUCKET_NAME, key)
    assert status['intelligent_tiering'] is True
    assert status['archived'] is False
Ejemplo n.º 10
0
 def upload_results(self, test_corpus='large_corpus_tests',
                    test_data=None, bucket=EMMAA_BUCKET_NAME):
     """Upload results to s3 bucket."""
     json_dict, json_lines = self.results_to_json(test_data)
     result_key = (f'results/{self.model.name}/results_'
                   f'{test_corpus}_{self.date_str}.json')
     paths_key = (f'paths/{self.model.name}/paths_{test_corpus}_'
                  f'{self.date_str}.jsonl')
     latest_paths_key = (f'paths/{self.model.name}/{test_corpus}'
                         '_latest_paths.jsonl')
     logger.info(f'Uploading test results to {result_key}')
     save_json_to_s3(json_dict, bucket, result_key)
     logger.info(f'Uploading test paths to {paths_key}')
     save_json_to_s3(json_lines, bucket, paths_key, save_format='jsonl')
     save_json_to_s3(json_lines, bucket, latest_paths_key, 'jsonl')
Ejemplo n.º 11
0
    def upload_results(self, test_corpus='large_corpus_tests',
                       test_data=None, upload_to_db=True,
                       bucket=EMMAA_BUCKET_NAME):
        """Upload results to s3 bucket."""
        json_dict, json_lines = self.results_to_json(test_data)
        result_key = (f'results/{self.model.name}/results_'
                      f'{test_corpus}_{self.date_str}.json')
        paths_key = (f'paths/{self.model.name}/paths_{test_corpus}_'
                     f'{self.date_str}.jsonl')
        latest_paths_key = (f'paths/{self.model.name}/{test_corpus}'
                            '_latest_paths.jsonl')
        logger.info(f'Uploading test results to {result_key}')
        save_json_to_s3(json_dict, bucket, result_key)
        logger.info(f'Uploading test paths to {paths_key}')
        save_json_to_s3(json_lines, bucket, paths_key, save_format='jsonl')
        save_json_to_s3(json_lines, bucket, latest_paths_key, 'jsonl')

        # Also save the path counts to the database if requested
        if upload_to_db:
            db = get_db('stmt')
            db.update_statements_path_counts(
                self.model.name, self.date_str[:10], self.path_stmt_counts)
Ejemplo n.º 12
0
 def save_to_s3_key(self, stats_key):
     if self.json_stats:
         logger.info(f'Uploading statistics to {stats_key}')
         save_json_to_s3(self.json_stats, self.bucket, stats_key)