Ejemplo n.º 1
0
 def add_model_from_s3(self, model_id, config=None, number_of_updates=3,
                       bucket=EMMAA_BUCKET_NAME):
     """Add data for one model from S3 files."""
     if not config:
         config = load_config_from_s3(model_id)
     test_corpora = config['test']['test_corpus']
     if isinstance(test_corpora, str):
         test_corpora = [test_corpora]
     stmt_files = sort_s3_files_by_date_str(
         bucket, f'assembled/{model_id}/statements_', '.gz')
     stmt_files_to_use = stmt_files[:number_of_updates]
     for stmt_file in stmt_files_to_use:
         date = strip_out_date(stmt_file, 'date')
         dt = strip_out_date(stmt_file, 'datetime')
         # First get and add statements
         stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file)
         self.add_statements(model_id, date, stmt_jsons)
         # Also update the path counts from each test corpus
         for test_corpus in test_corpora:
             key = f'results/{model_id}/results_{test_corpus}_{dt}.json'
             try:
                 results = load_json_from_s3(bucket, key)
                 path_counts = results[0].get('path_stmt_counts')
                 if path_counts:
                     self.update_statements_path_counts(
                         model_id, date, path_counts)
             except ClientError as e:
                 if e.response['Error']['Code'] == 'NoSuchKey':
                     logger.warning(f'No results file for {key}, skipping')
                     continue
                 else:
                     raise e
Ejemplo n.º 2
0
    def load_from_s3(klass, model_name, bucket=EMMAA_BUCKET_NAME):
        """Load the latest model state from S3.

        Parameters
        ----------
        model_name : str
            Name of model to load. This function expects the latest model
            to be found on S3 in the emmaa bucket with key
            'models/{model_name}/model_{date_string}', and the model config
            file at 'models/{model_name}/config.json'.

        Returns
        -------
        emmaa.model.EmmaaModel
            Latest instance of EmmaaModel with the given name, loaded from S3.
        """
        config = load_config_from_s3(model_name, bucket=bucket)
        stmts, stmts_key = load_stmts_from_s3(model_name, bucket=bucket)
        date = strip_out_date(stmts_key)
        # Stmts and papers should be from the same date
        key = f'papers/{model_name}/paper_ids_{date}.json'
        try:
            paper_ids = load_json_from_s3(bucket, key)
        except ClientError as e:
            logger.warning(f'Could not find paper IDs mapping due to: {e}')
            paper_ids = None
        em = klass(model_name, config, paper_ids)
        em.stmts = stmts
        if not paper_ids:
            em.paper_ids = em.get_paper_ids_from_stmts(stmts)
        return em
Ejemplo n.º 3
0
def get_assembled_statements(model, bucket=EMMAA_BUCKET_NAME):
    latest_file_key = find_latest_s3_file(bucket,
                                          f'assembled/{model}/statements_',
                                          '.json')
    if not latest_file_key:
        logger.info(f'No assembled statements found for {model}.')
        return None, None
    logger.info(f'Loading assembled statements for {model} from S3.')
    stmt_jsons = load_json_from_s3(bucket, latest_file_key)
    stmts = stmts_from_json(stmt_jsons)
    return stmts, latest_file_key
Ejemplo n.º 4
0
def load_latest_statements(model):
    if does_exist(EMMAA_BUCKET_NAME,
                  f'assembled/{model}/latest_statements_{model}'):
        fkey = f'assembled/{model}/latest_statements_{model}.json'
    elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'):
        fkey = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                   f'assembled/{model}/statements_', '.json')
    else:
        fkey = None
    if fkey:
        stmt_jsons = load_json_from_s3(EMMAA_BUCKET_NAME, fkey)
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    else:
        stmt_jsons = []
        link = ''
    return {'statements': stmt_jsons, 'link': link}
Ejemplo n.º 5
0
def load_config_from_s3(model_name, bucket=EMMAA_BUCKET_NAME):
    """Return a JSON dict of config settings for a model from S3.

    Parameters
    ----------
    model_name : str
        The name of the model whose config should be loaded.

    Returns
    -------
    config : dict
        A JSON dictionary of the model configuration loaded from S3.
    """
    base_key = f'models/{model_name}'
    config_key = f'{base_key}/config.json'
    logger.info(f'Loading model config from {config_key}')
    config = load_json_from_s3(bucket, config_key)
    return config
Ejemplo n.º 6
0
 def _get_previous_json_stats(self):
     key = find_latest_s3_file(
         self.bucket, f'model_stats/{self.model_name}/model_stats_',
         '.json')
     # This is the first time statistics is generated for this model
     if key is None:
         logger.info(f'Could not find a key to the previous statistics ')
         return
     # If stats for this date exists, previous stats is the second latest
     if strip_out_date(key) == self.latest_round.date_str:
         logger.info(f'Statistics for latest round already exists')
         key = find_nth_latest_s3_file(
             1, self.bucket, f'model_stats/{self.model_name}/model_stats_',
             '.json')
     # Store the date string to find previous round with it
     self.previous_date_str = strip_out_date(key)
     logger.info(f'Loading earlier statistics from {key}')
     previous_json_stats = load_json_from_s3(self.bucket, key)
     return previous_json_stats
Ejemplo n.º 7
0
 def load_from_statements(cls, model_name, mode='local', date=None,
                          bucket=EMMAA_BUCKET_NAME):
     config = load_config_from_s3(model_name, bucket=bucket)
     if date:
         prefix = f'papers/{model_name}/paper_ids_{date}'
     else:
         prefix = f'papers/{model_name}/paper_ids_'
     paper_key = find_latest_s3_file(bucket, prefix, 'json')
     if paper_key:
         paper_ids = load_json_from_s3(bucket, paper_key)
     else:
         paper_ids = None
     model = EmmaaModel(model_name, config, paper_ids)
     # Loading assembled statements to avoid reassembly
     stmts, fname = get_assembled_statements(model_name, date, bucket)
     model.assembled_stmts = stmts
     model.date_str = strip_out_date(fname, 'datetime')
     mm = cls(model, mode=mode)
     return mm
Ejemplo n.º 8
0
def get_assembled_statements(model, date=None, bucket=EMMAA_BUCKET_NAME):
    """Load and return a list of assembled statements.

    Parameters
    ----------
    model : str
        A name of a model.
    date : str or None
        Date in "YYYY-MM-DD" format for which to load the statements. If None,
        loads the latest available statements.
    bucket : str
        Name of S3 bucket to look for a file. Defaults to 'emmaa'.

    Returns
    -------
    stmts : list[indra.statements.Statement]
        A list of assembled statements.
    latest_file_key : str
        Key of a file with statements on s3.
    """
    if not date:
        prefix = f'assembled/{model}/statements_'
    else:
        prefix = f'assembled/{model}/statements_{date}'
    # Try loading gzip file
    latest_file_key = find_latest_s3_file(bucket, prefix, '.gz')
    if not latest_file_key:
        # Could be saved with .zip extension
        latest_file_key = find_latest_s3_file(bucket, prefix, '.zip')
    if latest_file_key:
        stmt_jsons = load_gzip_json_from_s3(bucket, latest_file_key)
    else:
        # Try loading json file
        latest_file_key = find_latest_s3_file(bucket, prefix, '.json')
        if latest_file_key:
            stmt_jsons = load_json_from_s3(bucket, latest_file_key)
        # Didn't get gzip, zip or json
        else:
            logger.info(f'No assembled statements found for {model}.')
            return None, None
    stmts = stmts_from_json(stmt_jsons)
    return stmts, latest_file_key
Ejemplo n.º 9
0
 def load_from_s3_key(cls, key, bucket=EMMAA_BUCKET_NAME):
     logger.info(f'Loading json from {key}')
     json_results = load_json_from_s3(bucket, key)
     date_str = json_results[0].get('date_str', strip_out_date(key))
     return cls(json_results, date_str)
Ejemplo n.º 10
0
def load_custom_grounding_map(model, bucket=EMMAA_BUCKET_NAME):
    key = f'models/{model}/grounding_map.json'
    gr_map = load_json_from_s3(bucket, key)
    return gr_map
Ejemplo n.º 11
0
def get_model_stats(model,
                    mode,
                    tests=None,
                    date=None,
                    extension='.json',
                    n=0,
                    bucket=EMMAA_BUCKET_NAME):
    """Gets the latest statistics for the given model

    Parameters
    ----------
    model : str
        Model name to look for
    mode : str
        Type of stats to generate (model or test)
    tests : str
        A name of a test corpus. Default is large_corpus_tests.
    date : str or None
        Date for which the stats will be returned in "YYYY-MM-DD" format.
    extension : str
        Extension of the file.
    n : int
        Index of the file in list of S3 files sorted by date (0-indexed).
    bucket : str
        Name of bucket on S3.
    Returns
    -------
    model_data : json
        The json formatted data containing the statistics for the model
    """
    if not tests:
        tests = _default_test(model, bucket=bucket)
    # If date is not specified, get the latest or the nth
    if not date:
        if mode == 'model':
            date = last_updated_date(model,
                                     'model_stats',
                                     'date',
                                     extension=extension,
                                     n=n,
                                     bucket=bucket)
        elif mode == 'test':
            date = last_updated_date(model,
                                     'test_stats',
                                     'date',
                                     tests=tests,
                                     extension=extension,
                                     n=n,
                                     bucket=bucket)
        else:
            raise TypeError('Mode must be either model or tests')

    # Try find new formatted stats (separate for model and tests)
    if mode == 'model':
        # File name example:
        # model_stats/skcm/model_stats_2019-08-20-17-34-40.json
        prefix = f'model_stats/{model}/model_stats_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    elif mode == 'test':
        # File name example:
        # stats/skcm/test_stats_large_corpus_tests_2019-08-20-17-34-40.json
        prefix = f'stats/{model}/test_stats_{tests}_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    else:
        raise TypeError('Mode must be either model or tests')
    # This might be an older file with model and test stats combined.
    # File name example: stats/skcm/stats_2019-08-20-17-34-40.json
    if not latest_file_key and (mode == 'model' or
                                (mode == 'test'
                                 and tests == _default_test(model))):
        prefix = f'stats/{model}/stats_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    # If we still didn't filnd the file it probably does not exist
    if not latest_file_key:
        return None, None
    return (load_json_from_s3(bucket, latest_file_key), latest_file_key)