Esempio n. 1
0
 def add_model_from_s3(self, model_id, config=None, number_of_updates=3,
                       bucket=EMMAA_BUCKET_NAME):
     """Add data for one model from S3 files."""
     if not config:
         config = load_config_from_s3(model_id)
     test_corpora = config['test']['test_corpus']
     if isinstance(test_corpora, str):
         test_corpora = [test_corpora]
     stmt_files = sort_s3_files_by_date_str(
         bucket, f'assembled/{model_id}/statements_', '.gz')
     stmt_files_to_use = stmt_files[:number_of_updates]
     for stmt_file in stmt_files_to_use:
         date = strip_out_date(stmt_file, 'date')
         dt = strip_out_date(stmt_file, 'datetime')
         # First get and add statements
         stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file)
         self.add_statements(model_id, date, stmt_jsons)
         # Also update the path counts from each test corpus
         for test_corpus in test_corpora:
             key = f'results/{model_id}/results_{test_corpus}_{dt}.json'
             try:
                 results = load_json_from_s3(bucket, key)
                 path_counts = results[0].get('path_stmt_counts')
                 if path_counts:
                     self.update_statements_path_counts(
                         model_id, date, path_counts)
             except ClientError as e:
                 if e.response['Error']['Code'] == 'NoSuchKey':
                     logger.warning(f'No results file for {key}, skipping')
                     continue
                 else:
                     raise e
Esempio n. 2
0
 def test_statements_url(self):
     rj = self.call_api('get', 'latest/statements_url/marm_model')
     assert rj
     set(rj.keys()) == {'link'}
     key = rj['link'].split('/')[-1]
     assert not strip_out_date(key)
     dated_rj = self.call_api(
         'get', 'latest/statements_url/marm_model?dated=true')
     key = dated_rj['link'].split('/')[-1]
     assert strip_out_date(key)
Esempio n. 3
0
 def _make_path_stmts(self, stmts, merge=False):
     sentences = []
     date = strip_out_date(self.date_str, 'date')
     if merge and isinstance(stmts[0], Statement):
         groups = group_and_sort_statements(stmts, grouping_level='relation')
         for _, rel_key, group_stmts, _ in groups:
             sentence = make_string_from_relation_key(rel_key) + '.'
             stmt_hashes = [gr_st.get_hash()
                            for _, _, gr_st, _ in group_stmts]
             url_param = parse.urlencode(
                 {'stmt_hash': stmt_hashes, 'source': 'model_statement',
                  'model': self.model.name, 'date': date}, doseq=True)
             link = f'/evidence?{url_param}'
             sentences.append((link, sentence, ''))
     else:
         for stmt in stmts:
             if isinstance(stmt, PybelEdge):
                 sentence = pybel_edge_to_english(stmt)
                 sentences.append(('', sentence, ''))
             elif isinstance(stmt, RefEdge):
                 sentence = stmt.to_english()
                 sentences.append(('', sentence, ''))
             else:
                 ea = EnglishAssembler([stmt])
                 sentence = ea.make_model()
                 stmt_hashes = [stmt.get_hash()]
                 url_param = parse.urlencode(
                     {'stmt_hash': stmt_hashes, 'source': 'model_statement',
                      'model': self.model.name, 'date': date}, doseq=True)
                 link = f'/evidence?{url_param}'
                 sentences.append((link, sentence, ''))
     return sentences
Esempio n. 4
0
File: api.py Progetto: steppi/emmaa
def model_last_updated(model, extension='.pkl'):
    """Find the most recent pickle file of model and return its creation date

    Example file name:
    models/aml/model_2018-12-13-18-11-54.pkl

    Parameters
    ----------
    model : str
        Model name to look for
    extension : str
        The extension the model file needs to have. Default is '.pkl'

    Returns
    -------
    last_updated : str
        A string of the format "YYYY-MM-DD-HH-mm-ss"
    """
    prefix = f'models/{model}/model_'
    try:
        return strip_out_date(
            find_latest_s3_file(bucket=EMMAA_BUCKET_NAME,
                                prefix=prefix,
                                extension=extension))
    except TypeError:
        logger.info('Could not find latest update date')
        return ''
Esempio n. 5
0
    def load_from_s3(klass, model_name, bucket=EMMAA_BUCKET_NAME):
        """Load the latest model state from S3.

        Parameters
        ----------
        model_name : str
            Name of model to load. This function expects the latest model
            to be found on S3 in the emmaa bucket with key
            'models/{model_name}/model_{date_string}', and the model config
            file at 'models/{model_name}/config.json'.

        Returns
        -------
        emmaa.model.EmmaaModel
            Latest instance of EmmaaModel with the given name, loaded from S3.
        """
        config = load_config_from_s3(model_name, bucket=bucket)
        stmts, stmts_key = load_stmts_from_s3(model_name, bucket=bucket)
        date = strip_out_date(stmts_key)
        # Stmts and papers should be from the same date
        key = f'papers/{model_name}/paper_ids_{date}.json'
        try:
            paper_ids = load_json_from_s3(bucket, key)
        except ClientError as e:
            logger.warning(f'Could not find paper IDs mapping due to: {e}')
            paper_ids = None
        em = klass(model_name, config, paper_ids)
        em.stmts = stmts
        if not paper_ids:
            em.paper_ids = em.get_paper_ids_from_stmts(stmts)
        return em
Esempio n. 6
0
def load_model_manager_from_s3(model_name=None,
                               key=None,
                               bucket=EMMAA_BUCKET_NAME):
    # First try find the file from specified key
    if key:
        try:
            model_manager = load_pickle_from_s3(bucket, key)
            if not model_manager.model.assembled_stmts:
                stmts, _ = get_assembled_statements(model_manager.model.name,
                                                    strip_out_date(
                                                        model_manager.date_str,
                                                        'date'),
                                                    bucket=bucket)
                model_manager.model.assembled_stmts = stmts
            return model_manager
        except Exception as e:
            logger.info('Could not load the model manager directly')
            logger.info(e)
            if not model_name:
                model_name = key.split('/')[1]
            date = strip_out_date(key, 'date')
            logger.info('Trying to load model manager from statements')
            try:
                model_manager = ModelManager.load_from_statements(
                    model_name, date=date, bucket=bucket)
                return model_manager
            except Exception as e:
                logger.info('Could not load the model manager from '
                            'statements')
                logger.info(e)
                return None
    # Now try find the latest key for given model
    if model_name:
        # Versioned
        key = find_latest_s3_file(bucket,
                                  f'results/{model_name}/model_manager_',
                                  '.pkl')
        if key is None:
            # Non-versioned
            key = f'results/{model_name}/latest_model_manager.pkl'
        return load_model_manager_from_s3(model_name=model_name,
                                          key=key,
                                          bucket=bucket)
    # Could not find either from key or from model name.
    logger.info('Could not find the model manager.')
    return None
Esempio n. 7
0
 def _get_previous_json_stats(self):
     key = find_latest_s3_file(
         self.bucket, f'model_stats/{self.model_name}/model_stats_',
         '.json')
     # This is the first time statistics is generated for this model
     if key is None:
         logger.info(f'Could not find a key to the previous statistics ')
         return
     # If stats for this date exists, previous stats is the second latest
     if strip_out_date(key) == self.latest_round.date_str:
         logger.info(f'Statistics for latest round already exists')
         key = find_nth_latest_s3_file(
             1, self.bucket, f'model_stats/{self.model_name}/model_stats_',
             '.json')
     # Store the date string to find previous round with it
     self.previous_date_str = strip_out_date(key)
     logger.info(f'Loading earlier statistics from {key}')
     previous_json_stats = load_json_from_s3(self.bucket, key)
     return previous_json_stats
Esempio n. 8
0
 def load_from_statements(cls, model_name, mode='local', date=None,
                          bucket=EMMAA_BUCKET_NAME):
     config = load_config_from_s3(model_name, bucket=bucket)
     if date:
         prefix = f'papers/{model_name}/paper_ids_{date}'
     else:
         prefix = f'papers/{model_name}/paper_ids_'
     paper_key = find_latest_s3_file(bucket, prefix, 'json')
     if paper_key:
         paper_ids = load_json_from_s3(bucket, paper_key)
     else:
         paper_ids = None
     model = EmmaaModel(model_name, config, paper_ids)
     # Loading assembled statements to avoid reassembly
     stmts, fname = get_assembled_statements(model_name, date, bucket)
     model.assembled_stmts = stmts
     model.date_str = strip_out_date(fname, 'datetime')
     mm = cls(model, mode=mode)
     return mm
Esempio n. 9
0
 def load_from_s3_key(cls, key, bucket=EMMAA_BUCKET_NAME):
     logger.info(f'Loading json from {key}')
     json_results = load_json_from_s3(bucket, key)
     date_str = json_results[0].get('date_str', strip_out_date(key))
     return cls(json_results, date_str)
Esempio n. 10
0
def last_updated_date(model,
                      file_type='model',
                      date_format='date',
                      tests='large_corpus_tests',
                      extension='.pkl',
                      n=0,
                      bucket=EMMAA_BUCKET_NAME):
    """Find the most recent or the nth file of given type on S3 and return its
    creation date.

    Example file name:
    models/aml/model_2018-12-13-18-11-54.pkl

    Parameters
    ----------
    model : str
        Model name to look for
    file_type : str
        Type of a file to find the latest file for. Accepted values: 'model',
        'test_results', 'model_stats', 'test_stats'.
    date_format : str
        Format of the returned date. Accepted values are 'datetime' (returns a
        date in the format "YYYY-MM-DD-HH-mm-ss") and 'date' (returns a date
        in the format "YYYY-MM-DD"). Default is 'date'.
    extension : str
        The extension the model file needs to have. Default is '.pkl'
    n : int
        Index of the file in list of S3 files sorted by date (0-indexed).
    bucket : str
        Name of bucket on S3.

    Returns
    -------
    last_updated : str
        A string of the selected format.
    """
    if file_type == 'model':
        folder_name = 'models'
        prefix_new = prefix_old = f'models/{model}/model_'
    elif file_type == 'test_results':
        prefix_new = f'results/{model}/results_{tests}'
        prefix_old = f'results/{model}/results_'
    elif file_type == 'model_stats':
        prefix_new = f'model_stats/{model}/model_stats_'
        prefix_old = f'stats/{model}/stats_'
    elif file_type == 'test_stats':
        prefix_new = f'stats/{model}/test_stats_{tests}'
        prefix_old = f'stats/{model}/stats_'
    else:
        raise TypeError(f'Files of type {file_type} are not supported')
    try:
        return strip_out_date(find_nth_latest_s3_file(n=n,
                                                      bucket=bucket,
                                                      prefix=prefix_new,
                                                      extension=extension),
                              date_format=date_format)
    except TypeError:
        try:
            return strip_out_date(find_nth_latest_s3_file(n=n,
                                                          bucket=bucket,
                                                          prefix=prefix_old,
                                                          extension=extension),
                                  date_format=date_format)
        except TypeError:
            logger.info('Could not find latest update date')
            return ''