def get_model_metrics() -> pd.DataFrame:
    metrics_path = get_path(dirs=['metrics'])

    dfs = []
    for filename in sorted(os.listdir(metrics_path)):
        with open(get_path(filename, dirs=['metrics']), 'r') as fp:
            d = json.load(fp)
            d = {filename: d}
            dfs.append(pd.DataFrame.from_dict(d))

    return pd.concat(dfs, axis=1, sort=False)
コード例 #2
0
def ensemble_analyzer() -> None:
    """
    Ensemble analyzer, useful to avoid multiple submissions where not many predictions change.

    :return: Pandas dataframe with observations that have changed.
    """
    test_set = pd.read_csv(get_resources_path('test.csv'))
    base_ensemble = None
    for filepath in sorted(list(os.walk(get_path(dirs=['submissions'])))[0][2]):
        if 'ensemble' in filepath:
            if base_ensemble is None:
                base_ensemble = pd.read_csv(get_path(filepath, dirs=['submissions']))
            else:
                print('Analyzing ensemble {}'.format(filepath))
                current_ensemble = pd.read_csv(get_path(filepath, dirs=['submissions']))
                merged_df = pd.merge(base_ensemble, current_ensemble, suffixes=('_base', '_curr'), on='id', how='inner')
                dif = merged_df.category_base != merged_df.category_curr
                print('Different predictions:', np.sum(dif))
                base_ensemble = current_ensemble

    return pd.merge(merged_df[merged_df.category_base != merged_df.category_curr], test_set, on='id', how='inner')
コード例 #3
0
def distribution_analysis(submission_fname: str) -> pd.DataFrame:
    """
    Analyze distribution mismatch between training and test set.

    :param submission_fname: Filename of submission.
    :return: Pandas dataframe with distribution analysis.
    """
    training_distribution = pd.read_csv(get_resources_path('train.csv')).groupby('category').count()[['title']]
    training_distribution = training_distribution.rename(columns={"title": "train_count"})
    training_distribution['pmf_train'] = training_distribution[
                                             'train_count'] / training_distribution.train_count.sum() * 100

    submission_distribution = pd.read_csv(get_path(submission_fname, dirs=['submissions'])).groupby('category').count()
    submission_distribution = submission_distribution.rename(columns={"id": "val_count"})
    submission_distribution['pmf_val'] = submission_distribution[
                                             'val_count'] / submission_distribution.val_count.sum() * 100

    dist_comp = submission_distribution.join(training_distribution)
    dist_comp['dif'] = dist_comp['pmf_val'] - dist_comp['pmf_train']
    return dist_comp.sort_values('dif')
コード例 #4
0
utils.make_deterministic()

# load and pre-process data
print('Loading dataset')
dataset, submission_set = data.get_training_test_set()

print('Pre-processing titles')
dataset = vocab.pre_process_titles(dataset)
submission_set = vocab.pre_process_titles(submission_set)

# get vocabulary and encoders
vocabulary = embeddings.get_embeddings_vocabulary()
label_encoder = vocab.get_label_encoder(dataset['category'])

# try to load tokenizer from file
tokenizer_path = get_path('tokenizer.json')
if os.path.exists(tokenizer_path):
    with open(tokenizer_path) as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
    print('Loaded tokenizer from file')
else:
    tokenizer = vocab.get_tokenizer(
        pd.concat([dataset['title'], submission_set['title']]), vocabulary)
    tokenizer_json = tokenizer.to_json()
    with open(tokenizer_path, 'w', encoding='utf-8') as f:
        f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# pre-process dataset for training
labels = label_encoder.transform(dataset['category'])
コード例 #5
0
utils.check_gpu_usage()
utils.make_deterministic()
timestamp = datetime.now().strftime("%d_%m_%Y_%Hh_%Mm_%Ss")

# load and pre-process data
dataset, submission_set = data.get_training_test_set()
forensics = submission_set.copy()
submission_set = vocab.pre_process_titles(submission_set)

# load category encoder
label_encoder = vocab.get_label_encoder(dataset['category'])
labels = label_encoder.transform(dataset['category'])

# reload tokenizer
with open(get_path('tokenizer.json')) as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)

# tokenize submission set
sequences_submission = vocab.get_padded_sequences(submission_set['title'],
                                                  tokenizer)

# ensemble prediction
yhat_ensemble = None
qty_ensembled_models = 0
for i in range(1, config.N_SPLITS + 1):
    model_path = get_path('stage_2_fold_{}.ckpt'.format(i),
                          dirs=['checkpoints'])

    if not os.path.exists(model_path):