def get_model_metrics() -> pd.DataFrame: metrics_path = get_path(dirs=['metrics']) dfs = [] for filename in sorted(os.listdir(metrics_path)): with open(get_path(filename, dirs=['metrics']), 'r') as fp: d = json.load(fp) d = {filename: d} dfs.append(pd.DataFrame.from_dict(d)) return pd.concat(dfs, axis=1, sort=False)
def ensemble_analyzer() -> None: """ Ensemble analyzer, useful to avoid multiple submissions where not many predictions change. :return: Pandas dataframe with observations that have changed. """ test_set = pd.read_csv(get_resources_path('test.csv')) base_ensemble = None for filepath in sorted(list(os.walk(get_path(dirs=['submissions'])))[0][2]): if 'ensemble' in filepath: if base_ensemble is None: base_ensemble = pd.read_csv(get_path(filepath, dirs=['submissions'])) else: print('Analyzing ensemble {}'.format(filepath)) current_ensemble = pd.read_csv(get_path(filepath, dirs=['submissions'])) merged_df = pd.merge(base_ensemble, current_ensemble, suffixes=('_base', '_curr'), on='id', how='inner') dif = merged_df.category_base != merged_df.category_curr print('Different predictions:', np.sum(dif)) base_ensemble = current_ensemble return pd.merge(merged_df[merged_df.category_base != merged_df.category_curr], test_set, on='id', how='inner')
def distribution_analysis(submission_fname: str) -> pd.DataFrame: """ Analyze distribution mismatch between training and test set. :param submission_fname: Filename of submission. :return: Pandas dataframe with distribution analysis. """ training_distribution = pd.read_csv(get_resources_path('train.csv')).groupby('category').count()[['title']] training_distribution = training_distribution.rename(columns={"title": "train_count"}) training_distribution['pmf_train'] = training_distribution[ 'train_count'] / training_distribution.train_count.sum() * 100 submission_distribution = pd.read_csv(get_path(submission_fname, dirs=['submissions'])).groupby('category').count() submission_distribution = submission_distribution.rename(columns={"id": "val_count"}) submission_distribution['pmf_val'] = submission_distribution[ 'val_count'] / submission_distribution.val_count.sum() * 100 dist_comp = submission_distribution.join(training_distribution) dist_comp['dif'] = dist_comp['pmf_val'] - dist_comp['pmf_train'] return dist_comp.sort_values('dif')
utils.make_deterministic() # load and pre-process data print('Loading dataset') dataset, submission_set = data.get_training_test_set() print('Pre-processing titles') dataset = vocab.pre_process_titles(dataset) submission_set = vocab.pre_process_titles(submission_set) # get vocabulary and encoders vocabulary = embeddings.get_embeddings_vocabulary() label_encoder = vocab.get_label_encoder(dataset['category']) # try to load tokenizer from file tokenizer_path = get_path('tokenizer.json') if os.path.exists(tokenizer_path): with open(tokenizer_path) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) print('Loaded tokenizer from file') else: tokenizer = vocab.get_tokenizer( pd.concat([dataset['title'], submission_set['title']]), vocabulary) tokenizer_json = tokenizer.to_json() with open(tokenizer_path, 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) # pre-process dataset for training labels = label_encoder.transform(dataset['category'])
utils.check_gpu_usage() utils.make_deterministic() timestamp = datetime.now().strftime("%d_%m_%Y_%Hh_%Mm_%Ss") # load and pre-process data dataset, submission_set = data.get_training_test_set() forensics = submission_set.copy() submission_set = vocab.pre_process_titles(submission_set) # load category encoder label_encoder = vocab.get_label_encoder(dataset['category']) labels = label_encoder.transform(dataset['category']) # reload tokenizer with open(get_path('tokenizer.json')) as f: data = json.load(f) tokenizer = tokenizer_from_json(data) # tokenize submission set sequences_submission = vocab.get_padded_sequences(submission_set['title'], tokenizer) # ensemble prediction yhat_ensemble = None qty_ensembled_models = 0 for i in range(1, config.N_SPLITS + 1): model_path = get_path('stage_2_fold_{}.ckpt'.format(i), dirs=['checkpoints']) if not os.path.exists(model_path):