def validate_features_dataset(output_dataset_path, ds_validation_path): ds = F.Dataset(output_dataset_path, read_only=True) print(ds) features = {} for key, val in ds.items(): if 'indices_' in key: name = key.split('_')[-1] features[name] = (val, ds[name]) all_indices = [val[0] for val in features.values()] # ====== sampling 250 files ====== # all_files = sampling_iter(it=all_indices[0].keys(), k=250, seed=Config.SUPER_SEED) all_files = [f for f in all_files if all(f in ids for ids in all_indices)] print("#Samples:", ctext(len(all_files), 'cyan')) # ====== ignore the 20-figures warning ====== # with catch_warnings_ignore(RuntimeWarning): for file_name in all_files: X = {} for feat_name, (ids, data) in features.items(): start, end = ids[file_name] X[feat_name] = data[start:end][:].astype('float32') V.plot_multiple_features(features=X, fig_width=20, title='[%s]%s' % (ds['dsname'][file_name], file_name)) V.plot_save(ds_validation_path, dpi=12)
def save_figs(args: Arguments, name: str, figs: Optional[Sequence[plt.Figure]] = None): path = get_results_path(args) multi_figs = True if figs is not None and len(as_tuple(figs)) == 1: multi_figs = False figs = as_tuple(figs) path = f'{path}/{name}.{"pdf" if multi_figs else "png"}' vs.plot_save(path, figs, dpi=args.dpi, verbose=True)
def evaluate_latent(fn, feeder, title): y_true = [] Z = [] for outputs in Progbar(feeder.set_batch(batch_mode='file'), name=title, print_report=True, print_summary=False, count_func=lambda x: x[-1].shape[0]): name = str(outputs[0]) idx = int(outputs[1]) data = outputs[2:] assert idx == 0 y_true.append(name) Z.append(fn(*data)) Z = np.concatenate(Z, axis=0) # ====== visualize spectrogram ====== # if Z.ndim >= 3: sample = np.random.choice(range(len(Z)), size=3, replace=False) spec = Z[sample.astype('int32')] y = [y_true[int(i)] for i in sample] plot_figure(nrow=6, ncol=6) for i, (s, tit) in enumerate(zip(spec, y)): s = s.reshape(len(s), -1) plot_spectrogram(s.T, ax=(1, 3, i + 1), title=tit) # ====== visualize each point ====== # # flattent to 2D Z = np.reshape(Z, newshape=(len(Z), -1)) # tsne if necessary if Z.shape[-1] > 3: Z = fast_tsne(Z, n_components=3, n_jobs=8, random_state=K.get_rng().randint(0, 10e8)) # color and marker Z_color = [digit_color_map[i.split('_')[-1]] for i in y_true] Z_marker = [gender_marker_map[i.split('_')[1]] for i in y_true] plot_figure(nrow=6, ncol=20) for i, azim in enumerate((15, 60, 120)): plot_scatter(x=Z[:, 0], y=Z[:, 1], z=Z[:, 2], ax=(1, 3, i + 1), size=4, color=Z_color, marker=Z_marker, azim=azim, legend=legends if i == 1 else None, legend_ncol=11, fontsize=10, title=title) plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
def plot_images(df: pd.DataFrame, path: str, show_reconstruction: bool = True): for zdim, group1 in tqdm(df.groupby('zdim')): tmp = group1.groupby('gamma') n_row = len(tmp) n_col = max(len(g) for _, g in tmp) plt.figure(figsize=(n_col * 1.5, n_row * 1.5 + 1.5), dpi=150) count = 0 for gamma, group2 in list(iter(tmp))[::-1]: # reverse the row order for i, (beta, _, _, recon, sample) in enumerate(group2.values): img = recon if show_reconstruction else sample img[np.isnan(img)] = 1. plt.subplot(n_row, n_col, count + 1) plt.imshow(img, cmap='Greys_r') plt.axis('off') if not args.no_anno: plt.title(f'b={beta} g={gamma}', fontsize=10) count += 1 plt.suptitle(f'z={zdim}') plt.tight_layout(rect=[0.0, 0.0, 1.0, 1.001]) vs.plot_save(path, verbose=True)
def on_compare(self, models, save_path): scores = [ 'mllk', 'mig', 'beta', 'factor', 'uca', 'nmi', 'sap', 'd', 'c', 'i' ] scores = { name: self.get_scores('score', [i.hash for i in models ], name) for name in scores } ncol = 5 nrow = int(np.ceil(len(scores) / ncol)) df = models.to_dataframe() for dsname, group in df.groupby('ds'): name = group['vae'] + '-' + group['strategy'] + '-' + group[ 'semi'].astype(str) colors = sns.color_palette(n_colors=group.shape[0]) X = np.arange(group.shape[0]) fig = plt.figure(figsize=(3 * ncol, 3 * nrow)) for idx, (key, val) in enumerate(scores.items()): y = np.array([val[hash_code] for hash_code in group['hash']]) vmin, vmax = np.min(y), np.max(y) y = y - np.min(y) ax = plt.subplot(nrow, ncol, idx + 1) points = [ ax.scatter(x_, y_, s=32, color=c_, alpha=0.8) for x_, y_, c_ in zip(X, y, colors) ] ax.set_title(key) plt.yticks(np.linspace(0., np.max(y), 5), ["%.2f" % i for i in np.linspace(vmin, vmax, 5)]) ax.tick_params(bottom=False, labelbottom=False, labelsize=8) # show legend: if idx == 0: ax.legend(points, [i for i in name], fontsize=6, fancybox=False, framealpha=0.) fig.suptitle(dsname) fig.tight_layout(rect=[0, 0.03, 1, 0.97]) vs.plot_save(os.path.join(save_path, 'compare.pdf'), dpi=100)
def evaluate_feeder(feeder, title): y_true_digit = [] y_true_gender = [] y_pred = [] for outputs in Progbar(feeder.set_batch(batch_mode='file'), name=title, print_report=True, print_summary=False, count_func=lambda x: x[-1].shape[0]): name = str(outputs[0]) idx = int(outputs[1]) data = outputs[2:] assert idx == 0 y_true_digit.append(f_digits(name)) y_true_gender.append(f_genders(name)) y_pred.append(f_pred(*data)) # ====== post processing ====== # y_true_digit = np.array(y_true_digit, dtype='int32') y_true_gender = np.array(y_true_gender, dtype='int32') y_pred_proba = np.concatenate(y_pred, axis=0) y_pred_all = np.argmax(y_pred_proba, axis=-1).astype('int32') # ====== plotting for each gender ====== # plot_figure(nrow=6, ncol=25) for gen in range(len(genders)): y_true, y_pred = [], [] for i, g in enumerate(y_true_gender): if g == gen: y_true.append(y_true_digit[i]) y_pred.append(y_pred_all[i]) if len(y_true) == 0: continue cm = confusion_matrix(y_true, y_pred, labels=range(len(digits))) plot_confusion_matrix(cm, labels=digits, fontsize=8, ax=(1, 4, gen + 1), title='[%s]%s' % (genders[gen], title)) plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
# ====== check exit condition ====== # if args.epoch > 0: if epoch >= args.epoch: break elif len(record_valid_loss) >= 2 and record_valid_loss[-1] > record_valid_loss[-2]: print(ctext("Dropped generalization loss `%.4f` -> `%.4f`" % (record_valid_loss[-2], record_valid_loss[-1]), 'yellow')) patience -= 1 if patience == 0: break epoch += 1 # ====== print summary training ====== # text = V.merge_text_graph(V.print_bar(record_train_loss, title="Train Loss"), V.print_bar(record_valid_loss, title="Valid Loss")) print(text) # ====== testing ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) if args.dim > 2: code_samples = ml.fast_pca(code_samples, n_components=2, random_state=K.get_rng().randint(10e8)) print("[Test set] Loss: %.4f" % lo) # plot test code samples V.plot_figure(nrow=8, ncol=8) ax = plt.subplot(1, 1, 1) ax.scatter(code_samples[:, 0], code_samples[:, 1], s=2, c=y_valid, alpha=0.5) ax.set_title('Test set') ax.set_aspect('equal', 'box') ax.axis('off') V.plot_save('/tmp/tmp_ae.pdf')
# =========================================================================== extractor = get_module_from_path(identifier=str(args.recipe), prefix='feature_recipes', path=get_script_path()) assert len(extractor) > 0, \ "Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path()) recipe = extractor[0](DEBUG) # ====== debugging ====== # if DEBUG: with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') for path, name in SAMPLED_WAV_FILE: feat = recipe.transform(path) assert feat['bnf'].shape[0] == feat['mspec'].shape[0] V.plot_multiple_features(feat, title=feat['name']) V.plot_save(os.path.join(PATH_EXP, 'features_%s.pdf' % args.recipe)) exit() # =========================================================================== # Prepare the processor # =========================================================================== with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') jobs = list(WAV_FILES.keys()) processor = pp.FeatureProcessor( jobs=jobs, path=os.path.join(PATH_ACOUSTIC_FEAT, args.recipe), extractor=recipe, n_cache=1200, ncpu=min(18, cpu_count() - 2), override=True,
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} train_indices, valid_indices = train_valid_test_split( x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists( path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = { name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset } # special case exclude all the noise data if 'noise' in exclude_dataset: indices = { name: (start, end) for name, (start, end) in indices.items() if '/' not in name } else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = { i: len(j) / n_total_files for i, j in dataset2name.items() } # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name) rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump( { 'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label }, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % (dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print( "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print( "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices) rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % ( 100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[ name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title( '[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
# =========================================================================== def plot(train, score, title, applying_pca=False): if applying_pca: pca = PCA(n_components=NUM_DIM) pca.fit(train) train = pca.transform(train) score = pca.transform(score) plot_figure(nrow=6, ncol=12) plot_scatter(x=train[:, 0], y=train[:, 1], z=None if NUM_DIM < 3 or train.shape[1] < 3 else train[:, 2], size=POINT_SIZE, color=y_train_color, marker=y_train_marker, fontsize=12, legend=legends, title='[train]' + str(title), ax=(1, 2, 1)) plot_scatter(x=score[:, 0], y=score[:, 1], z=None if NUM_DIM < 3 or score.shape[1] < 3 else score[:, 2], size=POINT_SIZE, color=y_score_color, marker=y_score_marker, fontsize=12, legend=legends, title='[score]' + str(title), ax=(1, 2, 2)) plot(train=X_train_pca, score=X_score_pca, title='PCA') plot(train=X_train_tsne, score=X_score_tsne, title='T-SNE') plot(train=X_train_tsne_pca, score=X_score_tsne_pca, title='T-SNE + PCA') plot(train=X_train_lda, score=X_score_lda, title='LDA') plot(train=X_train_plda, score=X_score_plda, title='PLDA') plot(train=X_train_plda, score=X_score_plda, title='PLDA + PCA', applying_pca=True) plot(train=X_train_gmm, score=X_score_gmm, title='GMM') plot(train=X_train_rbm, score=X_score_rbm, title='RBM') plot_save('/tmp/tmp.pdf')
assert FEATURE_NAME in feat # update progress if isinstance(feat, pp.base.ExtractorSignal): error_signal.append(feat) prog.add(1) continue prog['spkid'] = feat['spkid'] prog['name'] = feat['name'] prog['dsname'] = feat['dsname'] prog['duration'] = feat['duration'] prog.add(1) # 30% chance plotting if rand.rand() < 0.5: V.plot_multiple_features(feat, fig_width=20, title='[%s]%s' % (feat['dsname'], feat['name'])) V.plot_save(os.path.join(EXP_DIR, 'debug_%s.pdf' % FEATURE_RECIPE), dpi=30) # ====== save the extractor debugging log ====== # pp.set_extractor_debug(recipe, debug=True) recipe.transform(samples[0]) with open(os.path.join(EXP_DIR, 'debug_%s.log' % FEATURE_RECIPE), 'w') as f: for name, step in recipe.steps: f.write(step.last_debugging_text) # ====== print error signal ====== # for e in error_signal: f.write(str(e) + '\n') print(e) exit() # =========================================================================== # Running the extractor # =========================================================================== # ====== basic path ====== #
stdv_False = 1.5 false = stdv_False * np.random.randn(n_false) + mean_False y_true = np.zeros(shape=(n_true + n_false,)) y_true[:n_true] = 1 y_score = np.concatenate((true, false)) Pfa, Pmiss = K.metrics.det_curve(y_true=y_true, y_score=y_score) min_DCF, Pfa_opt, Pmiss_opt = K.metrics.compute_minDCF(Pfa, Pmiss) print("MinDCF, Pmiss_opt, Pfa_opt:", min_DCF, Pmiss_opt, Pfa_opt) print("EER1:", K.metrics.compute_EER(Pfa, Pmiss)) pmiss, pfa = rocch(tar_scores=true, nontar_scores=false) min_DCF, Pfa_opt, Pmiss_opt = K.metrics.compute_minDCF(pfa, pmiss) print("[Sidekit]MinDCF, Pmiss_opt, Pfa_opt:", min_DCF, Pmiss_opt, Pfa_opt) print("[Sidekit]EER:", compute_EER(pmiss, pfa)) print("[Sidekit]MinDCF, Pmiss_opt, Pfa_opt, ..., EER:", fast_minDCF(tar=true, non=false, plo=0)) fpr, tpr, _ = K.metrics.roc_curve(y_true=y_true, y_score=y_score) auc = K.metrics.compute_AUC(tpr, fpr) # ====== specialized plotting ====== # plt.figure() V.plot_detection_curve(x=pfa, y=pmiss, curve='det') plt.figure() V.plot_detection_curve(x=Pfa, y=Pmiss, curve='det') plt.figure() V.plot_detection_curve(x=fpr, y=tpr, curve='roc') V.plot_save('/tmp/tmp.pdf')
def plot_monitoring_epoch(X, X_drop, y, Z, Z_drop, W_outputs, W_drop_outputs, pi, pi_drop, row_name, dropout_percentage, curr_epoch, ds_name, labels, save_dir): # Order of W_outputs: [W, W_stdev_total, W_stdev_explained] from matplotlib import pyplot as plt if y.ndim == 2: y = np.argmax(y, axis=-1) y = np.array([labels[i] for i in y]) dropout_percentage_text = '%g%%' % (dropout_percentage * 100) Z_pca = fast_pca(Z, n_components=2, random_state=5218) Z_pca_drop = fast_pca(Z_drop, n_components=2, random_state=5218) if W_outputs is not None: X_pca, X_pca_drop, W_pca, W_pca_drop = fast_pca(X, X_drop, W_outputs[0], W_drop_outputs[0], n_components=2, random_state=5218) # ====== downsampling ====== # rand = np.random.RandomState(seed=5218) n_test_samples = len(y) ids = np.arange(n_test_samples, dtype='int32') if n_test_samples > 8000: ids = rand.choice(ids, size=8000, replace=False) # ====== scatter configuration ====== # config = dict(size=6, labels=None) y = y[ids] X = X[ids] X_drop = X_drop[ids] Z_pca = Z_pca[ids] X_pca = X_pca[ids] W_pca = W_pca[ids] W_outputs = [w[ids] for w in W_outputs] W_drop_outputs = [w[ids] for w in W_drop_outputs] Z_pca_drop = Z_pca_drop[ids] X_pca_drop = X_pca_drop[ids] W_pca_drop = W_pca_drop[ids] if pi is not None: pi = pi[ids] pi_drop = pi_drop[ids] # ====== plotting NO reconstruction ====== # if W_outputs is None: plot_figure(nrow=8, ncol=20) fast_scatter(x=Z_pca, y=y, title="[PCA] Test data latent space", enable_legend=True, ax=(1, 2, 1), **config) fast_scatter(x=Z_pca_drop, y=y, title="[PCA][Dropped:%s] Test data latent space" % dropout_percentage_text, ax=(1, 2, 2), **config) # ====== plotting WITH reconstruction ====== # else: plot_figure(nrow=16, ncol=20) # original test data WITHOUT dropout fast_scatter(x=X_pca, y=y, title="[PCA][Test Data] Original", ax=(2, 3, 1), **config) fast_scatter(x=W_pca, y=y, title="Reconstructed", ax=(2, 3, 2), **config) fast_scatter(x=Z_pca, y=y, title="Latent space", ax=(2, 3, 3), **config) # original test data WITH dropout fast_scatter(x=X_pca_drop, y=y, title="[PCA][Dropped:%s][Test Data] Original" % dropout_percentage_text, ax=(2, 3, 4), **config) fast_scatter(x=W_pca_drop, y=y, title="Reconstructed", ax=(2, 3, 5), enable_legend=True, **config) fast_scatter(x=Z_pca_drop, y=y, title="Latent space", ax=(2, 3, 6), **config) plot_save(os.path.join(save_dir, 'latent_epoch%d.png') % curr_epoch, dpi=180, clear_all=True, log=True) # ====== plot count-sum ====== # if W_outputs is not None: X_countsum = _clip_count_sum(np.sum(X, axis=-1)) W_countsum = _clip_count_sum(np.sum(W_outputs[0], axis=-1)) X_drop_countsum = _clip_count_sum(np.sum(X_drop, axis=-1)) W_drop_countsum = _clip_count_sum(np.sum(W_drop_outputs[0], axis=-1)) series_config = [ dict(xscale='linear', yscale='linear', sort_by=None), dict(xscale='linear', yscale='linear', sort_by='expected') ] if pi is not None: pi_sum = np.mean(pi, axis=-1) pi_drop_sum = np.mean(pi_drop, axis=-1) # plot the reconstruction count sum plot_figure(nrow=3 * 5 + 8, ncol=18) with plot_gridSpec(nrow=3 * (2 if pi is None else 3) + 4 * 3 + 1, ncol=6, wspace=1.0, hspace=0.8) as grid: kws = dict(colorbar=True, fontsize=10, size=10, marker=y, n_samples=1200) # without dropout ax = subplot(grid[:3, 0:3]) plot_scatter(x=X_pca, val=X_countsum, ax=ax, legend_enable=False, title='Original data (Count-sum)', **kws) ax = subplot(grid[:3, 3:6]) plot_scatter(x=W_pca, val=W_countsum, ax=ax, legend_enable=False, title='Reconstruction (Count-sum)', **kws) # with dropout ax = subplot(grid[3:6, 0:3]) plot_scatter(x=X_pca_drop, val=X_drop_countsum, ax=ax, legend_enable=True if pi is None else False, legend_ncol=len(labels), title='[Dropped:%s]Original data (Count-sum)' % dropout_percentage_text, **kws) ax = subplot(grid[3:6, 3:6]) plot_scatter(x=W_pca_drop, val=W_drop_countsum, ax=ax, legend_enable=False, title='[Dropped:%s]Reconstruction (Count-sum)' % dropout_percentage_text, **kws) row_start = 6 # zero-inflated pi if pi is not None: ax = subplot(grid[6:9, 0:3]) plot_scatter(x=X_pca, val=pi_sum, ax=ax, legend_enable=True, legend_ncol=len(labels), title='Zero-inflated probabilities', **kws) ax = subplot(grid[6:9, 3:6]) plot_scatter(x=X_pca, val=pi_drop_sum, ax=ax, legend_enable=False, title='[Dropped:%s]Zero-inflated probabilities' % dropout_percentage_text, **kws) row_start += 3 # plot the count-sum series def plot_count_sum_series(x, w, p, row_start, tit): if len(w) != 3: # no statistics provided return expected, stdev_total, stdev_explained = w count_sum_observed = np.sum(x, axis=0) count_sum_expected = np.sum(expected, axis=0) count_sum_stdev_total = np.sum(stdev_total, axis=0) count_sum_stdev_explained = np.sum(stdev_explained, axis=0) if p is not None: p_sum = np.mean(p, axis=0) for i, kws in enumerate(series_config): ax = subplot(grid[row_start:row_start + 3, (i * 3):(i * 3 + 3)]) ax, handles, indices = plot_series_statistics( count_sum_observed, count_sum_expected, explained_stdev=count_sum_stdev_explained, total_stdev=count_sum_stdev_total, fontsize=8, ax=ax, legend_enable=False, title=tit if i == 0 else None, despine=True if p is None else False, return_handles=True, return_indices=True, **kws) if p is not None: _show_zero_inflated_pi(p_sum, ax, handles, indices) plt.legend(handles=handles, loc='best', markerscale=4, fontsize=8) # add one row extra padding row_start += 1 plot_count_sum_series(x=X, w=W_outputs, p=pi, row_start=row_start, tit="Count-sum X_original - W_original") row_start += 1 plot_count_sum_series( x=X_drop, w=W_drop_outputs, p=pi_drop, row_start=row_start + 3, tit="[Dropped:%s]Count-sum X_drop - W_dropout" % dropout_percentage_text) row_start += 1 plot_count_sum_series( x=X, w=W_drop_outputs, p=pi_drop, row_start=row_start + 6, tit="[Dropped:%s]Count-sum X_original - W_dropout" % dropout_percentage_text) plot_save(os.path.join(save_dir, 'countsum_epoch%d.png') % curr_epoch, dpi=180, clear_all=True, log=True) # ====== plot series of samples ====== # if W_outputs is not None and len(W_outputs) == 3: # NOTe: turn off pi here pi = None n_visual_samples = 8 plot_figure(nrow=3 * n_visual_samples + 8, ncol=25) col_width = 5 with plot_gridSpec(nrow=3 * n_visual_samples, ncol=4 * col_width, wspace=5.0, hspace=1.0) as grid: curr_grid_index = 0 for i in rand.permutation(len(X))[:n_visual_samples]: observed = X[i] expected, stdev_explained, stdev_total = [ w[i] for w in W_outputs ] expected_drop, stdev_explained_drop, stdev_total_drop = [ w[i] for w in W_drop_outputs ] if pi is not None: p_zi = pi[i] p_zi_drop = pi_drop[i] # compare to W_original for j, kws in enumerate(series_config): ax = subplot(grid[curr_grid_index:curr_grid_index + 3, (j * col_width):(j * col_width + col_width)]) ax, handles, indices = plot_series_statistics( observed, expected, explained_stdev=stdev_explained, total_stdev=stdev_total, fontsize=8, legend_enable=False, despine=True if pi is None else False, title=("'%s' X_original - W_original" % row_name[i]) if j == 0 else None, return_handles=True, return_indices=True, **kws) if pi is not None: _show_zero_inflated_pi(p_zi, ax, handles, indices) plt.legend(handles=handles, loc='best', markerscale=4, fontsize=8) # compare to W_dropout for j, kws in enumerate(series_config): col_start = col_width * 2 ax = subplot( grid[curr_grid_index:curr_grid_index + 3, (col_start + j * col_width):(col_start + j * col_width + col_width)]) ax, handles, indices = plot_series_statistics( observed, expected_drop, explained_stdev=stdev_explained_drop, total_stdev=stdev_total_drop, fontsize=8, legend_enable=False, despine=True if pi is None else False, title=("[Dropped:%s]'%s' X_original - W_dropout" % (dropout_percentage_text, row_name[i])) if j == 0 else None, return_handles=True, return_indices=True, **kws) if pi is not None: _show_zero_inflated_pi(p_zi_drop, ax, handles, indices) plt.legend(handles=handles, loc='best', markerscale=4, fontsize=8) curr_grid_index += 3 plot_save(os.path.join(save_dir, 'samples_epoch%d.png') % curr_epoch, dpi=180, clear_all=True, log=True) # ====== special case for mnist ====== # if 'mnist' in ds_name and W_outputs is not None: plot_figure(nrow=3, ncol=18) n_images = 32 ids = rand.choice(np.arange(X.shape[0], dtype='int32'), size=n_images, replace=False) meta_data = [("Org", X[ids]), ("Rec", W_outputs[0][ids]), ("OrgDropout", X_drop[ids]), ("RecDropout", W_drop_outputs[0][ids])] count = 1 for name, data in meta_data: for i in range(n_images): x = data[i].reshape(28, 28) plt.subplot(4, n_images, count) show_image(x) if i == 0: plt.ylabel(name, fontsize=8) count += 1 plt.subplots_adjust(wspace=0.05, hspace=0.05) plot_save(os.path.join(save_dir, 'image_epoch%d.png') % curr_epoch, dpi=180, clear_all=True, log=True)
# Regressor for name, classifier in [ # ('SVRrbf', MultiOutputRegressor(SVR(kernel='rbf'))), ('Elastic', MultiOutputRegressor(ElasticNetCV(random_state=random_state))), ]: print("Testing Regressor:", ctext(name, 'cyan')) classifier.fit(X=np.concatenate([Z_train, Z_valid], axis=0), y=np.concatenate([y_prot_train, y_prot_valid], axis=0)) y_prot_test_pred = classifier.predict(Z_test) plot_evaluate_regressor(y_pred=y_prot_test_pred, y_true=y_prot_test, labels=y_prot_names, title='[%s]%s-%s' % (transformer_name, 'Regression', name)) # Reconstruction # for name, classifier in [ # ('MLP', MLPRegressor(hidden_layer_sizes=(512, 512, 512), verbose=True, # random_state=random_state)), # ]: # print("Testing Reconstruction:", ctext(name, 'cyan')) # classifier.fit(X=np.concatenate([Z_train, Z_valid], axis=0), # y=np.concatenate([X_train, X_valid], axis=0)) # X_test_pred = classifier.predict(Z_test) # plot_evaluate_reconstruction(X=X_test, X_res=X_test_pred, # gene_name=gene_name, # title='[%s]%s-%s' % (transformer_name, 'Reconstruction', name)) # ====== save all figure ====== # V.plot_save( os.path.join(EXP_DIR, 'baseline_%s.png' % transformer_name.lower()))
nllk = tf.reduce_mean(-y_pred.log_prob(y_true)) return nllk mdn = MixtureDensityNetwork(1, n_components=n_components, covariance_type='none') model = Sequential([mdn]) model.compile(optimizer='adam', loss=fn_loss) model.fit(x=x, y=x, epochs=48, batch_size=32, verbose=True) y = model(x) mdn_llk = tf.reduce_mean(y.log_prob(x)).numpy() mdn_mean = tf.reduce_mean(y.components_distribution.mean(), axis=(0, -1)).numpy() # ====== visualizing ====== # fig = plt.figure() sns.distplot(x, bins=80) plt.title('Data') fig = plt.figure() sns.distplot(gmm.sample(n * n_components)[0], bins=80) plt.title('GMM - llk: %.2f' % gmm_llk) fig = plt.figure() sns.distplot(y.sample().numpy(), bins=80) plt.title('MDN - llk: %.2f' % mdn_llk) vis.plot_save()
pp.base.EqualizeShape0(input_name=('mspec', 'mfcc', 'sdc', 'bnf', 'energy', 'sad')), pp.base.AsType(dtype='float16'), ], debug=args.debug) # ====== enable debug mode ====== # if args.debug: with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') for i, name in enumerate(all_files[:12]): tmp = extractors.transform(name) if isinstance(tmp, pp.base.ExtractorSignal): print(tmp) exit() else: V.plot_multiple_features(tmp, title=name) V.plot_save(os.path.join(PATH_EXP, 'feature_debug.pdf')) exit() # =========================================================================== # Processor # =========================================================================== with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') processor = pp.FeatureProcessor(jobs=all_files, path=PATH_ACOUSTIC, extractor=extractors, n_cache=0.12, ncpu =min(18, cpu_count() - 2) if args.ncpu <= 0 else int(args.ncpu), override=True, identifier='name', log_path=os.path.join(PATH_EXP, 'processor.log'), stop_on_failure=True # small dataset, enable stop on failure
def evaluate(y_true, y_pred_proba=None, y_pred_log_proba=None, labels=None, title='', path=None, xlims=None, ylims=None, print_log=True): from odin.backend import to_llr from odin.backend.metrics import (det_curve, compute_EER, roc_curve, compute_Cavg, compute_Cnorm, compute_minDCF) def format_score(s): return ctext('%.4f' % s if is_number(s) else s, 'yellow') nb_classes = None # ====== check y_pred ====== # if y_pred_proba is None and y_pred_log_proba is None: raise ValueError("At least one of `y_pred_proba` or `y_pred_log_proba` " "must not be None") y_pred_llr = to_llr(y_pred_proba) if y_pred_log_proba is None \ else to_llr(y_pred_log_proba) nb_classes = y_pred_llr.shape[1] y_pred = np.argmax(y_pred_llr, axis=-1) # ====== check y_true ====== # if isinstance(y_true, Data): y_true = y_true.array if isinstance(y_true, (tuple, list)): y_true = np.array(y_true) if y_true.ndim == 2: # convert one-hot to labels y_true = np.argmax(y_true, axis=-1) # ====== check labels ====== # if labels is None: labels = [str(i) for i in range(nb_classes)] # ====== scoring ====== # if y_pred_proba is None: ll = 'unknown' else: ll = log_loss(y_true=y_true, y_pred=y_pred_proba) acc = accuracy_score(y_true=y_true, y_pred=y_pred) cm = confusion_matrix(y_true=y_true, y_pred=y_pred) # C_norm cnorm, cnorm_arr = compute_Cnorm(y_true=y_true, y_score=y_pred_llr, Ptrue=[0.1, 0.5], probability_input=False) if y_pred_log_proba is not None: cnorm_, cnorm_arr_ = compute_Cnorm(y_true=y_true, y_score=y_pred_log_proba, Ptrue=[0.1, 0.5], probability_input=False) if np.mean(cnorm) > np.mean(cnorm_): # smaller is better cnorm, cnorm_arr = cnorm_, cnorm_arr_ # DET Pfa, Pmiss = det_curve(y_true=y_true, y_score=y_pred_llr) eer = compute_EER(Pfa=Pfa, Pmiss=Pmiss) minDCF = compute_minDCF(Pfa, Pmiss)[0] # PRINT LOG if print_log: print(ctext("--------", 'red'), ctext(title, 'cyan')) print("Log loss :", format_score(ll)) print("Accuracy :", format_score(acc)) print("C_norm :", format_score(np.mean(cnorm))) print("EER :", format_score(eer)) print("minDCF :", format_score(minDCF)) print(print_confusion(arr=cm, labels=labels)) # ====== save report to PDF files if necessary ====== # if path is not None: if y_pred_proba is None: y_pred_proba = y_pred_llr from matplotlib import pyplot as plt plt.figure(figsize=(nb_classes, nb_classes + 1)) plot_confusion_matrix(cm, labels) # Cavg plt.figure(figsize=(nb_classes + 1, 3)) plot_Cnorm(cnorm=cnorm_arr, labels=labels, Ptrue=[0.1, 0.5], fontsize=14) # binary classification if nb_classes == 2 and \ (y_pred_proba.ndim == 1 or (y_pred_proba.ndim == 2 and y_pred_proba.shape[1] == 1)): fpr, tpr = roc_curve(y_true=y_true, y_score=y_pred_proba.ravel()) # det curve plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2) # roc curve plt.figure() plot_detection_curve(fpr, tpr, curve='roc') # multiclasses else: y_true = one_hot(y_true, nb_classes=nb_classes) fpr_micro, tpr_micro, _ = roc_curve(y_true=y_true.ravel(), y_score=y_pred_proba.ravel()) Pfa_micro, Pmiss_micro = Pfa, Pmiss fpr, tpr = [], [] Pfa, Pmiss = [], [] for i, yi in enumerate(y_true.T): curve = roc_curve(y_true=yi, y_score=y_pred_proba[:, i]) fpr.append(curve[0]) tpr.append(curve[1]) curve = det_curve(y_true=yi, y_score=y_pred_llr[:, i]) Pfa.append(curve[0]) Pmiss.append(curve[1]) plt.figure() plot_detection_curve(fpr_micro, tpr_micro, curve='roc', linewidth=1.2, title="ROC Micro") plt.figure() plot_detection_curve(fpr, tpr, curve='roc', labels=labels, linewidth=1.0, title="ROC for each classes") plt.figure() plot_detection_curve(Pfa_micro, Pmiss_micro, curve='det', xlims=xlims, ylims=ylims, linewidth=1.2, title="DET Micro") plt.figure() plot_detection_curve(Pfa, Pmiss, curve='det', xlims=xlims, ylims=ylims, labels=labels, linewidth=1.0, title="DET for each classes") plot_save(path)
ncpu=NCPU, batch=1): if np.random.rand() > 0.8: feat = { i: j[:1200] if isinstance(j, np.ndarray) else j for i, j in feat.items() } V.plot_multiple_features(feat, fig_width=20, title=feat['name']) prog['name'] = feat['name'][:48] prog['dsname'] = feat['dsname'] prog['dsnoise'] = feat['dsnoise'] prog.add(1) V.plot_save( os.path.join( EXP_DIR, 'debug_%s_%s.pdf' % (FEATURE_RECIPE, AUGMENTATION_NAME))) # ====== save the extractor debugging log ====== # pp.set_extractor_debug(recipe, debug=True) recipe.transform(AUG_FILES[0]) with open( os.path.join( EXP_DIR, 'debug_%s_%s.log' % (FEATURE_RECIPE, AUGMENTATION_NAME)), 'w') as f: for name, step in recipe.steps: f.write(step.last_debugging_text) exit() # =========================================================================== # Run the processor # ===========================================================================
with catch_warnings_ignore(Warning): n_samples = 120 prog = Progbar(target=n_samples, print_summary=True, name='Debugging Augmentation') for feat in mpi.MPI(jobs=AUG_FILES[:n_samples], func=recipe.transform, ncpu=NCPU, batch=1): if np.random.rand() > 0.8: feat = {i: j[:1200] if isinstance(j, np.ndarray) else j for i, j in feat.items()} V.plot_multiple_features(feat, fig_width=20, title=feat['name']) prog['name'] = feat['name'][:48] prog['dsname'] = feat['dsname'] prog['dsnoise'] = feat['dsnoise'] prog.add(1) V.plot_save(os.path.join(EXP_DIR, 'debug_%s_%s.pdf' % (FEATURE_RECIPE, AUGMENTATION_NAME))) # ====== save the extractor debugging log ====== # pp.set_extractor_debug(recipe, debug=True) recipe.transform(AUG_FILES[0]) with open(os.path.join(EXP_DIR, 'debug_%s_%s.log' % (FEATURE_RECIPE, AUGMENTATION_NAME)), 'w') as f: for name, step in recipe.steps: f.write(step.last_debugging_text) exit() # =========================================================================== # Run the processor # =========================================================================== # ====== basic path ====== # output_dataset_path = os.path.join(PATH_ACOUSTIC_FEATURES, '%s_%s' % (FEATURE_RECIPE, AUGMENTATION_NAME)) processor_log_path = os.path.join(EXP_DIR,
if isinstance(feat, pp.base.ExtractorSignal): error_signal.append(feat) prog.add(1) continue prog['spkid'] = feat['spkid'] prog['name'] = feat['name'] prog['dsname'] = feat['dsname'] prog['duration'] = feat['duration'] prog.add(1) # 30% chance plotting if rand.rand() < 0.5: V.plot_multiple_features(feat, fig_width=20, title='[%s]%s' % (feat['dsname'], feat['name'])) V.plot_save(os.path.join(EXP_DIR, 'debug_%s.pdf' % FEATURE_RECIPE), dpi=30) # ====== save the extractor debugging log ====== # pp.set_extractor_debug(recipe, debug=True) recipe.transform(samples[0]) with open(os.path.join(EXP_DIR, 'debug_%s.log' % FEATURE_RECIPE), 'w') as f: for name, step in recipe.steps: f.write(step.last_debugging_text) # ====== print error signal ====== # for e in error_signal: f.write(str(e) + '\n') print(e) exit() # =========================================================================== # Running the extractor # ===========================================================================
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists(path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = {name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset} # special case exclude all the noise data if 'noise' in exclude_dataset: indices = {name: (start, end) for name, (start, end) in indices.items() if '/' not in name} else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = {i: len(j) / n_total_files for i, j in dataset2name.items()} # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name); rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump({'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label}, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % ( dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print("#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print("#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices); rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % (100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title('[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
with catch_warnings_ignore(RuntimeWarning), catch_warnings_ignore(FutureWarning): data_map = {} stats_map = {} spk_map = {} for dsname, text, data, stats, spk_stats in mpi.MPI(jobs=all_dataset, func=dataset_statistics, ncpu=None, batch=1): data_map[dsname] = data stats_map[dsname] = stats spk_map[dsname] = spk_stats print(text) for dsname in all_dataset: print("Plotting ...", ctext(dsname, 'cyan')) data = data_map[dsname] V.plot_figure(nrow=2, ncol=20) ax = plt.subplot(1, n_col, 1) plot_histogram(data[0], ax, title="Duration") ax = plt.subplot(1, n_col, 2) plot_histogram(data[1]['sum_per_spk'], ax, title="Dur/Spk") ax = plt.subplot(1, n_col, 3) plot_histogram(data[1]['nutt_per_spk'], ax, title="#Utt/Spk") plt.suptitle(dsname, fontsize=8) plot_mean_std(_map=stats_map, title='Data') plot_mean_std(_map=spk_map, title='Speaker') V.plot_save(figure_path, dpi=32)
def evaluate(vae, ds, expdir: str, title: str, batch_size: int = 32, seed: int = 1): from odin.bay.vi import Correlation rand = np.random.RandomState(seed=seed) if not os.path.exists(expdir): os.makedirs(expdir) tanh = True if ds.name.lower() == 'celeba' else False ## data for training semi-supervised # careful don't allow any data leakage! train = ds.create_dataset('train', batch_size=batch_size, label_percent=True, shuffle=False, normalize='tanh' if tanh else 'probs') data = [(vae.encode(x, training=False), y) \ for x, y in tqdm(train, desc=title)] x_semi_train = tf.concat( [tf.concat([i.mean(), _ymean(j)], axis=1) for (i, j), _ in data], axis=0).numpy() y_semi_train = tf.concat([i for _, i in data], axis=0).numpy() # shuffle ids = rand.permutation(x_semi_train.shape[0]) x_semi_train = x_semi_train[ids] y_semi_train = y_semi_train[ids] ## data for testing test = ds.create_dataset('test', batch_size=batch_size, label_percent=True, shuffle=False, normalize='tanh' if tanh else 'probs') prog = tqdm(test, desc=title) llk_x = [] llk_y = [] z = [] y_true = [] y_pred = [] x_true = [] x_pred = [] x_org, x_rec = [], [] for x, y in prog: px, (qz, qy) = vae(x, training=False) y_true.append(y) y_pred.append(_ymean(qy)) z.append(qz.mean()) llk_x.append(px.log_prob(x)) llk_y.append(qy.log_prob(y)) if rand.uniform() < 0.005 or len(x_org) < 2: x_org.append(x) x_rec.append(px.mean()) ## llk llk_x = tf.reduce_mean(tf.concat(llk_x, axis=0)).numpy() llk_y = tf.reduce_mean(tf.concat(llk_y, axis=0)).numpy() ## the latents z = tf.concat(z, axis=0).numpy() y_true = tf.concat(y_true, axis=0).numpy() y_pred = tf.concat(y_pred, axis=0).numpy() x_semi_test = tf.concat([z, y_pred], axis=-1).numpy() # shuffle ids = rand.permutation(z.shape[0]) z = z[ids] y_true = y_true[ids] y_pred = y_pred[ids] x_semi_test = x_semi_test[ids] ## saving reconstruction images x_org = tf.concat(x_org, axis=0).numpy() x_rec = tf.concat(x_rec, axis=0).numpy() ids = rand.permutation(x_org.shape[0]) x_org = x_org[ids][:36] x_rec = x_rec[ids][:36] vmin = x_rec.reshape((36, -1)).min(axis=1).reshape((36, 1, 1, 1)) vmax = x_rec.reshape((36, -1)).max(axis=1).reshape((36, 1, 1, 1)) if tanh: x_org = (x_org + 1.) / 2. x_rec = (x_rec - vmin) / (vmax - vmin) if x_org.shape[-1] == 1: # grayscale image x_org = np.squeeze(x_org, -1) x_rec = np.squeeze(x_rec, -1) else: # color image x_org = np.transpose(x_org, (0, 3, 1, 2)) x_rec = np.transpose(x_rec, (0, 3, 1, 2)) plt.figure(figsize=(15, 8)) ax = plt.subplot(1, 2, 1) vs.plot_images(x_org, grids=(6, 6), ax=ax, title='Original') ax = plt.subplot(1, 2, 2) vs.plot_images(x_rec, grids=(6, 6), ax=ax, title='Reconstructed') plt.tight_layout() ## prepare the labels if ds.name in ('mnist', 'fashionmnist', 'celeba'): true = np.argmax(y_true, axis=-1) pred = np.argmax(y_pred, axis=-1) y_semi_train = np.argmax(y_semi_train, axis=-1) y_semi_test = true labels_name = ds.labels else: # shapes3d dsprites true = y_true[:, 2].astype(np.int32) pred = y_pred[:, 2].astype(np.int32) y_semi_train = y_semi_train[:, 2].astype(np.int32) y_semi_test = true if ds.name == 'shapes3d': labels_name = ['cube', 'cylinder', 'sphere', 'round'] elif ds.name == 'dsprites': labels_name = ['square', 'ellipse', 'heart'] plt.figure(figsize=(8, 8)) vs.plot_confusion_matrix(cm=confusion_matrix(y_true=true, y_pred=pred), labels=labels_name, cbar=True, fontsize=10, title=title) labels = np.array([labels_name[i] for i in true]) labels_pred = np.array([labels_name[i] for i in pred]) ## save arrays for later inspectation np.savez_compressed(f'{expdir}/arrays', x_train=x_semi_train, y_train=y_semi_train, x_test=x_semi_test, y_test=y_semi_test, zdim=z.shape[1], labels=labels_name) print(f'Export arrays to "{expdir}/arrays.npz"') ## semi-supervised with open(f'{expdir}/results.txt', 'w') as f: print(f'Export results to "{expdir}/results.txt"') f.write(f'Steps: {vae.step.numpy()}\n') f.write(f'llk_x: {llk_x}\n') f.write(f'llk_y: {llk_y}\n') for p in [0.004, 0.06, 0.2, 0.99]: x_train, x_test, y_train, y_test = train_test_split( x_semi_train, y_semi_train, train_size=int(np.round(p * x_semi_train.shape[0])), random_state=1, ) m = LogisticRegression(max_iter=3000, random_state=1) m.fit(x_train, y_train) # write the report f.write(f'{m.__class__.__name__} Number of labels: ' f'{p} {x_train.shape[0]}/{x_test.shape[0]}') f.write('\nValidation:\n') f.write( classification_report(y_true=y_test, y_pred=m.predict(x_test))) f.write('\nTest:\n') f.write( classification_report(y_true=y_semi_test, y_pred=m.predict(x_semi_test))) f.write('------------\n') ## scatter plot n_points = 4000 # tsne plot tsne = DimReduce.TSNE(z[:n_points], n_components=2) kw = dict(x=tsne[:, 0], y=tsne[:, 1], grid=False, size=12.0, alpha=0.6) plt.figure(figsize=(8, 8)) vs.plot_scatter(color=labels[:n_points], title=f'[True-tSNE]{title}', **kw) plt.figure(figsize=(8, 8)) vs.plot_scatter(color=labels_pred[:n_points], title=f'[Pred-tSNE]{title}', **kw) # pca plot pca = DimReduce.PCA(z, n_components=2) kw = dict(x=pca[:, 0], y=pca[:, 1], grid=False, size=12.0, alpha=0.6) plt.figure(figsize=(8, 8)) vs.plot_scatter(color=labels, title=f'[True-PCA]{title}', **kw) plt.figure(figsize=(8, 8)) vs.plot_scatter(color=labels_pred, title=f'[Pred-PCA]{title}', **kw) ## factors plot corr = (Correlation.Spearman(z, y_true) + Correlation.Pearson(z, y_true)) / 2. best_z = np.argsort(np.abs(corr), axis=0)[-2:] style = dict(size=15.0, alpha=0.6, grid=False) for fi, (z1, z2) in enumerate(best_z.T): plt.figure(figsize=(8, 4)) ax = plt.subplot(1, 2, 1) vs.plot_scatter(x=z[:n_points, z1], y=z[:n_points, z2], val=y_true[:n_points, fi], ax=ax, title=ds.labels[fi], **style) ax = plt.subplot(1, 2, 2) vs.plot_scatter(x=z[:n_points, z1], y=z[:n_points, z2], val=y_pred[:n_points, fi], ax=ax, title=ds.labels[fi], **style) plt.tight_layout() ## save all plot vs.plot_save(f'{expdir}/analysis.pdf', dpi=180, verbose=True)
plt.plot(py, llk, label='SemafoVAE') plt.plot([py[0], py[-1]], [-3464.40, -3464.40], label='VAE baseline', color='r') plt.gca().set_xscale('log') plt.xticks(py, [str(i) for i in py], rotation=-30) plt.legend(fontsize=8) plt.xlabel('Supervision rate') plt.title('Test log-likelihood') plt.subplot(1, 3, 2) plt.plot(py, fid, label='SemafoVAE') plt.plot([py[0], py[-1]], [74.57, 74.57], label='VAE baseline', color='r') plt.gca().set_xscale('log') plt.xticks(py, [str(i) for i in py], rotation=-30) plt.legend(fontsize=8) plt.xlabel('Supervision rate') plt.title('FID') plt.subplot(1, 3, 3) plt.plot(py, dci, label='SemafoVAE') plt.plot([py[0], py[-1]], [64.82, 64.82], label='VAE baseline', color='r') plt.gca().set_xscale('log') plt.xticks(py, [str(i) for i in py], rotation=-30) plt.legend(fontsize=8) plt.xlabel('Supervision rate') plt.title('DCI') plt.tight_layout() vs.plot_save(verbose=True)
from odin import visual as vs os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' tf.random.set_seed(8) np.random.seed(8) X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_umap = ml.fast_umap(X_train, X_test) X_tsne = ml.fast_tsne(X_train, X_test) X_pca = ml.fast_pca(X_train, X_test, n_components=2) styles = dict(size=12, alpha=0.6, centroids=True) vs.plot_figure(6, 12) vs.plot_scatter(x=X_pca[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_pca[1], color=y_test, ax=(1, 2, 2), **styles) vs.plot_figure(6, 12) vs.plot_scatter(x=X_tsne[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_tsne[1], color=y_test, ax=(1, 2, 2), **styles) vs.plot_figure(6, 12) vs.plot_scatter(x=X_umap[0], color=y_train, ax=(1, 2, 1), **styles) vs.plot_scatter(x=X_umap[1], color=y_test, ax=(1, 2, 2), **styles) vs.plot_save()
from sklearn.manifold import TSNE from odin.utils import UnitTimer, TemporaryDirectory iris = F.load_iris() print(iris) pca = MiniBatchPCA() X = iris['X'][:] i = 0 while i < X.shape[0]: x = X[i:i + 20] i += 20 pca.partial_fit(x) print("Fitting PCA ...") with UnitTimer(): for i in range(8): x = pca.transform(X) with UnitTimer(): for i in range(8): x = pca.transform_mpi(X, keep_order=True, ncpu=1, n_components=2) print("Output shape:", x.shape) colors = ['r' if i == 0 else ('b' if i == 1 else 'g') for i in iris['y'][:]] visual.plot_scatter(x[:, 0], x[:, 1], color=colors, size=8) visual.plot_save('/tmp/tmp.pdf') # bananab
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
# =========================================================================== y_pred_proba, Z1_test, Z2_test, Z3_test = make_dnn_prediction( functions=[f_pred_proba, f_z1, f_z2, f_z3], X=X_test_data, title='TEST') print("Test Latent:", Z1_test.shape, Z2_test.shape, Z3_test.shape) y_pred = np.argmax(y_pred_proba, axis=-1) evaluate(y_true=X_test_true, y_pred_proba=y_pred_proba, labels=labels, title="Test set (Deep prediction)", path=os.path.join(EXP_DIR, 'test_deep.pdf')) # ====== make a streamline classifier ====== # # training PLDA Z3_train, y_train = make_dnn_prediction(f_z3, X=train, title="TRAIN") print("Z3_train:", Z3_train.shape, y_train.shape) Z3_valid, y_valid = make_dnn_prediction(f_z3, X=valid, title="VALID") print("Z3_valid:", Z3_valid.shape, y_valid.shape) plda = PLDA(n_phi=200, random_state=K.get_rng().randint(10e8), n_iter=12, labels=labels, verbose=0) plda.fit(np.concatenate([Z3_train, Z3_valid], axis=0), np.concatenate([y_train, y_valid], axis=0)) y_pred_log_proba = plda.predict_log_proba(Z3_test) evaluate(y_true=X_test_true, y_pred_log_proba=y_pred_log_proba, labels=labels, title="Test set (PLDA - Latent prediction)", path=os.path.join(EXP_DIR, 'test_latent.pdf')) # ====== visualize ====== # visualize_latent_space(X_org=X_test_data, X_latent=Z1_test, name=X_test_name, labels=X_test_true, title="latent1") visualize_latent_space(X_org=X_test_data, X_latent=Z2_test, name=X_test_name, labels=X_test_true, title="latent2") V.plot_save(os.path.join(EXP_DIR, 'latent.pdf'))
df = pickle.load(f) print(df) # plt.figure(figsize=(6, 5), dpi=150) sns.scatterplot(x='beta', y='llk', hue='finetune', data=df, alpha=0.5, s=80) plt.gca().set_xscale('log') plt.xticks(BETA, [f'{b:g}' for b in BETA]) # n_images = len(df) n_col = 10 n_row = int(np.ceil(n_images / 10)) plt.figure(figsize=(1.5 * n_col, 1.5 * n_row), dpi=150) for i, (beta, gamma, zdim, finetune, step, llk, image) in enumerate(df.values): plt.subplot(n_row, n_col, i + 1) plt.imshow(image, cmap='Greys_r') plt.axis('off') plt.title( f'b={beta} g={gamma} z={zdim} t={"T" if finetune else "F"}', fontsize=8) plt.tight_layout() vs.plot_save(os.path.join(PATH, 'figures.pdf'), verbose=True) # === 3. no support else: raise NotImplementedError
device='gpu') gmm.initialize(X) print(gmm) gmm.fit(X) # ====== match each components to closest mean ====== # gmm_mean = [None] * nmix gmm_sigma = [None] * nmix for mean, sigma in zip(gmm.mean.T, gmm.sigma.T): sigma = np.diag(sigma) distance = sorted([(i, np.sqrt(np.sum((m - mean)**2))) for i, m in enumerate(stats_mean)], key=lambda x: x[1]) for i, dist in distance: if gmm_mean[i] is None: gmm_mean[i] = mean gmm_sigma[i] = sigma break # ====== plot everything ====== # plt.figure() colors = V.generate_random_colors(n=nmix) for i in range(nmix): c = colors[i] dat = y[i] sigma = gmm_sigma[i] plt.scatter(dat[:, 0], dat[:, 1], c=c, s=0.5) V.plot_ellipses(gmm_mean[i], gmm_sigma[i], alpha=0.5, color=c) V.plot_ellipses(stats_mean[i], stats_sigma[i], alpha=0.3, color='red') plt.suptitle('#iter:%d stochastic:%s downsample:%d ' % (niter, stochastic, downsample)) V.plot_save(pdf_path)
# =========================================================================== extractor = get_module_from_path(identifier=str(args.recipe), prefix='feature_recipes', path=get_script_path()) assert len(extractor) > 0, \ "Cannot find any recipe with name: '%s' from path: '%s'" % (args.recipe, get_script_path()) recipe = extractor[0](DEBUG) # ====== debugging ====== # if DEBUG: with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') for path, name in SAMPLED_WAV_FILE: feat = recipe.transform(path) assert feat['bnf'].shape[0] == feat['mspec'].shape[0] V.plot_multiple_features(feat, title=feat['name']) V.plot_save(os.path.join(PATH_EXP, 'features_%s.pdf' % args.recipe)) exit() # =========================================================================== # Prepare the processor # =========================================================================== with np.warnings.catch_warnings(): np.warnings.filterwarnings('ignore') jobs = list(WAV_FILES.keys()) processor = pp.FeatureProcessor(jobs=jobs, path=os.path.join(PATH_ACOUSTIC_FEAT, args.recipe), extractor=recipe, n_cache=1200, ncpu=min(18, cpu_count() - 2), override=True, identifier='name', log_path=os.path.join(PATH_EXP, 'processor_%s.log' % args.recipe),
def evaluate(vae: VariationalAutoencoder, ds: ImageDataset, expdir: str, title: str, batch_size: int = 64, take_count: int = -1, n_images: int = 36, seed: int = 1): n_rows = int(np.sqrt(n_images)) is_semi = vae.is_semi_supervised() is_hierarchical = vae.is_hierarchical() ds_kw = dict(batch_size=batch_size, label_percent=1.0, shuffle=False) ## prepare rand = np.random.RandomState(seed=seed) if not os.path.exists(expdir): os.makedirs(expdir) ## data for training semi-supervised train = ds.create_dataset('train', **ds_kw) (llkx_train, llky_train, x_org_train, x_rec_train, y_true_train, y_pred_train, z_train, pz_train) = _call(vae, ds=train, rand=rand, take_count=take_count, n_images=n_images, verbose=True) ## data for testing test = ds.create_dataset('test', **ds_kw) (llkx_test, llky_test, x_org_test, x_rec_test, y_true_test, y_pred_test, z_test, pz_test) = _call(vae, ds=test, rand=rand, take_count=take_count, n_images=n_images, verbose=True) # === 0. plotting latent-factor pairs for idx, z in enumerate(z_test): z = z.mean() f = y_true_test corr_mat = Correlation.Spearman(z, f) # [n_latents, n_factors] plot_latents_pairs(z, f, corr_mat, ds.labels) vs.plot_save(f'{expdir}/latent{idx}_factor.pdf', dpi=100, verbose=True) # === 0. latent traverse plot x_travs = x_org_test if x_travs.ndim == 3: # grayscale image x_travs = np.expand_dims(x_travs, -1) else: # color image x_travs = np.transpose(x_travs, (0, 2, 3, 1)) x_travs = x_travs[rand.permutation(x_travs.shape[0])] n_visual_samples = 5 n_traverse_points = 21 n_top_latents = 10 plt.figure(figsize=(8, 3 * n_visual_samples)) for i in range(n_visual_samples): images = vae.sample_traverse(x_travs[i:i + 1], min_val=-np.min(z_test[0].mean()), max_val=np.max(z_test[0].mean()), n_best_latents=n_top_latents, n_traverse_points=n_traverse_points, mode='linear') images = as_tuple(images)[0] images = _prepare_images(images.mean().numpy(), normalize=True) vs.plot_images(images, grids=(n_top_latents, n_traverse_points), ax=(n_visual_samples, 1, i + 1)) if i == 0: plt.title('Latents traverse') plt.tight_layout() vs.plot_save(f'{expdir}/latents_traverse.pdf', dpi=180, verbose=True) # === 0. prior sampling plot images = as_tuple(vae.sample_observation(n=n_images, seed=seed))[0] images = _prepare_images(images.mean().numpy(), normalize=True) plt.figure(figsize=(5, 5)) vs.plot_images(images, grids=(n_rows, n_rows), title='Sampled') # === 1. reconstruction plot plt.figure(figsize=(15, 15)) vs.plot_images(x_org_train, grids=(n_rows, n_rows), ax=(2, 2, 1), title='[Train]Original') vs.plot_images(x_rec_train, grids=(n_rows, n_rows), ax=(2, 2, 2), title='[Train]Reconstructed') vs.plot_images(x_org_test, grids=(n_rows, n_rows), ax=(2, 2, 3), title='[Test]Original') vs.plot_images(x_rec_test, grids=(n_rows, n_rows), ax=(2, 2, 4), title='[Test]Reconstructed') plt.tight_layout() ## prepare the labels label_type = ds.label_type if label_type == 'categorical': labels_name = ds.labels true = np.argmax(y_true_test, axis=-1) labels_true = np.array([labels_name[i] for i in true]) labels_pred = labels_true if is_semi: pred = np.argmax(y_pred_test.mean().numpy(), axis=-1) labels_pred = np.array([labels_name[i] for i in pred]) elif label_type == 'factor': # dsprites, shapes3d labels_name = ['cube', 'cylinder', 'sphere', 'round'] \ if 'shapes3d' in ds.name else ['square', 'ellipse', 'heart'] true = y_true_test[:, 2].astype('int32') labels_true = np.array([labels_name[i] for i in true]) labels_pred = labels_true if is_semi: pred = get_ymean(y_pred_test)[:, 2].astype('int32') labels_pred = np.array([labels_name[i] for i in pred]) else: # CelebA raise NotImplementedError ## confusion matrix if is_semi: plt.figure(figsize=(8, 8)) acc = accuracy_score(y_true=true, y_pred=pred) vs.plot_confusion_matrix(cm=confusion_matrix(y_true=true, y_pred=pred), labels=labels_name, cbar=True, fontsize=10, title=f'{title} Acc:{acc:.2f}') ## save arrays for later inspections with open(f'{expdir}/arrays', 'wb') as f: pickle.dump( dict(z_train=z_train, y_pred_train=y_pred_train, y_true_train=y_true_train, z_test=z_test, y_pred_test=y_pred_test, y_true_test=y_true_test, labels=labels_name, ds=ds.name, label_type=label_type), f) print(f'Exported arrays to "{expdir}/arrays"') ## semi-supervised z_mean_train = np.concatenate( [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_train], -1) z_mean_test = np.concatenate( [z.mean().numpy().reshape(z.batch_shape[0], -1) for z in z_test], -1) # === 2. scatter points latents plot n_points = 5000 ids = rand.permutation(len(labels_true))[:n_points] Y_true = labels_true[ids] Y_pred = labels_pred[ids] # tsne plot n_latents = 0 if len(z_train) == 1 else len(z_train) for name, X in zip( ['all'] + [f'latents{i}' for i in range(n_latents)], [z_mean_test[ids]] + [z_test[i].mean().numpy()[ids] for i in range(n_latents)]): print(f'Plot scatter points for {name}') X = X.reshape(X.shape[0], -1) # flatten to 2D X = Pipeline([('zscore', StandardScaler()), ('pca', PCA(min(X.shape[1], 512), random_state=seed))]).fit_transform(X) tsne = DimReduce.TSNE(X, n_components=2, framework='sklearn') kw = dict(x=tsne[:, 0], y=tsne[:, 1], grid=False, size=12.0, alpha=0.8) plt.figure(figsize=(12, 6)) vs.plot_scatter(color=Y_true, title=f'[True]{title}-{name}', ax=(1, 2, 1), **kw) vs.plot_scatter(color=Y_pred, title=f'[Pred]{title}-{name}', ax=(1, 2, 2), **kw) ## save all plot vs.plot_save(f'{expdir}/analysis.pdf', dpi=180, verbose=True) # === 3. show the latents statistics n_latents = len(z_train) colors = sns.color_palette(n_colors=len(labels_true)) styles = dict(grid=False, ticks_off=False, alpha=0.6, xlabel='mean', ylabel='stddev') # scatter between latents and labels (assume categorical distribution) def _show_latents_labels(Z, Y, title): plt.figure(figsize=(5 * n_latents, 5), dpi=150) for idx, z in enumerate(Z): if len(z.batch_shape) == 0: mean = np.repeat(np.expand_dims(z.mean(), 0), Y.shape[0], 0) stddev = z.sample(Y.shape[0]) - mean else: mean = flatten(z.mean()) stddev = flatten(z.stddev()) y = np.argmax(Y, axis=-1) data = [[], [], []] for y_i, c in zip(np.unique(y), colors): mask = (y == y_i) data[0].append(np.mean(mean[mask], 0)) data[1].append(np.mean(stddev[mask], 0)) data[2].append([labels_true[y_i]] * mean.shape[1]) vs.plot_scatter( x=np.concatenate(data[0], 0), y=np.concatenate(data[1], 0), color=np.concatenate(data[2], 0), ax=(1, n_latents, idx + 1), size=15 if mean.shape[1] < 128 else 8, title=f'[Test-{title}]#{idx} - {mean.shape[1]} (units)', **styles) plt.tight_layout() # simple scatter mean-stddev each latents def _show_latents(Z, title): plt.figure(figsize=(3.5 * n_latents, 3.5), dpi=150) for idx, z in enumerate(Z): mean = flatten(z.mean()) stddev = flatten(z.stddev()) if mean.ndim == 2: mean = np.mean(mean, 0) stddev = np.mean(stddev, 0) vs.plot_scatter( x=mean, y=stddev, ax=(1, n_latents, idx + 1), size=15 if len(mean) < 128 else 8, title=f'[Test-{title}]#{idx} - {len(mean)} (units)', **styles) _show_latents_labels(z_test, y_true_test, 'post') _show_latents_labels(pz_test, y_true_test, 'prior') _show_latents(z_test, 'post') _show_latents(pz_test, 'prior') # KL statistics vs.plot_figure() for idx, (qz, pz) in enumerate(zip(z_test, pz_test)): kl = [] qz = Normal(loc=qz.mean(), scale=qz.stddev(), name=f'posterior{idx}') pz = Normal(loc=pz.mean(), scale=pz.stddev(), name=f'prior{idx}') for s, e in minibatch(batch_size=8, n=100): z = qz.sample(e - s) # don't do this in GPU, it explodes! kl.append((qz.log_prob(z) - pz.log_prob(z)).numpy()) kl = np.concatenate(kl, 0) # (mcmc, batch, event) # per sample kl_samples = np.sum(kl, as_tuple(list(range(2, kl.ndim)))) kl_samples = logsumexp(kl_samples, 0) plt.subplot(n_latents, 2, idx * 2 + 1) sns.histplot(kl_samples, bins=50) plt.title(f'Z#{idx} KL per sample (nats)') # per latent kl_latents = np.mean(flatten(logsumexp(kl, 0)), 0) plt.subplot(n_latents, 2, idx * 2 + 2) plt.plot(np.sort(kl_latents)) plt.title(f'Z#{idx} KL per dim (nats)') plt.tight_layout() vs.plot_save(f'{expdir}/latents.pdf', dpi=180, verbose=True)
color=y_train_color, marker=y_train_marker, fontsize=12, legend=legends, title='[train]' + str(title), ax=(1, 2, 1)) plot_scatter(x=score[:, 0], y=score[:, 1], z=None if NUM_DIM < 3 or score.shape[1] < 3 else score[:, 2], size=POINT_SIZE, color=y_score_color, marker=y_score_marker, fontsize=12, legend=legends, title='[score]' + str(title), ax=(1, 2, 2)) plot(train=X_train_pca, score=X_score_pca, title='PCA') plot(train=X_train_tsne, score=X_score_tsne, title='T-SNE') plot(train=X_train_tsne_pca, score=X_score_tsne_pca, title='T-SNE + PCA') plot(train=X_train_lda, score=X_score_lda, title='LDA') plot(train=X_train_plda, score=X_score_plda, title='PLDA') plot(train=X_train_plda, score=X_score_plda, title='PLDA + PCA', applying_pca=True) plot(train=X_train_gmm, score=X_score_gmm, title='GMM') plot(train=X_train_rbm, score=X_score_rbm, title='RBM') plot_save('/tmp/tmp.pdf')
streamline_classifier(Z_train=scvi_z, y_train=y, Z_test=scvi_ztest, y_test=y_test, labels_name=labels, title='scVI') streamline_classifier(Z_train=sisua_z, y_train=y, Z_test=sisua_ztest, y_test=y_test, labels_name=labels, title='SISUA') # ====== imputation ====== # V.plot_figure(nrow=6, ncol=18) ids = np.argsort(library_size) plt.plot(library_size[ids], label="Original", linewidth=2.0, linestyle='--') plt.plot(scvi_outputs[-3].ravel()[ids], label="scVI", linewidth=2.0) plt.plot(sisua_outputs[-3].ravel()[ids], label="SISUA", linewidth=2.0) plt.legend() plt.title("Library Size") x = to_array(x) scvi_score = imputation_score(original=x, imputed=scvi_outputs[2]) sisua_score = imputation_score(original=x, imputed=sisua_outputs[2]) print("scVI:", scvi_score) print("SISUA:", scvi_score) # ====== save all the figure ====== # V.plot_save(SAVE_FIGURE_PATH, dpi=48)
def epoch_end(self, task, epoch_results): output_name = self.output_name if len(output_name) == 0: # nothing to do return task_name = self._task_name if task.name in task_name: self._count -= 1 # ====== processing results ====== # assert all(name in epoch_results for name in output_name),\ "Given outputs with name: %s; but task: '%s' results only contain name: %s" % \ (', '.join(self.output_name), str(task), ', '.join(tuple(epoch_results.keys()))) for name in output_name: batch_results = epoch_results[name] if name not in self._epoch_results[task.name]: self._epoch_results[task.name][name] = [] self._epoch_results[task.name][name].append(self.fn_reduce(batch_results)) # ====== start plotting ====== # if self._count == 0: self._count = self._repeat_freq * len(task_name) from odin import visual as V n_col = len(task_name) n_row = len(output_name) if self.save_path is not None: from matplotlib import pyplot as plt save_figures = False override = True for o_idx, o_name in enumerate(output_name): results = {task_name: r[o_name] for task_name, r in self._epoch_results.items()} if all(len(i) >= 2 for i in results.values()): # ====== print text plot ====== # if self.print_plot: text = [] for t_name in task_name: values = results[t_name] if isinstance(values[0], Number): t = V.print_bar(f=values, height=8, title=t_name + "/" + o_name) elif isinstance(values[0], np.ndarray) and values[0].ndim == 2 and \ values[0].shape[0] == values[0].shape[1]: t = V.print_confusion(arr=values[-1], side_bar=False, inc_stats=True, float_precision=2) else: t = '' if len(t) > 0: text.append(t) if len(text) > 1: print(V.merge_text_graph(*text, padding=' ')) else: print(text[0]) # ====== matplotlib plot and save pdf ====== # if self.save_path is not None: for t_idx, t_name in enumerate(task_name): values = results[t_name] # plotting series if isinstance(values[0], Number): if not save_figures: V.plot_figure(nrow=int(n_row * 1.8), ncol=20) save_figures = True max_epoch = np.argmax(values) max_val = values[max_epoch] min_epoch = np.argmin(values) min_val = values[min_epoch] plt.subplot(n_row, n_col, o_idx * len(task_name) + t_idx + 1) plt.plot(values) plt.scatter(max_epoch, max_val, s=180, alpha=0.4, c='r') plt.scatter(min_epoch, min_val, s=180, alpha=0.4, c='g') plt.xlim((0, len(values) - 1)) if not np.any(np.isinf(values)): eps = 0.1 * (max_val - min_val) plt.ylim((min_val - eps, max_val + eps)) plt.xticks(np.linspace(0, len(values) - 1, num=12, dtype='int32')) title_text = '[%s]' % o_name if t_idx == 0 else '' title_text += t_name plt.title('%s' % title_text, fontsize=8, fontweight='bold') # save figure to pdf or image files if save_figures: if override: save_path = self.save_path else: path, ext = os.path.splitext(self.save_path) save_path = path + ('.%d' % (task.curr_epoch + 1)) + ext V.plot_save(save_path, tight_plot=True, clear_all=True, log=False, dpi=180) self.send_notification("Saved summary at: %s" % save_path) return None
plot(group, x='beta', y='gamma', hue='llk', size='au_std', title=f'zdim={zdim}', ax=ax) plt.tight_layout() # fix zdim, show au and elbo plt.figure(figsize=(n_cols * 6, n_rows * 5), dpi=200) for i, (zdim, group) in tqdm(enumerate(df.groupby('zdim'))): ax = plt.subplot(n_rows, n_cols, i + 1) plot(group, x='beta', y='gamma', hue='elbo', size='au_std', title=f'zdim={zdim}', ax=ax) plt.tight_layout() # save all figures vs.plot_save(os.path.join(save_path, 'rate_distortion.pdf'), verbose=True) # save score file score_path = os.path.join(save_path, 'results.txt') with open(score_path, 'w') as f: df.to_string(f, index=False) print('Saved score file:', score_path) else: raise NotImplementedError(f'No support mode={args.mode}')
# training PLDA Z3_train, y_train = make_dnn_prediction(f_z3, X=train, title="TRAIN") print("Z3_train:", Z3_train.shape, y_train.shape) Z3_valid, y_valid = make_dnn_prediction(f_z3, X=valid, title="VALID") print("Z3_valid:", Z3_valid.shape, y_valid.shape) plda = PLDA(n_phi=200, random_state=K.get_rng().randint(10e8), n_iter=12, labels=labels, verbose=0) plda.fit(np.concatenate([Z3_train, Z3_valid], axis=0), np.concatenate([y_train, y_valid], axis=0)) y_pred_log_proba = plda.predict_log_proba(Z3_test) evaluate(y_true=X_test_true, y_pred_log_proba=y_pred_log_proba, labels=labels, title="Test set (PLDA - Latent prediction)", path=os.path.join(EXP_DIR, 'test_latent.pdf')) # ====== visualize ====== # visualize_latent_space(X_org=X_test_data, X_latent=Z1_test, name=X_test_name, labels=X_test_true, title="latent1") visualize_latent_space(X_org=X_test_data, X_latent=Z2_test, name=X_test_name, labels=X_test_true, title="latent2") V.plot_save(os.path.join(EXP_DIR, 'latent.pdf'))
def plot_epoch(task): if task is None: curr_epoch = 0 else: curr_epoch = task.curr_epoch if not (curr_epoch < 5 or curr_epoch % 5 == 0): return rand = np.random.RandomState(seed=1234) X, y = X_test, y_test n_data = X.shape[0] Z = f_z(X) W, W_stdev_mcmc, W_stdev_analytic = f_w(X) X_pca, W_pca_1 = fast_pca(X, W, n_components=2, random_state=rand.randint(10e8)) W_pca_2 = fast_pca(W, n_components=2, random_state=rand.randint(10e8)) X_count_sum = np.sum(X, axis=tuple(range(1, X.ndim))) W_count_sum = np.sum(W, axis=-1) n_visual_samples = 8 nrow = 13 + n_visual_samples * 3 V.plot_figure(nrow=int(nrow * 1.8), ncol=18) with V.plot_gridSpec(nrow=nrow + 3, ncol=6, hspace=0.8) as grid: # plot the latent space for i, (z, name) in enumerate(zip(Z, Z_names)): if z.shape[1] > 2: z = fast_pca(z, n_components=2, random_state=rand.randint(10e8)) ax = V.subplot(grid[:3, (i * 2):(i * 2 + 2)]) V.plot_scatter(x=z[:, 0], y=z[:, 1], color=y, marker=y, n_samples=4000, ax=ax, legend_enable=False, legend_ncol=n_classes) ax.set_title(name, fontsize=12) # plot the reconstruction for i, (x, name) in enumerate( zip([X_pca, W_pca_1, W_pca_2], [ 'Original data', 'Reconstruction', 'Reconstruction (separated PCA)' ])): ax = V.subplot(grid[3:6, (i * 2):(i * 2 + 2)]) V.plot_scatter(x=x[:, 0], y=x[:, 1], color=y, marker=y, n_samples=4000, ax=ax, legend_enable=i == 1, legend_ncol=n_classes, title=name) # plot the reconstruction count sum for i, (x, count_sum, name) in enumerate( zip([X_pca, W_pca_1], [X_count_sum, W_count_sum], [ 'Original data (Count-sum)', 'Reconstruction (Count-sum)' ])): ax = V.subplot(grid[6:10, (i * 3):(i * 3 + 3)]) V.plot_scatter(x=x[:, 0], y=x[:, 1], val=count_sum, n_samples=2000, marker=y, ax=ax, size=8, legend_enable=i == 0, legend_ncol=n_classes, title=name, colorbar=True, fontsize=10) # plot the count-sum series count_sum_observed = np.sum(X, axis=0).ravel() count_sum_expected = np.sum(W, axis=0) count_sum_stdev_explained = np.sum(W_stdev_mcmc, axis=0) count_sum_stdev_total = np.sum(W_stdev_analytic, axis=0) for i, kws in enumerate([ dict(xscale='linear', yscale='linear', sort_by=None), dict(xscale='linear', yscale='linear', sort_by='expected'), dict(xscale='log', yscale='log', sort_by='expected') ]): ax = V.subplot(grid[10:10 + 3, (i * 2):(i * 2 + 2)]) V.plot_series_statistics(count_sum_observed, count_sum_expected, explained_stdev=count_sum_stdev_explained, total_stdev=count_sum_stdev_total, fontsize=8, title="Count-sum" if i == 0 else None, **kws) # plot the mean and variances curr_grid_index = 13 ids = rand.permutation(n_data) ids = ids[:n_visual_samples] for i in ids: observed, expected, stdev_explained, stdev_total = \ X[i], W[i], W_stdev_mcmc[i], W_stdev_analytic[i] observed = observed.ravel() for j, kws in enumerate([ dict(xscale='linear', yscale='linear', sort_by=None), dict(xscale='linear', yscale='linear', sort_by='expected'), dict(xscale='log', yscale='log', sort_by='expected') ]): ax = V.subplot(grid[curr_grid_index:curr_grid_index + 3, (j * 2):(j * 2 + 2)]) V.plot_series_statistics(observed, expected, explained_stdev=stdev_explained, total_stdev=stdev_total, fontsize=8, title="Test Sample #%d" % i if j == 0 else None, **kws) curr_grid_index += 3 V.plot_save(os.path.join(FIGURE_PATH, 'latent_%d.png' % curr_epoch), dpi=200, log=True) exit()
false = stdv_False * np.random.randn(n_false) + mean_False y_true = np.zeros(shape=(n_true + n_false, )) y_true[:n_true] = 1 y_score = np.concatenate((true, false)) Pfa, Pmiss = K.metrics.det_curve(y_true=y_true, y_score=y_score) min_DCF, Pfa_opt, Pmiss_opt = K.metrics.compute_minDCF(Pfa, Pmiss) print("MinDCF, Pmiss_opt, Pfa_opt:", min_DCF, Pmiss_opt, Pfa_opt) print("EER1:", K.metrics.compute_EER(Pfa, Pmiss)) pmiss, pfa = rocch(tar_scores=true, nontar_scores=false) min_DCF, Pfa_opt, Pmiss_opt = K.metrics.compute_minDCF(pfa, pmiss) print("[Sidekit]MinDCF, Pmiss_opt, Pfa_opt:", min_DCF, Pmiss_opt, Pfa_opt) print("[Sidekit]EER:", compute_EER(pmiss, pfa)) print("[Sidekit]MinDCF, Pmiss_opt, Pfa_opt, ..., EER:", fast_minDCF(tar=true, non=false, plo=0)) fpr, tpr, _ = K.metrics.roc_curve(y_true=y_true, y_score=y_score) auc = K.metrics.compute_AUC(tpr, fpr) # ====== specialized plotting ====== # plt.figure() V.plot_detection_curve(x=pfa, y=pmiss, curve='det') plt.figure() V.plot_detection_curve(x=Pfa, y=Pmiss, curve='det') plt.figure() V.plot_detection_curve(x=fpr, y=tpr, curve='roc') V.plot_save('/tmp/tmp.pdf')
FutureWarning): data_map = {} stats_map = {} spk_map = {} for dsname, text, data, stats, spk_stats in mpi.MPI( jobs=all_dataset, func=dataset_statistics, ncpu=None, batch=1): data_map[dsname] = data stats_map[dsname] = stats spk_map[dsname] = spk_stats print(text) for dsname in all_dataset: print("Plotting ...", ctext(dsname, 'cyan')) data = data_map[dsname] V.plot_figure(nrow=2, ncol=20) ax = plt.subplot(1, n_col, 1) plot_histogram(data[0], ax, title="Duration") ax = plt.subplot(1, n_col, 2) plot_histogram(data[1]['sum_per_spk'], ax, title="Dur/Spk") ax = plt.subplot(1, n_col, 3) plot_histogram(data[1]['nutt_per_spk'], ax, title="#Utt/Spk") plt.suptitle(dsname, fontsize=8) plot_mean_std(_map=stats_map, title='Data') plot_mean_std(_map=spk_map, title='Speaker') V.plot_save(figure_path, dpi=32)
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices']) # ====== checking indices ====== # main_indices = {name: (start, end) for name, (start, end) in ds['indices'].items()} for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform( ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()