def evaluate_latent(fn, feeder, title): y_true = [] Z = [] for outputs in Progbar(feeder.set_batch(batch_mode='file'), name=title, print_report=True, print_summary=False, count_func=lambda x: x[-1].shape[0]): name = str(outputs[0]) idx = int(outputs[1]) data = outputs[2:] assert idx == 0 y_true.append(name) Z.append(fn(*data)) Z = np.concatenate(Z, axis=0) # ====== visualize spectrogram ====== # if Z.ndim >= 3: sample = np.random.choice(range(len(Z)), size=3, replace=False) spec = Z[sample.astype('int32')] y = [y_true[int(i)] for i in sample] plot_figure(nrow=6, ncol=6) for i, (s, tit) in enumerate(zip(spec, y)): s = s.reshape(len(s), -1) plot_spectrogram(s.T, ax=(1, 3, i + 1), title=tit) # ====== visualize each point ====== # # flattent to 2D Z = np.reshape(Z, newshape=(len(Z), -1)) # tsne if necessary if Z.shape[-1] > 3: Z = fast_tsne(Z, n_components=3, n_jobs=8, random_state=K.get_rng().randint(0, 10e8)) # color and marker Z_color = [digit_color_map[i.split('_')[-1]] for i in y_true] Z_marker = [gender_marker_map[i.split('_')[1]] for i in y_true] plot_figure(nrow=6, ncol=20) for i, azim in enumerate((15, 60, 120)): plot_scatter(x=Z[:, 0], y=Z[:, 1], z=Z[:, 2], ax=(1, 3, i + 1), size=4, color=Z_color, marker=Z_marker, azim=azim, legend=legends if i == 1 else None, legend_ncol=11, fontsize=10, title=title) plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
def visualize_latent_space(X_org, X_latent, name, labels, title): """ X_org : [n_samples, n_timesteps, n_features] X_latent : [n_samples, n_timesteps, n_latents] """ assert X_org.shape[0] == X_latent.shape[0] == len(name) == len(labels) assert not np.any(np.isnan(X_org)) assert not np.any(np.isnan(X_latent)) X_org = X_org.astype('float32') X_latent = X_latent.astype('float32') # ====== evaluation of the latent space ====== # n_channels = 1 if X_latent.ndim == 3 else int(np.prod(X_latent.shape[3:])) n_samples = X_org.shape[0] # 1 for original, 1 for mean channel, then the rest n_row = 1 + 1 + n_channels n_col = 3 V.plot_figure(nrow=n_row + 1, ncol=16) # only select 3 random sample for i, idx in enumerate( sampling_iter(it=range(n_samples), k=n_col, seed=1234)): x = X_org[idx] # latent tensor can be 3D or 4D z = X_latent[idx] if z.ndim > 3: z = np.reshape(z, newshape=(z.shape[0], z.shape[1], -1)) elif z.ndim == 2: z = np.reshape(z, newshape=(z.shape[0], z.shape[1], 1)) elif z.ndim == 3: pass else: raise ValueError("No support for z value: %s" % str(z.shape)) # plot original acoustic ax = V.plot_spectrogram(x.T, ax=(n_row, n_col, i + 1), title='Org') if i == 0: ax.set_title("[%s]'%s-%s'" % (str(title), str(name[idx]), str(labels[idx])), fontsize=8) else: ax.set_title("'%s-%s'" % (str(name[idx]), str(labels[idx])), fontsize=8) # plot the mean V.plot_spectrogram(np.mean(z, axis=-1).T, ax=(n_row, n_col, i + 4), title='Zmean') # plot first 25 channels if n_channels > 1: for j in range(min(8, n_channels)): V.plot_spectrogram(z[:, :, j].T, ax=(n_row, n_col, j * 3 + 7 + i), title='Z%d' % j)
def visualize_latent_space(X_org, X_latent, name, labels, title): """ X_org : [n_samples, n_timesteps, n_features] X_latent : [n_samples, n_timesteps, n_latents] """ assert X_org.shape[0] == X_latent.shape[0] == len(name) == len(labels) assert not np.any(np.isnan(X_org)) assert not np.any(np.isnan(X_latent)) X_org = X_org.astype('float32') X_latent = X_latent.astype('float32') # ====== evaluation of the latent space ====== # n_channels = 1 if X_latent.ndim == 3 else int(np.prod(X_latent.shape[3:])) n_samples = X_org.shape[0] # 1 for original, 1 for mean channel, then the rest n_row = 1 + 1 + n_channels n_col = 3 V.plot_figure(nrow=n_row + 1, ncol=16) # only select 3 random sample for i, idx in enumerate( sampling_iter(it=range(n_samples), k= n_col, seed=5218)): x = X_org[idx] # latent tensor can be 3D or 4D z = X_latent[idx] if z.ndim > 3: z = np.reshape(z, newshape=(z.shape[0], z.shape[1], -1)) elif z.ndim == 2: z = np.reshape(z, newshape=(z.shape[0], z.shape[1], 1)) elif z.ndim == 3: pass else: raise ValueError("No support for z value: %s" % str(z.shape)) # plot original acoustic ax = V.plot_spectrogram(x.T, ax=(n_row, n_col, i + 1), title='Org') if i == 0: ax.set_title("[%s]'%s-%s'" % (str(title), str(name[idx]), str(labels[idx])), fontsize=8) else: ax.set_title("'%s-%s'" % (str(name[idx]), str(labels[idx])), fontsize=8) # plot the mean V.plot_spectrogram(np.mean(z, axis=-1).T, ax=(n_row, n_col, i + 4), title='Zmean') # plot first 25 channels if n_channels > 1: for j in range(min(8, n_channels)): V.plot_spectrogram(z[:, :, j].T, ax=(n_row, n_col, j * 3 + 7 + i), title='Z%d' % j)
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists( path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = { name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset } # special case exclude all the noise data if 'noise' in exclude_dataset: indices = { name: (start, end) for name, (start, end) in indices.items() if '/' not in name } else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = { i: len(j) / n_total_files for i, j in dataset2name.items() } # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name) rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump( { 'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label }, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % (dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print( "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print( "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices) rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % ( 100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[ name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title( '[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} train_indices, valid_indices = train_valid_test_split( x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
get_vad=True, get_energy=True, get_delta=None, pitch_threshold=0.8, fmin=64, fmax=None, sr_new=None, preemphasis=0.97) for i, j in feat.iteritems(): print(i, j.shape) plt.subplot(7, 1, 1) plt.plot(y) plt.subplot(7, 1, 2) visual.plot_spectrogram(feat['spec'].T, vad=feat['vad']) plt.subplot(7, 1, 3) visual.plot_spectrogram(feat['mspec'].T, vad=feat['vad']) plt.subplot(7, 1, 4) visual.plot_spectrogram(feat['mfcc'].T, vad=feat['vad']) plt.subplot(7, 1, 5) visual.plot_spectrogram(feat['qspec'].T, vad=feat['vad']) plt.subplot(7, 1, 6) visual.plot_spectrogram(feat['qmspec'].T, vad=feat['vad']) plt.subplot(7, 1, 7) visual.plot_spectrogram(feat['qmfcc'].T, vad=feat['vad']) visual.plot_show(block=True, tight_layout=False)
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists(path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = {name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset} # special case exclude all the noise data if 'noise' in exclude_dataset: indices = {name: (start, end) for name, (start, end) in indices.items() if '/' not in name} else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = {i: len(j) / n_total_files for i, j in dataset2name.items()} # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name); rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump({'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label}, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % ( dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print("#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print("#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices); rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % (100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title('[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers