print("#Parameters:", len(parameters)) updates = optz(ce, parameters) K.initialize_all_variables() # ====== function ====== # print('Building training functions ...') f_train = K.function(inputs, [ce, optz.norm, cm], updates=updates, training=True) print('Building testing functions ...') f_test = K.function(inputs, [ce, acc, cm], training=False) print('Building predicting functions ...') f_pred = K.function(inputs[0], outputs['prob'], training=False) # =========================================================================== # Build trainer # =========================================================================== # ====== spliting the data ====== # idx = np.arange(len(X_train), dtype='int32') idx_train, idx_valid = train_valid_test_split(idx, train=0.8, inc_test=False, seed=1234) X_valid = X_train[idx_valid] y_valid = y_train[idx_valid] X_train = X_train[idx_train] y_train = y_train[idx_train] print("#Train:", X_train.shape, y_train.shape) print("#Valid:", X_valid.shape, y_valid.shape) print("#Test:", X_test.shape, y_test.shape) # ====== trainign ====== # print('Start training ...') task = training.MainLoop(batch_size=128, seed=1234, shuffle_level=2, allow_rollback=True) task.set_checkpoint(MODEL_PATH, model) task.set_callbacks([ training.NaNDetector(), training.EarlyStopGeneralizationLoss('valid', ce, threshold=5, patience=3)
def prepare_data(feat, label, utt_length=0.4, for_ivec=False): """ Returns (i-vector) ------------------ ds[feat] train_files y_train test_files y_test labels Returns (x-vector) ------------------ train : Feeder feeder for training data for iterating over pair of (X, y) valid : Feeder feeder for validating data for iterating over pair of (X, y) X_test_name : list of file names file names are append with '.%d' for cut segment ID X_test_true : list of integer label of each sample X_test_data : array list of test data same length as X_test_name labels : list of string list of labels for classification task Example ------- (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data_dnn(feat=FEAT, label='gender') """ label = str(label).lower() assert label in _support_label, "No support for label: %s" % label assert 0 < utt_length <= 1. # ====== load dataset ====== # if not os.path.exists(PATH_ACOUSTIC): raise RuntimeError( "Cannot find extracted acoustic features at path: '%s'," "run the code speech_features_extraction.py!" % PATH_ACOUSTIC) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) assert feat in ds, "Cannot find feature with name: %s" % feat indices = list(ds['indices'].items()) K.get_rng().shuffle(indices) # ====== helper ====== # def is_train(x): return x.split('_')[0] == 'train' def extract_label(x): return x.split('_')[_support_label[label]] print("Task:", ctext(label, 'cyan')) fn_label, labels = unique_labels([i[0] for i in indices], key_func=extract_label, return_labels=True) print("Labels:", ctext(labels, 'cyan')) # ====== training and test data ====== # train_files = [] # (name, (start, end)) ... test_files = [] for name, (start, end) in indices: if is_train(name): train_files.append((name, (start, end))) else: test_files.append((name, (start, end))) # name for each dataset, useful for later print("#Train:", ctext(len(train_files), 'cyan')) print("#Test:", ctext(len(test_files), 'cyan')) # ====== for i-vectors ====== # y_train = np.array([fn_label(i[0]) for i in train_files]) y_test = np.array([fn_label(i[0]) for i in test_files]) if bool(for_ivec): return ds[feat], train_files, y_train, test_files, y_test, labels # ====== length ====== # length = [(end - start) for _, (start, end) in indices] max_length = max(length) frame_length = int(max_length * utt_length) step_length = frame_length print("Max length :", ctext(max_length, 'yellow')) print("Frame length:", ctext(frame_length, 'yellow')) print("Step length :", ctext(step_length, 'yellow')) # ====== split dataset ====== # # split by speaker ID train_files, valid_files = train_valid_test_split( x=train_files, train=0.8, cluster_func=None, idfunc=lambda x: x[0].split('_')[4], # splited by speaker inc_test=False) print("#File train:", ctext(len(train_files), 'cyan')) print("#File valid:", ctext(len(valid_files), 'cyan')) print("#File test :", ctext(len(test_files), 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=step_length, end='pad', pad_mode='post', pad_value=0), F.recipes.Name2Label(converter_func=fn_label), F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1) ] feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files), ncpu=6, batch_mode='batch') feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files), ncpu=4, batch_mode='batch') feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files), ncpu=4, batch_mode='file') feeder_train.set_recipes(recipes) feeder_valid.set_recipes(recipes) feeder_test.set_recipes(recipes) print(feeder_train) # ====== process X_test, y_test in advance for faster evaluation ====== # @cache_disk def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data # convert everything back to float32 X_test_name, X_test_data = _extract_test_data(feat, label, utt_length) X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name]) return feeder_train, feeder_valid, \ X_test_name, X_test_true, X_test_data, labels
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
legends = OrderedDict() for g, m in zip(genders, gender_markers): for d, c in zip(digits, digit_colors): legends[(c, m)] = str(g) + '-' + str(d) # =========================================================================== # SPlit dataset # =========================================================================== split_spkID = lambda x: x[0].split('_')[4] split_dialID_spkID = lambda x: x[0].split('_')[3] + x[0].split('_')[4] split_genID_spkID = lambda x: x[0].split('_')[1] + x[0].split('_')[4] split_genID = lambda x: x[0].split('_')[1] split_ageID = lambda x: x[0].split('_')[2] # stratified sampling for each digit, splited based on speaker ID train, valid = train_valid_test_split(x=train, train=0.6, inc_test=False, idfunc=split_spkID, seed=K.get_rng().randint(0, 10e8)) # make sure both train and valid set have all the numbers assert set(i[0].split('_')[-1] for i in train) == set(digits) assert set(i[0].split('_')[-1] for i in valid) == set(digits) # ====== report ====== # report_info = lambda idx, flist: sorted( list(set(i[0].split('_')[idx] for i in flist))) print(ctext("#File train:", 'yellow'), len(train), train[:2]) print(' * Genders:', ctext(report_info(1, train), 'cyan')) print(' * Age:', ctext(report_info(2, train), 'cyan')) print(' * Dialects:', ctext(report_info(3, train), 'cyan')) print(' * Speakers:', ctext(report_info(4, train), 'cyan')) print(ctext("#File valid:", 'yellow'), len(valid), valid[:2]) print(' * Genders:', ctext(report_info(1, valid), 'cyan'))
y_prot_names = [ i.replace("protein.count.", "") for i in ds['meta_cols'][12:22] ] y_prot = ds['metadata'][:, 12:22] # ====== log ====== # assert y_mRNA_names == mRNA_ORDER and y_prot_names == PROTEIN_ORDER # ====== load thresholded protein values ====== # y_bin = ds['y_bin'] if 'y_bin' in ds else None y_prob = ds['y_prob'] if 'y_prob' in ds else None # =========================================================================== # split the data # =========================================================================== num_samples = data.shape[0] ids = get_rng().permutation(num_samples) train, valid, test = train_valid_test_split(ids, train=TRAIN_PERCENTAGE, inc_test=True, seed=get_rng().randint(0, 10e8)) X_train, y_mRNA_train, y_prot_train, y_bin_train, y_prob_train = data[ train], y_mRNA[train], y_prot[train], y_bin[train], y_prob[train] X_valid, y_mRNA_valid, y_prot_valid, y_bin_valid, y_prob_valid = data[ valid], y_mRNA[valid], y_prot[valid], y_bin[valid], y_prob[valid] X_test, y_mRNA_test, y_prot_test, y_bin_test, y_prob_test = data[test], y_mRNA[ test], y_prot[test], y_bin[test], y_prob[test] print(ctext("Train:", 'cyan'), X_train.shape, y_mRNA_train.shape, y_prot_train.shape, y_bin_train.shape, y_prob_train.shape) print(ctext("Valid:", 'cyan'), X_valid.shape, y_mRNA_valid.shape, y_prot_valid.shape, y_bin_valid.shape, y_prob_valid.shape) print(ctext("Test:", 'cyan'), X_test.shape, y_mRNA_test.shape, y_prot_test.shape, y_bin_test.shape, y_prob_test.shape) # =========================================================================== # Evaluation
print(ds) nb_classes = 10 # 10 digits (0-9) # =========================================================================== # Create feeder # =========================================================================== indices = [(name, start, end) for name, (start, end) in ds['indices']] longest_utterances = max( int(end) - int(start) - 1 for i, start, end in indices) longest_vad = max(end - start for name, vad in ds['vadids'] for (start, end) in vad) print("Longest Utterance:", longest_utterances) print("Longest Vad:", longest_vad) np.random.shuffle(indices) train, valid, test = train_valid_test_split(indices, train=0.6, inc_test=True) print('Nb train:', len(train), freqcount([int(i[0][0]) for i in train])) print('Nb valid:', len(valid), freqcount([int(i[0][0]) for i in valid])) print('Nb test:', len(test), freqcount([int(i[0][0]) for i in test])) train_feeder = F.Feeder(ds['mspec'], train, ncpu=1) test_feeder = F.Feeder(ds['mspec'], test, ncpu=2) valid_feeder = F.Feeder(ds['mspec'], valid, ncpu=2) recipes = [ F.recipes.Name2Trans(converter_func=lambda x: int(x[0])), F.recipes.Normalization(mean=ds['mspec_mean'], std=ds['mspec_std'], local_normalize=False), F.recipes.Sequencing(frame_length=longest_utterances, hop_length=1,
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} train_indices, valid_indices = train_valid_test_split( x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_data(feat, label, utt_length=0.4, for_ivec=False): """ Returns (i-vector) ------------------ ds[feat] train_files y_train test_files y_test labels Returns (x-vector) ------------------ train : Feeder feeder for training data for iterating over pair of (X, y) valid : Feeder feeder for validating data for iterating over pair of (X, y) X_test_name : list of file names file names are append with '.%d' for cut segment ID X_test_true : list of integer label of each sample X_test_data : array list of test data same length as X_test_name labels : list of string list of labels for classification task Example ------- (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data_dnn(feat=FEAT, label='gender') """ label = str(label).lower() assert label in _support_label, "No support for label: %s" % label assert 0 < utt_length <= 1. # ====== load dataset ====== # if not os.path.exists(PATH_ACOUSTIC): raise RuntimeError("Cannot find extracted acoustic features at path: '%s'," "run the code speech_features_extraction.py!" % PATH_ACOUSTIC) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) assert feat in ds, "Cannot find feature with name: %s" % feat indices = list(ds['indices'].items()) K.get_rng().shuffle(indices) # ====== helper ====== # def is_train(x): return x.split('_')[0] == 'train' def extract_label(x): return x.split('_')[_support_label[label]] print("Task:", ctext(label, 'cyan')) fn_label, labels = unique_labels([i[0] for i in indices], key_func=extract_label, return_labels=True) print("Labels:", ctext(labels, 'cyan')) # ====== training and test data ====== # train_files = [] # (name, (start, end)) ... test_files = [] for name, (start, end) in indices: if is_train(name): train_files.append((name, (start, end))) else: test_files.append((name, (start, end))) # name for each dataset, useful for later print("#Train:", ctext(len(train_files), 'cyan')) print("#Test:", ctext(len(test_files), 'cyan')) # ====== for i-vectors ====== # y_train = np.array([fn_label(i[0]) for i in train_files]) y_test = np.array([fn_label(i[0]) for i in test_files]) if bool(for_ivec): return ds[feat], train_files, y_train, test_files, y_test, labels # ====== length ====== # length = [(end - start) for _, (start, end) in indices] max_length = max(length) frame_length = int(max_length * utt_length) step_length = frame_length print("Max length :", ctext(max_length, 'yellow')) print("Frame length:", ctext(frame_length, 'yellow')) print("Step length :", ctext(step_length, 'yellow')) # ====== split dataset ====== # # split by speaker ID train_files, valid_files = train_valid_test_split( x=train_files, train=0.8, cluster_func=None, idfunc=lambda x: x[0].split('_')[4], # splited by speaker inc_test=False) print("#File train:", ctext(len(train_files), 'cyan')) print("#File valid:", ctext(len(valid_files), 'cyan')) print("#File test :", ctext(len(test_files), 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=step_length, end='pad', pad_mode='post', pad_value=0), F.recipes.Name2Label(converter_func=fn_label), F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1) ] feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files), ncpu=6, batch_mode='batch') feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files), ncpu=4, batch_mode='batch') feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files), ncpu=4, batch_mode='file') feeder_train.set_recipes(recipes) feeder_valid.set_recipes(recipes) feeder_test.set_recipes(recipes) print(feeder_train) # ====== process X_test, y_test in advance for faster evaluation ====== # @cache_disk def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data # convert everything back to float32 X_test_name, X_test_data = _extract_test_data(feat, label, utt_length) X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name]) return feeder_train, feeder_valid, \ X_test_name, X_test_true, X_test_data, labels