def validate_features_dataset(output_dataset_path, ds_validation_path): ds = F.Dataset(output_dataset_path, read_only=True) print(ds) features = {} for key, val in ds.items(): if 'indices_' in key: name = key.split('_')[-1] features[name] = (val, ds[name]) all_indices = [val[0] for val in features.values()] # ====== sampling 250 files ====== # all_files = sampling_iter(it=all_indices[0].keys(), k=250, seed=Config.SUPER_SEED) all_files = [f for f in all_files if all(f in ids for ids in all_indices)] print("#Samples:", ctext(len(all_files), 'cyan')) # ====== ignore the 20-figures warning ====== # with catch_warnings_ignore(RuntimeWarning): for file_name in all_files: X = {} for feat_name, (ids, data) in features.items(): start, end = ids[file_name] X[feat_name] = data[start:end][:].astype('float32') V.plot_multiple_features(features=X, fig_width=20, title='[%s]%s' % (ds['dsname'][file_name], file_name)) V.plot_save(ds_validation_path, dpi=12)
def test_speech_processor(self): try: datapath = F.load_digit_wav() except Exception as e: print('Error (skip this test):', str(e)) return output_path = utils.get_datasetpath(name='digit', override=True) feat = F.SpeechProcessor(datapath, output_path, audio_ext='wav', sr_new=8000, win=0.02, shift=0.01, nb_melfilters=40, nb_ceps=13, get_delta=2, get_energy=True, pitch_threshold=0.8, get_spec=True, get_mspec=True, get_mfcc=True, get_pitch=True, get_vad=True, save_stats=True, substitute_nan=None, dtype='float32', datatype='memmap', ncache=0.12, ncpu=4) feat.run() ds = F.Dataset(output_path) def is_equal(x1, x2): x1 = repr(np.array(x1, 'float32').tolist()) x2 = repr(np.array(x2, 'float32').tolist()) n = 0 for i, j in zip(x1, x2): if i == j: n += 1 return n >= max(len(x1), len(x2)) // 2 # these numbers are highly numerical instable for i in ds.keys(): if i == 'indices.csv': self.assertTrue(isinstance(ds[i], str)) elif '_' not in i: pca = i + '_pca' if pca in ds: self.assertTrue( is_equal(np.sum(ds[i][:], dtype='float32'), test_speech_features[i])) elif '_pca' not in i: self.assertTrue( is_equal(np.sum(ds[i][:], dtype='float32'), test_speech_features[i])) else: self.assertTrue( is_equal(np.sum(ds[i].components_), test_speech_features[i]))
def prepare_ivec_data(recipe, feat): ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) return X, train_indices, test_indices
get_logpath(name="analyze_data.log", increasing=True, odin_base=False, root=ANALYSIS_DIR)) print(ctext(FEATURE_RECIPE, 'lightyellow')) print(ctext(FEATURE_NAME, 'lightyellow')) assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)) # ====== essential path ====== # figure_path = os.path.join( ANALYSIS_DIR, '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME)) print(ctext(figure_path, 'lightyellow')) # =========================================================================== # Load the data # =========================================================================== ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE), read_only=True) X = ds[FEATURE_NAME] # remove all noise data indices = { name: (start, end) for name, (start, end) in ds['indices_%s' % FEATURE_NAME].items() if '/' not in name } all_dataset = sorted(set(ds['dsname'].values())) print("All dataset:", ctext(all_dataset, 'cyan')) # =========================================================================== # Helpers # =========================================================================== all_percentiles = [.01, .05, .1, .25, .5, .75, .9, .95, .99]
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def prepare_data(feat, label, utt_length=0.4, for_ivec=False): """ Returns (i-vector) ------------------ ds[feat] train_files y_train test_files y_test labels Returns (x-vector) ------------------ train : Feeder feeder for training data for iterating over pair of (X, y) valid : Feeder feeder for validating data for iterating over pair of (X, y) X_test_name : list of file names file names are append with '.%d' for cut segment ID X_test_true : list of integer label of each sample X_test_data : array list of test data same length as X_test_name labels : list of string list of labels for classification task Example ------- (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data_dnn(feat=FEAT, label='gender') """ label = str(label).lower() assert label in _support_label, "No support for label: %s" % label assert 0 < utt_length <= 1. # ====== load dataset ====== # if not os.path.exists(PATH_ACOUSTIC): raise RuntimeError( "Cannot find extracted acoustic features at path: '%s'," "run the code speech_features_extraction.py!" % PATH_ACOUSTIC) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) assert feat in ds, "Cannot find feature with name: %s" % feat indices = list(ds['indices'].items()) K.get_rng().shuffle(indices) # ====== helper ====== # def is_train(x): return x.split('_')[0] == 'train' def extract_label(x): return x.split('_')[_support_label[label]] print("Task:", ctext(label, 'cyan')) fn_label, labels = unique_labels([i[0] for i in indices], key_func=extract_label, return_labels=True) print("Labels:", ctext(labels, 'cyan')) # ====== training and test data ====== # train_files = [] # (name, (start, end)) ... test_files = [] for name, (start, end) in indices: if is_train(name): train_files.append((name, (start, end))) else: test_files.append((name, (start, end))) # name for each dataset, useful for later print("#Train:", ctext(len(train_files), 'cyan')) print("#Test:", ctext(len(test_files), 'cyan')) # ====== for i-vectors ====== # y_train = np.array([fn_label(i[0]) for i in train_files]) y_test = np.array([fn_label(i[0]) for i in test_files]) if bool(for_ivec): return ds[feat], train_files, y_train, test_files, y_test, labels # ====== length ====== # length = [(end - start) for _, (start, end) in indices] max_length = max(length) frame_length = int(max_length * utt_length) step_length = frame_length print("Max length :", ctext(max_length, 'yellow')) print("Frame length:", ctext(frame_length, 'yellow')) print("Step length :", ctext(step_length, 'yellow')) # ====== split dataset ====== # # split by speaker ID train_files, valid_files = train_valid_test_split( x=train_files, train=0.8, cluster_func=None, idfunc=lambda x: x[0].split('_')[4], # splited by speaker inc_test=False) print("#File train:", ctext(len(train_files), 'cyan')) print("#File valid:", ctext(len(valid_files), 'cyan')) print("#File test :", ctext(len(test_files), 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=step_length, end='pad', pad_mode='post', pad_value=0), F.recipes.Name2Label(converter_func=fn_label), F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1) ] feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files), ncpu=6, batch_mode='batch') feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files), ncpu=4, batch_mode='batch') feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files), ncpu=4, batch_mode='file') feeder_train.set_recipes(recipes) feeder_valid.set_recipes(recipes) feeder_test.set_recipes(recipes) print(feeder_train) # ====== process X_test, y_test in advance for faster evaluation ====== # @cache_disk def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data # convert everything back to float32 X_test_name, X_test_data = _extract_test_data(feat, label, utt_length) X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name]) return feeder_train, feeder_valid, \ X_test_name, X_test_true, X_test_data, labels
processor = pp.FeatureProcessor(jobs=all_files, path=PATH_ACOUSTIC_FEATURES, extractor=extractors, n_cache=120, ncpu=None, override=True, identifier='name', log_path=os.path.join( EXP_DIR, 'processor.log'), stop_on_failure=True) processor.run() # pp.validate_features(processor, # nb_samples=12, # path=os.path.join(EXP_DIR, 'feature_validation'), # override=True) ds = F.Dataset(PATH_ACOUSTIC_FEATURES, read_only=True) print(ds) indices = list(ds['indices_%s' % args.feat].items()) print("Utterances length:") print(" ", describe([end - start for name, (start, end) in indices], shorten=True)) # =========================================================================== # Basic path for GMM, T-matrix and I-vector # =========================================================================== EXP_DIR = os.path.join(EXP_DIR, '%s_%d_%d' % (FEAT, NMIX, TV_DIM)) LOG_PATH = get_logpath(name='log.txt', override=False, root=EXP_DIR, odin_base=False) stdio(LOG_PATH) print("Exp-dir:", ctext(EXP_DIR, 'cyan'))
"`-aug` option was not provided, choose: 'rirs' or 'musan'") np.random.seed(Config.SUPER_SEED) # percentage of data will be used for augmentation PERCENTAGE_AUGMENTATION = 0.8 # =========================================================================== # Constant # =========================================================================== AUGMENTATION_DATASET = ['swb', 'sre04', 'sre05', 'sre06', 'sre08', 'sre10'] AUGMENTATION_DATASET = [i for i in AUGMENTATION_DATASET if i in ALL_DATASET] print("Augmenting following dataset: %s" % ctext(', '.join(AUGMENTATION_DATASET), 'yellow')) # ====== get the duration ====== # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists(path), \ "Acoustic feature must be extracted first, and stored at path: %s" % path ds = F.Dataset(path, read_only=True) all_duration = dict(ds['duration'].items()) ds.close() # ====== select a new file list ====== # AUG_FILES = [] missing_duration = [] for row in ALL_FILES: if row[4] not in AUGMENTATION_DATASET: continue if row[2] not in all_duration: missing_duration.append(row) continue dur = all_duration[row[2]] AUG_FILES.append([i for i in row] + [dur]) print("#Files missing duration:", ctext(len(missing_duration), 'cyan')) assert len(AUG_FILES), "Cannot find any files for augmentation"
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists( path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = { name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset } # special case exclude all the noise data if 'noise' in exclude_dataset: indices = { name: (start, end) for name, (start, end) in indices.items() if '/' not in name } else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = { i: len(j) / n_total_files for i, j in dataset2name.items() } # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name) rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump( { 'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label }, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % (dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print( "#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print( "#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices) rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % ( 100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[ name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title( '[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
# python two_way_2_group_data_into_batch.py -m memory_profiler # group: 165 + 10.9 MB and 24.3 (s/iter) # group2: 164 + 43.6 MB (old method) and 15.6 (s/iter) from __future__ import print_function, division, absolute_import import os os.environ['ODIN'] = 'theano,cpu,float32' from six.moves import zip_longest, cPickle import numpy as np from odin import backend as K, nnet as N, fuel as F from odin.utils import UnitTimer from memory_profiler import profile ds = F.Dataset('/home/trung/data/estonia_audio32') indices = np.genfromtxt(ds['indices.csv'], dtype=str, delimiter=' ') name, start, end = indices[0] x0 = ds['mfcc'][int(start):int(end)] x0 = (name, [x0, x0]) name, start, end = indices[1] x1 = ds['mfcc'][int(start):int(end)] x1 = (name, [x1, x1]) name, start, end = indices[2] x2 = ds['mfcc'][int(start):int(end)] x2 = (name, [x2, x2])
vad_minlen=0.1, pca=True, pca_whiten=False, center=True, save_stats=True, substitute_nan=None, dtype='float16', datatype='memmap', ncache=0.12, ncpu=8) with utils.UnitTimer(): feat.run() shutil.copy(os.path.join(datapath, 'README.md'), os.path.join(output_path, 'README.md')) # ====== check the preprocessed dataset ====== # ds = F.Dataset(output_path, read_only=True) print('Output path:', output_path) print(ds) for n in ds.keys(): if '_pca' in n: pca = ds[n] if pca.components_ is None: print(n, 'components is None !') elif np.any(np.isnan(pca.components_)): print(n, 'contains NaN !') else: print( n, ':', ' '.join([ '%.2f' % i + '-' + '%.2f' % j for i, j in zip(pca.explained_variance_ratio_[:8],
EXTRACTOR_NAME = FEATURE_RECIPE.split("_")[0] extractor = get_module_from_path(identifier=EXTRACTOR_NAME, path=get_script_path(), prefix='feature_recipes') assert len(extractor) > 0, \ "Cannot find extractor with name: %s" % EXTRACTOR_NAME extractor = extractor[0]() # ====== initializing ====== # # mapping from # scoring_data_name -> [features 2-D array, # indices {name: (start, end)}, # spkid_or_meta {name: spkid_or_meta}, # path {name: path}] acoustic_features = {} training_ds = F.Dataset(path=os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE), read_only=True) all_training_dataset = set(training_ds['dsname'].values()) print("All training dataset:", ctext(all_training_dataset, 'cyan')) # ====== extract the feature if not exists ====== # for dsname, file_list in sorted(list(SCORING_DATASETS.items()) + list(BACKEND_DATASETS.items()), key=lambda x: x[0]): # acoustic features already extracted in training dataset if dsname in all_training_dataset: assert FEATURE_NAME in training_ds, \ "Cannot find feature with name: %s, from: %s" % (FEATURE_NAME, training_ds.path) X = training_ds[FEATURE_NAME] indices = { name: (start, end) for name, (start, end) in training_ds['indices_%s' %
def test_feeders(self): with utils.TemporaryDirectory() as temppath: np.random.seed(1208251813) transcription_test = {} # ====== create fake dataset ====== # ds = F.Dataset(os.path.join(temppath, 'ds')) ds['X'] = np.arange(0, 10000).reshape(-1, 5) # generate fake indices indices = [] for i, j in enumerate(range(0, ds['X'].shape[0], 20)): indices.append(['name_%d' % i, j, j + 20]) np.savetxt(os.path.join(ds.path, 'indices.csv'), indices, fmt='%s', delimiter=' ') # generate fake transcription transcription = F.MmapDict( os.path.join(ds.path, 'transcription.dict')) for name, start, end in indices: trans = np.random.randint(0, 4, size=(20, )).tolist() transcription[name] = trans for i, j in zip(ds['X'][start:end], trans): transcription_test[str(i.tolist())] = j transcription.flush() transcription.close() ds.flush() ds.close() # ====== test feeder ====== # ds = F.Dataset(os.path.join(temppath, 'ds'), read_only=True) REF = ds['X'][:].ravel().tolist() feeder = F.Feeder(ds['X'], ds['indices.csv'], ncpu=2, buffer_size=2) # ==================== No recipes ==================== # def test_iter_no_trans(it): x = [] n = 0 for i in it: x += i.ravel().tolist() n += i.shape[0] x = np.sort(x).tolist() self.assertEqual(x, REF) self.assertEqual(n, ds['X'].shape[0]) # ====== NO shuffle ====== # test_iter_no_trans(feeder.set_batch(12, seed=None, shuffle_level=0)) # ====== shuffle 0 ====== # test_iter_no_trans(feeder.set_batch(12, seed=1203, shuffle_level=0)) # ====== shuffle 2 ====== # test_iter_no_trans(feeder.set_batch(12, seed=1203, shuffle_level=2)) # ==================== Convert name to indices ==================== # feeder.set_recipes([ F.recipes.Name2Trans( converter_func=lambda name: int(name.split('_')[-1])), F.recipes.CreateBatch() ]) def test_iter_trans(it): x = [] y = 0 n = 0 for i, j in it: x += i.ravel().tolist() n += i.shape[0] y += np.sum(j) x = np.sort(x).tolist() self.assertEqual(x, REF) self.assertEqual(y, 99000) self.assertEqual(n, ds['X'].shape[0]) # ====== NO shuffle ====== # test_iter_trans(feeder.set_batch(12, seed=None, shuffle_level=0)) # ====== shuffle 0 ====== # test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=0)) # ====== shuffle 2 ====== # test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=2)) # ==================== Transcription ==================== # del feeder ds = F.Dataset(os.path.join(temppath, 'ds')) feeder = F.Feeder(ds['X'], indices=ds['indices.csv'], ncpu=2, buffer_size=2) feeder.set_recipes([ F.recipes.TransLoader(ds['transcription.dict'], dtype='int32'), F.recipes.CreateBatch() ]) n = 0 X = [] for i, j in feeder.set_batch(12, seed=1208251813, shuffle_level=2): X += i.ravel().tolist() n += i.shape[0] for x, y in zip(i, j): self.assertTrue(transcription_test[str(x.tolist())] == y) X = np.sort(X).tolist() self.assertEqual(X, REF) self.assertEqual(n, ds['X'].shape[0])
processor = pp.FeatureProcessor( jobs=all_files, path=PATH_ACOUSTIC, extractor=extractors, n_cache=0.12, ncpu=min(18, cpu_count() - 2) if args.ncpu <= 0 else int(args.ncpu), override=True, identifier='name', log_path=os.path.join(PATH_EXP, 'processor.log'), stop_on_failure=True # small dataset, enable stop on failure ) with UnitTimer(): processor.run() n_error = len(processor.error_log) print(processor) # ====== copy readme and check the preprocessed dataset ====== # if n_error == 0: readme_path = os.path.join( audio.path, [i for i in os.listdir(audio.path) if 'README' in i][0]) shutil.copy(readme_path, os.path.join(PATH_ACOUSTIC, 'README.md')) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) print(ds) pp.validate_features(ds, path=os.path.join(PATH_EXP, 'acoustic'), nb_samples=12, override=True) else: print("%s errors happened during processing!" % ctext(n_error, 'red'))