def load(n=0, cut=0, use_delta=False, timesteps=100, shift=10): # pre-processing of data: # - downsample data from 44.1KHz to 16KHz: python downsample.py # - extract audio features: sudo ./ahocoder.sh # - extract derivatives of audio features: sudo ./extractdelta.sh data = sp.load_ahocoder_data(n, use_delta=use_delta) interp_data = [] vuv_data = [] for sample in data: sample = sample[cut:sample.shape[0]-cut, :] vuv = sp.get_vuv_flag(sample, use_delta=use_delta) sample = sp.interp_uv(sample, use_delta=use_delta) interp_data.append(sample) vuv_data.append(vuv) data = sp.split_samples(interp_data, timesteps, shift) vuv = sp.split_samples(vuv_data, timesteps, shift) interp_data = vuv_data = None # free some space data = np.dstack(data).transpose((2, 0, 1)) vuv = np.dstack(vuv).transpose((2, 0, 1)) data = np.concatenate((data, vuv), -1) return data
print("Shift: " + str(shift)) print("Batch size: " + str(batch_size)) print("Epochs: " + str(nb_epoch)) print("Prediction length: " + str(pred_len)) print("Generation length: " + str(gen_len)) print("Cut: " + str(cut)) print("Use delta: " + str(use_delta)) print("M: " + str(M)) print("Patience: " + str(patience)) print("Load data") if gen_speech: data = load(nb_rawsamples, cut, use_delta, timesteps, shift) else: data = sp.load_spectrogram_data(nb_rawsamples) data = sp.split_samples(data, timesteps, shift) data = np.dstack(data).transpose((2, 0, 1)) # vuv = np.expand_dims(np.ones(data.shape[0:2]), 2) # data = np.concatenate((data, vuv), -1) (X_train, y_train), (X_test, y_test) = sp.train_test_split(data, pred_len) nb_samples, timesteps, input_dim = X_train.shape data = None # free some space print("Preprocess data") # exclude vuv bit from normalization if gen_speech or M is not None: X_train[:, :, :-1], mu, sigma = preprocess(X_train[:, :, :-1]) y_train[:, :, :-1], _, _ = preprocess(y_train[:, :, :-1], mu, sigma) X_test[:, :, :-1], _, _ = preprocess(X_test[:, :, :-1], mu, sigma) y_test[:, :, :-1], _, _ = preprocess(y_test[:, :, :-1], mu, sigma)