def test_minmax(): # Pick linguistic features for testing X, _ = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_min, X_max = P.minmax(X) assert np.isfinite(X_min).all() assert np.isfinite(X_max).all() x = X[0] x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99)) assert np.max(x_scaled) <= 1 assert np.min(x_scaled) >= 0 assert np.isfinite(x_scaled).all() # Need to specify (min, max) or (scale_, min_) @raises(ValueError) def __test_raise1(x, X_min, X_max): P.minmax_scale(x) @raises(ValueError) def __test_raise2(x, X_min, X_max): P.inv_minmax_scale(x) __test_raise1(x, X_min, X_max) __test_raise2(x, X_min, X_max) # Explicit scale_ and min_ min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99)) x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_) assert np.allclose(x_scaled, x_scaled_hat) # For padded dataset X, _ = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_min_hat, X_max_hat = P.minmax(X, lengths) assert np.allclose(X_min, X_min_hat) assert np.allclose(X_max, X_max_hat) # Inverse transform x = X[0] x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max) assert np.allclose(x, x_hat) x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_) assert np.allclose(x, x_hat)
def __getitem__(self, idx): x = P.minmax_scale(self.X[idx], min_=self.X_data_min, scale_=self.X_data_scale, feature_range=(0.01, 0.99)) y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std) return x, y
def gen_duration(device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_scale, duration_model): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=False, subphone_features=None).astype(np.float32) # Apply normalization ty = "duration" duration_linguistic_features = minmax_scale(duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # # Apply model # # duration_model = duration_model.cpu() duration_model.eval() x = torch.FloatTensor(duration_linguistic_features) duration_predicted = duration_model(x.unsqueeze(0)).data.numpy() print("duration_predicted shape: {}".format(duration_predicted.shape)) # Apply denormalization duration_predicted = duration_predicted * Y_scale[ty] + Y_mean[ty] duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def _get_x_scaled(self, dataset, seq): if dataset is None: return seq else: min_ = dataset.x_stat['min'] max_ = dataset.x_stat['max'] return minmax_scale(seq, min_, max_, feature_range=(0.01, 0.99))
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, coef=1.4, fs=16000): duration_model, acoustic_model = models["duration"], models["acoustic"] if use_cuda: duration_model = duration_model.cuda() acoustic_model = acoustic_model.cuda() # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale(linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy() acoustic_predicted = acoustic_predicted.reshape( -1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, coef=coef, fs=fs)
def __getitem__(self, idx): x, t = self.xs[idx], self.ts[idx] x = minmax_scale(x, self.x_stat['min'], self.x_stat['max'], feature_range=(0.01, 0.99)) t = scale(t, self.t_stat['mean'], np.sqrt(self.t_stat['var'])) pad_x = self._padding(x) pad_t = self._padding(t) return pad_x, pad_t, len(self.xs[idx]), len(self.ts[idx])
def __getitem__(self, index): file = self.metadata[index] x = np.load(os.path.join(self.X_path, '{}.npy'.format(file))).reshape( -1, self.x_dim) y = np.load(os.path.join(self.Y_path, '{}.npy'.format(file))).reshape( -1, self.y_dim) norm_x = minmax_scale(x, self.X_min[self.train], self.X_max[self.train], feature_range=(0.01, 0.99)) norm_y = scale(y, self.Y_mean[self.train], self.Y_scale[self.train]) return norm_x, norm_y
def __getitem__(self, idx): x = P.minmax_scale(self.X[idx], min_=self.X_data_min, scale_=self.X_data_scale, feature_range=(0.01, 0.99)) y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std) # To handle inconsistent static-delta relationship after normalization # This is required to use MSE + MGE loss work if hp.recompute_delta_features: y = recompute_delta_features(y, self.Y_data_mean, self.Y_data_std, hp.windows, hp.stream_sizes, hp.has_dynamic_features) return x, y
def lab2wav(args, device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_var, Y_scale, duration_model, acoustic_model, post_filter=False): # Predict durations duration_modified_hts_labels = gen_duration(device, label_path, binary_dict, continuous_dict, X_min, X_max, Y_mean, Y_scale, duration_model) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features="full" if args.label == 'state_align' else "coarse_coding") # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = minmax_scale(linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features # acoustic_model = acoustic_model.cpu() acoustic_model.eval() x = torch.FloatTensor(linguistic_features) acoustic_predicted = acoustic_model(x.unsqueeze(0)).data.numpy() print("acoustic_predicted shape: {}".format(acoustic_predicted.shape)) # Apply denormalization acoustic_predicted = acoustic_predicted * Y_scale[ty] + Y_mean[ty] return gen_waveform(acoustic_predicted.squeeze(0), Y_var, post_filter)
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std, post_filter=False, apply_duration_model=True, coef=1.4, fs=16000, mge_training=True): duration_model, acoustic_model = models["duration"], models["acoustic"] if use_cuda: duration_model = duration_model.cuda() acoustic_model = acoustic_model.cuda() # Predict durations if apply_duration_model: duration_modified_hts_labels = gen_duration( label_path, duration_model, X_min, X_max, Y_mean, Y_std) else: duration_modified_hts_labels = hts.load(label_path) # Linguistic features linguistic_features = fe.linguistic_features( duration_modified_hts_labels, binary_dict, continuous_dict, add_frame_features=hp_acoustic.add_frame_features, subphone_features=hp_acoustic.subphone_features) # Trim silences indices = duration_modified_hts_labels.silence_frame_indices() linguistic_features = np.delete(linguistic_features, indices, axis=0) # Apply normalization ty = "acoustic" linguistic_features = P.minmax_scale( linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Predict acoustic features acoustic_model.eval() x = Variable(torch.from_numpy(linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy() acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1]) return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter, coef=coef, fs=fs, mge_training=mge_training)
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale(duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model = duration_model.cpu() duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() try: duration_predicted = duration_model(x).data.numpy() except: xl = len(x) x = x.view(1, -1, x.size(-1)) duration_predicted = duration_model(x, [xl]).data.numpy() duration_predicted = duration_predicted.reshape( -1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = duration_predicted * Y_std[ty] + Y_mean[ty] duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale( duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x duration_predicted = duration_model(x, [xl]).data.cpu().numpy() duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty]) duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def __test_raise1(x, X_min, X_max): P.minmax_scale(x)
def create_loader(test=False): DATA_ROOT = "./data/basic5000" X = {"acoustic": {}} Y = {"acoustic": {}} utt_lengths = {"acoustic": {}} for ty in ["acoustic"]: for phase in ["train", "test"]: train = phase == "train" x_dim = (duration_linguistic_dim if ty == "duration" else acoustic_linguisic_dim) y_dim = duration_dim if ty == "duration" else acoustic_dim X[ty][phase] = FileSourceDataset( BinaryFileSource(join(DATA_ROOT, "X_{}".format(ty)), dim=x_dim, train=train)) Y[ty][phase] = FileSourceDataset( BinaryFileSource(join(DATA_ROOT, "Y_{}".format(ty)), dim=y_dim, train=train)) utt_lengths[ty][phase] = np.array([len(x) for x in X[ty][phase]], dtype=np.int) X_min = {} X_max = {} Y_mean = {} Y_var = {} Y_scale = {} for typ in ["acoustic"]: X_min[typ], X_max[typ] = minmax(X[typ]["train"], utt_lengths[typ]["train"]) Y_mean[typ], Y_var[typ] = meanvar(Y[typ]["train"], utt_lengths[typ]["train"]) Y_scale[typ] = np.sqrt(Y_var[typ]) mora_index_lists = sorted( glob(join("data/basic5000/mora_index", "squeezed_*.csv"))) mora_index_lists_for_model = [ np.loadtxt(path).reshape(-1) for path in mora_index_lists ] train_mora_index_lists = [] test_mora_index_lists = [] test_not_valid = [] for i, mora_i in enumerate(mora_index_lists_for_model): if (i - 1) % 20 == 0: # test if test: test_not_valid.append(i) else: pass elif i % 20 == 0: # valid test_mora_index_lists.append(mora_i) else: train_mora_index_lists.append(mora_i) X_acoustic_train = [ minmax_scale( X["acoustic"]["train"][i], X_min["acoustic"], X_max["acoustic"], feature_range=(0.01, 0.99), ) for i in range(len(X["acoustic"]["train"])) ] Y_acoustic_train = [y for y in Y["acoustic"]["train"]] # Y_acoustic_train = [scale(Y["acoustic"]["train"][i], Y_mean["acoustic"], Y_scale["acoustic"]) for i in range(len(Y["acoustic"]["train"]))] train_mora_index_lists = [ train_mora_index_lists[i] for i in range(len(train_mora_index_lists)) ] X_acoustic_test = [ minmax_scale( X["acoustic"]["test"][i], X_min["acoustic"], X_max["acoustic"], feature_range=(0.01, 0.99), ) for i in range(len(X["acoustic"]["test"])) ] Y_acoustic_test = [y for y in Y["acoustic"]["test"]] # Y_acoustic_test = [scale(Y["acoustic"]["test"][i], Y_mean["acoustic"], Y_scale["acoustic"])for i in range(len(Y["acoustic"]["test"]))] test_mora_index_lists = [ test_mora_index_lists[i] for i in range(len(test_mora_index_lists)) ] train_loader = [[ X_acoustic_train[i], Y_acoustic_train[i], train_mora_index_lists[i] ] for i in range(len(train_mora_index_lists))] test_loader = [[ X_acoustic_test[i], Y_acoustic_test[i], test_mora_index_lists[i] ] for i in range(len(test_mora_index_lists))] if test: return train_loader, test_loader, test_not_valid_loader else: return train_loader, test_loader