def inv_scale(mgc, lf0, vuv, bap, Y_mean, Y_std, binalize_vuv=True): # static + dynamic domain mgc_dim, lf0_dim, vuv_dim, bap_dim = hp.stream_sizes windows = hp.windows mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim mgc = P.inv_scale(mgc, Y_mean[:mgc_dim // len(windows)], Y_std[:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, Y_mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, Y_mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[vuv_start_idx], Y_std[vuv_start_idx]) if binalize_vuv: vuv[vuv > 0.5] = 1.0 vuv[vuv <= 0.5] = 0 vuv = vuv.long() return mgc, lf0, vuv, bap
def gen_parameters(y_predicted, Y_mean, Y_std): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) ty = "acoustic" # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)], Y_std[ty][:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx]) return mgc, lf0, vuv, bap
def compute_distortions(y_static, y_hat_static, Y_data_mean, Y_data_std, lengths=None): if hp.name == "acoustic": mgc, lf0, vuv, bap = split_streams(y_static, Y_data_mean, Y_data_std) mgc_hat, lf0_hat, vuv_hat, bap_hat = split_streams( y_hat_static, Y_data_mean, Y_data_std) try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_hat, vuv_hat, lengths=lengths, linear_domain=True) except ZeroDivisionError: f0_mse = np.nan distortions = { "mcd": metrics.melcd(mgc[:, :, 1:], mgc_hat[:, :, 1:], lengths=lengths), "bap_mcd": metrics.melcd(bap, bap_hat, lengths=lengths) / 10.0, "f0_rmse": np.sqrt(f0_mse), "vuv_err": metrics.vuv_error(vuv, vuv_hat, lengths=lengths), } elif hp.name == "duration": y_static_invscale = P.inv_scale(y_static, Y_data_mean, Y_data_std) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean, Y_data_std) distortions = { "dur_rmse": math.sqrt( metrics.mean_squared_error(y_static_invscale, y_hat_static_invscale, lengths=lengths)) } elif hp.name == "vc": static_dim = hp.order y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) distortions = { "mcd": metrics.melcd(y_static_invscale, y_hat_static_invscale, lengths=lengths) } else: assert False return distortions
def gen_parameters(self, y_predicted, mge_training=False): mgc_dim, lf0_dim, vuv_dim, bap_dim = stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) #import pdb; pdb.set_trace() # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, self.mean[:mgc_dim // len(windows)], self.std[:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, self.mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], self.std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, self.mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)], self.std[bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, self.mean[vuv_start_idx], self.std[vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, self.mean, self.std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = self.std * self.std mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = audio_world_config.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = audio_world_config.windows #ty = "acoustic" # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[:mgc_dim // len(windows)], Y_std[:mgc_dim // len(windows)]) lf0 = P.inv_scale( lf0, Y_mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale( bap, Y_mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[vuv_start_idx], Y_std[vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = Y_std * Y_std mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True): mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes mgc_start_idx = 0 lf0_start_idx = mgc_dim vuv_start_idx = lf0_start_idx + lf0_dim bap_start_idx = vuv_start_idx + vuv_dim windows = hp_acoustic.windows ty = "acoustic" # MGE training if mge_training: # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG on normalized features mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows) lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows) bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows) # When we use MGE training, denormalization should be done after MLPG. mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)], Y_std[ty][:mgc_dim // len(windows)]) lf0 = P.inv_scale(lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)], Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)]) bap = P.inv_scale(bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)], Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)]) vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx]) else: # Denormalization first y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std) # Split acoustic features mgc = y_predicted[:, :lf0_start_idx] lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx] vuv = y_predicted[:, vuv_start_idx] bap = y_predicted[:, bap_start_idx:] # Perform MLPG Y_var = Y_std[ty] * Y_std[ty] mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows) lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows) bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows) return mgc, lf0, vuv, bap
def test_meanvar(): # Pick acoustic features for testing _, X = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_mean, X_var = P.meanvar(X) X_std = np.sqrt(X_var) assert np.isfinite(X_mean).all() assert np.isfinite(X_var).all() assert X_mean.shape[-1] == D assert X_var.shape[-1] == D _, X_std_hat = P.meanstd(X) assert np.allclose(X_std, X_std_hat) x = X[0] x_scaled = P.scale(x, X_mean, X_std) assert np.isfinite(x_scaled).all() # For padded dataset _, X = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_mean_hat, X_var_hat = P.meanvar(X, lengths) assert np.allclose(X_mean, X_mean_hat) assert np.allclose(X_var, X_var_hat) # Inverse transform x = X[0] x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std) assert np.allclose(x, x_hat, atol=1e-5)
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def compute_distortions(y_static, y_hat_static, Y_data_mean, Y_data_std, lengths=None): if hp.name == "vc": static_dim = hp.order y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean[:static_dim], Y_data_std[:static_dim]) distortions = { "mcd": metrics.melcd(y_static_invscale, y_hat_static_invscale, lengths=lengths) } else: assert False return distortions
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale(duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x duration_predicted = duration_model(x, [xl]).data.cpu().numpy() duration_predicted = duration_predicted.reshape( -1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty]) duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std): # Linguistic features for duration hts_labels = hts.load(label_path) duration_linguistic_features = fe.linguistic_features( hts_labels, binary_dict, continuous_dict, add_frame_features=hp_duration.add_frame_features, subphone_features=hp_duration.subphone_features).astype(np.float32) # Apply normali--post-filterzation ty = "duration" duration_linguistic_features = P.minmax_scale( duration_linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99)) # Apply models duration_model.eval() # Apply model x = Variable(torch.from_numpy(duration_linguistic_features)).float() xl = len(x) x = x.view(1, -1, x.size(-1)) x = _generator_input(hp_duration, x) x = x.cuda() if use_cuda else x duration_predicted = duration_model(x, [xl]).data.cpu().numpy() duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1]) # Apply denormalization duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty]) duration_predicted = np.round(duration_predicted) # Set minimum state duration to 1 # print(duration_predicted) duration_predicted[duration_predicted <= 0] = 1 hts_labels.set_durations(duration_predicted) return hts_labels
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs