def test_functional_mlpg(): static_dim = 2 T = 5 for windows in _get_windows_set(): torch.manual_seed(1234) means = torch.rand(T, static_dim * len(windows)) variances = torch.ones(static_dim * len(windows)) y = G.mlpg(means.numpy(), variances.numpy(), windows) y = Variable(torch.from_numpy(y), requires_grad=False) means = Variable(means, requires_grad=True) # mlpg y_hat = AF.mlpg(means, variances, windows) assert np.allclose(y.data.numpy(), y_hat.data.numpy()) # Test backward pass nn.MSELoss()(y_hat, y).backward() # unit_variance_mlpg R = torch.from_numpy(G.unit_variance_mlpg_matrix(windows, T)) y_hat = AF.unit_variance_mlpg(R, means) assert np.allclose(y.data.numpy(), y_hat.data.numpy()) nn.MSELoss()(y_hat, y).backward() # Test 3D tensor inputs y_hat = AF.unit_variance_mlpg(R, means.view(1, -1, means.size(-1))) assert np.allclose( y.data.numpy(), y_hat.data.view(-1, static_dim).numpy()) nn.MSELoss()(y_hat.view(-1, static_dim), y).backward()
def test_multi_stream_mlpg(): windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0])), ] in_dim = 187 T = 100 R = unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) batch_size = 32 x = Variable(torch.rand(batch_size, T, in_dim)) stream_sizes = [180, 3, 1, 3] has_dynamic_features = [True, True, False, True] y = multi_stream_mlpg(x, R, stream_sizes, has_dynamic_features) assert y.size() == (batch_size, T, 60 + 1 + 1 + 1) mgc = y[:, :, : 60] lf0 = y[:, :, 60] vuv = y[:, :, 61] bap = y[:, :, 62] assert (unit_variance_mlpg(R, x[:, :, : 180]) == mgc).data.all() assert (unit_variance_mlpg(R, x[:, :, 180: 180 + 3]).squeeze(-1) == lf0).data.all() assert (x[:, :, 183] == vuv).data.all() assert (unit_variance_mlpg(R, x[:, :, 184: 184 + 3]).squeeze(-1) == bap).data.all() static_features = get_static_features( x, len(windows), stream_sizes, has_dynamic_features) assert static_features.size() == y.size()
def test_multi_stream_mlpg(): windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0])), ] in_dim = 187 T = 100 R = unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) batch_size = 32 x = Variable(torch.rand(batch_size, T, in_dim)) stream_sizes = [180, 3, 1, 3] has_dynamic_features = [True, True, False, True] y = multi_stream_mlpg(x, R, stream_sizes, has_dynamic_features) assert y.size() == (batch_size, T, 60 + 1 + 1 + 1) mgc = y[:, :, :60] lf0 = y[:, :, 60] vuv = y[:, :, 61] bap = y[:, :, 62] assert (unit_variance_mlpg(R, x[:, :, :180]) == mgc).data.all() assert (unit_variance_mlpg(R, x[:, :, 180:180 + 3]) == lf0).data.all() assert (x[:, :, 183] == vuv).data.all() assert (unit_variance_mlpg(R, x[:, :, 184:184 + 3]) == bap).data.all() static_features = get_static_features(x, len(windows), stream_sizes, has_dynamic_features) assert static_features.size() == y.size()
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_unit_variance_mlpg(): static_dim = 2 T = 10 for windows in _get_windows_set(): means = np.random.rand(T, static_dim * len(windows)) variances = np.ones(static_dim * len(windows)) y = G.mlpg(means, variances, windows) R = G.unit_variance_mlpg_matrix(windows, T) y_hat = R.dot(G.reshape_means(means, static_dim)) assert np.allclose(y_hat, y)
def test_unit_variance_mlpg_gradcheck(): static_dim = 2 T = 10 for windows in _get_windows_set(): torch.manual_seed(1234) # Meens, input for MLPG means = Variable(torch.rand(T, static_dim * len(windows)), requires_grad=True) # Input for UnitVarianceMLPG reshaped_means = G.reshape_means( means.data.clone().numpy(), static_dim) reshaped_means = Variable(torch.from_numpy(reshaped_means), requires_grad=True) # Compute MLPG matrix R = G.unit_variance_mlpg_matrix(windows, T).astype(np.float32) R = torch.from_numpy(R) # UnitVarianceMLPG can take input with both means and reshaped_means y1 = UnitVarianceMLPG(R)(means) y2 = UnitVarianceMLPG(R)(reshaped_means) # Unit variances variances = torch.ones(static_dim * len(windows) ).expand(T, static_dim * len(windows)) y_hat = MLPG(variances, windows)(means) # Make sure UnitVarianceMLPG and MLPG can get same result # if we use unit variances for y in [y1, y2]: assert np.allclose(y.data.numpy(), y_hat.data.numpy()) # Grad check inputs = (reshaped_means,) assert gradcheck(UnitVarianceMLPG(R), inputs, eps=1e-3, atol=1e-3) inputs = (means,) assert gradcheck(UnitVarianceMLPG(R), inputs, eps=1e-3, atol=1e-3)
def test_model(): windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), ] model = In2OutHighwayNet() print(model) assert model.include_parameter_generation() in_dim = 118 static_dim = in_dim // 2 T = 100 x = Variable(torch.rand(1, T, in_dim)) R = unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) _, y = model(x, R) print(y.size()) assert y.size(-1) == static_dim # Mini batch batch_size = 32 x = Variable(torch.rand(batch_size, T, in_dim)) _, y_hat = model(x, R) y = Variable(torch.rand(batch_size, T, static_dim), requires_grad=False) lengths = [np.random.randint(50, T - 1) for _ in range(batch_size - 1)] + [T] lengths = Variable(torch.LongTensor(lengths), requires_grad=False) print(x.size(), y.size(), lengths.size()) MaskedMSELoss()(y_hat, y, lengths).backward() print(y.size()) assert y.size(-1) == static_dim assert y.size(0) == batch_size # cuda if torch.cuda.is_available(): model = model.cuda() x = x.cuda() R = R.cuda() _, y_hat = model(x, R)
def test_model(): windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), ] model = In2OutHighwayNet() print(model) assert model.include_parameter_generation() in_dim = 118 static_dim = in_dim // 2 T = 100 x = Variable(torch.rand(1, T, in_dim)) R = unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) _, y = model(x, R) print(y.size()) assert y.size(-1) == static_dim # Mini batch batch_size = 32 x = Variable(torch.rand(batch_size, T, in_dim)) _, y_hat = model(x, R) y = Variable(torch.rand(batch_size, T, static_dim), requires_grad=False) lengths = [np.random.randint(50, T - 1) for _ in range(batch_size - 1)] + [T] lengths = Variable(torch.LongTensor(lengths), requires_grad=False) print(x.size(), y.size(), lengths.size()) MaskedMSELoss()(y_hat, y, lengths).backward() print(y.size()) assert y.size(-1) == static_dim assert y.size(0) == batch_size # cuda if torch.cuda.is_available(): model = model.cuda() x = x.cuda() R = R.cuda() _, y_hat = model(x, R)
def test_minibatch_unit_variance_mlpg_gradcheck(): static_dim = 2 T = 5 for windows in _get_windows_set(): batch_size = 5 torch.manual_seed(1234) # Prepare inputs means = torch.rand(T, static_dim * len(windows)) means_expanded = means.expand( batch_size, means.shape[0], means.shape[1]) reshaped_means = torch.from_numpy( G.reshape_means(means.numpy(), static_dim)) reshaped_means_expanded = reshaped_means.expand( batch_size, reshaped_means.shape[0], reshaped_means.shape[1]) # Target y = G.mlpg(means.numpy(), np.ones(static_dim * len(windows)), windows) y = Variable(torch.from_numpy(y), requires_grad=False) y_expanded = y.expand(batch_size, y.size(0), y.size(1)) # Pack into variables means = Variable(means, requires_grad=True) means_expanded = Variable(means_expanded, requires_grad=True) reshaped_means = Variable(reshaped_means, requires_grad=True) reshaped_means_expanded = Variable( reshaped_means_expanded, requires_grad=True) # Case 1: 2d with reshaped means R = torch.from_numpy(G.unit_variance_mlpg_matrix(windows, T)) y_hat1 = AF.unit_variance_mlpg(R, reshaped_means) # Case 2: 3d with reshaped means y_hat2 = AF.unit_variance_mlpg(R, reshaped_means_expanded) for i in range(batch_size): assert np.allclose(y_hat1.data.numpy(), y_hat2[i].data.numpy()) nn.MSELoss()(y_hat1, y).backward() nn.MSELoss()(y_hat2, y_expanded).backward() # Check grad consistency for i in range(batch_size): grad1 = reshaped_means.grad.data.numpy() grad2 = reshaped_means_expanded.grad[i].data.numpy() assert np.allclose(grad1, grad2) # Case 3: 2d with non-reshaped input y_hat3 = AF.unit_variance_mlpg(R, means) # Case 4: 3d with non-reshaped input y_hat4 = AF.unit_variance_mlpg(R, means_expanded) for i in range(batch_size): assert np.allclose(y_hat1.data.numpy(), y_hat3.data.numpy()) assert np.allclose(y_hat3.data.numpy(), y_hat4[i].data.numpy()) nn.MSELoss()(y_hat3, y).backward() nn.MSELoss()(y_hat4, y_expanded).backward() # Check grad consistency for i in range(batch_size): grad1 = means.grad.data.numpy() grad2 = means_expanded.grad[i].data.numpy() assert np.allclose(grad1, grad2)
def train_loop(models, optimizers, dataset_loaders, w_d=0.0, mse_w=0.0, mge_w=1.0, update_d=True, update_g=True, reference_discriminator=None): model_g, model_d = models optimizer_g, optimizer_d = optimizers if use_cuda: model_g, model_d = model_g.cuda(), model_d.cuda() if reference_discriminator is not None: reference_discriminator = reference_discriminator.cuda() reference_discriminator.eval() model_g.train() model_d.train() Y_data_mean = dataset_loaders["train"].dataset.Y_data_mean Y_data_std = dataset_loaders["train"].dataset.Y_data_std Y_data_mean = torch.from_numpy(Y_data_mean) Y_data_std = torch.from_numpy(Y_data_std) if use_cuda: Y_data_mean = Y_data_mean.cuda() Y_data_std = Y_data_std.cuda() E_loss_mge = 1 E_loss_adv = 1 is_acoustic = hp.name == "acoustic" global global_epoch for global_epoch in tqdm(range(global_epoch + 1, hp.nepoch + 1)): # LR schedule if hp.lr_decay_schedule and update_g: optimizer_g = exp_lr_scheduler(optimizer_g, global_epoch - 1, hp.nepoch, init_lr=hp.optimizer_g_params["lr"], lr_decay_epoch=hp.lr_decay_epoch) if hp.lr_decay_schedule and update_d: optimizer_d = exp_lr_scheduler(optimizer_d, global_epoch - 1, hp.nepoch, init_lr=hp.optimizer_d_params["lr"], lr_decay_epoch=hp.lr_decay_epoch) for phase in ["train", "test"]: running_loss = { "generator": 0.0, "mse": 0.0, "mge": 0.0, "loss_real_d": 0.0, "loss_fake_d": 0.0, "loss_adv": 0.0, "discriminator": 0.0 } running_metrics = {} real_correct_count, fake_correct_count = 0, 0 regard_fake_as_natural = 0 N = len(dataset_loaders[phase]) total_num_frames = 0 for x, y, lengths in dataset_loaders[phase]: # Sort by lengths. This is needed for pytorch's PackedSequence sorted_lengths, indices = torch.sort(lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long() max_len = sorted_lengths[0] # Get sorted batch x, y = x[indices], y[indices] # MLPG paramgen matrix # TODO: create this only if it's needed R = unit_variance_mlpg_matrix(hp.windows, max_len) R = torch.from_numpy(R) if use_cuda: x, y, R = x.cuda(), y.cuda(), R.cuda() sorted_lengths = sorted_lengths.cuda() # Pack into variables x, y = Variable(x), Variable(y) sorted_lengths = Variable(sorted_lengths) # Static features y_static = get_static_features(y, len(hp.windows), hp.stream_sizes, hp.has_dynamic_features) # Num frames in batch total_num_frames += sorted_lengths.float().sum().data[0] # Mask mask = sequence_mask(sorted_lengths).unsqueeze(-1) # Reset optimizers state optimizer_g.zero_grad() optimizer_d.zero_grad() # Apply model (generator) y_hat, y_hat_static = apply_generator(x, R, sorted_lengths) # Compute spoofing rate if reference_discriminator is not None: if hp.adversarial_streams is not None: y_hat_static_ref = get_selected_static_stream( y_hat_static) else: y_hat_static_ref = y_hat_static target = reference_discriminator(y_hat_static_ref, lengths=sorted_lengths) # Count samples classified as natural, while inputs are # actually generated. regard_fake_as_natural += ((target > 0.5).float() * mask).sum().data[0] ### Update discriminator ### # Natural: 1, Genrated: 0 if update_d: loss_d, loss_fake_d, loss_real_d, _real_correct_count,\ _fake_correct_count = update_discriminator( model_d, optimizer_d, y_static, y_hat_static, sorted_lengths, mask, phase) running_loss["discriminator"] += loss_d running_loss["loss_fake_d"] += loss_fake_d running_loss["loss_real_d"] += loss_real_d real_correct_count += _real_correct_count fake_correct_count += _fake_correct_count ### Update generator ### if update_g: adv_w = w_d * float( np.clip(E_loss_mge / E_loss_adv, 0, 1e+3)) # update generator $step times for adversarial training # TODO: configuarable step = 2 if update_d and phase == "train" else 1 while True: loss_mse, loss_mge, loss_adv, loss_g = update_generator( model_g, model_d, optimizer_g, y, y_hat, y_static, y_hat_static, adv_w, sorted_lengths, mask, phase, mse_w=mse_w, mge_w=mge_w) step -= 1 if step <= 0: break # Update outputs y_hat, y_hat_static = apply_generator( x, R, sorted_lengths) running_loss["mse"] += loss_mse running_loss["mge"] += loss_mge running_loss["loss_adv"] += loss_adv running_loss["generator"] += loss_g # Distotions distortions = compute_distortions(y_static.data, y_hat_static.data, Y_data_mean, Y_data_std, sorted_lengths.data) for k, v in distortions.items(): try: running_metrics[k] += float(v) except KeyError: running_metrics[k] = float(v) # Update expectation # NOTE: E_loss_mge is not exactly same as E[L_mge(y,y_hat)] # in thier papers, since we add MSE term in the loss. # It will be same if mse_w = 0 and mge_w = 1. if update_d and update_g and phase == "train": E_loss_mge = (mse_w * running_loss["mse"] + mge_w * running_loss["mge"]) / N E_loss_adv = running_loss["loss_adv"] / N log_value("E(mge)", E_loss_mge, global_epoch) log_value("E(adv)", E_loss_adv, global_epoch) log_value("MGE/ADV loss weight", E_loss_mge / E_loss_adv, global_epoch) # Log loss for ty, enabled in [("mse", update_g), ("mge", update_g), ("discriminator", update_d), ("loss_real_d", update_d), ("loss_fake_d", update_d), ("loss_adv", update_g and update_d), ("generator", update_g)]: if enabled: ave_loss = running_loss[ty] / N log_value("{} {} loss".format(phase, ty), ave_loss, global_epoch) # Log eval metrics for k, v in running_metrics.items(): log_value("{} {} metric".format(phase, k), v / N, global_epoch) # Log discriminator classification accuracy if update_d: log_value("Real {} acc".format(phase), real_correct_count / total_num_frames, global_epoch) log_value("Fake {} acc".format(phase), fake_correct_count / total_num_frames, global_epoch) # Log spoofing rate for generated features by reference model if reference_discriminator is not None: log_value("{} spoofing rate".format(phase), regard_fake_as_natural / total_num_frames, global_epoch) # Save checkpoints if global_epoch % checkpoint_interval == 0: for model, optimizer, enabled, name in [ (model_g, optimizer_g, update_g, "Generator"), (model_d, optimizer_d, update_d, "Discriminator") ]: if enabled: save_checkpoint(model, optimizer, global_epoch, checkpoint_dir, name) return 0
def benchmark_mlpg(static_dim=59, T=100, batch_size=10, use_cuda=True): if use_cuda and not torch.cuda.is_available(): return windows = _get_windows_set()[-1] np.random.seed(1234) torch.manual_seed(1234) means = np.random.rand(T, static_dim * len(windows)).astype(np.float32) variances = np.ones(static_dim * len(windows)) reshaped_means = G.reshape_means(means, static_dim) # Ppseud target y = G.mlpg(means, variances, windows).astype(np.float32) # Pack into variables means = Variable(torch.from_numpy(means), requires_grad=True) reshaped_means = Variable(torch.from_numpy(reshaped_means), requires_grad=True) y = Variable(torch.from_numpy(y), requires_grad=False) criterion = nn.MSELoss() # Case 1: MLPG since = time.time() for _ in range(batch_size): y_hat = AF.mlpg(means, torch.from_numpy(variances), windows) L = criterion(y_hat, y) assert np.allclose(y_hat.data.numpy(), y.data.numpy()) L.backward() # slow! elapsed_mlpg = time.time() - since # Case 2: UnitVarianceMLPG since = time.time() if use_cuda: y = y.cuda() R = G.unit_variance_mlpg_matrix(windows, T) R = torch.from_numpy(R) # Assuming minibatch are zero-ppaded, we only need to create MLPG matrix # per-minibatch, not per-utterance. if use_cuda: R = R.cuda() for _ in range(batch_size): if use_cuda: means = means.cpu() means = means.cuda() y_hat = AF.unit_variance_mlpg(R, means) L = criterion(y_hat, y) assert np.allclose(y_hat.cpu().data.numpy(), y.cpu().data.numpy(), atol=1e-5) L.backward() elapsed_unit_variance_mlpg = time.time() - since ratio = elapsed_mlpg / elapsed_unit_variance_mlpg print( "MLPG vs UnitVarianceMLPG (static_dim, T, batch_size, use_cuda) = ({}):" .format((static_dim, T, batch_size, use_cuda))) if ratio > 1: s = "faster" sys.stdout.write(OKGREEN) else: s = "slower" sys.stdout.write(FAIL) print( "UnitVarianceMLPG, {:4f} times {}. Elapsed times {:4f} / {:4f}".format( ratio, s, elapsed_mlpg, elapsed_unit_variance_mlpg)) print(ENDC)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs