Ejemplo n.º 1
0
def inv_scale(mgc, lf0, vuv, bap, Y_mean, Y_std, binalize_vuv=True):
    # static + dynamic domain
    mgc_dim, lf0_dim, vuv_dim, bap_dim = hp.stream_sizes
    windows = hp.windows

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    mgc = P.inv_scale(mgc, Y_mean[:mgc_dim // len(windows)],
                      Y_std[:mgc_dim // len(windows)])
    lf0 = P.inv_scale(
        lf0, Y_mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)],
        Y_std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)])
    bap = P.inv_scale(
        bap, Y_mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)],
        Y_std[bap_start_idx:bap_start_idx + bap_dim // len(windows)])
    vuv = P.inv_scale(vuv, Y_mean[vuv_start_idx], Y_std[vuv_start_idx])
    if binalize_vuv:
        vuv[vuv > 0.5] = 1.0
        vuv[vuv <= 0.5] = 0
        vuv = vuv.long()

    return mgc, lf0, vuv, bap
Ejemplo n.º 2
0
def gen_parameters(y_predicted, Y_mean, Y_std):
    mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    windows = hp_acoustic.windows

    # Split acoustic features
    mgc = y_predicted[:, :lf0_start_idx]
    lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
    vuv = y_predicted[:, vuv_start_idx]
    bap = y_predicted[:, bap_start_idx:]

    # Perform MLPG on normalized features
    mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows)
    lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows)
    bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows)

    ty = "acoustic"
    # When we use MGE training, denormalization should be done after MLPG.
    mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)],
                      Y_std[ty][:mgc_dim // len(windows)])
    lf0 = P.inv_scale(
        lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)],
        Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)])
    bap = P.inv_scale(
        bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)],
        Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)])
    vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx])

    return mgc, lf0, vuv, bap
Ejemplo n.º 3
0
def compute_distortions(y_static,
                        y_hat_static,
                        Y_data_mean,
                        Y_data_std,
                        lengths=None):
    if hp.name == "acoustic":
        mgc, lf0, vuv, bap = split_streams(y_static, Y_data_mean, Y_data_std)
        mgc_hat, lf0_hat, vuv_hat, bap_hat = split_streams(
            y_hat_static, Y_data_mean, Y_data_std)
        try:
            f0_mse = metrics.lf0_mean_squared_error(lf0,
                                                    vuv,
                                                    lf0_hat,
                                                    vuv_hat,
                                                    lengths=lengths,
                                                    linear_domain=True)
        except ZeroDivisionError:
            f0_mse = np.nan

        distortions = {
            "mcd": metrics.melcd(mgc[:, :, 1:],
                                 mgc_hat[:, :, 1:],
                                 lengths=lengths),
            "bap_mcd": metrics.melcd(bap, bap_hat, lengths=lengths) / 10.0,
            "f0_rmse": np.sqrt(f0_mse),
            "vuv_err": metrics.vuv_error(vuv, vuv_hat, lengths=lengths),
        }
    elif hp.name == "duration":
        y_static_invscale = P.inv_scale(y_static, Y_data_mean, Y_data_std)
        y_hat_static_invscale = P.inv_scale(y_hat_static, Y_data_mean,
                                            Y_data_std)
        distortions = {
            "dur_rmse":
            math.sqrt(
                metrics.mean_squared_error(y_static_invscale,
                                           y_hat_static_invscale,
                                           lengths=lengths))
        }
    elif hp.name == "vc":
        static_dim = hp.order
        y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim],
                                        Y_data_std[:static_dim])
        y_hat_static_invscale = P.inv_scale(y_hat_static,
                                            Y_data_mean[:static_dim],
                                            Y_data_std[:static_dim])
        distortions = {
            "mcd":
            metrics.melcd(y_static_invscale,
                          y_hat_static_invscale,
                          lengths=lengths)
        }
    else:
        assert False

    return distortions
Ejemplo n.º 4
0
    def gen_parameters(self, y_predicted, mge_training=False):
        mgc_dim, lf0_dim, vuv_dim, bap_dim = stream_sizes
        mgc_start_idx = 0
        lf0_start_idx = mgc_dim
        vuv_start_idx = lf0_start_idx + lf0_dim
        bap_start_idx = vuv_start_idx + vuv_dim
        # MGE training
        if mge_training:
            # Split acoustic features
            mgc = y_predicted[:, :lf0_start_idx]
            lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
            vuv = y_predicted[:, vuv_start_idx]
            bap = y_predicted[:, bap_start_idx:]

            # Perform MLPG on normalized features
            mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows)
            lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows)
            bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows)
            #import pdb; pdb.set_trace()
            # When we use MGE training, denormalization should be done after MLPG.
            mgc = P.inv_scale(mgc, self.mean[:mgc_dim // len(windows)],
                              self.std[:mgc_dim // len(windows)])
            lf0 = P.inv_scale(
                lf0, self.mean[lf0_start_idx:lf0_start_idx +
                               lf0_dim // len(windows)],
                self.std[lf0_start_idx:lf0_start_idx +
                         lf0_dim // len(windows)])
            bap = P.inv_scale(
                bap, self.mean[bap_start_idx:bap_start_idx +
                               bap_dim // len(windows)],
                self.std[bap_start_idx:bap_start_idx +
                         bap_dim // len(windows)])
            vuv = P.inv_scale(vuv, self.mean[vuv_start_idx],
                              self.std[vuv_start_idx])
        else:
            # Denormalization first
            y_predicted = P.inv_scale(y_predicted, self.mean, self.std)

            # Split acoustic features
            mgc = y_predicted[:, :lf0_start_idx]
            lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
            vuv = y_predicted[:, vuv_start_idx]
            bap = y_predicted[:, bap_start_idx:]

            # Perform MLPG
            Y_var = self.std * self.std
            mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows)
            lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx],
                                windows)
            bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows)

        return mgc, lf0, vuv, bap
Ejemplo n.º 5
0
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True):
    mgc_dim, lf0_dim, vuv_dim, bap_dim = audio_world_config.stream_sizes

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    windows = audio_world_config.windows

    #ty = "acoustic"

    # MGE training
    if mge_training:
        # Split acoustic features
        mgc = y_predicted[:, :lf0_start_idx]
        lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
        vuv = y_predicted[:, vuv_start_idx]
        bap = y_predicted[:, bap_start_idx:]

        # Perform MLPG on normalized features
        mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows)
        lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows)
        bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows)

        # When we use MGE training, denormalization should be done after MLPG.
        mgc = P.inv_scale(mgc, Y_mean[:mgc_dim // len(windows)],
                          Y_std[:mgc_dim // len(windows)])
        lf0 = P.inv_scale(
            lf0, Y_mean[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)],
            Y_std[lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)])
        bap = P.inv_scale(
            bap, Y_mean[bap_start_idx:bap_start_idx + bap_dim // len(windows)],
            Y_std[bap_start_idx:bap_start_idx + bap_dim // len(windows)])
        vuv = P.inv_scale(vuv, Y_mean[vuv_start_idx], Y_std[vuv_start_idx])
    else:
        # Denormalization first
        y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std)

        # Split acoustic features
        mgc = y_predicted[:, :lf0_start_idx]
        lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
        vuv = y_predicted[:, vuv_start_idx]
        bap = y_predicted[:, bap_start_idx:]

        # Perform MLPG
        Y_var = Y_std * Y_std
        mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows)
        lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows)
        bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows)
    return mgc, lf0, vuv, bap
Ejemplo n.º 6
0
def gen_parameters(y_predicted, Y_mean, Y_std, mge_training=True):
    mgc_dim, lf0_dim, vuv_dim, bap_dim = hp_acoustic.stream_sizes

    mgc_start_idx = 0
    lf0_start_idx = mgc_dim
    vuv_start_idx = lf0_start_idx + lf0_dim
    bap_start_idx = vuv_start_idx + vuv_dim

    windows = hp_acoustic.windows

    ty = "acoustic"

    # MGE training
    if mge_training:
        # Split acoustic features
        mgc = y_predicted[:, :lf0_start_idx]
        lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
        vuv = y_predicted[:, vuv_start_idx]
        bap = y_predicted[:, bap_start_idx:]

        # Perform MLPG on normalized features
        mgc = paramgen.mlpg(mgc, np.ones(mgc.shape[-1]), windows)
        lf0 = paramgen.mlpg(lf0, np.ones(lf0.shape[-1]), windows)
        bap = paramgen.mlpg(bap, np.ones(bap.shape[-1]), windows)

        # When we use MGE training, denormalization should be done after MLPG.
        mgc = P.inv_scale(mgc, Y_mean[ty][:mgc_dim // len(windows)],
                          Y_std[ty][:mgc_dim // len(windows)])
        lf0 = P.inv_scale(lf0, Y_mean[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)],
                          Y_std[ty][lf0_start_idx:lf0_start_idx + lf0_dim // len(windows)])
        bap = P.inv_scale(bap, Y_mean[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)],
                          Y_std[ty][bap_start_idx:bap_start_idx + bap_dim // len(windows)])
        vuv = P.inv_scale(vuv, Y_mean[ty][vuv_start_idx], Y_std[ty][vuv_start_idx])
    else:
        # Denormalization first
        y_predicted = P.inv_scale(y_predicted, Y_mean, Y_std)

        # Split acoustic features
        mgc = y_predicted[:, :lf0_start_idx]
        lf0 = y_predicted[:, lf0_start_idx:vuv_start_idx]
        vuv = y_predicted[:, vuv_start_idx]
        bap = y_predicted[:, bap_start_idx:]

        # Perform MLPG
        Y_var = Y_std[ty] * Y_std[ty]
        mgc = paramgen.mlpg(mgc, Y_var[:lf0_start_idx], windows)
        lf0 = paramgen.mlpg(lf0, Y_var[lf0_start_idx:vuv_start_idx], windows)
        bap = paramgen.mlpg(bap, Y_var[bap_start_idx:], windows)

    return mgc, lf0, vuv, bap
Ejemplo n.º 7
0
def test_meanvar():
    # Pick acoustic features for testing
    _, X = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_mean, X_var = P.meanvar(X)
    X_std = np.sqrt(X_var)
    assert np.isfinite(X_mean).all()
    assert np.isfinite(X_var).all()
    assert X_mean.shape[-1] == D
    assert X_var.shape[-1] == D

    _, X_std_hat = P.meanstd(X)
    assert np.allclose(X_std, X_std_hat)

    x = X[0]
    x_scaled = P.scale(x, X_mean, X_std)
    assert np.isfinite(x_scaled).all()

    # For padded dataset
    _, X = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_mean_hat, X_var_hat = P.meanvar(X, lengths)
    assert np.allclose(X_mean, X_mean_hat)
    assert np.allclose(X_var, X_var_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std)
    assert np.allclose(x, x_hat, atol=1e-5)
Ejemplo n.º 8
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Ejemplo n.º 9
0
def compute_distortions(y_static,
                        y_hat_static,
                        Y_data_mean,
                        Y_data_std,
                        lengths=None):
    if hp.name == "vc":
        static_dim = hp.order
        y_static_invscale = P.inv_scale(y_static, Y_data_mean[:static_dim],
                                        Y_data_std[:static_dim])
        y_hat_static_invscale = P.inv_scale(y_hat_static,
                                            Y_data_mean[:static_dim],
                                            Y_data_std[:static_dim])
        distortions = {
            "mcd":
            metrics.melcd(y_static_invscale,
                          y_hat_static_invscale,
                          lengths=lengths)
        }
    else:
        assert False

    return distortions
Ejemplo n.º 10
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(duration_linguistic_features,
                                                  X_min[ty],
                                                  X_max[ty],
                                                  feature_range=(0.01, 0.99))

    # Apply models
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    duration_predicted = duration_model(x, [xl]).data.cpu().numpy()
    duration_predicted = duration_predicted.reshape(
        -1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty])
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Ejemplo n.º 11
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(
        duration_linguistic_features,
        X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Apply models
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    duration_predicted = duration_model(x, [xl]).data.cpu().numpy()
    duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty])
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Ejemplo n.º 12
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs
Ejemplo n.º 13
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes,
                                         hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs