Beispiel #1
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Beispiel #2
0
Datei: gen.py Projekt: r9y9/nnsvs
def gen_spsvs_static_features(
    labels,
    acoustic_features,
    binary_dict,
    numeric_dict,
    stream_sizes,
    has_dynamic_features,
    subphone_features="coarse_coding",
    pitch_idx=None,
    num_windows=3,
    frame_period=5,
    relative_f0=True,
    vibrato_scale=1.0,
    vuv_threshold=0.3,
    force_fix_vuv=True,
):
    """Generate static features from predicted acoustic features

    Args:
        labels (HTSLabelFile): HTS labels
        acoustic_features (ndarray): predicted acoustic features
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        stream_sizes (list): stream sizes
        has_dynamic_features (list): whether each stream has dynamic features
        subphone_features (str): subphone feature type
        pitch_idx (int): index of pitch features
        num_windows (int): number of windows
        frame_period (float): frame period
        relative_f0 (bool): whether to use relative f0
        vibrato_scale (float): vibrato scale
        vuv_threshold (float): vuv threshold
        force_fix_vuv (bool): whether to use post-processing to fix VUV.

    Returns:
        tuple: tuple of mgc, lf0, vuv and bap.
    """
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, num_windows
        )
    else:
        static_stream_sizes = stream_sizes

    # Copy here to avoid inplace operations on input acoustic features
    acoustic_features = acoustic_features.copy()

    # Split multi-stream features
    streams = split_streams(acoustic_features, static_stream_sizes)

    if len(streams) == 4:
        mgc, target_f0, vuv, bap = streams
        vib, vib_flags = None, None
    elif len(streams) == 5:
        # Assuming diff-based vibrato parameters
        mgc, target_f0, vuv, bap, vib = streams
        vib_flags = None
    elif len(streams) == 6:
        # Assuming sine-based vibrato parameters
        mgc, target_f0, vuv, bap, vib, vib_flags = streams
    else:
        raise RuntimeError("Not supported streams")

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    # Correct V/UV based on special phone flags
    if force_fix_vuv:
        vuv = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)

    # F0
    if relative_f0:
        diff_lf0 = target_f0
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    if vib is not None:
        if vib_flags is not None:
            # Generate sine-based vibrato
            vib_flags = vib_flags.flatten()
            m_a, m_f = vib[:, 0], vib[:, 1]

            # Fill zeros for non-vibrato frames
            m_a[vib_flags < 0.5] = 0
            m_f[vib_flags < 0.5] = 0

            # Gen vibrato
            sr_f0 = int(1 / (frame_period * 0.001))
            f0 = gen_sine_vibrato(f0.flatten(), sr_f0, m_a, m_f, vibrato_scale)
        else:
            # Generate diff-based vibrato
            f0 = f0.flatten() + vibrato_scale * vib.flatten()

    # NOTE: Back to log-domain for convenience
    lf0 = f0.copy()
    lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)])
    # NOTE: interpolation is necessary
    lf0 = interp1d(lf0, kind="slinear")

    lf0 = lf0[:, None] if len(lf0.shape) == 1 else lf0
    vuv = vuv[:, None] if len(vuv.shape) == 1 else vuv

    return mgc, lf0, vuv, bap
Beispiel #3
0
Datei: svs.py Projekt: r9y9/nnsvs
    def svs(
        self,
        labels,
        vocoder_type="world",
        post_filter_type="merlin",
        trajectory_smoothing=True,
        trajectory_smoothing_cutoff=50,
        vuv_threshold=0.1,
        vibrato_scale=1.0,
        return_states=False,
        force_fix_vuv=True,
        post_filter=None,
    ):
        """Synthesize waveform given HTS-style labels

        Args:
            labels (nnmnkwii.io.HTSLabelFile): HTS-style labels
            vocoder_type (str): Vocoder type. world or pwg
            post_filter_type (str): Post-filter type. merlin or nnsvs.

        Returns:
            tuple: (synthesized waveform, sampling rate)
        """
        vocoder_type = vocoder_type.lower()
        if vocoder_type not in ["world", "pwg"]:
            raise ValueError(f"Unknown vocoder type: {vocoder_type}")
        if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]:
            raise ValueError(f"Unknown post-filter type: {post_filter_type}")

        if vocoder_type == "pwg" and self.vocoder is None:
            raise ValueError("""Pre-trained vocodr model is not found.
WORLD is only supported for waveform generation""")
        if post_filter is not None:
            warn("post_filter is deprecated. Use post_filter_type instead.")
            post_filter_type = "merlin" if post_filter else "none"

        # Time-lag
        lag = predict_timelag(
            self.device,
            labels,
            self.timelag_model,
            self.timelag_config,
            self.timelag_in_scaler,
            self.timelag_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.timelag.allowed_range,
            self.config.timelag.allowed_range_rest,
            self.config.timelag.force_clip_input_features,
        )
        # Duration predictions
        durations = predict_duration(
            self.device,
            labels,
            self.duration_model,
            self.duration_config,
            self.duration_in_scaler,
            self.duration_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.duration.force_clip_input_features,
        )

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

        # Predict acoustic features
        acoustic_features = predict_acoustic(
            self.device,
            duration_modified_labels,
            self.acoustic_model,
            self.acoustic_config,
            self.acoustic_in_scaler,
            self.acoustic_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.config.acoustic.subphone_features,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.acoustic.force_clip_input_features,
        )

        # Apply GV post-filtering
        if post_filter_type in ["nnsvs", "gv"]:
            static_stream_sizes = get_static_stream_sizes(
                self.acoustic_config.stream_sizes,
                self.acoustic_config.has_dynamic_features,
                self.acoustic_config.num_windows,
            )
            mgc_end_dim = static_stream_sizes[0]
            acoustic_features[:, :mgc_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)[:mgc_end_dim],
                acoustic_features[:, :mgc_end_dim],
                offset=2,
            )
            # bap
            bap_start_dim = sum(static_stream_sizes[:3])
            bap_end_dim = sum(static_stream_sizes[:4])
            acoustic_features[:, bap_start_dim:bap_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)
                [bap_start_dim:bap_end_dim],
                acoustic_features[:, bap_start_dim:bap_end_dim],
                offset=0,
            )

        # Learned post-filter using nnsvs
        if post_filter_type == "nnsvs" and self.postfilter_model is not None:
            in_feats = torch.from_numpy(acoustic_features).float().unsqueeze(0)
            in_feats = (
                self.postfilter_out_scaler.transform(in_feats).float().to(
                    self.device))
            out_feats = self.postfilter_model.inference(
                in_feats, [in_feats.shape[1]])
            acoustic_features = (self.postfilter_out_scaler.inverse_transform(
                out_feats.cpu()).squeeze(0).numpy())

        # Generate WORLD parameters
        mgc, lf0, vuv, bap = gen_spsvs_static_features(
            duration_modified_labels,
            acoustic_features,
            self.binary_dict,
            self.numeric_dict,
            self.acoustic_config.stream_sizes,
            self.acoustic_config.has_dynamic_features,
            self.config.acoustic.subphone_features,
            self.pitch_idx,
            self.acoustic_config.num_windows,
            self.config.frame_period,
            self.config.acoustic.relative_f0,
            vibrato_scale=vibrato_scale,
            vuv_threshold=vuv_threshold,
            force_fix_vuv=force_fix_vuv,
        )

        # NOTE: spectral enhancement based on the Merlin's post-filter implementation
        if post_filter_type == "merlin":
            alpha = pysptk.util.mcepalpha(self.config.sample_rate)
            mgc = merlin_post_filter(mgc, alpha)

        # Remove high-frequency components of mgc/bap
        # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
        if trajectory_smoothing:
            modfs = int(1 / 0.005)
            for d in range(mgc.shape[1]):
                mgc[:, d] = lowpass_filter(mgc[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)
            for d in range(bap.shape[1]):
                bap[:, d] = lowpass_filter(bap[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)

        # Waveform generation by (1) WORLD or (2) neural vocoder
        if vocoder_type == "world":
            f0, spectrogram, aperiodicity = gen_world_params(
                mgc,
                lf0,
                vuv,
                bap,
                self.config.sample_rate,
                vuv_threshold=vuv_threshold)

            wav = pyworld.synthesize(
                f0,
                spectrogram,
                aperiodicity,
                self.config.sample_rate,
                self.config.frame_period,
            )
        elif vocoder_type == "pwg":
            # NOTE: So far vocoder models are trained on binary V/UV features
            vuv = (vuv > vuv_threshold).astype(np.float32)
            voc_inp = (torch.from_numpy(
                self.vocoder_in_scaler.transform(
                    np.concatenate([mgc, lf0, vuv, bap],
                                   axis=-1))).float().to(self.device))
            wav = self.vocoder.inference(voc_inp).view(-1).to("cpu").numpy()

        wav = self.post_process(wav)

        if return_states:
            states = {
                "mgc": mgc,
                "lf0": lf0,
                "vuv": vuv,
                "bap": bap,
            }
            if vocoder_type == "world":
                states.update({
                    "f0": f0,
                    "spectrogram": spectrogram,
                    "aperiodicity": aperiodicity,
                })

            return wav, self.config.sample_rate, states

        return wav, self.config.sample_rate
Beispiel #4
0
def my_app(config: DictConfig) -> None:
    # NOTE: set discriminator's in_dim automatically
    if config.model.netD.in_dim is None:
        if config.train.adv_use_static_feats_only:
            stream_sizes = get_static_stream_sizes(
                config.model.stream_sizes,
                config.model.has_dynamic_features,
                config.model.num_windows,
            )
        else:
            stream_sizes = np.asarray(config.model.stream_sizes)
        D_in_dim = int((stream_sizes * np.asarray(config.train.adv_streams)).sum())
        if config.train.mask_nth_mgc_for_adv_loss > 0:
            D_in_dim -= config.train.mask_nth_mgc_for_adv_loss
        config.model.netD.in_dim = D_in_dim

    if "max_time_frames" in config.data and config.data.max_time_frames > 0:
        collate_fn = partial(
            collate_fn_random_segments, max_time_frames=config.data.max_time_frames
        )
    else:
        collate_fn = collate_fn_default

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    (
        (netG, optG, schedulerG),
        (netD, optD, schedulerD),
        grad_scaler,
        data_loaders,
        writer,
        logger,
        in_scaler,
        out_scaler,
    ) = setup_gan(config, device, collate_fn)

    check_resf0_config(logger, netG, config, in_scaler, out_scaler)

    out_scaler = PyTorchStandardScaler(
        torch.from_numpy(out_scaler.mean_), torch.from_numpy(out_scaler.scale_)
    ).to(device)
    use_mlflow = config.mlflow.enabled

    if use_mlflow:
        with mlflow.start_run() as run:
            # NOTE: modify out_dir when running with mlflow
            config.train.out_dir = f"{config.train.out_dir}/{run.info.run_id}"
            save_configs(config)
            log_params_from_omegaconf_dict(config)
            last_dev_loss = train_loop(
                config,
                logger,
                device,
                netG,
                optG,
                schedulerG,
                netD,
                optD,
                schedulerD,
                grad_scaler,
                data_loaders,
                writer,
                in_scaler,
                out_scaler,
                use_mlflow,
            )
    else:
        save_configs(config)
        last_dev_loss = train_loop(
            config,
            logger,
            device,
            netG,
            optG,
            schedulerG,
            netD,
            optD,
            schedulerD,
            grad_scaler,
            data_loaders,
            writer,
            in_scaler,
            out_scaler,
            use_mlflow,
        )

    return last_dev_loss
def gen_waveform(labels,
                 acoustic_features,
                 binary_dict,
                 continuous_dict,
                 stream_sizes,
                 has_dynamic_features,
                 subphone_features="coarse_coding",
                 log_f0_conditioning=True,
                 pitch_idx=None,
                 num_windows=3,
                 post_filter=True,
                 sample_rate=48000,
                 frame_period=5,
                 relative_f0=True):
    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(stream_sizes,
                                                      has_dynamic_features,
                                                      len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features,
                                             static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                               sample_rate, fftlen)

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(
            labels,
            binary_dict,
            continuous_dict,
            add_frame_features=True,
            subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    # 音量を小さくする(音割れ防止)
    # TODO: ここのかける定数をいい感じにする
    spectrogram *= 0.000000001
    sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60)

    return f0, sp, bap, generated_waveform
Beispiel #6
0
def eval_spss_model(
    step,
    netG,
    in_feats,
    out_feats,
    lengths,
    model_config,
    out_scaler,
    writer,
    sr,
    trajectory_smoothing=True,
    trajectory_smoothing_cutoff=50,
):
    # make sure to be in eval mode
    netG.eval()
    is_autoregressive = (netG.module.is_autoregressive() if isinstance(
        netG, nn.DataParallel) else netG.is_autoregressive())
    prediction_type = (netG.module.prediction_type() if isinstance(
        netG, nn.DataParallel) else netG.prediction_type())
    utt_indices = [-1, -2, -3]
    utt_indices = utt_indices[:min(3, len(in_feats))]

    if np.any(model_config.has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            model_config.stream_sizes,
            model_config.has_dynamic_features,
            model_config.num_windows,
        )
    else:
        static_stream_sizes = model_config.stream_sizes

    for utt_idx in utt_indices:
        out_feats_denorm_ = out_scaler.inverse_transform(
            out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0))
        mgc, lf0, vuv, bap = get_static_features(
            out_feats_denorm_,
            model_config.num_windows,
            model_config.stream_sizes,
            model_config.has_dynamic_features,
        )[:4]
        mgc = mgc.squeeze(0).cpu().numpy()
        lf0 = lf0.squeeze(0).cpu().numpy()
        vuv = vuv.squeeze(0).cpu().numpy()
        bap = bap.squeeze(0).cpu().numpy()

        f0, spectrogram, aperiodicity = gen_world_params(
            mgc, lf0, vuv, bap, sr)
        wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
        group = f"utt{np.abs(utt_idx)}_reference"
        wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
        writer.add_audio(group, wav, step, sr)

        # Run forward
        if is_autoregressive:
            outs = netG(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]],
                out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
            )
        else:
            outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                        [lengths[utt_idx]])

        # ResF0 case
        if isinstance(outs, tuple) and len(outs) == 2:
            outs, _ = outs

        if prediction_type == PredictionType.PROBABILISTIC:
            pi, sigma, mu = outs
            pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma,
                                                                mu)[1]
        else:
            pred_out_feats = outs
        # NOTE: multiple outputs
        if isinstance(pred_out_feats, list):
            pred_out_feats = pred_out_feats[-1]
        if isinstance(pred_out_feats, tuple):
            pred_out_feats = pred_out_feats[0]

        if not isinstance(pred_out_feats, list):
            pred_out_feats = [pred_out_feats]

        # Run inference
        if prediction_type == PredictionType.PROBABILISTIC:
            inference_out_feats, _ = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        else:
            inference_out_feats = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        pred_out_feats.append(inference_out_feats)

        # Plot normalized input/output
        in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        fig, ax = plt.subplots(3, 1, figsize=(8, 8))
        ax[0].set_title("Reference features")
        ax[1].set_title("Input features")
        ax[2].set_title("Predicted features")
        mesh = librosa.display.specshow(out_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[0],
                                        cmap="viridis")
        # NOTE: assuming normalized to N(0, 1)
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[0])
        mesh = librosa.display.specshow(in_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[1],
                                        cmap="viridis")
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[1])
        mesh = librosa.display.specshow(
            inference_out_feats.squeeze(0).cpu().numpy().T,
            x_axis="frames",
            y_axis="frames",
            ax=ax[2],
            cmap="viridis",
        )
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[2])
        for ax_ in ax:
            ax_.set_ylabel("Feature")
        plt.tight_layout()
        group = f"utt{np.abs(utt_idx)}_inference"
        writer.add_figure(f"{group}/Input-Output", fig, step)
        plt.close()

        assert len(pred_out_feats) == 2
        for idx, pred_out_feats_ in enumerate(pred_out_feats):
            pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy()
            pred_out_feats_denorm = (out_scaler.inverse_transform(
                torch.from_numpy(pred_out_feats_).to(
                    in_feats.device)).cpu().numpy())
            if np.any(model_config.has_dynamic_features):
                # (T, D_out) -> (T, static_dim)
                pred_out_feats_denorm = multi_stream_mlpg(
                    pred_out_feats_denorm,
                    (out_scaler.scale_**2).cpu().numpy(),
                    get_windows(model_config.num_windows),
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                )
            pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams(
                pred_out_feats_denorm, static_stream_sizes)[:4]

            # Remove high-frequency components of mgc/bap
            # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
            if trajectory_smoothing:
                modfs = int(1 / 0.005)
                for d in range(pred_mgc.shape[1]):
                    pred_mgc[:, d] = lowpass_filter(
                        pred_mgc[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)
                for d in range(pred_bap.shape[1]):
                    pred_bap[:, d] = lowpass_filter(
                        pred_bap[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)

            # Generated sample
            f0, spectrogram, aperiodicity = gen_world_params(
                pred_mgc, pred_lf0, pred_vuv, pred_bap, sr)
            wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
            wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
            if idx == 1:
                group = f"utt{np.abs(utt_idx)}_inference"
            else:
                group = f"utt{np.abs(utt_idx)}_forward"
            writer.add_audio(group, wav, step, sr)
            plot_spsvs_params(
                step,
                writer,
                mgc,
                lf0,
                vuv,
                bap,
                pred_mgc,
                pred_lf0,
                pred_vuv,
                pred_bap,
                group=group,
                sr=sr,
            )
Beispiel #7
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    device = torch.device("cuda" if use_cuda else "cpu")
    utt_list = to_absolute_path(config.utt_list)
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)

    utt_ids = load_utt_list(utt_list)

    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(
        to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage,
    )
    model.load_state_dict(checkpoint["state_dict"])
    model.eval()

    out_scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    mean_ = get_static_features(
        out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    mean_ = np.concatenate(mean_, -1).reshape(1, -1)
    var_ = get_static_features(
        out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    var_ = np.concatenate(var_, -1).reshape(1, -1)
    scale_ = get_static_features(
        out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    scale_ = np.concatenate(scale_, -1).reshape(1, -1)
    static_scaler = StandardScaler(mean_, var_, scale_)

    static_stream_sizes = get_static_stream_sizes(
        model_config.stream_sizes,
        model_config.has_dynamic_features,
        model_config.num_windows,
    )

    for utt_id in tqdm(utt_ids):
        in_feats = (torch.from_numpy(
            np.load(join(in_dir,
                         utt_id + "-feats.npy"))).unsqueeze(0).to(device))
        static_feats = _gen_static_features(model, model_config, in_feats,
                                            out_scaler)

        mgc_end_dim = static_stream_sizes[0]
        bap_start_dim = sum(static_stream_sizes[:3])
        bap_end_dim = sum(static_stream_sizes[:4])

        if config.gv_postfilter:
            # mgc
            static_feats[:, :mgc_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[:mgc_end_dim],
                static_feats[:, :mgc_end_dim],
                offset=config.mgc_offset,
            )
            # bap
            static_feats[:, bap_start_dim:bap_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[bap_start_dim:bap_end_dim],
                static_feats[:, bap_start_dim:bap_end_dim],
                offset=config.bap_offset,
            )

        if config.normalize:
            static_feats = static_scaler.transform(static_feats)
        out_path = join(out_dir, f"{utt_id}-feats.npy")
        np.save(out_path, static_feats.astype(np.float32), allow_pickle=False)