Exemple #1
0
def gen_waveform(y_predicted,
                 Y_mean,
                 Y_std,
                 post_filter=False,
                 coef=1.4,
                 fs=16000,
                 mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std,
                                        mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Exemple #2
0
    def decode_sp(coded_sp: np.array,
                  sp_type: str = "mcep",
                  fs: int = None,
                  alpha: float = None,
                  mgc_gamma: float = None,
                  n_fft: int = None,
                  post_filtering: bool = False):

        if post_filtering:
            if sp_type in ["mcep", "mgc"]:
                coded_sp = merlin_post_filter(
                    coded_sp, AudioProcessing.fs_to_mgc_alpha(fs))
            else:
                logging.warning(
                    "Post-filtering only implemented for cepstrum features.")

        if sp_type == "mcep":
            return AudioProcessing.mcep_to_amp_sp(coded_sp, fs, alpha)
        elif sp_type == "mgc":
            return AudioProcessing.mgc_to_amp_sp(coded_sp, fs, alpha,
                                                 mgc_gamma, n_fft)
        elif sp_type == "mfbanks":
            return AudioProcessing.mfbanks_to_amp_sp(coded_sp, fs, n_fft)
        elif sp_type == "amp_sp":
            return coded_sp
        else:
            raise NotImplementedError("Unknown feature type {}. No decoding "
                                      "method available.".format(sp_type))
Exemple #3
0
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4,
                 fs=16000, mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Exemple #4
0
    def generate(self, parm_var, do_postfilter=True):
        config = self.analysis_config

        for path in self.paths:
            file_id = splitext(basename(path))[0]
            print('Synthesizing %s ... ' % (file_id), end='')
            mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var)

            if do_postfilter:
                mgc = merlin_post_filter(mgc, config.alpha)

            sp = pysptk.mc2sp(mgc,
                              fftlen=config.fft_length,
                              alpha=config.alpha)
            ap = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                             config.sampling_rate,
                                             config.fft_length)
            f0 = self._lf0_to_f0(lf0, vuv)
            generated = pyworld.synthesize(f0.flatten().astype(np.float64),
                                           sp.astype(np.float64),
                                           ap.astype(np.float64),
                                           config.sampling_rate,
                                           config.frame_period)
            with open(join(self.out_dir, file_id + '.wav'), 'wb') as f:
                f.write(Audio(generated, rate=config.sampling_rate).data)
            print('done!')
Exemple #5
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Exemple #6
0
    def run_r9y9wavenet_mulaw_world_feats_synth(synth_output, hparams):

        # If no path is given, use pre-trained model.
        if not hasattr(
                hparams,
                "synth_vocoder_path") or hparams.synth_vocoder_path is None:
            parent_dirs = os.path.realpath(__file__).split(os.sep)
            dir_root = str.join(
                os.sep, parent_dirs[:parent_dirs.index("IdiapTTS") + 1])
            hparams.synth_vocoder_path = os.path.join(
                dir_root, "idiaptts", "misc", "pretrained",
                "r9y9wavenet_quantized_16k_world_feats_English.nn")

        # Default quantization is with mu=255.
        if not hasattr(hparams, "mu") or hparams.mu is None:
            hparams.add_hparam("mu", 255)

        if hasattr(hparams, 'frame_rate_output_Hz'):
            org_frame_rate_output_Hz = hparams.frame_rate_output_Hz
            hparams.frame_rate_output_Hz = 16000
        else:
            org_frame_rate_output_Hz = None
            hparams.add_hparam("frame_rate_output_Hz", 16000)

        synth_output = copy.copy(synth_output)

        if hparams.do_post_filtering:
            for id_name, output in synth_output.items():
                coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(
                    output,
                    contains_deltas=False,
                    num_coded_sps=hparams.num_coded_sps)
                coded_sp = merlin_post_filter(
                    coded_sp,
                    WorldFeatLabelGen.fs_to_mgc_alpha(hparams.synth_fs))
                synth_output[
                    id_name] = WorldFeatLabelGen.convert_from_world_features(
                        coded_sp, lf0, vuv, bap)

        if hasattr(hparams, 'bit_depth'):
            org_bit_depth = hparams.bit_depth
            hparams.bit_depth = 16
        else:
            org_bit_depth = None
            hparams.add_hparam("bit_depth", 16)

        Synthesiser.run_wavenet_vocoder(synth_output, hparams)

        # Restore identifier.
        hparams.setattr_no_type_check(
            "bit_depth", org_bit_depth)  # Can be None, thus no type check.
        hparams.setattr_no_type_check("frame_rate_output_Hz",
                                      org_frame_rate_output_Hz)  # Can be None.
Exemple #7
0
def test_merlin_post_filter():
    root = join(DATA_DIR, "merlin_post_filter")
    mgc = np.fromfile(join(root, "arctic_b0539.mgc"),
                      dtype=np.float32).reshape(-1, 60)
    weight = np.fromfile(join(root, "weight"), dtype=np.float32)
    alpha = 0.58
    minimum_phase_order = 511
    fftlen = 1024
    coef = 1.4

    # Step 1
    mgc_r0 = np.fromfile(join(root, "arctic_b0539.mgc_r0"), dtype=np.float32)
    mgc_r0_hat = pysptk.c2acr(pysptk.freqt(
        mgc, minimum_phase_order, alpha=-alpha), 0, fftlen).flatten()
    assert np.allclose(mgc_r0, mgc_r0_hat)

    # Step 2
    mgc_p_r0 = np.fromfile(
        join(root, "arctic_b0539.mgc_p_r0"), dtype=np.float32)
    mgc_p_r0_hat = pysptk.c2acr(pysptk.freqt(
        mgc * weight, minimum_phase_order, -alpha), 0, fftlen).flatten()
    assert np.allclose(mgc_p_r0, mgc_p_r0_hat)

    # Step 3
    mgc_b0 = np.fromfile(join(root, "arctic_b0539.mgc_b0"), dtype=np.float32)
    mgc_b0_hat = pysptk.mc2b(weight * mgc, alpha)[:, 0]
    assert np.allclose(mgc_b0, mgc_b0_hat)

    # Step 4
    mgc_p_b0 = np.fromfile(
        join(root, "arctic_b0539.mgc_p_b0"), dtype=np.float32)
    mgc_p_b0_hat = np.log(mgc_r0_hat / mgc_p_r0_hat) / 2 + mgc_b0_hat
    assert np.allclose(mgc_p_b0, mgc_p_b0_hat)

    # Final step
    mgc_p_mgc = np.fromfile(
        join(root, "arctic_b0539.mgc_p_mgc"), dtype=np.float32).reshape(-1, 60)
    mgc_p_mgc_hat = pysptk.b2mc(
        np.hstack((mgc_p_b0_hat[:, None], pysptk.mc2b(mgc * weight, alpha)[:, 1:])), alpha)
    assert np.allclose(mgc_p_mgc, mgc_p_mgc_hat)

    filtered_mgc = merlin_post_filter(mgc, alpha, coef=coef, weight=weight,
                                      minimum_phase_order=minimum_phase_order,
                                      fftlen=fftlen)
    assert np.allclose(filtered_mgc, mgc_p_mgc, atol=1e-6)
Exemple #8
0
def gen_waveform(y_predicted, do_postfilter=False):
    y_predicted = trim_zeros_frames(y_predicted)
    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted)
    if do_postfilter:
        mgc = merlin_post_filter(mgc, alpha)
    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    #print(bap.shape)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    return generated_waveform
def world2wav(
        clf0, vuv, cap, fs, fbin,
        mcep=None, sp=None, frame_period=None, mcep_postfilter=False):

    # setup
    frame_period = pyworld.default_frame_period \
        if frame_period is None else frame_period

    clf0 = np.ascontiguousarray(clf0.astype('float64'))
    vuv = np.ascontiguousarray(vuv > 0.5).astype('int')
    cap = np.ascontiguousarray(cap.astype('float64'))
    fft_len = fbin * 2 - 2
    alpha = pysptk.util.mcepalpha(fs)

    # clf0 2 f0
    f0 = np.squeeze(np.exp(clf0)) * np.squeeze(vuv)

    # cap 2 ap
    if cap.ndim != 2:
        cap = np.expand_dims(cap, 1)
    ap = pyworld.decode_aperiodicity(cap, fs, fft_len)

    # mcep 2 sp
    if sp is None:
        if mcep is None:
            raise ValueError

        else:
            mcep = np.ascontiguousarray(mcep.astype('float64'))
            if mcep_postfilter:
                mcep = merlin_post_filter(mcep, alpha)
            sp = pysptk.mgc2sp(mcep, alpha=alpha, fftlen=fft_len)
            sp = np.abs(np.exp(sp)) ** 2
    else:
        sp = np.ascontiguousarray(sp)

    wave = pyworld.synthesize(f0, sp, ap, fs, frame_period=frame_period)

    scale = np.abs(wave).max()
    if scale > 0.99:
        wave = wave / scale * 0.99

    return wave
Exemple #10
0
    def svs(
        self,
        labels,
        vocoder_type="world",
        post_filter_type="merlin",
        trajectory_smoothing=True,
        trajectory_smoothing_cutoff=50,
        vuv_threshold=0.1,
        vibrato_scale=1.0,
        return_states=False,
        force_fix_vuv=True,
        post_filter=None,
    ):
        """Synthesize waveform given HTS-style labels

        Args:
            labels (nnmnkwii.io.HTSLabelFile): HTS-style labels
            vocoder_type (str): Vocoder type. world or pwg
            post_filter_type (str): Post-filter type. merlin or nnsvs.

        Returns:
            tuple: (synthesized waveform, sampling rate)
        """
        vocoder_type = vocoder_type.lower()
        if vocoder_type not in ["world", "pwg"]:
            raise ValueError(f"Unknown vocoder type: {vocoder_type}")
        if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]:
            raise ValueError(f"Unknown post-filter type: {post_filter_type}")

        if vocoder_type == "pwg" and self.vocoder is None:
            raise ValueError("""Pre-trained vocodr model is not found.
WORLD is only supported for waveform generation""")
        if post_filter is not None:
            warn("post_filter is deprecated. Use post_filter_type instead.")
            post_filter_type = "merlin" if post_filter else "none"

        # Time-lag
        lag = predict_timelag(
            self.device,
            labels,
            self.timelag_model,
            self.timelag_config,
            self.timelag_in_scaler,
            self.timelag_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.timelag.allowed_range,
            self.config.timelag.allowed_range_rest,
            self.config.timelag.force_clip_input_features,
        )
        # Duration predictions
        durations = predict_duration(
            self.device,
            labels,
            self.duration_model,
            self.duration_config,
            self.duration_in_scaler,
            self.duration_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.duration.force_clip_input_features,
        )

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

        # Predict acoustic features
        acoustic_features = predict_acoustic(
            self.device,
            duration_modified_labels,
            self.acoustic_model,
            self.acoustic_config,
            self.acoustic_in_scaler,
            self.acoustic_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.config.acoustic.subphone_features,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.acoustic.force_clip_input_features,
        )

        # Apply GV post-filtering
        if post_filter_type in ["nnsvs", "gv"]:
            static_stream_sizes = get_static_stream_sizes(
                self.acoustic_config.stream_sizes,
                self.acoustic_config.has_dynamic_features,
                self.acoustic_config.num_windows,
            )
            mgc_end_dim = static_stream_sizes[0]
            acoustic_features[:, :mgc_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)[:mgc_end_dim],
                acoustic_features[:, :mgc_end_dim],
                offset=2,
            )
            # bap
            bap_start_dim = sum(static_stream_sizes[:3])
            bap_end_dim = sum(static_stream_sizes[:4])
            acoustic_features[:, bap_start_dim:bap_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)
                [bap_start_dim:bap_end_dim],
                acoustic_features[:, bap_start_dim:bap_end_dim],
                offset=0,
            )

        # Learned post-filter using nnsvs
        if post_filter_type == "nnsvs" and self.postfilter_model is not None:
            in_feats = torch.from_numpy(acoustic_features).float().unsqueeze(0)
            in_feats = (
                self.postfilter_out_scaler.transform(in_feats).float().to(
                    self.device))
            out_feats = self.postfilter_model.inference(
                in_feats, [in_feats.shape[1]])
            acoustic_features = (self.postfilter_out_scaler.inverse_transform(
                out_feats.cpu()).squeeze(0).numpy())

        # Generate WORLD parameters
        mgc, lf0, vuv, bap = gen_spsvs_static_features(
            duration_modified_labels,
            acoustic_features,
            self.binary_dict,
            self.numeric_dict,
            self.acoustic_config.stream_sizes,
            self.acoustic_config.has_dynamic_features,
            self.config.acoustic.subphone_features,
            self.pitch_idx,
            self.acoustic_config.num_windows,
            self.config.frame_period,
            self.config.acoustic.relative_f0,
            vibrato_scale=vibrato_scale,
            vuv_threshold=vuv_threshold,
            force_fix_vuv=force_fix_vuv,
        )

        # NOTE: spectral enhancement based on the Merlin's post-filter implementation
        if post_filter_type == "merlin":
            alpha = pysptk.util.mcepalpha(self.config.sample_rate)
            mgc = merlin_post_filter(mgc, alpha)

        # Remove high-frequency components of mgc/bap
        # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
        if trajectory_smoothing:
            modfs = int(1 / 0.005)
            for d in range(mgc.shape[1]):
                mgc[:, d] = lowpass_filter(mgc[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)
            for d in range(bap.shape[1]):
                bap[:, d] = lowpass_filter(bap[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)

        # Waveform generation by (1) WORLD or (2) neural vocoder
        if vocoder_type == "world":
            f0, spectrogram, aperiodicity = gen_world_params(
                mgc,
                lf0,
                vuv,
                bap,
                self.config.sample_rate,
                vuv_threshold=vuv_threshold)

            wav = pyworld.synthesize(
                f0,
                spectrogram,
                aperiodicity,
                self.config.sample_rate,
                self.config.frame_period,
            )
        elif vocoder_type == "pwg":
            # NOTE: So far vocoder models are trained on binary V/UV features
            vuv = (vuv > vuv_threshold).astype(np.float32)
            voc_inp = (torch.from_numpy(
                self.vocoder_in_scaler.transform(
                    np.concatenate([mgc, lf0, vuv, bap],
                                   axis=-1))).float().to(self.device))
            wav = self.vocoder.inference(voc_inp).view(-1).to("cpu").numpy()

        wav = self.post_process(wav)

        if return_states:
            states = {
                "mgc": mgc,
                "lf0": lf0,
                "vuv": vuv,
                "bap": bap,
            }
            if vocoder_type == "world":
                states.update({
                    "f0": f0,
                    "spectrogram": spectrogram,
                    "aperiodicity": aperiodicity,
                })

            return wav, self.config.sample_rate, states

        return wav, self.config.sample_rate
def gen_waveform(labels,
                 acoustic_features,
                 binary_dict,
                 continuous_dict,
                 stream_sizes,
                 has_dynamic_features,
                 subphone_features="coarse_coding",
                 log_f0_conditioning=True,
                 pitch_idx=None,
                 num_windows=3,
                 post_filter=True,
                 sample_rate=48000,
                 frame_period=5,
                 relative_f0=True):
    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(stream_sizes,
                                                      has_dynamic_features,
                                                      len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features,
                                             static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                               sample_rate, fftlen)

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(
            labels,
            binary_dict,
            continuous_dict,
            add_frame_features=True,
            subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    # 音量を小さくする(音割れ防止)
    # TODO: ここのかける定数をいい感じにする
    spectrogram *= 0.000000001
    sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60)

    return f0, sp, bap, generated_waveform
Exemple #12
0
def synthesis(
    config,
    device,
    label_path,
    question_path,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    duration_model,
    duration_config,
    duration_in_scaler,
    duration_out_scaler,
    acoustic_model,
    acoustic_config,
    acoustic_in_scaler,
    acoustic_out_scaler,
):
    # load labels and question
    labels = hts.load(label_path).round_()
    binary_dict, numeric_dict = hts.load_question_set(question_path,
                                                      append_hat_for_LL=False)

    # pitch indices in the input features
    # TODO: configuarable
    pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    log_f0_conditioning = config.log_f0_conditioning

    # Clipping settings
    # setting True by default for backward compatibility
    timelag_clip_input_features = (config.timelag.force_clip_input_features
                                   if "force_clip_input_features"
                                   in config.timelag else True)
    duration_clip_input_features = (config.duration.force_clip_input_features
                                    if "force_clip_input_features"
                                    in config.duration else True)
    acoustic_clip_input_features = (config.acoustic.force_clip_input_features
                                    if "force_clip_input_features"
                                    in config.acoustic else True)

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag
        lag = predict_timelag(
            device,
            labels,
            timelag_model,
            timelag_config,
            timelag_in_scaler,
            timelag_out_scaler,
            binary_dict,
            numeric_dict,
            pitch_indices,
            log_f0_conditioning,
            config.timelag.allowed_range,
            config.timelag.allowed_range_rest,
            timelag_clip_input_features,
        )

        # Duration predictions
        durations = predict_duration(
            device,
            labels,
            duration_model,
            duration_config,
            duration_in_scaler,
            duration_out_scaler,
            binary_dict,
            numeric_dict,
            pitch_indices,
            log_f0_conditioning,
            duration_clip_input_features,
        )

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device,
        duration_modified_labels,
        acoustic_model,
        acoustic_config,
        acoustic_in_scaler,
        acoustic_out_scaler,
        binary_dict,
        numeric_dict,
        config.acoustic.subphone_features,
        pitch_indices,
        log_f0_conditioning,
        acoustic_clip_input_features,
    )

    # Generate WORLD parameters
    mgc, lf0, vuv, bap = gen_spsvs_static_features(
        duration_modified_labels,
        acoustic_features,
        binary_dict,
        numeric_dict,
        acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features,
        pitch_idx,
        acoustic_config.num_windows,
        config.frame_period,
        config.acoustic.relative_f0,
        config.vibrato_scale,
    )

    if config.acoustic.post_filter:
        alpha = pysptk.util.mcepalpha(config.sample_rate)
        mgc = merlin_post_filter(mgc, alpha)

    f0, spectrogram, aperiodicity = gen_world_params(mgc, lf0, vuv, bap,
                                                     config.sample_rate)

    wav = pyworld.synthesize(f0, spectrogram, aperiodicity, config.sample_rate,
                             config.frame_period)

    return wav