コード例 #1
0
def synthesis(config, device, label_path, question_path, timelag_model,
              timelag_config, timelag_in_scaler, timelag_out_scaler,
              duration_model, duration_config, duration_in_scaler,
              duration_out_scaler, acoustic_model, acoustic_config,
              acoustic_in_scaler, acoustic_out_scaler):
    # load labels and question
    labels = hts.load(label_path).round_()
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)

    # pitch indices in the input features
    # TODO: configuarable
    pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    log_f0_conditioning = config.log_f0_conditioning

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag
        lag = predict_timelag(device, labels, timelag_model, timelag_config,
                              timelag_in_scaler, timelag_out_scaler,
                              binary_dict, continuous_dict, pitch_indices,
                              log_f0_conditioning,
                              config.timelag.allowed_range)

        # Timelag predictions
        durations = predict_duration(device, labels, duration_model,
                                     duration_config, duration_in_scaler,
                                     duration_out_scaler, lag, binary_dict,
                                     continuous_dict, pitch_indices,
                                     log_f0_conditioning)

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device, duration_modified_labels, acoustic_model, acoustic_config,
        acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict,
        config.acoustic.subphone_features, pitch_indices, log_f0_conditioning)

    # Waveform generation
    generated_waveform = gen_waveform(
        duration_modified_labels, acoustic_features, binary_dict,
        continuous_dict, acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features, log_f0_conditioning, pitch_idx,
        acoustic_config.num_windows, config.acoustic.post_filter,
        config.sample_rate, config.frame_period, config.acoustic.relative_f0)

    return generated_waveform
コード例 #2
0
ファイル: acoustic.py プロジェクト: oatsu-gh/ENUNU
def timing2acoustic(config: DictConfig, timing_path, acoustic_path):
    """
    フルラベルを読み取って、音響特長量のファイルを出力する。
    """
    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------
    # loggerの設定
    global logger  # pylint: disable=global-statement
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    typ = 'acoustic'
    # CUDAが使えるかどうか
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # maybe_set_checkpoints_(config) のかわり
    set_checkpoint(config, typ)
    # maybe_set_normalization_stats_(config) のかわり
    set_normalization_stat(config, typ)

    # 各種設定を読み込む
    model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(config[typ].checkpoint,
                            map_location=lambda storage, loc: storage)

    model.load_state_dict(checkpoint['state_dict'])
    in_scaler = joblib.load(config[typ].in_scaler_path)
    out_scaler = joblib.load(config[typ].out_scaler_path)
    model.eval()
    # -----------------------------------------------------
    # ここまで nnsvs.bin.synthesis.my_app() の内容 --------
    # -----------------------------------------------------

    # -----------------------------------------------------
    # ここから nnsvs.bin.synthesis.synthesis() の内容 -----
    # -----------------------------------------------------
    # full_score_lab を読み取る。
    duration_modified_labels = hts.load(timing_path).round_()

    # hedファイルを読み取る。
    question_path = to_absolute_path(config.question_path)
    # hts2wav.pyだとこう↓-----------------
    # これだと各モデルに別個のhedを適用できる。
    # if config[typ].question_path is None:
    #     config[typ].question_path = config.question_path
    # --------------------------------------
    # hedファイルを辞書として読み取る。
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)
    # pitch indices in the input features
    # pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    # f0の設定を読み取る。
    log_f0_conditioning = config.log_f0_conditioning
    acoustic_features = predict_acoustic(device, duration_modified_labels,
                                         model, model_config, in_scaler,
                                         out_scaler, binary_dict,
                                         continuous_dict,
                                         config.acoustic.subphone_features,
                                         pitch_indices, log_f0_conditioning)

    # csvファイルとしてAcousticの行列を出力
    np.savetxt(acoustic_path, acoustic_features, delimiter=',')
コード例 #3
0
ファイル: svs.py プロジェクト: r9y9/nnsvs
    def svs(
        self,
        labels,
        vocoder_type="world",
        post_filter_type="merlin",
        trajectory_smoothing=True,
        trajectory_smoothing_cutoff=50,
        vuv_threshold=0.1,
        vibrato_scale=1.0,
        return_states=False,
        force_fix_vuv=True,
        post_filter=None,
    ):
        """Synthesize waveform given HTS-style labels

        Args:
            labels (nnmnkwii.io.HTSLabelFile): HTS-style labels
            vocoder_type (str): Vocoder type. world or pwg
            post_filter_type (str): Post-filter type. merlin or nnsvs.

        Returns:
            tuple: (synthesized waveform, sampling rate)
        """
        vocoder_type = vocoder_type.lower()
        if vocoder_type not in ["world", "pwg"]:
            raise ValueError(f"Unknown vocoder type: {vocoder_type}")
        if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]:
            raise ValueError(f"Unknown post-filter type: {post_filter_type}")

        if vocoder_type == "pwg" and self.vocoder is None:
            raise ValueError("""Pre-trained vocodr model is not found.
WORLD is only supported for waveform generation""")
        if post_filter is not None:
            warn("post_filter is deprecated. Use post_filter_type instead.")
            post_filter_type = "merlin" if post_filter else "none"

        # Time-lag
        lag = predict_timelag(
            self.device,
            labels,
            self.timelag_model,
            self.timelag_config,
            self.timelag_in_scaler,
            self.timelag_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.timelag.allowed_range,
            self.config.timelag.allowed_range_rest,
            self.config.timelag.force_clip_input_features,
        )
        # Duration predictions
        durations = predict_duration(
            self.device,
            labels,
            self.duration_model,
            self.duration_config,
            self.duration_in_scaler,
            self.duration_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.duration.force_clip_input_features,
        )

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

        # Predict acoustic features
        acoustic_features = predict_acoustic(
            self.device,
            duration_modified_labels,
            self.acoustic_model,
            self.acoustic_config,
            self.acoustic_in_scaler,
            self.acoustic_out_scaler,
            self.binary_dict,
            self.numeric_dict,
            self.config.acoustic.subphone_features,
            self.pitch_indices,
            self.config.log_f0_conditioning,
            self.config.acoustic.force_clip_input_features,
        )

        # Apply GV post-filtering
        if post_filter_type in ["nnsvs", "gv"]:
            static_stream_sizes = get_static_stream_sizes(
                self.acoustic_config.stream_sizes,
                self.acoustic_config.has_dynamic_features,
                self.acoustic_config.num_windows,
            )
            mgc_end_dim = static_stream_sizes[0]
            acoustic_features[:, :mgc_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)[:mgc_end_dim],
                acoustic_features[:, :mgc_end_dim],
                offset=2,
            )
            # bap
            bap_start_dim = sum(static_stream_sizes[:3])
            bap_end_dim = sum(static_stream_sizes[:4])
            acoustic_features[:, bap_start_dim:bap_end_dim] = variance_scaling(
                self.postfilter_out_scaler.var_.reshape(-1)
                [bap_start_dim:bap_end_dim],
                acoustic_features[:, bap_start_dim:bap_end_dim],
                offset=0,
            )

        # Learned post-filter using nnsvs
        if post_filter_type == "nnsvs" and self.postfilter_model is not None:
            in_feats = torch.from_numpy(acoustic_features).float().unsqueeze(0)
            in_feats = (
                self.postfilter_out_scaler.transform(in_feats).float().to(
                    self.device))
            out_feats = self.postfilter_model.inference(
                in_feats, [in_feats.shape[1]])
            acoustic_features = (self.postfilter_out_scaler.inverse_transform(
                out_feats.cpu()).squeeze(0).numpy())

        # Generate WORLD parameters
        mgc, lf0, vuv, bap = gen_spsvs_static_features(
            duration_modified_labels,
            acoustic_features,
            self.binary_dict,
            self.numeric_dict,
            self.acoustic_config.stream_sizes,
            self.acoustic_config.has_dynamic_features,
            self.config.acoustic.subphone_features,
            self.pitch_idx,
            self.acoustic_config.num_windows,
            self.config.frame_period,
            self.config.acoustic.relative_f0,
            vibrato_scale=vibrato_scale,
            vuv_threshold=vuv_threshold,
            force_fix_vuv=force_fix_vuv,
        )

        # NOTE: spectral enhancement based on the Merlin's post-filter implementation
        if post_filter_type == "merlin":
            alpha = pysptk.util.mcepalpha(self.config.sample_rate)
            mgc = merlin_post_filter(mgc, alpha)

        # Remove high-frequency components of mgc/bap
        # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
        if trajectory_smoothing:
            modfs = int(1 / 0.005)
            for d in range(mgc.shape[1]):
                mgc[:, d] = lowpass_filter(mgc[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)
            for d in range(bap.shape[1]):
                bap[:, d] = lowpass_filter(bap[:, d],
                                           modfs,
                                           cutoff=trajectory_smoothing_cutoff)

        # Waveform generation by (1) WORLD or (2) neural vocoder
        if vocoder_type == "world":
            f0, spectrogram, aperiodicity = gen_world_params(
                mgc,
                lf0,
                vuv,
                bap,
                self.config.sample_rate,
                vuv_threshold=vuv_threshold)

            wav = pyworld.synthesize(
                f0,
                spectrogram,
                aperiodicity,
                self.config.sample_rate,
                self.config.frame_period,
            )
        elif vocoder_type == "pwg":
            # NOTE: So far vocoder models are trained on binary V/UV features
            vuv = (vuv > vuv_threshold).astype(np.float32)
            voc_inp = (torch.from_numpy(
                self.vocoder_in_scaler.transform(
                    np.concatenate([mgc, lf0, vuv, bap],
                                   axis=-1))).float().to(self.device))
            wav = self.vocoder.inference(voc_inp).view(-1).to("cpu").numpy()

        wav = self.post_process(wav)

        if return_states:
            states = {
                "mgc": mgc,
                "lf0": lf0,
                "vuv": vuv,
                "bap": bap,
            }
            if vocoder_type == "world":
                states.update({
                    "f0": f0,
                    "spectrogram": spectrogram,
                    "aperiodicity": aperiodicity,
                })

            return wav, self.config.sample_rate, states

        return wav, self.config.sample_rate
コード例 #4
0
ファイル: hts2wav.py プロジェクト: taroushirani/ENUNU
def synthesis(config, device, label_path,
              timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler,
              duration_model, duration_config, duration_in_scaler, duration_out_scaler,
              acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler):
    """
    音声ファイルを合成する。
    """
    # load labels and question
    labels = hts.load(label_path).round_()
    # load questions
    set_each_question_path(config)
    log_f0_conditioning = config.log_f0_conditioning

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag predictions
        timelag_binary_dict, timelag_continuous_dict, timelag_pitch_indices, _ \
            = load_qst(config.timelag.question_path)
        lag = predict_timelag(
            device, labels,
            timelag_model,
            timelag_config,
            timelag_in_scaler,
            timelag_out_scaler,
            timelag_binary_dict,
            timelag_continuous_dict,
            timelag_pitch_indices,
            log_f0_conditioning,
            config.timelag.allowed_range)

        # Duration predictions
        duration_binary_dict, duration_continuous_dict, duration_pitch_indices, _ \
            = load_qst(config.timelag.question_path)
        durations = predict_duration(
            device, labels,
            duration_model,
            duration_config,
            duration_in_scaler,
            duration_out_scaler,
            lag,
            duration_binary_dict,
            duration_continuous_dict,
            duration_pitch_indices,
            log_f0_conditioning)
        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    acoustic_binary_dict, acoustic_continuous_dict, acoustic_pitch_indices, acoustic_pitch_idx \
        = load_qst(config.timelag.question_path)
    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device, duration_modified_labels,
        acoustic_model,
        acoustic_config,
        acoustic_in_scaler,
        acoustic_out_scaler,
        acoustic_binary_dict,
        acoustic_continuous_dict,
        config.acoustic.subphone_features,
        acoustic_pitch_indices,
        log_f0_conditioning)

    # Generate f0, mgc, bap, waveform
    f0, mgc, bap, generated_waveform = gen_waveform(
        duration_modified_labels,
        acoustic_features,
        acoustic_binary_dict,
        acoustic_continuous_dict,
        acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features,
        log_f0_conditioning,
        acoustic_pitch_idx,
        acoustic_config.num_windows,
        config.acoustic.post_filter,
        config.sample_rate,
        config.frame_period,
        config.acoustic.relative_f0)

    return duration_modified_labels, f0, mgc, bap, generated_waveform
コード例 #5
0
ファイル: synthesis.py プロジェクト: r9y9/nnsvs
def synthesis(
    config,
    device,
    label_path,
    question_path,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    duration_model,
    duration_config,
    duration_in_scaler,
    duration_out_scaler,
    acoustic_model,
    acoustic_config,
    acoustic_in_scaler,
    acoustic_out_scaler,
):
    # load labels and question
    labels = hts.load(label_path).round_()
    binary_dict, numeric_dict = hts.load_question_set(question_path,
                                                      append_hat_for_LL=False)

    # pitch indices in the input features
    # TODO: configuarable
    pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    log_f0_conditioning = config.log_f0_conditioning

    # Clipping settings
    # setting True by default for backward compatibility
    timelag_clip_input_features = (config.timelag.force_clip_input_features
                                   if "force_clip_input_features"
                                   in config.timelag else True)
    duration_clip_input_features = (config.duration.force_clip_input_features
                                    if "force_clip_input_features"
                                    in config.duration else True)
    acoustic_clip_input_features = (config.acoustic.force_clip_input_features
                                    if "force_clip_input_features"
                                    in config.acoustic else True)

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag
        lag = predict_timelag(
            device,
            labels,
            timelag_model,
            timelag_config,
            timelag_in_scaler,
            timelag_out_scaler,
            binary_dict,
            numeric_dict,
            pitch_indices,
            log_f0_conditioning,
            config.timelag.allowed_range,
            config.timelag.allowed_range_rest,
            timelag_clip_input_features,
        )

        # Duration predictions
        durations = predict_duration(
            device,
            labels,
            duration_model,
            duration_config,
            duration_in_scaler,
            duration_out_scaler,
            binary_dict,
            numeric_dict,
            pitch_indices,
            log_f0_conditioning,
            duration_clip_input_features,
        )

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device,
        duration_modified_labels,
        acoustic_model,
        acoustic_config,
        acoustic_in_scaler,
        acoustic_out_scaler,
        binary_dict,
        numeric_dict,
        config.acoustic.subphone_features,
        pitch_indices,
        log_f0_conditioning,
        acoustic_clip_input_features,
    )

    # Generate WORLD parameters
    mgc, lf0, vuv, bap = gen_spsvs_static_features(
        duration_modified_labels,
        acoustic_features,
        binary_dict,
        numeric_dict,
        acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features,
        pitch_idx,
        acoustic_config.num_windows,
        config.frame_period,
        config.acoustic.relative_f0,
        config.vibrato_scale,
    )

    if config.acoustic.post_filter:
        alpha = pysptk.util.mcepalpha(config.sample_rate)
        mgc = merlin_post_filter(mgc, alpha)

    f0, spectrogram, aperiodicity = gen_world_params(mgc, lf0, vuv, bap,
                                                     config.sample_rate)

    wav = pyworld.synthesize(f0, spectrogram, aperiodicity, config.sample_rate,
                             config.frame_period)

    return wav