def synthesis(config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler): # load labels and question labels = hts.load(label_path).round_() binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # TODO: configuarable pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) log_f0_conditioning = config.log_f0_conditioning if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag lag = predict_timelag(device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning, config.timelag.allowed_range) # Timelag predictions durations = predict_duration(device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, binary_dict, continuous_dict, pitch_indices, log_f0_conditioning) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # Waveform generation generated_waveform = gen_waveform( duration_modified_labels, acoustic_features, binary_dict, continuous_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, log_f0_conditioning, pitch_idx, acoustic_config.num_windows, config.acoustic.post_filter, config.sample_rate, config.frame_period, config.acoustic.relative_f0) return generated_waveform
def timing2acoustic(config: DictConfig, timing_path, acoustic_path): """ フルラベルを読み取って、音響特長量のファイルを出力する。 """ # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # loggerの設定 global logger # pylint: disable=global-statement logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) typ = 'acoustic' # CUDAが使えるかどうか device = 'cuda' if torch.cuda.is_available() else 'cpu' # maybe_set_checkpoints_(config) のかわり set_checkpoint(config, typ) # maybe_set_normalization_stats_(config) のかわり set_normalization_stat(config, typ) # 各種設定を読み込む model_config = OmegaConf.load(to_absolute_path(config[typ].model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(config[typ].checkpoint, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint['state_dict']) in_scaler = joblib.load(config[typ].in_scaler_path) out_scaler = joblib.load(config[typ].out_scaler_path) model.eval() # ----------------------------------------------------- # ここまで nnsvs.bin.synthesis.my_app() の内容 -------- # ----------------------------------------------------- # ----------------------------------------------------- # ここから nnsvs.bin.synthesis.synthesis() の内容 ----- # ----------------------------------------------------- # full_score_lab を読み取る。 duration_modified_labels = hts.load(timing_path).round_() # hedファイルを読み取る。 question_path = to_absolute_path(config.question_path) # hts2wav.pyだとこう↓----------------- # これだと各モデルに別個のhedを適用できる。 # if config[typ].question_path is None: # config[typ].question_path = config.question_path # -------------------------------------- # hedファイルを辞書として読み取る。 binary_dict, continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) # pitch indices in the input features # pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) # f0の設定を読み取る。 log_f0_conditioning = config.log_f0_conditioning acoustic_features = predict_acoustic(device, duration_modified_labels, model, model_config, in_scaler, out_scaler, binary_dict, continuous_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning) # csvファイルとしてAcousticの行列を出力 np.savetxt(acoustic_path, acoustic_features, delimiter=',')
def svs( self, labels, vocoder_type="world", post_filter_type="merlin", trajectory_smoothing=True, trajectory_smoothing_cutoff=50, vuv_threshold=0.1, vibrato_scale=1.0, return_states=False, force_fix_vuv=True, post_filter=None, ): """Synthesize waveform given HTS-style labels Args: labels (nnmnkwii.io.HTSLabelFile): HTS-style labels vocoder_type (str): Vocoder type. world or pwg post_filter_type (str): Post-filter type. merlin or nnsvs. Returns: tuple: (synthesized waveform, sampling rate) """ vocoder_type = vocoder_type.lower() if vocoder_type not in ["world", "pwg"]: raise ValueError(f"Unknown vocoder type: {vocoder_type}") if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]: raise ValueError(f"Unknown post-filter type: {post_filter_type}") if vocoder_type == "pwg" and self.vocoder is None: raise ValueError("""Pre-trained vocodr model is not found. WORLD is only supported for waveform generation""") if post_filter is not None: warn("post_filter is deprecated. Use post_filter_type instead.") post_filter_type = "merlin" if post_filter else "none" # Time-lag lag = predict_timelag( self.device, labels, self.timelag_model, self.timelag_config, self.timelag_in_scaler, self.timelag_out_scaler, self.binary_dict, self.numeric_dict, self.pitch_indices, self.config.log_f0_conditioning, self.config.timelag.allowed_range, self.config.timelag.allowed_range_rest, self.config.timelag.force_clip_input_features, ) # Duration predictions durations = predict_duration( self.device, labels, self.duration_model, self.duration_config, self.duration_in_scaler, self.duration_out_scaler, self.binary_dict, self.numeric_dict, self.pitch_indices, self.config.log_f0_conditioning, self.config.duration.force_clip_input_features, ) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( self.device, duration_modified_labels, self.acoustic_model, self.acoustic_config, self.acoustic_in_scaler, self.acoustic_out_scaler, self.binary_dict, self.numeric_dict, self.config.acoustic.subphone_features, self.pitch_indices, self.config.log_f0_conditioning, self.config.acoustic.force_clip_input_features, ) # Apply GV post-filtering if post_filter_type in ["nnsvs", "gv"]: static_stream_sizes = get_static_stream_sizes( self.acoustic_config.stream_sizes, self.acoustic_config.has_dynamic_features, self.acoustic_config.num_windows, ) mgc_end_dim = static_stream_sizes[0] acoustic_features[:, :mgc_end_dim] = variance_scaling( self.postfilter_out_scaler.var_.reshape(-1)[:mgc_end_dim], acoustic_features[:, :mgc_end_dim], offset=2, ) # bap bap_start_dim = sum(static_stream_sizes[:3]) bap_end_dim = sum(static_stream_sizes[:4]) acoustic_features[:, bap_start_dim:bap_end_dim] = variance_scaling( self.postfilter_out_scaler.var_.reshape(-1) [bap_start_dim:bap_end_dim], acoustic_features[:, bap_start_dim:bap_end_dim], offset=0, ) # Learned post-filter using nnsvs if post_filter_type == "nnsvs" and self.postfilter_model is not None: in_feats = torch.from_numpy(acoustic_features).float().unsqueeze(0) in_feats = ( self.postfilter_out_scaler.transform(in_feats).float().to( self.device)) out_feats = self.postfilter_model.inference( in_feats, [in_feats.shape[1]]) acoustic_features = (self.postfilter_out_scaler.inverse_transform( out_feats.cpu()).squeeze(0).numpy()) # Generate WORLD parameters mgc, lf0, vuv, bap = gen_spsvs_static_features( duration_modified_labels, acoustic_features, self.binary_dict, self.numeric_dict, self.acoustic_config.stream_sizes, self.acoustic_config.has_dynamic_features, self.config.acoustic.subphone_features, self.pitch_idx, self.acoustic_config.num_windows, self.config.frame_period, self.config.acoustic.relative_f0, vibrato_scale=vibrato_scale, vuv_threshold=vuv_threshold, force_fix_vuv=force_fix_vuv, ) # NOTE: spectral enhancement based on the Merlin's post-filter implementation if post_filter_type == "merlin": alpha = pysptk.util.mcepalpha(self.config.sample_rate) mgc = merlin_post_filter(mgc, alpha) # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(mgc.shape[1]): mgc[:, d] = lowpass_filter(mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(bap.shape[1]): bap[:, d] = lowpass_filter(bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Waveform generation by (1) WORLD or (2) neural vocoder if vocoder_type == "world": f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, self.config.sample_rate, vuv_threshold=vuv_threshold) wav = pyworld.synthesize( f0, spectrogram, aperiodicity, self.config.sample_rate, self.config.frame_period, ) elif vocoder_type == "pwg": # NOTE: So far vocoder models are trained on binary V/UV features vuv = (vuv > vuv_threshold).astype(np.float32) voc_inp = (torch.from_numpy( self.vocoder_in_scaler.transform( np.concatenate([mgc, lf0, vuv, bap], axis=-1))).float().to(self.device)) wav = self.vocoder.inference(voc_inp).view(-1).to("cpu").numpy() wav = self.post_process(wav) if return_states: states = { "mgc": mgc, "lf0": lf0, "vuv": vuv, "bap": bap, } if vocoder_type == "world": states.update({ "f0": f0, "spectrogram": spectrogram, "aperiodicity": aperiodicity, }) return wav, self.config.sample_rate, states return wav, self.config.sample_rate
def synthesis(config, device, label_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler): """ 音声ファイルを合成する。 """ # load labels and question labels = hts.load(label_path).round_() # load questions set_each_question_path(config) log_f0_conditioning = config.log_f0_conditioning if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag predictions timelag_binary_dict, timelag_continuous_dict, timelag_pitch_indices, _ \ = load_qst(config.timelag.question_path) lag = predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, timelag_binary_dict, timelag_continuous_dict, timelag_pitch_indices, log_f0_conditioning, config.timelag.allowed_range) # Duration predictions duration_binary_dict, duration_continuous_dict, duration_pitch_indices, _ \ = load_qst(config.timelag.question_path) durations = predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, lag, duration_binary_dict, duration_continuous_dict, duration_pitch_indices, log_f0_conditioning) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) acoustic_binary_dict, acoustic_continuous_dict, acoustic_pitch_indices, acoustic_pitch_idx \ = load_qst(config.timelag.question_path) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, acoustic_binary_dict, acoustic_continuous_dict, config.acoustic.subphone_features, acoustic_pitch_indices, log_f0_conditioning) # Generate f0, mgc, bap, waveform f0, mgc, bap, generated_waveform = gen_waveform( duration_modified_labels, acoustic_features, acoustic_binary_dict, acoustic_continuous_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, log_f0_conditioning, acoustic_pitch_idx, acoustic_config.num_windows, config.acoustic.post_filter, config.sample_rate, config.frame_period, config.acoustic.relative_f0) return duration_modified_labels, f0, mgc, bap, generated_waveform
def synthesis( config, device, label_path, question_path, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, duration_model, duration_config, duration_in_scaler, duration_out_scaler, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, ): # load labels and question labels = hts.load(label_path).round_() binary_dict, numeric_dict = hts.load_question_set(question_path, append_hat_for_LL=False) # pitch indices in the input features # TODO: configuarable pitch_idx = len(binary_dict) + 1 pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3) log_f0_conditioning = config.log_f0_conditioning # Clipping settings # setting True by default for backward compatibility timelag_clip_input_features = (config.timelag.force_clip_input_features if "force_clip_input_features" in config.timelag else True) duration_clip_input_features = (config.duration.force_clip_input_features if "force_clip_input_features" in config.duration else True) acoustic_clip_input_features = (config.acoustic.force_clip_input_features if "force_clip_input_features" in config.acoustic else True) if config.ground_truth_duration: # Use provided alignment duration_modified_labels = labels else: # Time-lag lag = predict_timelag( device, labels, timelag_model, timelag_config, timelag_in_scaler, timelag_out_scaler, binary_dict, numeric_dict, pitch_indices, log_f0_conditioning, config.timelag.allowed_range, config.timelag.allowed_range_rest, timelag_clip_input_features, ) # Duration predictions durations = predict_duration( device, labels, duration_model, duration_config, duration_in_scaler, duration_out_scaler, binary_dict, numeric_dict, pitch_indices, log_f0_conditioning, duration_clip_input_features, ) # Normalize phoneme durations duration_modified_labels = postprocess_duration(labels, durations, lag) # Predict acoustic features acoustic_features = predict_acoustic( device, duration_modified_labels, acoustic_model, acoustic_config, acoustic_in_scaler, acoustic_out_scaler, binary_dict, numeric_dict, config.acoustic.subphone_features, pitch_indices, log_f0_conditioning, acoustic_clip_input_features, ) # Generate WORLD parameters mgc, lf0, vuv, bap = gen_spsvs_static_features( duration_modified_labels, acoustic_features, binary_dict, numeric_dict, acoustic_config.stream_sizes, acoustic_config.has_dynamic_features, config.acoustic.subphone_features, pitch_idx, acoustic_config.num_windows, config.frame_period, config.acoustic.relative_f0, config.vibrato_scale, ) if config.acoustic.post_filter: alpha = pysptk.util.mcepalpha(config.sample_rate) mgc = merlin_post_filter(mgc, alpha) f0, spectrogram, aperiodicity = gen_world_params(mgc, lf0, vuv, bap, config.sample_rate) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, config.sample_rate, config.frame_period) return wav