def __init__(self, utt_list, wav_root, label_root, question_path, use_harvest=True, f0_floor=150, f0_ceil=700, frame_period=5, mgc_order=59, num_windows=3, relative_f0=True, interp_unvoiced_aperiodicity=True): self.utt_list = utt_list self.wav_root = wav_root self.label_root = label_root self.binary_dict, self.continuous_dict = hts.load_question_set( question_path, append_hat_for_LL=False) self.pitch_idx = len(self.binary_dict) + 1 self.use_harvest = use_harvest self.f0_floor = f0_floor self.f0_ceil = f0_ceil self.frame_period = frame_period self.mgc_order = mgc_order self.relative_f0 = relative_f0 self.interp_unvoiced_aperiodicity = interp_unvoiced_aperiodicity self.windows = get_windows(num_windows)
def _gen_static_features(model, model_config, in_feats, out_scaler): if model.prediction_type() == PredictionType.PROBABILISTIC: max_mu, max_sigma = model.inference(in_feats, [in_feats.shape[1]]) if np.any(model_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = (max_sigma.squeeze(0).cpu().data.numpy()**2 * out_scaler.var_) max_mu = out_scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # Apply MLPG # (T, D_out) -> (T, static_dim) out_feats = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) else: # (T, D_out) out_feats = max_mu.squeeze(0).cpu().data.numpy() out_feats = out_scaler.inverse_transform(out_feats) else: out_feats = (model.inference( in_feats, [in_feats.shape[1]]).squeeze(0).cpu().data.numpy()) out_feats = out_scaler.inverse_transform(out_feats) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): out_feats = multi_stream_mlpg( out_feats, out_scaler.var_, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) return out_feats.astype(np.float32)
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): windows = get_windows(3) out = multi_stream_mlpg( out, scaler.var_, windows, model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def gen_waveform(labels, acoustic_features, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) # 音量を小さくする(音割れ防止) # TODO: ここのかける定数をいい感じにする spectrogram *= 0.000000001 sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60) return f0, sp, bap, generated_waveform
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) if model.prediction_type() == PredictionType.PROBABILISTIC: max_mu, max_sigma = model.inference(feats, [feats.shape[1]]) if np.any(model_config.has_dynamic_features): # Apply denormalization # (B, T, D_out) -> (T, D_out) max_sigma_sq = max_sigma.squeeze( 0).cpu().data.numpy()**2 * scaler.var_ max_mu = scaler.inverse_transform( max_mu.squeeze(0).cpu().data.numpy()) # Apply MLPG # (T, D_out) -> (T, static_dim) out = multi_stream_mlpg( max_mu, max_sigma_sq, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) else: # (T, D_out) out = max_mu.squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) else: out = model.inference( feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): out = multi_stream_mlpg( out, scaler.var_, get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
def eval_spss_model( step, netG, in_feats, out_feats, lengths, model_config, out_scaler, writer, sr, trajectory_smoothing=True, trajectory_smoothing_cutoff=50, ): # make sure to be in eval mode netG.eval() is_autoregressive = (netG.module.is_autoregressive() if isinstance( netG, nn.DataParallel) else netG.is_autoregressive()) prediction_type = (netG.module.prediction_type() if isinstance( netG, nn.DataParallel) else netG.prediction_type()) utt_indices = [-1, -2, -3] utt_indices = utt_indices[:min(3, len(in_feats))] if np.any(model_config.has_dynamic_features): static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) else: static_stream_sizes = model_config.stream_sizes for utt_idx in utt_indices: out_feats_denorm_ = out_scaler.inverse_transform( out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0)) mgc, lf0, vuv, bap = get_static_features( out_feats_denorm_, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, )[:4] mgc = mgc.squeeze(0).cpu().numpy() lf0 = lf0.squeeze(0).cpu().numpy() vuv = vuv.squeeze(0).cpu().numpy() bap = bap.squeeze(0).cpu().numpy() f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) group = f"utt{np.abs(utt_idx)}_reference" wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav writer.add_audio(group, wav, step, sr) # Run forward if is_autoregressive: outs = netG( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]], out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), ) else: outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) # ResF0 case if isinstance(outs, tuple) and len(outs) == 2: outs, _ = outs if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = outs pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1] else: pred_out_feats = outs # NOTE: multiple outputs if isinstance(pred_out_feats, list): pred_out_feats = pred_out_feats[-1] if isinstance(pred_out_feats, tuple): pred_out_feats = pred_out_feats[0] if not isinstance(pred_out_feats, list): pred_out_feats = [pred_out_feats] # Run inference if prediction_type == PredictionType.PROBABILISTIC: inference_out_feats, _ = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) else: inference_out_feats = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) pred_out_feats.append(inference_out_feats) # Plot normalized input/output in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() fig, ax = plt.subplots(3, 1, figsize=(8, 8)) ax[0].set_title("Reference features") ax[1].set_title("Input features") ax[2].set_title("Predicted features") mesh = librosa.display.specshow(out_feats_.T, x_axis="frames", y_axis="frames", ax=ax[0], cmap="viridis") # NOTE: assuming normalized to N(0, 1) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[0]) mesh = librosa.display.specshow(in_feats_.T, x_axis="frames", y_axis="frames", ax=ax[1], cmap="viridis") mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[1]) mesh = librosa.display.specshow( inference_out_feats.squeeze(0).cpu().numpy().T, x_axis="frames", y_axis="frames", ax=ax[2], cmap="viridis", ) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[2]) for ax_ in ax: ax_.set_ylabel("Feature") plt.tight_layout() group = f"utt{np.abs(utt_idx)}_inference" writer.add_figure(f"{group}/Input-Output", fig, step) plt.close() assert len(pred_out_feats) == 2 for idx, pred_out_feats_ in enumerate(pred_out_feats): pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy() pred_out_feats_denorm = (out_scaler.inverse_transform( torch.from_numpy(pred_out_feats_).to( in_feats.device)).cpu().numpy()) if np.any(model_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_out_feats_denorm = multi_stream_mlpg( pred_out_feats_denorm, (out_scaler.scale_**2).cpu().numpy(), get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams( pred_out_feats_denorm, static_stream_sizes)[:4] # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(pred_mgc.shape[1]): pred_mgc[:, d] = lowpass_filter( pred_mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(pred_bap.shape[1]): pred_bap[:, d] = lowpass_filter( pred_bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Generated sample f0, spectrogram, aperiodicity = gen_world_params( pred_mgc, pred_lf0, pred_vuv, pred_bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav if idx == 1: group = f"utt{np.abs(utt_idx)}_inference" else: group = f"utt{np.abs(utt_idx)}_forward" writer.add_audio(group, wav, step, sr) plot_spsvs_params( step, writer, mgc, lf0, vuv, bap, pred_mgc, pred_lf0, pred_vuv, pred_bap, group=group, sr=sr, )