def _prepare_voc_features( in_dir, out_dir, utt_id, num_windows, stream_sizes, has_dynamic_features, ) -> None: feats = np.load(join(in_dir, utt_id + "-feats.npy")) in_wave_path = join(in_dir, utt_id + "-wave.npy") assert exists(in_wave_path) assert np.any(has_dynamic_features) streams = get_static_features( feats.reshape(1, -1, feats.shape[-1]), num_windows, stream_sizes, has_dynamic_features, ) # remove batch-axis streams = list(map(lambda x: x.squeeze(0), streams)) # NOTE: even if the number of streams are larger than 4, we only use the first 4 streams # for training neural vocoders assert len(streams) >= 4 mgc, lf0, vuv, bap = streams[0], streams[1], streams[2], streams[3] voc_feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32) voc_feats_path = join(out_dir, utt_id + "-feats.npy") np.save(voc_feats_path, voc_feats, allow_pickle=False) # NOTE: To train vocoders with https://github.com/kan-bayashi/ParallelWaveGAN # target waveform needs to be created in the same directory as the vocoder input features. save_wave_path = join(out_dir, utt_id + "-wave.npy") if not exists(save_wave_path): os.symlink(join(in_dir, utt_id + "-wave.npy"), save_wave_path)
def _extract_static_features( in_dir, out_dir, utt_id, num_windows, stream_sizes, has_dynamic_features, ) -> None: feats = np.load(join(in_dir, utt_id + "-feats.npy")) assert np.any(has_dynamic_features) streams = get_static_features( feats.reshape(1, -1, feats.shape[-1]), num_windows, stream_sizes, has_dynamic_features, ) # remove batch-axis streams = list(map(lambda x: x.squeeze(0), streams)) static_feats = np.concatenate(streams, axis=-1).astype(np.float32) static_path = join(out_dir, utt_id + "-feats.npy") np.save(static_path, static_feats, allow_pickle=False)
mean = scaler.mean_ scale = scaler.scale_ var = scaler.var_ out_dir = Path(args.out_dir) out_dir.mkdir(parents=True, exist_ok=True) stream_sizes, has_dynamic_features = get_world_stream_info( args.sample_rate, args.mgc_order, args.num_windows, args.vibrato_mode) print(f"Converting {input_file} mean/scale npy files") lf0_params = {} for name, in_feats in [("mean", mean), ("scale", scale), ("var", var)]: streams = get_static_features( in_feats.reshape(1, -1, in_feats.shape[-1]), args.num_windows, stream_sizes, has_dynamic_features, ) # NOTE: use up to 4 streams # [mgc, lf0, bap, vuv] streams = list(map(lambda x: x.reshape(-1), streams))[:4] lf0_params[name] = float(streams[1]) out_feats = np.concatenate(streams) print(f"[{name}] dim: {in_feats.shape} -> {out_feats.shape}") out_path = out_dir / (out_file_name + f"_{name}.npy") np.save(out_path, out_feats, allow_pickle=False) print(f""" If you are going to train NSF-based vocoders, please set the following parameters:
def train_step( model_config, optim_config, netG, optG, netD, optD, grad_scaler, train, in_feats, out_feats, lengths, out_scaler, feats_criterion="mse", pitch_reg_dyn_ws=1.0, pitch_reg_weight=1.0, adv_weight=1.0, adv_streams=None, fm_weight=0.0, adv_use_static_feats_only=True, mask_nth_mgc_for_adv_loss=0, gan_type="lsgan", ): netG.train() if train else netG.eval() netD.train() if train else netD.eval() log_metrics = {} if feats_criterion in ["l2", "mse"]: criterion = nn.MSELoss(reduction="none") elif feats_criterion in ["l1", "mae"]: criterion = nn.L1Loss(reduction="none") else: raise RuntimeError("not supported criterion") prediction_type = ( netG.module.prediction_type() if isinstance(netG, nn.DataParallel) else netG.prediction_type() ) # NOTE: it is not trivial to adapt GAN for probabilistic models assert prediction_type != PredictionType.PROBABILISTIC # Apply preprocess if required (e.g., FIR filter for shallow AR) # defaults to no-op if isinstance(netG, nn.DataParallel): out_feats = netG.module.preprocess_target(out_feats) else: out_feats = netG.preprocess_target(out_feats) # Run forward with autocast(enabled=grad_scaler is not None): pred_out_feats, lf0_residual = netG(in_feats, lengths) # Select streams for computing adversarial loss if adv_use_static_feats_only: real_netD_in_feats = torch.cat( get_static_features( out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, adv_streams, ), dim=-1, ) fake_netD_in_feats = torch.cat( get_static_features( pred_out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, adv_streams, ), dim=-1, ) else: real_netD_in_feats = select_streams( out_feats, model_config.stream_sizes, adv_streams ) fake_netD_in_feats = select_streams( pred_out_feats, model_config.stream_sizes, adv_streams, ) # Ref: http://sython.org/papers/ASJ/saito2017asja.pdf # 0-th mgc with adversarial trainging affects speech quality # NOTE: assuming that the first stream contains mgc if mask_nth_mgc_for_adv_loss > 0: real_netD_in_feats = real_netD_in_feats[:, :, mask_nth_mgc_for_adv_loss:] fake_netD_in_feats = fake_netD_in_feats[:, :, mask_nth_mgc_for_adv_loss:] # Real with autocast(enabled=grad_scaler is not None): D_real = netD(real_netD_in_feats, in_feats, lengths) # NOTE: must be list of list to support multi-scale discriminators assert isinstance(D_real, list) and isinstance(D_real[-1], list) # Fake D_fake_det = netD(fake_netD_in_feats.detach(), in_feats, lengths) # Mask (B, T, 1) mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device) # Update discriminator eps = 1e-14 loss_real = 0 loss_fake = 0 with autocast(enabled=grad_scaler is not None): for idx, (D_real_, D_fake_det_) in enumerate(zip(D_real, D_fake_det)): if gan_type == "lsgan": loss_real_ = (D_real_[-1] - 1) ** 2 loss_fake_ = D_fake_det_[-1] ** 2 elif gan_type == "vanilla-gan": loss_real_ = -torch.log(D_real_[-1] + eps) loss_fake_ = -torch.log(1 - D_fake_det_[-1] + eps) else: raise ValueError(f"Unknown gan type: {gan_type}") # mask for D if ( hasattr(netD, "downsample_scale") and mask.shape[1] // netD.downsample_scale == D_real_[-1].shape[1] ): D_mask = mask[:, :: netD.downsample_scale, :] else: if D_real_[-1].shape[1] == out_feats.shape[1]: D_mask = mask else: D_mask = None if D_mask is not None: loss_real_ = loss_real_.masked_select(D_mask).mean() loss_fake_ = loss_fake_.masked_select(D_mask).mean() else: loss_real_ = loss_real_.mean() loss_fake_ = loss_fake_.mean() log_metrics[f"Loss_Real_Scale{idx}"] = loss_real_.item() log_metrics[f"Loss_Fake_Scale{idx}"] = loss_fake_.item() loss_real += loss_real_ loss_fake += loss_fake_ loss_d = loss_real + loss_fake if train: optD.zero_grad() if grad_scaler is not None: grad_scaler.scale(loss_d).backward() grad_scaler.unscale_(optD) grad_norm_d = torch.nn.utils.clip_grad_norm_( netD.parameters(), optim_config.netD.clip_norm ) log_metrics["GradNorm_D"] = grad_norm_d grad_scaler.step(optD) else: loss_d.backward() grad_norm_d = torch.nn.utils.clip_grad_norm_( netD.parameters(), optim_config.netD.clip_norm ) log_metrics["GradNorm_D"] = grad_norm_d optD.step() # Update generator with autocast(enabled=grad_scaler is not None): loss_feats = criterion( pred_out_feats.masked_select(mask), out_feats.masked_select(mask) ).mean() # adversarial loss D_fake = netD(fake_netD_in_feats, in_feats, lengths) loss_adv = 0 for idx, D_fake_ in enumerate(D_fake): if gan_type == "lsgan": loss_adv_ = (1 - D_fake_[-1]) ** 2 elif gan_type == "vanilla-gan": loss_adv_ = -torch.log(D_fake_[-1] + eps) else: raise ValueError(f"Unknown gan type: {gan_type}") if ( hasattr(netD, "downsample_scale") and mask.shape[1] // netD.downsample_scale == D_fake_[-1].shape[1] ): D_mask = mask[:, :: netD.downsample_scale, :] else: if D_real_[-1].shape[1] == out_feats.shape[1]: D_mask = mask else: D_mask = None if D_mask is not None: loss_adv_ = loss_adv_.masked_select(D_mask).mean() else: loss_adv_ = loss_adv_.mean() log_metrics[f"Loss_Adv_Scale{idx}"] = loss_adv_.item() loss_adv += loss_adv_ # Feature matching loss loss_fm = torch.tensor(0.0).to(in_feats.device) if fm_weight > 0: for D_fake_, D_real_ in zip(D_fake, D_real): for fake_fmap, real_fmap in zip(D_fake_[:-1], D_real_[:-1]): loss_fm += F.l1_loss(fake_fmap, real_fmap.detach()) # Pitch regularization # NOTE: l1 loss seems to be better than mse loss in my experiments # we could use l2 loss as suggested in the sinsy's paper loss_pitch = (pitch_reg_dyn_ws * lf0_residual.abs()).masked_select(mask).mean() loss = ( loss_feats + adv_weight * loss_adv + pitch_reg_weight * loss_pitch + fm_weight * loss_fm ) if train: optG.zero_grad() if grad_scaler is not None: grad_scaler.scale(loss).backward() grad_scaler.unscale_(optG) grad_norm_g = torch.nn.utils.clip_grad_norm_( netG.parameters(), optim_config.netG.clip_norm ) log_metrics["GradNorm_G"] = grad_norm_g grad_scaler.step(optG) else: loss.backward() grad_norm_g = torch.nn.utils.clip_grad_norm_( netG.parameters(), optim_config.netG.clip_norm ) log_metrics["GradNorm_G"] = grad_norm_g optG.step() # NOTE: this shouldn't be called multiple times in a training step if train and grad_scaler is not None: grad_scaler.update() # Metrics distortions = compute_distortions( pred_out_feats, out_feats, lengths, out_scaler, model_config ) log_metrics.update(distortions) log_metrics.update( { "Loss": loss.item(), "Loss_Feats": loss_feats.item(), "Loss_Adv_Total": loss_adv.item(), "Loss_Feature_Matching": loss_fm.item(), "Loss_Pitch": loss_pitch.item(), "Loss_Real_Total": loss_real.item(), "Loss_Fake_Total": loss_fake.item(), "Loss_D": loss_d.item(), } ) return loss, log_metrics
parser.add_argument("input_file", type=str, help="input file") parser.add_argument("model_config", type=str, help="model config") parser.add_argument("output_file", type=str, help="output file") return parser if __name__ == "__main__": args = get_parser().parse_args(sys.argv[1:]) model_config = OmegaConf.load(args.model_config) out_scaler = joblib.load(args.input_file) mean_ = get_static_features( out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) mean_ = np.concatenate(mean_, -1).reshape(1, -1) var_ = get_static_features( out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) var_ = np.concatenate(var_, -1).reshape(1, -1) scale_ = get_static_features( out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features,
def compute_distortions(pred_out_feats, out_feats, lengths, out_scaler, model_config): """Compute distortion measures between predicted and ground-truth acoustic features Args: pred_out_feats (nn.Tensor): predicted acoustic features out_feats (nn.Tensor): ground-truth acoustic features lengths (nn.Tensor): lengths of the sequences out_scaler (nn.Module): scaler to denormalize features model_config (dict): model configuration Returns: dict: a dict that includes MCD for mgc/bap, V/UV error and F0 RMSE """ out_feats = out_scaler.inverse_transform(out_feats) pred_out_feats = out_scaler.inverse_transform(pred_out_feats) out_streams = get_static_features( out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) pred_out_streams = get_static_features( pred_out_feats, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) assert len(out_streams) >= 4 mgc, lf0, vuv, bap = out_streams[0], out_streams[1], out_streams[ 2], out_streams[3] pred_mgc, pred_lf0, pred_vuv, pred_bap = ( pred_out_streams[0], pred_out_streams[1], pred_out_streams[2], pred_out_streams[3], ) # binarize vuv vuv, pred_vuv = (vuv > 0.5).float(), (pred_vuv > 0.5).float() dist = { "ObjEval_MGC_MCD": metrics.melcd(mgc[:, :, 1:], pred_mgc[:, :, 1:], lengths=lengths), "ObjEval_BAP_MCD": metrics.melcd(bap, pred_bap, lengths=lengths) / 10.0, "ObjEval_VUV_ERR": metrics.vuv_error(vuv, pred_vuv, lengths=lengths), } try: f0_mse = metrics.lf0_mean_squared_error(lf0, vuv, pred_lf0, pred_vuv, lengths=lengths, linear_domain=True) dist["ObjEval_F0_RMSE"] = np.sqrt(f0_mse) except ZeroDivisionError: pass return dist
def eval_spss_model( step, netG, in_feats, out_feats, lengths, model_config, out_scaler, writer, sr, trajectory_smoothing=True, trajectory_smoothing_cutoff=50, ): # make sure to be in eval mode netG.eval() is_autoregressive = (netG.module.is_autoregressive() if isinstance( netG, nn.DataParallel) else netG.is_autoregressive()) prediction_type = (netG.module.prediction_type() if isinstance( netG, nn.DataParallel) else netG.prediction_type()) utt_indices = [-1, -2, -3] utt_indices = utt_indices[:min(3, len(in_feats))] if np.any(model_config.has_dynamic_features): static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) else: static_stream_sizes = model_config.stream_sizes for utt_idx in utt_indices: out_feats_denorm_ = out_scaler.inverse_transform( out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0)) mgc, lf0, vuv, bap = get_static_features( out_feats_denorm_, model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, )[:4] mgc = mgc.squeeze(0).cpu().numpy() lf0 = lf0.squeeze(0).cpu().numpy() vuv = vuv.squeeze(0).cpu().numpy() bap = bap.squeeze(0).cpu().numpy() f0, spectrogram, aperiodicity = gen_world_params( mgc, lf0, vuv, bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) group = f"utt{np.abs(utt_idx)}_reference" wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav writer.add_audio(group, wav, step, sr) # Run forward if is_autoregressive: outs = netG( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]], out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), ) else: outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) # ResF0 case if isinstance(outs, tuple) and len(outs) == 2: outs, _ = outs if prediction_type == PredictionType.PROBABILISTIC: pi, sigma, mu = outs pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma, mu)[1] else: pred_out_feats = outs # NOTE: multiple outputs if isinstance(pred_out_feats, list): pred_out_feats = pred_out_feats[-1] if isinstance(pred_out_feats, tuple): pred_out_feats = pred_out_feats[0] if not isinstance(pred_out_feats, list): pred_out_feats = [pred_out_feats] # Run inference if prediction_type == PredictionType.PROBABILISTIC: inference_out_feats, _ = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) else: inference_out_feats = netG.inference( in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0), [lengths[utt_idx]]) pred_out_feats.append(inference_out_feats) # Plot normalized input/output in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy() fig, ax = plt.subplots(3, 1, figsize=(8, 8)) ax[0].set_title("Reference features") ax[1].set_title("Input features") ax[2].set_title("Predicted features") mesh = librosa.display.specshow(out_feats_.T, x_axis="frames", y_axis="frames", ax=ax[0], cmap="viridis") # NOTE: assuming normalized to N(0, 1) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[0]) mesh = librosa.display.specshow(in_feats_.T, x_axis="frames", y_axis="frames", ax=ax[1], cmap="viridis") mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[1]) mesh = librosa.display.specshow( inference_out_feats.squeeze(0).cpu().numpy().T, x_axis="frames", y_axis="frames", ax=ax[2], cmap="viridis", ) mesh.set_clim(-4, 4) fig.colorbar(mesh, ax=ax[2]) for ax_ in ax: ax_.set_ylabel("Feature") plt.tight_layout() group = f"utt{np.abs(utt_idx)}_inference" writer.add_figure(f"{group}/Input-Output", fig, step) plt.close() assert len(pred_out_feats) == 2 for idx, pred_out_feats_ in enumerate(pred_out_feats): pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy() pred_out_feats_denorm = (out_scaler.inverse_transform( torch.from_numpy(pred_out_feats_).to( in_feats.device)).cpu().numpy()) if np.any(model_config.has_dynamic_features): # (T, D_out) -> (T, static_dim) pred_out_feats_denorm = multi_stream_mlpg( pred_out_feats_denorm, (out_scaler.scale_**2).cpu().numpy(), get_windows(model_config.num_windows), model_config.stream_sizes, model_config.has_dynamic_features, ) pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams( pred_out_feats_denorm, static_stream_sizes)[:4] # Remove high-frequency components of mgc/bap # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering if trajectory_smoothing: modfs = int(1 / 0.005) for d in range(pred_mgc.shape[1]): pred_mgc[:, d] = lowpass_filter( pred_mgc[:, d], modfs, cutoff=trajectory_smoothing_cutoff) for d in range(pred_bap.shape[1]): pred_bap[:, d] = lowpass_filter( pred_bap[:, d], modfs, cutoff=trajectory_smoothing_cutoff) # Generated sample f0, spectrogram, aperiodicity = gen_world_params( pred_mgc, pred_lf0, pred_vuv, pred_bap, sr) wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5) wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav if idx == 1: group = f"utt{np.abs(utt_idx)}_inference" else: group = f"utt{np.abs(utt_idx)}_forward" writer.add_audio(group, wav, step, sr) plot_spsvs_params( step, writer, mgc, lf0, vuv, bap, pred_mgc, pred_lf0, pred_vuv, pred_bap, group=group, sr=sr, )
def my_app(config: DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(OmegaConf.to_yaml(config)) device = torch.device("cuda" if use_cuda else "cpu") utt_list = to_absolute_path(config.utt_list) in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) utt_ids = load_utt_list(utt_list) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load( to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage, ) model.load_state_dict(checkpoint["state_dict"]) model.eval() out_scaler = joblib.load(to_absolute_path(config.out_scaler_path)) mean_ = get_static_features( out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) mean_ = np.concatenate(mean_, -1).reshape(1, -1) var_ = get_static_features( out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) var_ = np.concatenate(var_, -1).reshape(1, -1) scale_ = get_static_features( out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]), model_config.num_windows, model_config.stream_sizes, model_config.has_dynamic_features, ) scale_ = np.concatenate(scale_, -1).reshape(1, -1) static_scaler = StandardScaler(mean_, var_, scale_) static_stream_sizes = get_static_stream_sizes( model_config.stream_sizes, model_config.has_dynamic_features, model_config.num_windows, ) for utt_id in tqdm(utt_ids): in_feats = (torch.from_numpy( np.load(join(in_dir, utt_id + "-feats.npy"))).unsqueeze(0).to(device)) static_feats = _gen_static_features(model, model_config, in_feats, out_scaler) mgc_end_dim = static_stream_sizes[0] bap_start_dim = sum(static_stream_sizes[:3]) bap_end_dim = sum(static_stream_sizes[:4]) if config.gv_postfilter: # mgc static_feats[:, :mgc_end_dim] = variance_scaling( static_scaler.var_.reshape(-1)[:mgc_end_dim], static_feats[:, :mgc_end_dim], offset=config.mgc_offset, ) # bap static_feats[:, bap_start_dim:bap_end_dim] = variance_scaling( static_scaler.var_.reshape(-1)[bap_start_dim:bap_end_dim], static_feats[:, bap_start_dim:bap_end_dim], offset=config.bap_offset, ) if config.normalize: static_feats = static_scaler.transform(static_feats) out_path = join(out_dir, f"{utt_id}-feats.npy") np.save(out_path, static_feats.astype(np.float32), allow_pickle=False)