Esempio n. 1
0
def _prepare_voc_features(
    in_dir,
    out_dir,
    utt_id,
    num_windows,
    stream_sizes,
    has_dynamic_features,
) -> None:
    feats = np.load(join(in_dir, utt_id + "-feats.npy"))
    in_wave_path = join(in_dir, utt_id + "-wave.npy")
    assert exists(in_wave_path)

    assert np.any(has_dynamic_features)
    streams = get_static_features(
        feats.reshape(1, -1, feats.shape[-1]),
        num_windows,
        stream_sizes,
        has_dynamic_features,
    )

    # remove batch-axis
    streams = list(map(lambda x: x.squeeze(0), streams))

    # NOTE: even if the number of streams are larger than 4, we only use the first 4 streams
    # for training neural vocoders
    assert len(streams) >= 4
    mgc, lf0, vuv, bap = streams[0], streams[1], streams[2], streams[3]
    voc_feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32)

    voc_feats_path = join(out_dir, utt_id + "-feats.npy")
    np.save(voc_feats_path, voc_feats, allow_pickle=False)

    # NOTE: To train vocoders with https://github.com/kan-bayashi/ParallelWaveGAN
    # target waveform needs to be created in the same directory as the vocoder input features.
    save_wave_path = join(out_dir, utt_id + "-wave.npy")
    if not exists(save_wave_path):
        os.symlink(join(in_dir, utt_id + "-wave.npy"), save_wave_path)
Esempio n. 2
0
def _extract_static_features(
    in_dir,
    out_dir,
    utt_id,
    num_windows,
    stream_sizes,
    has_dynamic_features,
) -> None:
    feats = np.load(join(in_dir, utt_id + "-feats.npy"))

    assert np.any(has_dynamic_features)
    streams = get_static_features(
        feats.reshape(1, -1, feats.shape[-1]),
        num_windows,
        stream_sizes,
        has_dynamic_features,
    )

    # remove batch-axis
    streams = list(map(lambda x: x.squeeze(0), streams))
    static_feats = np.concatenate(streams, axis=-1).astype(np.float32)

    static_path = join(out_dir, utt_id + "-feats.npy")
    np.save(static_path, static_feats, allow_pickle=False)
Esempio n. 3
0
    mean = scaler.mean_
    scale = scaler.scale_
    var = scaler.var_

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    stream_sizes, has_dynamic_features = get_world_stream_info(
        args.sample_rate, args.mgc_order, args.num_windows, args.vibrato_mode)

    print(f"Converting {input_file} mean/scale npy files")
    lf0_params = {}
    for name, in_feats in [("mean", mean), ("scale", scale), ("var", var)]:
        streams = get_static_features(
            in_feats.reshape(1, -1, in_feats.shape[-1]),
            args.num_windows,
            stream_sizes,
            has_dynamic_features,
        )

        # NOTE: use up to 4 streams
        # [mgc, lf0, bap, vuv]
        streams = list(map(lambda x: x.reshape(-1), streams))[:4]
        lf0_params[name] = float(streams[1])
        out_feats = np.concatenate(streams)

        print(f"[{name}] dim: {in_feats.shape} -> {out_feats.shape}")
        out_path = out_dir / (out_file_name + f"_{name}.npy")
        np.save(out_path, out_feats, allow_pickle=False)

    print(f"""
If you are going to train NSF-based vocoders, please set the following parameters:
Esempio n. 4
0
def train_step(
    model_config,
    optim_config,
    netG,
    optG,
    netD,
    optD,
    grad_scaler,
    train,
    in_feats,
    out_feats,
    lengths,
    out_scaler,
    feats_criterion="mse",
    pitch_reg_dyn_ws=1.0,
    pitch_reg_weight=1.0,
    adv_weight=1.0,
    adv_streams=None,
    fm_weight=0.0,
    adv_use_static_feats_only=True,
    mask_nth_mgc_for_adv_loss=0,
    gan_type="lsgan",
):
    netG.train() if train else netG.eval()
    netD.train() if train else netD.eval()
    log_metrics = {}

    if feats_criterion in ["l2", "mse"]:
        criterion = nn.MSELoss(reduction="none")
    elif feats_criterion in ["l1", "mae"]:
        criterion = nn.L1Loss(reduction="none")
    else:
        raise RuntimeError("not supported criterion")

    prediction_type = (
        netG.module.prediction_type()
        if isinstance(netG, nn.DataParallel)
        else netG.prediction_type()
    )
    # NOTE: it is not trivial to adapt GAN for probabilistic models
    assert prediction_type != PredictionType.PROBABILISTIC

    # Apply preprocess if required (e.g., FIR filter for shallow AR)
    # defaults to no-op
    if isinstance(netG, nn.DataParallel):
        out_feats = netG.module.preprocess_target(out_feats)
    else:
        out_feats = netG.preprocess_target(out_feats)

    # Run forward
    with autocast(enabled=grad_scaler is not None):
        pred_out_feats, lf0_residual = netG(in_feats, lengths)

        # Select streams for computing adversarial loss
        if adv_use_static_feats_only:
            real_netD_in_feats = torch.cat(
                get_static_features(
                    out_feats,
                    model_config.num_windows,
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                    adv_streams,
                ),
                dim=-1,
            )
            fake_netD_in_feats = torch.cat(
                get_static_features(
                    pred_out_feats,
                    model_config.num_windows,
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                    adv_streams,
                ),
                dim=-1,
            )
        else:
            real_netD_in_feats = select_streams(
                out_feats, model_config.stream_sizes, adv_streams
            )
            fake_netD_in_feats = select_streams(
                pred_out_feats,
                model_config.stream_sizes,
                adv_streams,
            )

    # Ref: http://sython.org/papers/ASJ/saito2017asja.pdf
    # 0-th mgc with adversarial trainging affects speech quality
    # NOTE: assuming that the first stream contains mgc
    if mask_nth_mgc_for_adv_loss > 0:
        real_netD_in_feats = real_netD_in_feats[:, :, mask_nth_mgc_for_adv_loss:]
        fake_netD_in_feats = fake_netD_in_feats[:, :, mask_nth_mgc_for_adv_loss:]

    # Real
    with autocast(enabled=grad_scaler is not None):
        D_real = netD(real_netD_in_feats, in_feats, lengths)
        # NOTE: must be list of list to support multi-scale discriminators
        assert isinstance(D_real, list) and isinstance(D_real[-1], list)
        # Fake
        D_fake_det = netD(fake_netD_in_feats.detach(), in_feats, lengths)

    # Mask (B, T, 1)
    mask = make_non_pad_mask(lengths).unsqueeze(-1).to(in_feats.device)

    # Update discriminator
    eps = 1e-14
    loss_real = 0
    loss_fake = 0

    with autocast(enabled=grad_scaler is not None):
        for idx, (D_real_, D_fake_det_) in enumerate(zip(D_real, D_fake_det)):
            if gan_type == "lsgan":
                loss_real_ = (D_real_[-1] - 1) ** 2
                loss_fake_ = D_fake_det_[-1] ** 2
            elif gan_type == "vanilla-gan":
                loss_real_ = -torch.log(D_real_[-1] + eps)
                loss_fake_ = -torch.log(1 - D_fake_det_[-1] + eps)
            else:
                raise ValueError(f"Unknown gan type: {gan_type}")

            # mask for D
            if (
                hasattr(netD, "downsample_scale")
                and mask.shape[1] // netD.downsample_scale == D_real_[-1].shape[1]
            ):
                D_mask = mask[:, :: netD.downsample_scale, :]
            else:
                if D_real_[-1].shape[1] == out_feats.shape[1]:
                    D_mask = mask
                else:
                    D_mask = None

            if D_mask is not None:
                loss_real_ = loss_real_.masked_select(D_mask).mean()
                loss_fake_ = loss_fake_.masked_select(D_mask).mean()
            else:
                loss_real_ = loss_real_.mean()
                loss_fake_ = loss_fake_.mean()

            log_metrics[f"Loss_Real_Scale{idx}"] = loss_real_.item()
            log_metrics[f"Loss_Fake_Scale{idx}"] = loss_fake_.item()

            loss_real += loss_real_
            loss_fake += loss_fake_

        loss_d = loss_real + loss_fake

    if train:
        optD.zero_grad()
        if grad_scaler is not None:
            grad_scaler.scale(loss_d).backward()
            grad_scaler.unscale_(optD)
            grad_norm_d = torch.nn.utils.clip_grad_norm_(
                netD.parameters(), optim_config.netD.clip_norm
            )
            log_metrics["GradNorm_D"] = grad_norm_d
            grad_scaler.step(optD)
        else:
            loss_d.backward()
            grad_norm_d = torch.nn.utils.clip_grad_norm_(
                netD.parameters(), optim_config.netD.clip_norm
            )
            log_metrics["GradNorm_D"] = grad_norm_d
            optD.step()

    # Update generator
    with autocast(enabled=grad_scaler is not None):
        loss_feats = criterion(
            pred_out_feats.masked_select(mask), out_feats.masked_select(mask)
        ).mean()

        # adversarial loss
        D_fake = netD(fake_netD_in_feats, in_feats, lengths)
        loss_adv = 0
        for idx, D_fake_ in enumerate(D_fake):
            if gan_type == "lsgan":
                loss_adv_ = (1 - D_fake_[-1]) ** 2
            elif gan_type == "vanilla-gan":
                loss_adv_ = -torch.log(D_fake_[-1] + eps)
            else:
                raise ValueError(f"Unknown gan type: {gan_type}")

            if (
                hasattr(netD, "downsample_scale")
                and mask.shape[1] // netD.downsample_scale == D_fake_[-1].shape[1]
            ):
                D_mask = mask[:, :: netD.downsample_scale, :]
            else:
                if D_real_[-1].shape[1] == out_feats.shape[1]:
                    D_mask = mask
                else:
                    D_mask = None

            if D_mask is not None:
                loss_adv_ = loss_adv_.masked_select(D_mask).mean()
            else:
                loss_adv_ = loss_adv_.mean()

            log_metrics[f"Loss_Adv_Scale{idx}"] = loss_adv_.item()

            loss_adv += loss_adv_

        # Feature matching loss
        loss_fm = torch.tensor(0.0).to(in_feats.device)
        if fm_weight > 0:
            for D_fake_, D_real_ in zip(D_fake, D_real):
                for fake_fmap, real_fmap in zip(D_fake_[:-1], D_real_[:-1]):
                    loss_fm += F.l1_loss(fake_fmap, real_fmap.detach())

        # Pitch regularization
        # NOTE: l1 loss seems to be better than mse loss in my experiments
        # we could use l2 loss as suggested in the sinsy's paper
        loss_pitch = (pitch_reg_dyn_ws * lf0_residual.abs()).masked_select(mask).mean()

        loss = (
            loss_feats
            + adv_weight * loss_adv
            + pitch_reg_weight * loss_pitch
            + fm_weight * loss_fm
        )

    if train:
        optG.zero_grad()
        if grad_scaler is not None:
            grad_scaler.scale(loss).backward()
            grad_scaler.unscale_(optG)
            grad_norm_g = torch.nn.utils.clip_grad_norm_(
                netG.parameters(), optim_config.netG.clip_norm
            )
            log_metrics["GradNorm_G"] = grad_norm_g
            grad_scaler.step(optG)
        else:
            loss.backward()
            grad_norm_g = torch.nn.utils.clip_grad_norm_(
                netG.parameters(), optim_config.netG.clip_norm
            )
            log_metrics["GradNorm_G"] = grad_norm_g
            optG.step()

    # NOTE: this shouldn't be called multiple times in a training step
    if train and grad_scaler is not None:
        grad_scaler.update()

    # Metrics
    distortions = compute_distortions(
        pred_out_feats, out_feats, lengths, out_scaler, model_config
    )
    log_metrics.update(distortions)
    log_metrics.update(
        {
            "Loss": loss.item(),
            "Loss_Feats": loss_feats.item(),
            "Loss_Adv_Total": loss_adv.item(),
            "Loss_Feature_Matching": loss_fm.item(),
            "Loss_Pitch": loss_pitch.item(),
            "Loss_Real_Total": loss_real.item(),
            "Loss_Fake_Total": loss_fake.item(),
            "Loss_D": loss_d.item(),
        }
    )

    return loss, log_metrics
Esempio n. 5
0
    parser.add_argument("input_file", type=str, help="input file")
    parser.add_argument("model_config", type=str, help="model config")
    parser.add_argument("output_file", type=str, help="output file")

    return parser


if __name__ == "__main__":
    args = get_parser().parse_args(sys.argv[1:])
    model_config = OmegaConf.load(args.model_config)

    out_scaler = joblib.load(args.input_file)

    mean_ = get_static_features(
        out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    mean_ = np.concatenate(mean_, -1).reshape(1, -1)
    var_ = get_static_features(
        out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    var_ = np.concatenate(var_, -1).reshape(1, -1)
    scale_ = get_static_features(
        out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
Esempio n. 6
0
def compute_distortions(pred_out_feats, out_feats, lengths, out_scaler,
                        model_config):
    """Compute distortion measures between predicted and ground-truth acoustic features


    Args:
        pred_out_feats (nn.Tensor): predicted acoustic features
        out_feats (nn.Tensor): ground-truth acoustic features
        lengths (nn.Tensor): lengths of the sequences
        out_scaler (nn.Module): scaler to denormalize features
        model_config (dict): model configuration

    Returns:
        dict: a dict that includes MCD for mgc/bap, V/UV error and F0 RMSE
    """
    out_feats = out_scaler.inverse_transform(out_feats)
    pred_out_feats = out_scaler.inverse_transform(pred_out_feats)
    out_streams = get_static_features(
        out_feats,
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    pred_out_streams = get_static_features(
        pred_out_feats,
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )

    assert len(out_streams) >= 4
    mgc, lf0, vuv, bap = out_streams[0], out_streams[1], out_streams[
        2], out_streams[3]
    pred_mgc, pred_lf0, pred_vuv, pred_bap = (
        pred_out_streams[0],
        pred_out_streams[1],
        pred_out_streams[2],
        pred_out_streams[3],
    )

    # binarize vuv
    vuv, pred_vuv = (vuv > 0.5).float(), (pred_vuv > 0.5).float()

    dist = {
        "ObjEval_MGC_MCD":
        metrics.melcd(mgc[:, :, 1:], pred_mgc[:, :, 1:], lengths=lengths),
        "ObjEval_BAP_MCD":
        metrics.melcd(bap, pred_bap, lengths=lengths) / 10.0,
        "ObjEval_VUV_ERR":
        metrics.vuv_error(vuv, pred_vuv, lengths=lengths),
    }

    try:
        f0_mse = metrics.lf0_mean_squared_error(lf0,
                                                vuv,
                                                pred_lf0,
                                                pred_vuv,
                                                lengths=lengths,
                                                linear_domain=True)
        dist["ObjEval_F0_RMSE"] = np.sqrt(f0_mse)
    except ZeroDivisionError:
        pass

    return dist
Esempio n. 7
0
def eval_spss_model(
    step,
    netG,
    in_feats,
    out_feats,
    lengths,
    model_config,
    out_scaler,
    writer,
    sr,
    trajectory_smoothing=True,
    trajectory_smoothing_cutoff=50,
):
    # make sure to be in eval mode
    netG.eval()
    is_autoregressive = (netG.module.is_autoregressive() if isinstance(
        netG, nn.DataParallel) else netG.is_autoregressive())
    prediction_type = (netG.module.prediction_type() if isinstance(
        netG, nn.DataParallel) else netG.prediction_type())
    utt_indices = [-1, -2, -3]
    utt_indices = utt_indices[:min(3, len(in_feats))]

    if np.any(model_config.has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            model_config.stream_sizes,
            model_config.has_dynamic_features,
            model_config.num_windows,
        )
    else:
        static_stream_sizes = model_config.stream_sizes

    for utt_idx in utt_indices:
        out_feats_denorm_ = out_scaler.inverse_transform(
            out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0))
        mgc, lf0, vuv, bap = get_static_features(
            out_feats_denorm_,
            model_config.num_windows,
            model_config.stream_sizes,
            model_config.has_dynamic_features,
        )[:4]
        mgc = mgc.squeeze(0).cpu().numpy()
        lf0 = lf0.squeeze(0).cpu().numpy()
        vuv = vuv.squeeze(0).cpu().numpy()
        bap = bap.squeeze(0).cpu().numpy()

        f0, spectrogram, aperiodicity = gen_world_params(
            mgc, lf0, vuv, bap, sr)
        wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
        group = f"utt{np.abs(utt_idx)}_reference"
        wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
        writer.add_audio(group, wav, step, sr)

        # Run forward
        if is_autoregressive:
            outs = netG(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]],
                out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
            )
        else:
            outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                        [lengths[utt_idx]])

        # ResF0 case
        if isinstance(outs, tuple) and len(outs) == 2:
            outs, _ = outs

        if prediction_type == PredictionType.PROBABILISTIC:
            pi, sigma, mu = outs
            pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma,
                                                                mu)[1]
        else:
            pred_out_feats = outs
        # NOTE: multiple outputs
        if isinstance(pred_out_feats, list):
            pred_out_feats = pred_out_feats[-1]
        if isinstance(pred_out_feats, tuple):
            pred_out_feats = pred_out_feats[0]

        if not isinstance(pred_out_feats, list):
            pred_out_feats = [pred_out_feats]

        # Run inference
        if prediction_type == PredictionType.PROBABILISTIC:
            inference_out_feats, _ = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        else:
            inference_out_feats = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        pred_out_feats.append(inference_out_feats)

        # Plot normalized input/output
        in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        fig, ax = plt.subplots(3, 1, figsize=(8, 8))
        ax[0].set_title("Reference features")
        ax[1].set_title("Input features")
        ax[2].set_title("Predicted features")
        mesh = librosa.display.specshow(out_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[0],
                                        cmap="viridis")
        # NOTE: assuming normalized to N(0, 1)
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[0])
        mesh = librosa.display.specshow(in_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[1],
                                        cmap="viridis")
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[1])
        mesh = librosa.display.specshow(
            inference_out_feats.squeeze(0).cpu().numpy().T,
            x_axis="frames",
            y_axis="frames",
            ax=ax[2],
            cmap="viridis",
        )
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[2])
        for ax_ in ax:
            ax_.set_ylabel("Feature")
        plt.tight_layout()
        group = f"utt{np.abs(utt_idx)}_inference"
        writer.add_figure(f"{group}/Input-Output", fig, step)
        plt.close()

        assert len(pred_out_feats) == 2
        for idx, pred_out_feats_ in enumerate(pred_out_feats):
            pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy()
            pred_out_feats_denorm = (out_scaler.inverse_transform(
                torch.from_numpy(pred_out_feats_).to(
                    in_feats.device)).cpu().numpy())
            if np.any(model_config.has_dynamic_features):
                # (T, D_out) -> (T, static_dim)
                pred_out_feats_denorm = multi_stream_mlpg(
                    pred_out_feats_denorm,
                    (out_scaler.scale_**2).cpu().numpy(),
                    get_windows(model_config.num_windows),
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                )
            pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams(
                pred_out_feats_denorm, static_stream_sizes)[:4]

            # Remove high-frequency components of mgc/bap
            # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
            if trajectory_smoothing:
                modfs = int(1 / 0.005)
                for d in range(pred_mgc.shape[1]):
                    pred_mgc[:, d] = lowpass_filter(
                        pred_mgc[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)
                for d in range(pred_bap.shape[1]):
                    pred_bap[:, d] = lowpass_filter(
                        pred_bap[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)

            # Generated sample
            f0, spectrogram, aperiodicity = gen_world_params(
                pred_mgc, pred_lf0, pred_vuv, pred_bap, sr)
            wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
            wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
            if idx == 1:
                group = f"utt{np.abs(utt_idx)}_inference"
            else:
                group = f"utt{np.abs(utt_idx)}_forward"
            writer.add_audio(group, wav, step, sr)
            plot_spsvs_params(
                step,
                writer,
                mgc,
                lf0,
                vuv,
                bap,
                pred_mgc,
                pred_lf0,
                pred_vuv,
                pred_bap,
                group=group,
                sr=sr,
            )
Esempio n. 8
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    device = torch.device("cuda" if use_cuda else "cpu")
    utt_list = to_absolute_path(config.utt_list)
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)

    utt_ids = load_utt_list(utt_list)

    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(
        to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage,
    )
    model.load_state_dict(checkpoint["state_dict"])
    model.eval()

    out_scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    mean_ = get_static_features(
        out_scaler.mean_.reshape(1, 1, out_scaler.mean_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    mean_ = np.concatenate(mean_, -1).reshape(1, -1)
    var_ = get_static_features(
        out_scaler.var_.reshape(1, 1, out_scaler.var_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    var_ = np.concatenate(var_, -1).reshape(1, -1)
    scale_ = get_static_features(
        out_scaler.scale_.reshape(1, 1, out_scaler.scale_.shape[-1]),
        model_config.num_windows,
        model_config.stream_sizes,
        model_config.has_dynamic_features,
    )
    scale_ = np.concatenate(scale_, -1).reshape(1, -1)
    static_scaler = StandardScaler(mean_, var_, scale_)

    static_stream_sizes = get_static_stream_sizes(
        model_config.stream_sizes,
        model_config.has_dynamic_features,
        model_config.num_windows,
    )

    for utt_id in tqdm(utt_ids):
        in_feats = (torch.from_numpy(
            np.load(join(in_dir,
                         utt_id + "-feats.npy"))).unsqueeze(0).to(device))
        static_feats = _gen_static_features(model, model_config, in_feats,
                                            out_scaler)

        mgc_end_dim = static_stream_sizes[0]
        bap_start_dim = sum(static_stream_sizes[:3])
        bap_end_dim = sum(static_stream_sizes[:4])

        if config.gv_postfilter:
            # mgc
            static_feats[:, :mgc_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[:mgc_end_dim],
                static_feats[:, :mgc_end_dim],
                offset=config.mgc_offset,
            )
            # bap
            static_feats[:, bap_start_dim:bap_end_dim] = variance_scaling(
                static_scaler.var_.reshape(-1)[bap_start_dim:bap_end_dim],
                static_feats[:, bap_start_dim:bap_end_dim],
                offset=config.bap_offset,
            )

        if config.normalize:
            static_feats = static_scaler.transform(static_feats)
        out_path = join(out_dir, f"{utt_id}-feats.npy")
        np.save(out_path, static_feats.astype(np.float32), allow_pickle=False)