Exemple #1
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Exemple #2
0
def _gen_static_features(model, model_config, in_feats, out_scaler):
    if model.prediction_type() == PredictionType.PROBABILISTIC:
        max_mu, max_sigma = model.inference(in_feats, [in_feats.shape[1]])

        if np.any(model_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (max_sigma.squeeze(0).cpu().data.numpy()**2 *
                            out_scaler.var_)
            max_mu = out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # Apply MLPG
            # (T, D_out) -> (T, static_dim)
            out_feats = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(model_config.num_windows),
                model_config.stream_sizes,
                model_config.has_dynamic_features,
            )
        else:
            # (T, D_out)
            out_feats = max_mu.squeeze(0).cpu().data.numpy()
            out_feats = out_scaler.inverse_transform(out_feats)
    else:
        out_feats = (model.inference(
            in_feats, [in_feats.shape[1]]).squeeze(0).cpu().data.numpy())
        out_feats = out_scaler.inverse_transform(out_feats)

        # Apply MLPG if necessary
        if np.any(model_config.has_dynamic_features):
            out_feats = multi_stream_mlpg(
                out_feats,
                out_scaler.var_,
                get_windows(model_config.num_windows),
                model_config.stream_sizes,
                model_config.has_dynamic_features,
            )

    return out_feats.astype(np.float32)
Exemple #3
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    device = torch.device("cuda" if use_cuda else "cpu")
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.model.checkpoint),
        map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])

    scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    in_feats = FileSourceDataset(NpyFileSource(in_dir))

    with torch.no_grad():
        for idx in tqdm(range(len(in_feats))):
            feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device)
            out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy()

            out = scaler.inverse_transform(out)

            # Apply MLPG if necessary
            if np.any(model_config.has_dynamic_features):
                windows = get_windows(3)
                out = multi_stream_mlpg(
                    out, scaler.var_, windows, model_config.stream_sizes,
                    model_config.has_dynamic_features)

            name = basename(in_feats.collected_files[idx][0])
            out_path = join(out_dir, name)
            np.save(out_path, out, allow_pickle=False)
Exemple #4
0
def predict_acoustic(
    device,
    labels,
    acoustic_model,
    acoustic_config,
    acoustic_in_scaler,
    acoustic_out_scaler,
    binary_dict,
    numeric_dict,
    subphone_features="coarse_coding",
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict acoustic features from HTS labels

    MLPG is applied to the predicted features if the output features have
    dynamic features.

    Args:
        device (torch.device): device to use
        labels (HTSLabelFile): HTS labels
        acoustic_model (nn.Module): acoustic model
        acoustic_config (AcousticConfig): acoustic configuration
        acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler
        acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        subphone_features (str): subphone feature type
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log f0 conditioning
        force_clip_input_features (bool): whether to force clip input features

    Returns:
        ndarray: predicted acoustic features
    """
    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(
                _midi_to_hz(linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        linguistic_features[:, non_pitch_indices] = np.clip(
            linguistic_features[:, non_pitch_indices],
            acoustic_in_scaler.feature_range[0],
            acoustic_in_scaler.feature_range[1],
        )

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_acoustic = (
            acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic,
                acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )

    return pred_acoustic
Exemple #5
0
def predict_timelag(
    device,
    labels,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    allowed_range=None,
    allowed_range_rest=None,
    force_clip_input_features=False,
):
    """Predict time-lag from HTS labels

    Args:
        device (torch.device): device
        labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels
        timelag_model (nn.Module): time-lag model
        timelag_config (dict): time-lag model config
        timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler
        timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler
        binary_dict (dict): binary feature dict
        numeric_dict (dict): numeric feature dict
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to condition on log f0
        allowed_range (list): allowed range of time-lag
        allowed_range_rest (list): allowed range of time-lag for rest
        force_clip_input_features (bool): whether to clip input features

    Returns;
        ndarray: time-lag predictions
    """
    if allowed_range is None:
        allowed_range = [-20, 20]
    if allowed_range_rest is None:
        allowed_range_rest = [-40, 40]
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features
    )
    if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(timelag_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        timelag_linguistic_features[:, non_pitch_indices] = np.clip(
            timelag_linguistic_features[:, non_pitch_indices],
            timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1],
        )

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_timelag = (
            timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag,
                timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]
            )
        else:
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range[0], allowed_range[1]
            )

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
Exemple #6
0
def predict_duration(
    device,
    labels,
    duration_model,
    duration_config,
    duration_in_scaler,
    duration_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict phoneme durations from HTS labels

    Args:
        device (torch.device): device to run the model on
        labels (nnmnkwii.io.hts.HTSLabelFile): labels
        duration_model (nn.Module): duration model
        duration_config (dict): duration config
        duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler
        duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log-f0 conditioning
        force_clip_input_features (bool): whether to clip input features

    Returns:
        np.ndarray: predicted durations
    """
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features
    )
    if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(duration_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        duration_linguistic_features[:, non_pitch_indices] = np.clip(
            duration_linguistic_features[:, non_pitch_indices],
            duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1],
        )

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            raise RuntimeError(
                "Dynamic features are not supported for duration modeling"
            )
        # Apply denormalization
        max_sigma_sq = (
            max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_
        )
        max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
        max_mu = duration_out_scaler.inverse_transform(
            max_mu.squeeze(0).cpu().data.numpy()
        )

        return max_mu, max_sigma_sq
    else:
        # (T, D_out)
        pred_durations = (
            duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations,
                duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features,
            )

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
Exemple #7
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_config,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-20, 20],
                    allowed_range_rest=[-40, 40]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)
    if isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range
        timelag_linguistic_features = np.clip(
            timelag_linguistic_features, timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1])

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * timelag_out_scaler.var_
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_timelag = timelag_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag, timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(pred_timelag[idx],
                                        allowed_range_rest[0],
                                        allowed_range_rest[1])
        else:
            pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0],
                                        allowed_range[1])

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
Exemple #8
0
def predict_acoustic(device,
                     labels,
                     acoustic_model,
                     acoustic_config,
                     acoustic_in_scaler,
                     acoustic_out_scaler,
                     binary_dict,
                     continuous_dict,
                     subphone_features="coarse_coding",
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(_midi_to_hz(
                linguistic_features, idx, log_f0_conditioning),
                                                   kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range
        linguistic_features = np.clip(linguistic_features,
                                      acoustic_in_scaler.feature_range[0],
                                      acoustic_in_scaler.feature_range[1])

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # (B, T, D_out)
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)

            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * acoustic_out_scaler.var_
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_acoustic = acoustic_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic, acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)

    return pred_acoustic
Exemple #9
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_config,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)
    if isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range
        duration_linguistic_features = np.clip(
            duration_linguistic_features, duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1])

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * duration_out_scaler.var_
            max_mu = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_durations = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_durations = duration_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations, duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
Exemple #10
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    device = torch.device("cuda" if use_cuda else "cpu")
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    os.makedirs(out_dir, exist_ok=True)

    model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml))
    model = hydra.utils.instantiate(model_config.netG).to(device)
    checkpoint = torch.load(to_absolute_path(config.model.checkpoint),
                            map_location=lambda storage, loc: storage)
    model.load_state_dict(checkpoint["state_dict"])

    scaler = joblib.load(to_absolute_path(config.out_scaler_path))

    in_feats = FileSourceDataset(NpyFileSource(in_dir))

    with torch.no_grad():
        for idx in tqdm(range(len(in_feats))):
            feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device)

            if model.prediction_type() == PredictionType.PROBABILISTIC:

                max_mu, max_sigma = model.inference(feats, [feats.shape[1]])

                if np.any(model_config.has_dynamic_features):
                    # Apply denormalization
                    # (B, T, D_out) -> (T, D_out)
                    max_sigma_sq = max_sigma.squeeze(
                        0).cpu().data.numpy()**2 * scaler.var_
                    max_mu = scaler.inverse_transform(
                        max_mu.squeeze(0).cpu().data.numpy())
                    # Apply MLPG
                    # (T, D_out) -> (T, static_dim)
                    out = multi_stream_mlpg(
                        max_mu, max_sigma_sq,
                        get_windows(model_config.num_windows),
                        model_config.stream_sizes,
                        model_config.has_dynamic_features)

                else:
                    # (T, D_out)
                    out = max_mu.squeeze(0).cpu().data.numpy()
                    out = scaler.inverse_transform(out)
            else:
                out = model.inference(
                    feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy()
                out = scaler.inverse_transform(out)

                # Apply MLPG if necessary
                if np.any(model_config.has_dynamic_features):
                    out = multi_stream_mlpg(
                        out, scaler.var_,
                        get_windows(model_config.num_windows),
                        model_config.stream_sizes,
                        model_config.has_dynamic_features)

            name = basename(in_feats.collected_files[idx][0])
            out_path = join(out_dir, name)
            np.save(out_path, out, allow_pickle=False)
Exemple #11
0
def eval_spss_model(
    step,
    netG,
    in_feats,
    out_feats,
    lengths,
    model_config,
    out_scaler,
    writer,
    sr,
    trajectory_smoothing=True,
    trajectory_smoothing_cutoff=50,
):
    # make sure to be in eval mode
    netG.eval()
    is_autoregressive = (netG.module.is_autoregressive() if isinstance(
        netG, nn.DataParallel) else netG.is_autoregressive())
    prediction_type = (netG.module.prediction_type() if isinstance(
        netG, nn.DataParallel) else netG.prediction_type())
    utt_indices = [-1, -2, -3]
    utt_indices = utt_indices[:min(3, len(in_feats))]

    if np.any(model_config.has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            model_config.stream_sizes,
            model_config.has_dynamic_features,
            model_config.num_windows,
        )
    else:
        static_stream_sizes = model_config.stream_sizes

    for utt_idx in utt_indices:
        out_feats_denorm_ = out_scaler.inverse_transform(
            out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0))
        mgc, lf0, vuv, bap = get_static_features(
            out_feats_denorm_,
            model_config.num_windows,
            model_config.stream_sizes,
            model_config.has_dynamic_features,
        )[:4]
        mgc = mgc.squeeze(0).cpu().numpy()
        lf0 = lf0.squeeze(0).cpu().numpy()
        vuv = vuv.squeeze(0).cpu().numpy()
        bap = bap.squeeze(0).cpu().numpy()

        f0, spectrogram, aperiodicity = gen_world_params(
            mgc, lf0, vuv, bap, sr)
        wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
        group = f"utt{np.abs(utt_idx)}_reference"
        wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
        writer.add_audio(group, wav, step, sr)

        # Run forward
        if is_autoregressive:
            outs = netG(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]],
                out_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
            )
        else:
            outs = netG(in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                        [lengths[utt_idx]])

        # ResF0 case
        if isinstance(outs, tuple) and len(outs) == 2:
            outs, _ = outs

        if prediction_type == PredictionType.PROBABILISTIC:
            pi, sigma, mu = outs
            pred_out_feats = mdn_get_most_probable_sigma_and_mu(pi, sigma,
                                                                mu)[1]
        else:
            pred_out_feats = outs
        # NOTE: multiple outputs
        if isinstance(pred_out_feats, list):
            pred_out_feats = pred_out_feats[-1]
        if isinstance(pred_out_feats, tuple):
            pred_out_feats = pred_out_feats[0]

        if not isinstance(pred_out_feats, list):
            pred_out_feats = [pred_out_feats]

        # Run inference
        if prediction_type == PredictionType.PROBABILISTIC:
            inference_out_feats, _ = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        else:
            inference_out_feats = netG.inference(
                in_feats[utt_idx, :lengths[utt_idx]].unsqueeze(0),
                [lengths[utt_idx]])
        pred_out_feats.append(inference_out_feats)

        # Plot normalized input/output
        in_feats_ = in_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        out_feats_ = out_feats[utt_idx, :lengths[utt_idx]].cpu().numpy()
        fig, ax = plt.subplots(3, 1, figsize=(8, 8))
        ax[0].set_title("Reference features")
        ax[1].set_title("Input features")
        ax[2].set_title("Predicted features")
        mesh = librosa.display.specshow(out_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[0],
                                        cmap="viridis")
        # NOTE: assuming normalized to N(0, 1)
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[0])
        mesh = librosa.display.specshow(in_feats_.T,
                                        x_axis="frames",
                                        y_axis="frames",
                                        ax=ax[1],
                                        cmap="viridis")
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[1])
        mesh = librosa.display.specshow(
            inference_out_feats.squeeze(0).cpu().numpy().T,
            x_axis="frames",
            y_axis="frames",
            ax=ax[2],
            cmap="viridis",
        )
        mesh.set_clim(-4, 4)
        fig.colorbar(mesh, ax=ax[2])
        for ax_ in ax:
            ax_.set_ylabel("Feature")
        plt.tight_layout()
        group = f"utt{np.abs(utt_idx)}_inference"
        writer.add_figure(f"{group}/Input-Output", fig, step)
        plt.close()

        assert len(pred_out_feats) == 2
        for idx, pred_out_feats_ in enumerate(pred_out_feats):
            pred_out_feats_ = pred_out_feats_.squeeze(0).cpu().numpy()
            pred_out_feats_denorm = (out_scaler.inverse_transform(
                torch.from_numpy(pred_out_feats_).to(
                    in_feats.device)).cpu().numpy())
            if np.any(model_config.has_dynamic_features):
                # (T, D_out) -> (T, static_dim)
                pred_out_feats_denorm = multi_stream_mlpg(
                    pred_out_feats_denorm,
                    (out_scaler.scale_**2).cpu().numpy(),
                    get_windows(model_config.num_windows),
                    model_config.stream_sizes,
                    model_config.has_dynamic_features,
                )
            pred_mgc, pred_lf0, pred_vuv, pred_bap = split_streams(
                pred_out_feats_denorm, static_stream_sizes)[:4]

            # Remove high-frequency components of mgc/bap
            # NOTE: It seems to be effective to suppress artifacts of GAN-based post-filtering
            if trajectory_smoothing:
                modfs = int(1 / 0.005)
                for d in range(pred_mgc.shape[1]):
                    pred_mgc[:, d] = lowpass_filter(
                        pred_mgc[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)
                for d in range(pred_bap.shape[1]):
                    pred_bap[:, d] = lowpass_filter(
                        pred_bap[:, d],
                        modfs,
                        cutoff=trajectory_smoothing_cutoff)

            # Generated sample
            f0, spectrogram, aperiodicity = gen_world_params(
                pred_mgc, pred_lf0, pred_vuv, pred_bap, sr)
            wav = pyworld.synthesize(f0, spectrogram, aperiodicity, sr, 5)
            wav = wav / np.abs(wav).max() if np.max(wav) > 1.0 else wav
            if idx == 1:
                group = f"utt{np.abs(utt_idx)}_inference"
            else:
                group = f"utt{np.abs(utt_idx)}_forward"
            writer.add_audio(group, wav, step, sr)
            plot_spsvs_params(
                step,
                writer,
                mgc,
                lf0,
                vuv,
                bap,
                pred_mgc,
                pred_lf0,
                pred_vuv,
                pred_bap,
                group=group,
                sr=sr,
            )