Beispiel #1
0
def postprocess_duration(labels, pred_durations, lag):
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        # Apply time lag
        p = labels[note_indices[i-1]:note_indices[i]]
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i-1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p))
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000)

        # Compute normalized phoneme durations
        d = fe.duration_features(p)
        d_hat = pred_durations[note_indices[i-1]:note_indices[i]]
        d_norm = d[0] * d_hat / d_hat.sum()
        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        # TODO: better way to adjust?
        if d_norm.sum() != d[0]:
            d_norm[-1] +=  d[0] - d_norm.sum()
        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Beispiel #2
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-30, 30]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)
    y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu()

    # De-normalization and rounding
    lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy()))

    # Clip to the allowed range
    lag = np.clip(lag, allowed_range[0], allowed_range[1])

    # frames -> 100 ns
    lag *= 50000

    return lag
Beispiel #3
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Get note indices
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))
    pred_durations = duration_model(
        x, [x.shape[1]]).squeeze(0).cpu().data.numpy()

    # Apply denormalization
    pred_durations = duration_out_scaler.inverse_transform(pred_durations)
    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
Beispiel #4
0
print("Prepare data for time-lag models")
full_lab_align_files = sorted(glob(join(full_align_dir, "*.lab")))
full_lab_score_files = sorted(glob(join(full_score_dir, "*.lab")))
for lab_align_path, lab_score_path in zip(full_lab_align_files,
                                          full_lab_score_files):
    name = basename(lab_align_path)

    lab_align = hts.load(lab_align_path)
    lab_score = hts.load(lab_score_path)

    # this may harm for computing offset
    lab_align = remove_sil_and_pau(lab_align)
    lab_score = remove_sil_and_pau(lab_score)

    # Extract note onsets and let's compute a offset
    note_indices = get_note_indices(lab_score)

    onset_align = np.asarray(lab_align[note_indices].start_times)
    onset_score = np.asarray(lab_score[note_indices].start_times)

    global_offset = (onset_align - onset_score).mean()
    global_offset = int(round(global_offset / 50000) * 50000)

    # Apply offset correction only when there is a big gap
    apply_offset_correction = np.abs(
        global_offset * 1e-7) > offset_correction_threshold
    if apply_offset_correction:
        print(f"{name}: Global offset (in sec): {global_offset * 1e-7}")
        lab_score.start_times = list(
            np.asarray(lab_score.start_times) + global_offset)
        lab_score.end_times = list(
Beispiel #5
0
Datei: gen.py Projekt: r9y9/nnsvs
def predict_timelag(
    device,
    labels,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    allowed_range=None,
    allowed_range_rest=None,
    force_clip_input_features=False,
):
    """Predict time-lag from HTS labels

    Args:
        device (torch.device): device
        labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels
        timelag_model (nn.Module): time-lag model
        timelag_config (dict): time-lag model config
        timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler
        timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler
        binary_dict (dict): binary feature dict
        numeric_dict (dict): numeric feature dict
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to condition on log f0
        allowed_range (list): allowed range of time-lag
        allowed_range_rest (list): allowed range of time-lag for rest
        force_clip_input_features (bool): whether to clip input features

    Returns;
        ndarray: time-lag predictions
    """
    if allowed_range is None:
        allowed_range = [-20, 20]
    if allowed_range_rest is None:
        allowed_range_rest = [-40, 40]
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features
    )
    if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(timelag_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        timelag_linguistic_features[:, non_pitch_indices] = np.clip(
            timelag_linguistic_features[:, non_pitch_indices],
            timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1],
        )

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_timelag = (
            timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag,
                timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]
            )
        else:
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range[0], allowed_range[1]
            )

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
Beispiel #6
0
Datei: gen.py Projekt: r9y9/nnsvs
def postprocess_duration(labels, pred_durations, lag):
    """Post-process durations based on predicted time-lag

    Ref : https://arxiv.org/abs/2108.02776

    Args:
        labels (HTSLabelFile): HTS labels
        pred_durations (array or tuple): predicted durations for non-MDN,
            mean and variance for MDN
        lag (array): predicted time-lag

    Returns:
        HTSLabelFile: labels with adjusted durations
    """
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        p = labels[note_indices[i - 1] : note_indices[i]]

        # Compute note duration with time-lag
        # eq (11)
        L = int(fe.duration_features(p)[0])
        if i < len(note_indices) - 1:
            L_hat = L - (lag[i - 1] - lag[i]) / 50000
        else:
            L_hat = L - (lag[i - 1]) / 50000

        # Prevent negative duration
        L_hat = max(L_hat, 1)

        # adjust the start time of the note
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i - 1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p),
        )
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(
                p.start_times, output_labels.start_times[-1] + 50000
            )

        # Compute normalized phoneme durations
        if is_mdn:
            mu = pred_durations[0][note_indices[i - 1] : note_indices[i]]
            sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]]
            # eq (17)
            rho = (L_hat - mu.sum()) / sigma_sq.sum()
            # eq (16)
            d_norm = mu + rho * sigma_sq

            if np.any(d_norm <= 0):
                # eq (12) (using mu as d_hat)
                print(
                    f"Negative phoneme durations are predicted at {i}-th note. "
                    "The note duration: ",
                    f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec",
                )
                print(
                    "It's likely that the model couldn't predict correct durations "
                    "for short notes."
                )
                print(
                    f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}"
                )
                print(
                    f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}"
                )
                d_norm = L_hat * mu / mu.sum()
        else:
            # eq (12)
            d_hat = pred_durations[note_indices[i - 1] : note_indices[i]]
            d_norm = L_hat * d_hat / d_hat.sum()

        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Beispiel #7
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_config,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-20, 20],
                    allowed_range_rest=[-40, 40]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)
    if isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range
        timelag_linguistic_features = np.clip(
            timelag_linguistic_features, timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1])

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * timelag_out_scaler.var_
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_timelag = timelag_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag, timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(pred_timelag[idx],
                                        allowed_range_rest[0],
                                        allowed_range_rest[1])
        else:
            pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0],
                                        allowed_range[1])

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag