Example #1
0
def postprocess_duration(labels, pred_durations, lag):
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        # Apply time lag
        p = labels[note_indices[i-1]:note_indices[i]]
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i-1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p))
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(p.start_times, output_labels.start_times[-1] + 50000)

        # Compute normalized phoneme durations
        d = fe.duration_features(p)
        d_hat = pred_durations[note_indices[i-1]:note_indices[i]]
        d_norm = d[0] * d_hat / d_hat.sum()
        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        # TODO: better way to adjust?
        if d_norm.sum() != d[0]:
            d_norm[-1] +=  d[0] - d_norm.sum()
        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Example #2
0
def test_linguistic_and_duration_features_for_duration_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Phone-level linguistic features
    # Linguistic features
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)

    # Duration features
    labels = hts.load(input_state_label)
    x = fe.duration_features(labels,
                             feature_type="numerical",
                             unit_size="state",
                             feature_size="phoneme")
    y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"),
                    dtype=np.float32).reshape(-1, x.shape[-1])

    assert np.allclose(x, y)
Example #3
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.duration_features(labels)
     indices = labels.silence_phone_indices()
     features = np.delete(features, indices, axis=0)
     #print('DurationFeature:',features.shape)
     return features.astype(np.float32)
def _process_feature(out_dir, index, label_path):

    labels = hts.load(label_path)
    features = fe.duration_features(labels)
    n_frames = len(features)
    indices = labels.silence_phone_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the duration to disk:
    duration_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, duration_filename), features.astype(np.float32), allow_pickle=False)

    # Return a tuple describing this training example:
    return (duration_filename, n_frames, voiced_frames)
Example #5
0
def test_state_alignment_label_file():
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    with open(input_state_label) as f:
        assert f.read() == str(labels)

    print(labels.num_states())
    assert labels.num_states() == 5

    # Get and restore durations
    durations = fe.duration_features(labels)
    labels_copy = copy.deepcopy(labels)
    labels_copy.set_durations(durations)

    assert str(labels) == str(labels_copy)
Example #6
0
def test_phone_alignment_label():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    assert not labels.is_state_alignment_label()
    assert np.all(np.isfinite(x))

    for subphone_features in ["coarse_coding", "minimal_phoneme"]:
        x = fe.linguistic_features(labels,
                                   binary_dict,
                                   continuous_dict,
                                   add_frame_features=True,
                                   subphone_features=subphone_features)
        assert np.all(np.isfinite(x))

    x = fe.duration_features(labels)
    assert np.all(np.isfinite(x))
Example #7
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.duration_features(labels)
     return features.astype(np.float32)
Example #8
0
File: gen.py Project: r9y9/nnsvs
def postprocess_duration(labels, pred_durations, lag):
    """Post-process durations based on predicted time-lag

    Ref : https://arxiv.org/abs/2108.02776

    Args:
        labels (HTSLabelFile): HTS labels
        pred_durations (array or tuple): predicted durations for non-MDN,
            mean and variance for MDN
        lag (array): predicted time-lag

    Returns:
        HTSLabelFile: labels with adjusted durations
    """
    note_indices = get_note_indices(labels)
    # append the end of note
    note_indices.append(len(labels))

    is_mdn = isinstance(pred_durations, tuple) and len(pred_durations) == 2

    output_labels = hts.HTSLabelFile()

    for i in range(1, len(note_indices)):
        p = labels[note_indices[i - 1] : note_indices[i]]

        # Compute note duration with time-lag
        # eq (11)
        L = int(fe.duration_features(p)[0])
        if i < len(note_indices) - 1:
            L_hat = L - (lag[i - 1] - lag[i]) / 50000
        else:
            L_hat = L - (lag[i - 1]) / 50000

        # Prevent negative duration
        L_hat = max(L_hat, 1)

        # adjust the start time of the note
        p.start_times = np.minimum(
            np.asarray(p.start_times) + lag[i - 1].reshape(-1),
            np.asarray(p.end_times) - 50000 * len(p),
        )
        p.start_times = np.maximum(p.start_times, 0)
        if len(output_labels) > 0:
            p.start_times = np.maximum(
                p.start_times, output_labels.start_times[-1] + 50000
            )

        # Compute normalized phoneme durations
        if is_mdn:
            mu = pred_durations[0][note_indices[i - 1] : note_indices[i]]
            sigma_sq = pred_durations[1][note_indices[i - 1] : note_indices[i]]
            # eq (17)
            rho = (L_hat - mu.sum()) / sigma_sq.sum()
            # eq (16)
            d_norm = mu + rho * sigma_sq

            if np.any(d_norm <= 0):
                # eq (12) (using mu as d_hat)
                print(
                    f"Negative phoneme durations are predicted at {i}-th note. "
                    "The note duration: ",
                    f"{round(float(L)*0.005,3)} sec -> {round(float(L_hat)*0.005,3)} sec",
                )
                print(
                    "It's likely that the model couldn't predict correct durations "
                    "for short notes."
                )
                print(
                    f"Variance scaling based durations (in frame):\n{(mu + rho * sigma_sq)}"
                )
                print(
                    f"Fallback to uniform scaling (in frame):\n{(L_hat * mu / mu.sum())}"
                )
                d_norm = L_hat * mu / mu.sum()
        else:
            # eq (12)
            d_hat = pred_durations[note_indices[i - 1] : note_indices[i]]
            d_norm = L_hat * d_hat / d_hat.sum()

        d_norm = np.round(d_norm)
        d_norm[d_norm <= 0] = 1

        p.set_durations(d_norm)

        if len(output_labels) > 0:
            output_labels.end_times[-1] = p.start_times[0]
        for n in p:
            output_labels.append(n)

    return output_labels
Example #9
0
def get_duration(lab_path):
    labels = hts.load(lab_path)
    feature = fe.duration_features(labels)
    indices = labels.silence_phone_indices()
    feature = np.delete(feature, indices, axis=0)
    return feature.astype(np.float32)
Example #10
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.duration_features(labels)
     indices = labels.silence_phone_indices()
     features = np.delete(features, indices, axis=0)
     return features.astype(np.float32)
Example #11
0
 def __test(labels, unit_size, feature_size):
     fe.duration_features(labels,
                          unit_size=unit_size,
                          feature_size=feature_size)