Beispiel #1
0
def _get_small_datasets(padded=False, duration=False, padded_length=1000):
    if duration:
        X, Y = example_file_data_sources_for_duration_model()
    else:
        X, Y = example_file_data_sources_for_acoustic_model()
    if padded:
        X = PaddedFileSourceDataset(X, padded_length=padded_length)
        Y = PaddedFileSourceDataset(Y, padded_length=padded_length)
    else:
        X = FileSourceDataset(X)
        Y = FileSourceDataset(Y)
    return X, Y
Beispiel #2
0
def test_meanvar():
    # Pick acoustic features for testing
    _, X = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_mean, X_var = P.meanvar(X)
    X_std = np.sqrt(X_var)
    assert np.isfinite(X_mean).all()
    assert np.isfinite(X_var).all()
    assert X_mean.shape[-1] == D
    assert X_var.shape[-1] == D

    _, X_std_hat = P.meanstd(X)
    assert np.allclose(X_std, X_std_hat)

    x = X[0]
    x_scaled = P.scale(x, X_mean, X_std)
    assert np.isfinite(x_scaled).all()

    # For padded dataset
    _, X = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_mean_hat, X_var_hat = P.meanvar(X, lengths)
    assert np.allclose(X_mean, X_mean_hat)
    assert np.allclose(X_var, X_var_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std)
    assert np.allclose(x, x_hat, atol=1e-5)
Beispiel #3
0
def test_minmax():
    # Pick linguistic features for testing
    X, _ = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_min, X_max = P.minmax(X)
    assert np.isfinite(X_min).all()
    assert np.isfinite(X_max).all()

    x = X[0]
    x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99))
    assert np.max(x_scaled) <= 1
    assert np.min(x_scaled) >= 0
    assert np.isfinite(x_scaled).all()

    # Need to specify (min, max) or (scale_, min_)
    @raises(ValueError)
    def __test_raise1(x, X_min, X_max):
        P.minmax_scale(x)

    @raises(ValueError)
    def __test_raise2(x, X_min, X_max):
        P.inv_minmax_scale(x)

    __test_raise1(x, X_min, X_max)
    __test_raise2(x, X_min, X_max)

    # Explicit scale_ and min_
    min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99))
    x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_)
    assert np.allclose(x_scaled, x_scaled_hat)

    # For padded dataset
    X, _ = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_min_hat, X_max_hat = P.minmax(X, lengths)
    assert np.allclose(X_min, X_min_hat)
    assert np.allclose(X_max, X_max_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max)
    assert np.allclose(x, x_hat)

    x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_),
                               scale_=scale_,
                               min_=min_)
    assert np.allclose(x, x_hat)
Beispiel #4
0
        fs, x = wavfile.read(path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        spectrogram = trim_zeros_frames(spectrogram)
        mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
        return mc


bdl_source = MyFileDataSource(data_root=DATA_ROOT,
                              speakers=["bdl"], max_files=max_files)
slt_source = MyFileDataSource(data_root=DATA_ROOT,
                              speakers=["slt"], max_files=max_files)

X = PaddedFileSourceDataset(bdl_source, 1200).asarray()
Y = PaddedFileSourceDataset(slt_source, 1200).asarray()
print(X.shape)
print(Y.shape)

# # Plotting util
# def plot_parallel(x,y):
#     figure(figsize=(16,7))
#     subplot(2,1,1)
#     librosa.display.specshow(trim_zeros_frames(x).T, sr=fs, hop_length=hop_length, x_axis="time")
#     colorbar()
#     subplot(2,1,2)
#     librosa.display.specshow(trim_zeros_frames(y).T, sr=fs, hop_length=hop_length, x_axis="time")
#     colorbar()
#
# idx = 22 # any
Beispiel #5
0
def make_expected_dataset(data_root, use_delta):
    from pathlib import Path
    from nnmnkwii.datasets import PaddedFileSourceDataset
    from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource
    from nnmnkwii.metrics import melcd
    from nnmnkwii.preprocessing import (delta_features, remove_zeros_frames,
                                        trim_zeros_frames)
    from nnmnkwii.preprocessing.alignment import DTWAligner
    from nnmnkwii.util import apply_each2d_trim

    max_files = 100  # number of utterances to be used.
    test_size = 0.03

    windows = DELTA_WINDOWS

    class MyFileDataSource(CMUArcticWavFileDataSource):
        def __init__(self, *args, **kwargs):
            super(MyFileDataSource, self).__init__(*args, **kwargs)
            self.test_paths = None

        def collect_files(self):
            paths = [
                Path(path)
                for path in super(MyFileDataSource, self).collect_files()
            ]
            paths_train, paths_test = train_test_split(paths,
                                                       test_size=test_size,
                                                       random_state=1234)

            # keep paths for later testing
            self.test_paths = paths_test

            return paths_train

        def collect_features(self, path):
            feature = kwiiyatta.analyze_wav(path)
            s = trim_zeros_frames(feature.spectrum_envelope)
            return feature.mel_cepstrum.data[:len(s)]  # トリムするフレームが手前にずれてるのでは?

    clb_source = MyFileDataSource(data_root=data_root,
                                  speakers=["clb"],
                                  max_files=max_files)
    slt_source = MyFileDataSource(data_root=data_root,
                                  speakers=["slt"],
                                  max_files=max_files)

    X = PaddedFileSourceDataset(clb_source, 1200).asarray()
    Y = PaddedFileSourceDataset(slt_source, 1200).asarray()

    # Alignment
    X_aligned, Y_aligned = DTWAligner(verbose=0, dist=melcd).transform((X, Y))

    # Drop 1st (power) dimension
    X_aligned, Y_aligned = X_aligned[:, :, 1:], Y_aligned[:, :, 1:]

    if use_delta:
        X_aligned = apply_each2d_trim(delta_features, X_aligned, windows)
        Y_aligned = apply_each2d_trim(delta_features, Y_aligned, windows)

    XY = (np.concatenate((X_aligned, Y_aligned),
                         axis=-1).reshape(-1, X_aligned.shape[-1] * 2))

    return remove_zeros_frames(XY)