def _get_small_datasets(padded=False, duration=False, padded_length=1000): if duration: X, Y = example_file_data_sources_for_duration_model() else: X, Y = example_file_data_sources_for_acoustic_model() if padded: X = PaddedFileSourceDataset(X, padded_length=padded_length) Y = PaddedFileSourceDataset(Y, padded_length=padded_length) else: X = FileSourceDataset(X) Y = FileSourceDataset(Y) return X, Y
def test_meanvar(): # Pick acoustic features for testing _, X = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_mean, X_var = P.meanvar(X) X_std = np.sqrt(X_var) assert np.isfinite(X_mean).all() assert np.isfinite(X_var).all() assert X_mean.shape[-1] == D assert X_var.shape[-1] == D _, X_std_hat = P.meanstd(X) assert np.allclose(X_std, X_std_hat) x = X[0] x_scaled = P.scale(x, X_mean, X_std) assert np.isfinite(x_scaled).all() # For padded dataset _, X = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_mean_hat, X_var_hat = P.meanvar(X, lengths) assert np.allclose(X_mean, X_mean_hat) assert np.allclose(X_var, X_var_hat) # Inverse transform x = X[0] x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std) assert np.allclose(x, x_hat, atol=1e-5)
def test_minmax(): # Pick linguistic features for testing X, _ = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_min, X_max = P.minmax(X) assert np.isfinite(X_min).all() assert np.isfinite(X_max).all() x = X[0] x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99)) assert np.max(x_scaled) <= 1 assert np.min(x_scaled) >= 0 assert np.isfinite(x_scaled).all() # Need to specify (min, max) or (scale_, min_) @raises(ValueError) def __test_raise1(x, X_min, X_max): P.minmax_scale(x) @raises(ValueError) def __test_raise2(x, X_min, X_max): P.inv_minmax_scale(x) __test_raise1(x, X_min, X_max) __test_raise2(x, X_min, X_max) # Explicit scale_ and min_ min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99)) x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_) assert np.allclose(x_scaled, x_scaled_hat) # For padded dataset X, _ = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_min_hat, X_max_hat = P.minmax(X, lengths) assert np.allclose(X_min, X_min_hat) assert np.allclose(X_max, X_max_hat) # Inverse transform x = X[0] x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max) assert np.allclose(x, x_hat) x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_) assert np.allclose(x, x_hat)
fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc bdl_source = MyFileDataSource(data_root=DATA_ROOT, speakers=["bdl"], max_files=max_files) slt_source = MyFileDataSource(data_root=DATA_ROOT, speakers=["slt"], max_files=max_files) X = PaddedFileSourceDataset(bdl_source, 1200).asarray() Y = PaddedFileSourceDataset(slt_source, 1200).asarray() print(X.shape) print(Y.shape) # # Plotting util # def plot_parallel(x,y): # figure(figsize=(16,7)) # subplot(2,1,1) # librosa.display.specshow(trim_zeros_frames(x).T, sr=fs, hop_length=hop_length, x_axis="time") # colorbar() # subplot(2,1,2) # librosa.display.specshow(trim_zeros_frames(y).T, sr=fs, hop_length=hop_length, x_axis="time") # colorbar() # # idx = 22 # any
def make_expected_dataset(data_root, use_delta): from pathlib import Path from nnmnkwii.datasets import PaddedFileSourceDataset from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource from nnmnkwii.metrics import melcd from nnmnkwii.preprocessing import (delta_features, remove_zeros_frames, trim_zeros_frames) from nnmnkwii.preprocessing.alignment import DTWAligner from nnmnkwii.util import apply_each2d_trim max_files = 100 # number of utterances to be used. test_size = 0.03 windows = DELTA_WINDOWS class MyFileDataSource(CMUArcticWavFileDataSource): def __init__(self, *args, **kwargs): super(MyFileDataSource, self).__init__(*args, **kwargs) self.test_paths = None def collect_files(self): paths = [ Path(path) for path in super(MyFileDataSource, self).collect_files() ] paths_train, paths_test = train_test_split(paths, test_size=test_size, random_state=1234) # keep paths for later testing self.test_paths = paths_test return paths_train def collect_features(self, path): feature = kwiiyatta.analyze_wav(path) s = trim_zeros_frames(feature.spectrum_envelope) return feature.mel_cepstrum.data[:len(s)] # トリムするフレームが手前にずれてるのでは? clb_source = MyFileDataSource(data_root=data_root, speakers=["clb"], max_files=max_files) slt_source = MyFileDataSource(data_root=data_root, speakers=["slt"], max_files=max_files) X = PaddedFileSourceDataset(clb_source, 1200).asarray() Y = PaddedFileSourceDataset(slt_source, 1200).asarray() # Alignment X_aligned, Y_aligned = DTWAligner(verbose=0, dist=melcd).transform((X, Y)) # Drop 1st (power) dimension X_aligned, Y_aligned = X_aligned[:, :, 1:], Y_aligned[:, :, 1:] if use_delta: X_aligned = apply_each2d_trim(delta_features, X_aligned, windows) Y_aligned = apply_each2d_trim(delta_features, Y_aligned, windows) XY = (np.concatenate((X_aligned, Y_aligned), axis=-1).reshape(-1, X_aligned.shape[-1] * 2)) return remove_zeros_frames(XY)