def test_trim_remove_zeros_frames(): fs, x = wavfile.read(example_audio_file()) frame_period = 5 x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) for mat in [spectrogram, aperiodicity]: trimmed = trim_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1] for mat in [spectrogram, aperiodicity]: trimmed = remove_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1]
def make_dataset_to_array(dataset, keys=None): if keys is None: keys = sorted(dataset.keys()) data = None for key in keys: d = dataset[key] if isinstance(d, tuple): d = np.hstack(d) d = remove_zeros_frames(d) if data is None: data = copy.copy(d) else: len_data = len(data) data.resize(len_data + len(d), d.shape[-1]) data[len_data:, :] = d return data
# Drop 1st (power) dimension X_aligned, Y_aligned = X_aligned[:, :, 1:], Y_aligned[:, :, 1:] # Append delta features static_dim = X_aligned.shape[-1] if use_delta: X_aligned = apply_each2d_trim(delta_features, X_aligned, windows) Y_aligned = apply_each2d_trim(delta_features, Y_aligned, windows) # plot_parallel(X_aligned[idx],Y_aligned[idx]) # Finally, we get joint feature matrix XY = np.concatenate((X_aligned, Y_aligned), axis=-1).reshape(-1, X_aligned.shape[-1] * 2) print(XY.shape) XY = remove_zeros_frames(XY) print(XY.shape) # Model gmm = GaussianMixture( n_components=64, covariance_type="full", max_iter=100, verbose=1) gmm.fit(XY) # Visualize model # Means # for k in range(3): # plot(gmm.means_[k], linewidth=1.5, label="Mean of mixture {}".format(k+1)) # legend(prop={"size": 16})
def make_expected_dataset(data_root, use_delta): from pathlib import Path from nnmnkwii.datasets import PaddedFileSourceDataset from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource from nnmnkwii.metrics import melcd from nnmnkwii.preprocessing import (delta_features, remove_zeros_frames, trim_zeros_frames) from nnmnkwii.preprocessing.alignment import DTWAligner from nnmnkwii.util import apply_each2d_trim max_files = 100 # number of utterances to be used. test_size = 0.03 windows = DELTA_WINDOWS class MyFileDataSource(CMUArcticWavFileDataSource): def __init__(self, *args, **kwargs): super(MyFileDataSource, self).__init__(*args, **kwargs) self.test_paths = None def collect_files(self): paths = [ Path(path) for path in super(MyFileDataSource, self).collect_files() ] paths_train, paths_test = train_test_split(paths, test_size=test_size, random_state=1234) # keep paths for later testing self.test_paths = paths_test return paths_train def collect_features(self, path): feature = kwiiyatta.analyze_wav(path) s = trim_zeros_frames(feature.spectrum_envelope) return feature.mel_cepstrum.data[:len(s)] # トリムするフレームが手前にずれてるのでは? clb_source = MyFileDataSource(data_root=data_root, speakers=["clb"], max_files=max_files) slt_source = MyFileDataSource(data_root=data_root, speakers=["slt"], max_files=max_files) X = PaddedFileSourceDataset(clb_source, 1200).asarray() Y = PaddedFileSourceDataset(slt_source, 1200).asarray() # Alignment X_aligned, Y_aligned = DTWAligner(verbose=0, dist=melcd).transform((X, Y)) # Drop 1st (power) dimension X_aligned, Y_aligned = X_aligned[:, :, 1:], Y_aligned[:, :, 1:] if use_delta: X_aligned = apply_each2d_trim(delta_features, X_aligned, windows) Y_aligned = apply_each2d_trim(delta_features, Y_aligned, windows) XY = (np.concatenate((X_aligned, Y_aligned), axis=-1).reshape(-1, X_aligned.shape[-1] * 2)) return remove_zeros_frames(XY)