def test_dtw_aligner(): x, fs = librosa.load(example_audio_file(), sr=None) assert fs == 16000 x_fast = librosa.effects.time_stretch(x, 2.0) X = _get_mcep(x, fs) Y = _get_mcep(x_fast, fs) D = X.shape[-1] # Create padded pair X, Y = adjast_frame_lengths(X, Y, divisible_by=2) # Add utterance axis X = X.reshape(1, -1, D) Y = Y.reshape(1, -1, D) X_aligned, Y_aligned = DTWAligner().transform((X, Y)) assert X_aligned.shape == Y_aligned.shape assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y) X_aligned, Y_aligned = IterativeDTWAligner( n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y)) assert X_aligned.shape == Y_aligned.shape assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y) # Custom dist function from nnmnkwii.metrics import melcd X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y)) assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
def test_adjast_frame_lengths(): D = 5 T1 = 10 T2 = 11 x = np.random.rand(T1, D) y = np.random.rand(T2, D) x_hat, y_hat = adjast_frame_lengths(x, y, pad=True) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 11 x_hat, y_hat = adjast_frame_lengths(x, y, pad=False) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 10 x_hat, y_hat = adjast_frame_lengths(x, y, pad=True, divisible_by=2) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 12 x_hat, y_hat = adjast_frame_lengths(x, y, pad=False, divisible_by=2) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 10 # Divisible x_hat, y_hat = adjast_frame_lengths(x, y, pad=False, divisible_by=3) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 9 x_hat, y_hat = adjast_frame_lengths(x, y, pad=True, divisible_by=3) assert x_hat.shape == y_hat.shape assert x_hat.shape[0] == 12
print("Destination dir for {}: {}".format(speaker, d)) if not exists(d): os.makedirs(d) # Convert to arrays print("Convert datasets to arrays") X, Y = X_dataset.asarray(verbose=1), Y_dataset.asarray(verbose=1) # Alignment print("Perform alignment") X, Y = DTWAligner().transform((X, Y)) print("Save features to disk") for idx, (x, y) in tqdm(enumerate(zip(X, Y))): # paths src_name = splitext(basename(X_dataset.collected_files[idx][0]))[0] tgt_name = splitext(basename(Y_dataset.collected_files[idx][0]))[0] src_path = join(dst_dir, "X", src_name) tgt_path = join(dst_dir, "Y", tgt_name) # Trim and ajast frames x = P.trim_zeros_frames(x) y = P.trim_zeros_frames(y) x, y = P.adjast_frame_lengths(x, y, pad=True, divisible_by=2) # Save np.save(src_path, x) np.save(tgt_path, y) sys.exit(0)