def __init__(self, n_fft=1024, n_mels=80): super().__init__() self.mel_to_lin = transforms.InverseMelScale(n_stft=n_fft // 2 + 1, n_mels=n_mels, sample_rate=sample_rate, max_iter=2048) self.griffin_lim = transforms.GriffinLim(n_fft=n_fft, hop_length=256)
def test_batch_InverseMelScale(self): n_fft = 8 n_mels = 32 n_stft = 5 mel_spec = torch.randn(2, n_mels, 32) ** 2 # Single then transform then batch expected = transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1) # Batch then transform computed = transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1)) # shape = (3, 2, n_mels, 32) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) # Because InverseMelScale runs SGD on randomly initialized values so they do not yield # exactly same result. For this reason, tolerance is very relaxed here. self.assertTrue(torch.allclose(computed, expected, atol=1.0))
def test_InverseMelScale(self): """InverseMelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 n_stft = n_fft // 2 + 1 hop_length = n_fft // 4 # Prepare mel spectrogram input. We use torchaudio to compute one. sound, sample_rate = self._get_sample_data( 'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14) spec_orig = F.spectrogram( sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig) melspec_lr = melspec_ta.cpu().numpy().squeeze() # Perform InverseMelScale with torch audio and librosa spec_ta = transforms.InverseMelScale( n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta) spec_lr = librosa.feature.inverse.mel_to_stft( melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None) spec_lr = torch.from_numpy(spec_lr[None, ...]) # Align dimensions # librosa does not return power spectrogram while torchaudio returns power spectrogram spec_orig = spec_orig.sqrt() spec_ta = spec_ta.sqrt() threshold = 2.0 # This threshold was choosen empirically, based on the following observation # # torch.dist(spec_lr, spec_ta, p=float('inf')) # >>> tensor(1.9666) # # The spectrograms reconstructed by librosa and torchaudio are not very comparable elementwise. # This is because they use different approximation algorithms and resulting values can live # in different magnitude. (although most of them are very close) # See https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm # See https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf # distance over frequencies. assert torch.allclose(spec_ta, spec_lr, atol=threshold) threshold = 1700.0 # This threshold was choosen empirically, based on the following observations # # torch.dist(spec_orig, spec_ta, p=1) # >>> tensor(1644.3516) # torch.dist(spec_orig, spec_lr, p=1) # >>> tensor(1420.7103) # torch.dist(spec_lr, spec_ta, p=1) # >>> tensor(943.2759) assert torch.dist(spec_orig, spec_ta, p=1) < threshold
def test_InverseMelScale(self): """Gauge the quality of InverseMelScale transform. As InverseMelScale is currently implemented with random initialization + iterative optimization, it is not practically possible to assert the difference between the estimated spectrogram and the original spectrogram as a whole. Estimated spectrogram has very huge descrepency locally. Thus in this test we gauge what percentage of elements are bellow certain tolerance. At the moment, the quality of estimated spectrogram is not good. When implementation is changed in a way it makes the quality even worse, this test will fail. """ n_fft = 400 power = 1 n_mels = 64 sample_rate = 8000 n_stft = n_fft // 2 + 1 # Generate reference spectrogram and input mel-scaled spectrogram expected = get_spectrogram(get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power).to(self.device, self.dtype) input = T.MelScale(n_mels=n_mels, sample_rate=sample_rate).to(self.device, self.dtype)(expected) # Run transform transform = T.InverseMelScale(n_stft, n_mels=n_mels, sample_rate=sample_rate).to( self.device, self.dtype) torch.random.manual_seed(0) result = transform(input) # Compare epsilon = 1e-60 relative_diff = torch.abs((result - expected) / (expected + epsilon)) for tol in [1e-1, 1e-3, 1e-5, 1e-10]: print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}") assert _get_ratio(relative_diff < 1e-1) > 0.2 assert _get_ratio(relative_diff < 1e-3) > 5e-3 assert _get_ratio(relative_diff < 1e-5) > 1e-5
def _mel_to_wav(mel, sr=sample_rate, engine='librosa'): ''' using Griffin-Lim algorithm ''' if engine == 'librosa': return librosa.feature.inverse.mel_to_audio(mel, sr=sr, **stft_params, power=power) elif engine == 'torch': return _spec_to_wav(tf.InverseMelScale( n_stft=stft_params['n_fft'] // 2 + 1, sample_rate=sr, n_mels=n_mels, f_max=mel_params['fmax'], f_min=mel_params['fmin'], max_iter=1000)(mel), sr=sr, engine=engine) raise ValueError(engine)