def test_griffinlim(self): # NOTE: This test is flaky without a fixed random seed # See https://github.com/pytorch/audio/issues/382 torch.random.manual_seed(42) tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 100 window = torch.hann_window(ws) normalize = False momentum = 0.99 n_iter = 8 length = 1000 rand_init = False init = 'random' if rand_init else None specgram = F.spectrogram(tensor, 0, window, n_fft, hop, ws, 2, normalize).sqrt() ta_out = F.griffinlim(specgram, window, n_fft, hop, ws, 1, normalize, n_iter, momentum, length, rand_init) lr_out = librosa.griffinlim(specgram.squeeze(0).numpy(), n_iter=n_iter, hop_length=hop, momentum=momentum, init=init, length=length) lr_out = torch.from_numpy(lr_out).unsqueeze(0) self.assertTrue(torch.allclose(ta_out, lr_out, atol=5e-5))
def wav_to_stft( wav_p: str, nperseg: int = constant.N_FFT, stride: int = constant.STFT_STRIDE, ) -> th.Tensor: raw_audio, sr = th_audio.load(wav_p) assert sr == constant.SAMPLE_RATE, \ f"Audio sample rate must be {constant.SAMPLE_RATE}Hz, " \ f"file \"{wav_p}\" is {sr}Hz" raw_audio_mono = raw_audio.mean(0) hann_window = th.hann_window(nperseg) complex_values = th_audio_f.spectrogram(raw_audio_mono, pad=0, window=hann_window, n_fft=nperseg, hop_length=stride, win_length=nperseg, power=None, normalized=True, return_complex=True) # remove Nyquist frequency return complex_values[:-1, :]
def test_MelScale(self): """MelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 hop_length = n_fft // 4 # Prepare spectrogram input. We use torchaudio to compute one. sound, sample_rate = self._get_sample_data('whitenoise_1min.mp3') spec_ta = F.spectrogram(sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) spec_lr = spec_ta.cpu().numpy().squeeze() # Perform MelScale with torchaudio and librosa melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_ta) melspec_lr = librosa.feature.melspectrogram(S=spec_lr, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, center=True, window='hann', n_mels=n_mels, htk=True, norm=None) # Note: Using relaxed rtol instead of atol assert torch.allclose(melspec_ta, torch.from_numpy(melspec_lr[None, ...]), rtol=1e-3)
def func(tensor): n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype) power = 2. normalize = False return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)
def file_log_spectrogram(sound, segment_time=20, overlap_time=10): r"""Generates a spectrogram of a given sound file. """ waveform, fs = torchaudio.load(sound) nperseg = int(segment_time * fs / 1000) # TODO: do not hardcode these noverlap = int(overlap_time * fs / 1000) cur_input = torch.log( F.spectrogram(waveform, 0, torch.hann_window(nperseg), nperseg, nperseg - noverlap, nperseg, 2, 0) + 1e-10) return torch.squeeze(torch.transpose(cur_input, 1, 2))
def forward(self, waveform: Tensor) -> Tensor: r""" Args: waveform (Tensor): Tensor of audio of dimension (..., time). Returns: Tensor: Dimension (..., freq, time), where freq is ``n_fft // 2 + 1`` where ``n_fft`` is the number of Fourier bins, and time is the number of window hops (n_frame). """ return F.spectrogram(waveform, self.pad, self.window, self.n_fft, self.hop_length, self.win_length, self.power, self.normalized)
def test_InverseMelScale(self): """InverseMelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 n_stft = n_fft // 2 + 1 hop_length = n_fft // 4 # Prepare mel spectrogram input. We use torchaudio to compute one. sound, sample_rate = _load_audio_asset( 'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14) sound = sound.mean(dim=0, keepdim=True) spec_orig = F.spectrogram( sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig) melspec_lr = melspec_ta.cpu().numpy().squeeze() # Perform InverseMelScale with torch audio and librosa spec_ta = torchaudio.transforms.InverseMelScale( n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta) spec_lr = librosa.feature.inverse.mel_to_stft( melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None) spec_lr = torch.from_numpy(spec_lr[None, ...]) # Align dimensions # librosa does not return power spectrogram while torchaudio returns power spectrogram spec_orig = spec_orig.sqrt() spec_ta = spec_ta.sqrt() threshold = 2.0 # This threshold was choosen empirically, based on the following observation # # torch.dist(spec_lr, spec_ta, p=float('inf')) # >>> tensor(1.9666) # # The spectrograms reconstructed by librosa and torchaudio are not comparable elementwise. # This is because they use different approximation algorithms and resulting values can live # in different magnitude. (although most of them are very close) # See # https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm # https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf # distance over frequencies. assert torch.allclose(spec_ta, spec_lr, atol=threshold) threshold = 1700.0 # This threshold was choosen empirically, based on the following observations # # torch.dist(spec_orig, spec_ta, p=1) # >>> tensor(1644.3516) # torch.dist(spec_orig, spec_lr, p=1) # >>> tensor(1420.7103) # torch.dist(spec_lr, spec_ta, p=1) # >>> tensor(943.2759) assert torch.dist(spec_orig, spec_ta, p=1) < threshold
def forward(self, waveform): r""" Args: waveform (torch.Tensor): Tensor of audio of dimension (channel, time) Returns: torch.Tensor: Dimension (channel, freq, time), where channel is unchanged, freq is ``n_fft // 2 + 1`` where ``n_fft`` is the number of Fourier bins, and time is the number of window hops (n_frames). """ return F.spectrogram(waveform, self.pad, self.window, self.n_fft, self.hop_length, self.win_length, self.power, self.normalized)
def test_grad_at_zero(self, power): """The gradient of power spectrogram should not be nan but zero near x=0 https://github.com/pytorch/audio/issues/993 """ x = torch.zeros(1, 22050, requires_grad=True) spec = F.spectrogram( x, pad=0, window=None, n_fft=2048, hop_length=None, win_length=None, power=power, normalized=False, ) spec.sum().backward() assert not x.grad.isnan().sum()
def test_torchscript_spectrogram(self): @torch.jit.script def jit_method(sig, pad, window, n_fft, hop, ws, power, normalize): # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor return F.spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize) tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws) power = 2 normalize = False jit_out = jit_method(tensor, pad, window, n_fft, hop, ws, power, normalize) py_out = F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize) self.assertTrue(torch.allclose(jit_out, py_out))
def test_MelScale(self): """MelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 hop_length = n_fft // 4 # Prepare spectrogram input. We use torchaudio to compute one. common_utils.set_audio_backend('default') sound, sample_rate = _load_audio_asset('whitenoise_1min.mp3') sound = sound.mean(dim=0, keepdim=True) spec_ta = F.spectrogram(sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) spec_lr = spec_ta.cpu().numpy().squeeze() # Perform MelScale with torchaudio and librosa melspec_ta = torchaudio.transforms.MelScale( n_mels=n_mels, sample_rate=sample_rate)(spec_ta) melspec_lr = librosa.feature.melspectrogram(S=spec_lr, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, center=True, window='hann', n_mels=n_mels, htk=True, norm=None) # Note: Using relaxed rtol instead of atol self.assertEqual(melspec_ta, torch.from_numpy(melspec_lr[None, ...]), atol=1e-8, rtol=1e-3)
def jit_method(sig, pad, window, n_fft, hop, ws, power, normalize): # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor return F.spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize)