def test_pitch_feats(self, kwargs): """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats""" sample_rate = kwargs['sample_rate'] waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate) result = F.compute_kaldi_pitch(waveform[0], **kwargs) waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate) wave_file = self.get_temp_path('test.wav') save_wav(wave_file, waveform, sample_rate) command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-'] kaldi_result = run_kaldi(command, 'scp', wave_file) self.assert_equal(result, expected=kaldi_result)
def test_mel_spectrogram(self, n_fft, hop_length, n_mels, norm): sample_rate = 16000 sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate) sound_librosa = sound.cpu().numpy().squeeze() melspect_transform = torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate, window_fn=torch.hann_window, hop_length=hop_length, n_mels=n_mels, n_fft=n_fft, norm=norm) librosa_mel = librosa.feature.melspectrogram(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, htk=True, norm=norm) librosa_mel_tensor = torch.from_numpy(librosa_mel) torch_mel = melspect_transform(sound).squeeze().cpu() self.assertEqual(torch_mel.type(librosa_mel_tensor.dtype), librosa_mel_tensor, atol=5e-3, rtol=1e-5)
def test_apply_effects(self, args): """`apply_effects_tensor` should return identical data as sox command""" effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) output_sr = args.get("output_sample_rate") input_path = self.get_temp_path('input.wav') reference_path = self.get_temp_path('reference.wav') original = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32') save_wav(input_path, original, input_sr) sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects) assert sr == expected_sr self.assertEqual(expected, found)
def test_apply_effects_file(self, args): effects = args['effects'] channels_first = True num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) trans = SoxEffectFileTransform(effects, channels_first) path = self.get_temp_path('sox_effect.zip') torch.jit.script(trans).save(path) trans = torch.jit.load(path) path = self.get_temp_path('input.wav') wav = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32', channels_first=channels_first) save_wav(path, wav, sample_rate=input_sr, channels_first=channels_first) found, sr_found = trans(path) expected, sr_expected = sox_effects.apply_effects_file( path, effects, channels_first) assert sr_found == sr_expected self.assertEqual(expected, found)
def test_detect_pitch_frequency(self, frequency, sample_rate, n_channels): waveform = common_utils.get_sinusoid(frequency=frequency, sample_rate=sample_rate, n_channels=n_channels, duration=5) self.assert_batch_consistencies(F.detect_pitch_frequency, waveform, sample_rate)
def test_MelSpectrogram(self, n_fft, hop_length, n_mels, norm, mel_scale): sample_rate = 16000 waveform = get_sinusoid( sample_rate=sample_rate, n_channels=1, ).to(self.device, self.dtype) expected = librosa.feature.melspectrogram(y=waveform[0].cpu().numpy(), sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, norm=norm, htk=mel_scale == "htk") result = T.MelSpectrogram( sample_rate=sample_rate, window_fn=torch.hann_window, hop_length=hop_length, n_mels=n_mels, n_fft=n_fft, norm=norm, mel_scale=mel_scale, ).to(self.device, self.dtype)(waveform)[0] self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
def test_detect_pitch_frequency(self): waveform = common_utils.get_sinusoid(sample_rate=44100) def func(tensor): sample_rate = 44100 return F.detect_pitch_frequency(tensor, sample_rate) self._assert_consistency(func, waveform)
def test_apply_effects_tensor(self, args): """`apply_effects_tensor` should not crash""" effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) original = get_sinusoid( frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32') _found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
def test_detect_pitch_frequency_pitch(self, frequency): sample_rate = 44100 test_sine_waveform = get_sinusoid(frequency=frequency, sample_rate=sample_rate, duration=5) freq = F.detect_pitch_frequency(test_sine_waveform, sample_rate) threshold = 1 s = ((freq - frequency).abs() > threshold).sum() self.assertFalse(s)
def test_s2db(self, n_fft, hop_length, power, n_mels, norm, skip_ci=False): if skip_ci and 'CI' in os.environ: self.skipTest('Test is known to fail on CI') sample_rate = 16000 sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate) sound_librosa = sound.cpu().numpy().squeeze() spect_transform = torchaudio.transforms.Spectrogram( n_fft=n_fft, hop_length=hop_length, power=power) out_librosa, _ = librosa.core.spectrum._spectrogram( y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power) melspect_transform = torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate, window_fn=torch.hann_window, hop_length=hop_length, n_mels=n_mels, n_fft=n_fft, norm=norm) librosa_mel = librosa.feature.melspectrogram(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, htk=True, norm=norm) power_to_db_transform = torchaudio.transforms.AmplitudeToDB( 'power', 80.) power_to_db_torch = power_to_db_transform( spect_transform(sound)).squeeze().cpu() power_to_db_librosa = librosa.core.spectrum.power_to_db(out_librosa) self.assertEqual(power_to_db_torch, torch.from_numpy(power_to_db_librosa), atol=5e-3, rtol=1e-5) mag_to_db_transform = torchaudio.transforms.AmplitudeToDB( 'magnitude', 80.) mag_to_db_torch = mag_to_db_transform(torch.abs(sound)).squeeze().cpu() mag_to_db_librosa = librosa.core.spectrum.amplitude_to_db( sound_librosa) self.assertEqual(mag_to_db_torch, torch.from_numpy(mag_to_db_librosa), atol=5e-3, rtol=1e-5) power_to_db_torch = power_to_db_transform( melspect_transform(sound)).squeeze().cpu() db_librosa = librosa.core.spectrum.power_to_db(librosa_mel) db_librosa_tensor = torch.from_numpy(db_librosa) self.assertEqual(power_to_db_torch.type(db_librosa_tensor.dtype), db_librosa_tensor, atol=5e-3, rtol=1e-5)
def test_pitch(self, frequency): sample_rate = 44100 test_sine_waveform = common_utils.get_sinusoid( frequency=frequency, sample_rate=sample_rate, duration=5, ) freq = torchaudio.functional.detect_pitch_frequency(test_sine_waveform, sample_rate) threshold = 1 s = ((freq - frequency).abs() > threshold).sum() self.assertFalse(s)
def test_detect_pitch_frequency(self, sample_rate, n_channels): # Use different frequencies to ensure each item in the batch returns a # different answer. torch.manual_seed(0) frequencies = torch.randint(100, 1000, [self.batch_size]) waveforms = torch.stack([ common_utils.get_sinusoid(frequency=frequency, sample_rate=sample_rate, n_channels=n_channels, duration=5) for frequency in frequencies ]) self.assert_batch_consistency(F.detect_pitch_frequency, waveforms, sample_rate)
def test_spectrogram(self, n_fft, hop_length, power): sample_rate = 16000 sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate) sound_librosa = sound.cpu().numpy().squeeze() spect_transform = torchaudio.transforms.Spectrogram( n_fft=n_fft, hop_length=hop_length, power=power) out_librosa, _ = librosa.core.spectrum._spectrogram( y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power) out_torch = spect_transform(sound).squeeze().cpu() self.assertEqual(out_torch, torch.from_numpy(out_librosa), atol=1e-5, rtol=1e-5)
def test_spectral_centroid(self, n_fft, hop_length): sample_rate = 16000 sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate) sound_librosa = sound.cpu().numpy().squeeze() spect_centroid = torchaudio.transforms.SpectralCentroid( sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length) out_torch = spect_centroid(sound).squeeze().cpu() out_librosa = librosa.feature.spectral_centroid(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length) out_librosa = torch.from_numpy(out_librosa)[0] self.assertEqual(out_torch.type(out_librosa.dtype), out_librosa, atol=1e-5, rtol=1e-5)
def test_mfcc(self, n_fft, hop_length, n_mels, n_mfcc): sample_rate = 16000 sound = common_utils.get_sinusoid(n_channels=1, sample_rate=sample_rate) sound_librosa = sound.cpu().numpy().squeeze() librosa_mel = librosa.feature.melspectrogram(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, htk=True, norm=None) db_librosa = librosa.core.spectrum.power_to_db(librosa_mel) # librosa.feature.mfcc doesn't pass kwargs properly since some of the # kwargs for melspectrogram and mfcc are the same. We just follow the # function body in # https://librosa.github.io/librosa/_modules/librosa/feature/spectral.html#melspectrogram # to mirror this function call with correct args: # # librosa_mfcc = librosa.feature.mfcc( # y=sound_librosa, sr=sample_rate, n_mfcc = n_mfcc, # hop_length=hop_length, n_fft=n_fft, htk=True, norm=None, n_mels=n_mels) librosa_mfcc = scipy.fftpack.dct(db_librosa, axis=0, type=2, norm='ortho')[:n_mfcc] librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc) melkwargs = {'hop_length': hop_length, 'n_fft': n_fft} mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc, norm='ortho', melkwargs=melkwargs) torch_mfcc = mfcc_transform(sound).squeeze().cpu() self.assertEqual(torch_mfcc.type(librosa_mfcc_tensor.dtype), librosa_mfcc_tensor, atol=5e-3, rtol=1e-5)
def test_apply_effects_tensor(self, args): effects = args['effects'] channels_first = True num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) trans = SoxEffectTensorTransform(effects, input_sr, channels_first) trans = torch_script(trans) wav = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32', channels_first=channels_first) found, sr_found = trans(wav) expected, sr_expected = sox_effects.apply_effects_tensor( wav, input_sr, effects, channels_first) assert sr_found == sr_expected self.assertEqual(expected, found)
def test_filtfilt_filter_sinusoid(self): """ Check that, for a signal comprising two sinusoids, applying filtfilt with appropriate filter coefficients correctly removes the higher-frequency sinusoid while imparting no time delay. """ T = 1.0 samples = 1000 waveform_k0 = get_sinusoid(frequency=5, sample_rate=samples // T, dtype=self.dtype, device=self.device).squeeze(0) waveform_k1 = get_sinusoid( frequency=200, sample_rate=samples // T, dtype=self.dtype, device=self.device, ).squeeze(0) waveform = waveform_k0 + waveform_k1 # Transfer function numerator and denominator polynomial coefficients # corresponding to 8th-order Butterworth filter with 100-cycle/T cutoff. # Generated with # >>> from scipy import signal # >>> b_coeffs, a_coeffs = signal.butter(8, 0.2) b_coeffs = torch.tensor( [ 2.39596441e-05, 1.91677153e-04, 6.70870035e-04, 1.34174007e-03, 1.67717509e-03, 1.34174007e-03, 6.70870035e-04, 1.91677153e-04, 2.39596441e-05, ], dtype=self.dtype, device=self.device, ) a_coeffs = torch.tensor( [ 1.0, -4.78451489, 10.44504107, -13.45771989, 11.12933104, -6.0252604, 2.0792738, -0.41721716, 0.0372001, ], dtype=self.dtype, device=self.device, ) # Extend waveform in each direction, preserving periodicity. padded_waveform = torch.cat((waveform[:-1], waveform, waveform[1:])) output_waveform = F.filtfilt(padded_waveform, a_coeffs, b_coeffs) # Remove padding from output waveform; confirm that result # closely matches waveform_k0. self.assertEqual( output_waveform[samples - 1:2 * samples - 1], waveform_k0, atol=1e-3, rtol=1e-3, )