def test_pitch_feats(self, kwargs):
        """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
        sample_rate = kwargs['sample_rate']
        waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate)
        result = F.compute_kaldi_pitch(waveform[0], **kwargs)

        waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate)
        wave_file = self.get_temp_path('test.wav')
        save_wav(wave_file, waveform, sample_rate)

        command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
        kaldi_result = run_kaldi(command, 'scp', wave_file)
        self.assert_equal(result, expected=kaldi_result)
 def test_mel_spectrogram(self, n_fft, hop_length, n_mels, norm):
     sample_rate = 16000
     sound = common_utils.get_sinusoid(n_channels=1,
                                       sample_rate=sample_rate)
     sound_librosa = sound.cpu().numpy().squeeze()
     melspect_transform = torchaudio.transforms.MelSpectrogram(
         sample_rate=sample_rate,
         window_fn=torch.hann_window,
         hop_length=hop_length,
         n_mels=n_mels,
         n_fft=n_fft,
         norm=norm)
     librosa_mel = librosa.feature.melspectrogram(y=sound_librosa,
                                                  sr=sample_rate,
                                                  n_fft=n_fft,
                                                  hop_length=hop_length,
                                                  n_mels=n_mels,
                                                  htk=True,
                                                  norm=norm)
     librosa_mel_tensor = torch.from_numpy(librosa_mel)
     torch_mel = melspect_transform(sound).squeeze().cpu()
     self.assertEqual(torch_mel.type(librosa_mel_tensor.dtype),
                      librosa_mel_tensor,
                      atol=5e-3,
                      rtol=1e-5)
Beispiel #3
0
    def test_apply_effects(self, args):
        """`apply_effects_tensor` should return identical data as sox command"""
        effects = args['effects']
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)
        output_sr = args.get("output_sample_rate")

        input_path = self.get_temp_path('input.wav')
        reference_path = self.get_temp_path('reference.wav')

        original = get_sinusoid(frequency=800,
                                sample_rate=input_sr,
                                n_channels=num_channels,
                                dtype='float32')
        save_wav(input_path, original, input_sr)
        sox_utils.run_sox_effect(input_path,
                                 reference_path,
                                 effects,
                                 output_sample_rate=output_sr)

        expected, expected_sr = load_wav(reference_path)
        found, sr = sox_effects.apply_effects_tensor(original, input_sr,
                                                     effects)

        assert sr == expected_sr
        self.assertEqual(expected, found)
Beispiel #4
0
    def test_apply_effects_file(self, args):
        effects = args['effects']
        channels_first = True
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)

        trans = SoxEffectFileTransform(effects, channels_first)

        path = self.get_temp_path('sox_effect.zip')
        torch.jit.script(trans).save(path)
        trans = torch.jit.load(path)

        path = self.get_temp_path('input.wav')
        wav = get_sinusoid(frequency=800,
                           sample_rate=input_sr,
                           n_channels=num_channels,
                           dtype='float32',
                           channels_first=channels_first)
        save_wav(path,
                 wav,
                 sample_rate=input_sr,
                 channels_first=channels_first)

        found, sr_found = trans(path)
        expected, sr_expected = sox_effects.apply_effects_file(
            path, effects, channels_first)

        assert sr_found == sr_expected
        self.assertEqual(expected, found)
 def test_detect_pitch_frequency(self, frequency, sample_rate, n_channels):
     waveform = common_utils.get_sinusoid(frequency=frequency,
                                          sample_rate=sample_rate,
                                          n_channels=n_channels,
                                          duration=5)
     self.assert_batch_consistencies(F.detect_pitch_frequency, waveform,
                                     sample_rate)
Beispiel #6
0
    def test_MelSpectrogram(self, n_fft, hop_length, n_mels, norm, mel_scale):
        sample_rate = 16000
        waveform = get_sinusoid(
            sample_rate=sample_rate,
            n_channels=1,
        ).to(self.device, self.dtype)

        expected = librosa.feature.melspectrogram(y=waveform[0].cpu().numpy(),
                                                  sr=sample_rate,
                                                  n_fft=n_fft,
                                                  hop_length=hop_length,
                                                  n_mels=n_mels,
                                                  norm=norm,
                                                  htk=mel_scale == "htk")
        result = T.MelSpectrogram(
            sample_rate=sample_rate,
            window_fn=torch.hann_window,
            hop_length=hop_length,
            n_mels=n_mels,
            n_fft=n_fft,
            norm=norm,
            mel_scale=mel_scale,
        ).to(self.device, self.dtype)(waveform)[0]
        self.assertEqual(result,
                         torch.from_numpy(expected),
                         atol=5e-4,
                         rtol=1e-5)
    def test_detect_pitch_frequency(self):
        waveform = common_utils.get_sinusoid(sample_rate=44100)

        def func(tensor):
            sample_rate = 44100
            return F.detect_pitch_frequency(tensor, sample_rate)

        self._assert_consistency(func, waveform)
Beispiel #8
0
 def test_apply_effects_tensor(self, args):
     """`apply_effects_tensor` should not crash"""
     effects = args['effects']
     num_channels = args.get("num_channels", 2)
     input_sr = args.get("input_sample_rate", 8000)
     original = get_sinusoid(
         frequency=800, sample_rate=input_sr,
         n_channels=num_channels, dtype='float32')
     _found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
Beispiel #9
0
    def test_detect_pitch_frequency_pitch(self, frequency):
        sample_rate = 44100
        test_sine_waveform = get_sinusoid(frequency=frequency,
                                          sample_rate=sample_rate,
                                          duration=5)

        freq = F.detect_pitch_frequency(test_sine_waveform, sample_rate)

        threshold = 1
        s = ((freq - frequency).abs() > threshold).sum()
        self.assertFalse(s)
    def test_s2db(self, n_fft, hop_length, power, n_mels, norm, skip_ci=False):
        if skip_ci and 'CI' in os.environ:
            self.skipTest('Test is known to fail on CI')
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1,
                                          sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        spect_transform = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, hop_length=hop_length, power=power)
        out_librosa, _ = librosa.core.spectrum._spectrogram(
            y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power)
        melspect_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            window_fn=torch.hann_window,
            hop_length=hop_length,
            n_mels=n_mels,
            n_fft=n_fft,
            norm=norm)
        librosa_mel = librosa.feature.melspectrogram(y=sound_librosa,
                                                     sr=sample_rate,
                                                     n_fft=n_fft,
                                                     hop_length=hop_length,
                                                     n_mels=n_mels,
                                                     htk=True,
                                                     norm=norm)

        power_to_db_transform = torchaudio.transforms.AmplitudeToDB(
            'power', 80.)
        power_to_db_torch = power_to_db_transform(
            spect_transform(sound)).squeeze().cpu()
        power_to_db_librosa = librosa.core.spectrum.power_to_db(out_librosa)
        self.assertEqual(power_to_db_torch,
                         torch.from_numpy(power_to_db_librosa),
                         atol=5e-3,
                         rtol=1e-5)

        mag_to_db_transform = torchaudio.transforms.AmplitudeToDB(
            'magnitude', 80.)
        mag_to_db_torch = mag_to_db_transform(torch.abs(sound)).squeeze().cpu()
        mag_to_db_librosa = librosa.core.spectrum.amplitude_to_db(
            sound_librosa)
        self.assertEqual(mag_to_db_torch,
                         torch.from_numpy(mag_to_db_librosa),
                         atol=5e-3,
                         rtol=1e-5)

        power_to_db_torch = power_to_db_transform(
            melspect_transform(sound)).squeeze().cpu()
        db_librosa = librosa.core.spectrum.power_to_db(librosa_mel)
        db_librosa_tensor = torch.from_numpy(db_librosa)
        self.assertEqual(power_to_db_torch.type(db_librosa_tensor.dtype),
                         db_librosa_tensor,
                         atol=5e-3,
                         rtol=1e-5)
Beispiel #11
0
    def test_pitch(self, frequency):
        sample_rate = 44100
        test_sine_waveform = common_utils.get_sinusoid(
            frequency=frequency, sample_rate=sample_rate, duration=5,
        )

        freq = torchaudio.functional.detect_pitch_frequency(test_sine_waveform, sample_rate)

        threshold = 1
        s = ((freq - frequency).abs() > threshold).sum()
        self.assertFalse(s)
Beispiel #12
0
 def test_detect_pitch_frequency(self, sample_rate, n_channels):
     # Use different frequencies to ensure each item in the batch returns a
     # different answer.
     torch.manual_seed(0)
     frequencies = torch.randint(100, 1000, [self.batch_size])
     waveforms = torch.stack([
         common_utils.get_sinusoid(frequency=frequency,
                                   sample_rate=sample_rate,
                                   n_channels=n_channels,
                                   duration=5) for frequency in frequencies
     ])
     self.assert_batch_consistency(F.detect_pitch_frequency, waveforms,
                                   sample_rate)
    def test_spectrogram(self, n_fft, hop_length, power):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1,
                                          sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        spect_transform = torchaudio.transforms.Spectrogram(
            n_fft=n_fft, hop_length=hop_length, power=power)
        out_librosa, _ = librosa.core.spectrum._spectrogram(
            y=sound_librosa, n_fft=n_fft, hop_length=hop_length, power=power)

        out_torch = spect_transform(sound).squeeze().cpu()
        self.assertEqual(out_torch,
                         torch.from_numpy(out_librosa),
                         atol=1e-5,
                         rtol=1e-5)
    def test_spectral_centroid(self, n_fft, hop_length):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1,
                                          sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        spect_centroid = torchaudio.transforms.SpectralCentroid(
            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length)
        out_torch = spect_centroid(sound).squeeze().cpu()

        out_librosa = librosa.feature.spectral_centroid(y=sound_librosa,
                                                        sr=sample_rate,
                                                        n_fft=n_fft,
                                                        hop_length=hop_length)
        out_librosa = torch.from_numpy(out_librosa)[0]

        self.assertEqual(out_torch.type(out_librosa.dtype),
                         out_librosa,
                         atol=1e-5,
                         rtol=1e-5)
    def test_mfcc(self, n_fft, hop_length, n_mels, n_mfcc):
        sample_rate = 16000
        sound = common_utils.get_sinusoid(n_channels=1,
                                          sample_rate=sample_rate)
        sound_librosa = sound.cpu().numpy().squeeze()
        librosa_mel = librosa.feature.melspectrogram(y=sound_librosa,
                                                     sr=sample_rate,
                                                     n_fft=n_fft,
                                                     hop_length=hop_length,
                                                     n_mels=n_mels,
                                                     htk=True,
                                                     norm=None)
        db_librosa = librosa.core.spectrum.power_to_db(librosa_mel)

        # librosa.feature.mfcc doesn't pass kwargs properly since some of the
        # kwargs for melspectrogram and mfcc are the same. We just follow the
        # function body in
        # https://librosa.github.io/librosa/_modules/librosa/feature/spectral.html#melspectrogram
        # to mirror this function call with correct args:
        #
        # librosa_mfcc = librosa.feature.mfcc(
        #     y=sound_librosa, sr=sample_rate, n_mfcc = n_mfcc,
        #     hop_length=hop_length, n_fft=n_fft, htk=True, norm=None, n_mels=n_mels)

        librosa_mfcc = scipy.fftpack.dct(db_librosa,
                                         axis=0,
                                         type=2,
                                         norm='ortho')[:n_mfcc]
        librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc)

        melkwargs = {'hop_length': hop_length, 'n_fft': n_fft}
        mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate,
                                                    n_mfcc=n_mfcc,
                                                    norm='ortho',
                                                    melkwargs=melkwargs)
        torch_mfcc = mfcc_transform(sound).squeeze().cpu()

        self.assertEqual(torch_mfcc.type(librosa_mfcc_tensor.dtype),
                         librosa_mfcc_tensor,
                         atol=5e-3,
                         rtol=1e-5)
Beispiel #16
0
    def test_apply_effects_tensor(self, args):
        effects = args['effects']
        channels_first = True
        num_channels = args.get("num_channels", 2)
        input_sr = args.get("input_sample_rate", 8000)

        trans = SoxEffectTensorTransform(effects, input_sr, channels_first)

        trans = torch_script(trans)

        wav = get_sinusoid(frequency=800,
                           sample_rate=input_sr,
                           n_channels=num_channels,
                           dtype='float32',
                           channels_first=channels_first)
        found, sr_found = trans(wav)
        expected, sr_expected = sox_effects.apply_effects_tensor(
            wav, input_sr, effects, channels_first)

        assert sr_found == sr_expected
        self.assertEqual(expected, found)
Beispiel #17
0
    def test_filtfilt_filter_sinusoid(self):
        """
        Check that, for a signal comprising two sinusoids, applying filtfilt
        with appropriate filter coefficients correctly removes the higher-frequency
        sinusoid while imparting no time delay.
        """
        T = 1.0
        samples = 1000

        waveform_k0 = get_sinusoid(frequency=5,
                                   sample_rate=samples // T,
                                   dtype=self.dtype,
                                   device=self.device).squeeze(0)
        waveform_k1 = get_sinusoid(
            frequency=200,
            sample_rate=samples // T,
            dtype=self.dtype,
            device=self.device,
        ).squeeze(0)
        waveform = waveform_k0 + waveform_k1

        # Transfer function numerator and denominator polynomial coefficients
        # corresponding to 8th-order Butterworth filter with 100-cycle/T cutoff.
        # Generated with
        # >>> from scipy import signal
        # >>> b_coeffs, a_coeffs = signal.butter(8, 0.2)
        b_coeffs = torch.tensor(
            [
                2.39596441e-05,
                1.91677153e-04,
                6.70870035e-04,
                1.34174007e-03,
                1.67717509e-03,
                1.34174007e-03,
                6.70870035e-04,
                1.91677153e-04,
                2.39596441e-05,
            ],
            dtype=self.dtype,
            device=self.device,
        )
        a_coeffs = torch.tensor(
            [
                1.0,
                -4.78451489,
                10.44504107,
                -13.45771989,
                11.12933104,
                -6.0252604,
                2.0792738,
                -0.41721716,
                0.0372001,
            ],
            dtype=self.dtype,
            device=self.device,
        )

        # Extend waveform in each direction, preserving periodicity.
        padded_waveform = torch.cat((waveform[:-1], waveform, waveform[1:]))

        output_waveform = F.filtfilt(padded_waveform, a_coeffs, b_coeffs)

        # Remove padding from output waveform; confirm that result
        # closely matches waveform_k0.
        self.assertEqual(
            output_waveform[samples - 1:2 * samples - 1],
            waveform_k0,
            atol=1e-3,
            rtol=1e-3,
        )