Example #1
0
 def __init__(self, n_fft=1024, n_mels=80):
     super().__init__()
     self.mel_to_lin = transforms.InverseMelScale(n_stft=n_fft // 2 + 1,
                                                  n_mels=n_mels,
                                                  sample_rate=sample_rate,
                                                  max_iter=2048)
     self.griffin_lim = transforms.GriffinLim(n_fft=n_fft, hop_length=256)
Example #2
0
    def test_batch_InverseMelScale(self):
        n_fft = 8
        n_mels = 32
        n_stft = 5
        mel_spec = torch.randn(2, n_mels, 32) ** 2

        # Single then transform then batch
        expected = transforms.InverseMelScale(n_stft, n_mels)(mel_spec).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = transforms.InverseMelScale(n_stft, n_mels)(mel_spec.repeat(3, 1, 1, 1))

        # shape = (3, 2, n_mels, 32)
        self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))

        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
        # exactly same result. For this reason, tolerance is very relaxed here.
        self.assertTrue(torch.allclose(computed, expected, atol=1.0))
Example #3
0
    def test_InverseMelScale(self):
        """InverseMelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        n_stft = n_fft // 2 + 1
        hop_length = n_fft // 4

        # Prepare mel spectrogram input. We use torchaudio to compute one.
        sound, sample_rate = self._get_sample_data(
            'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14)
        spec_orig = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig)
        melspec_lr = melspec_ta.cpu().numpy().squeeze()
        # Perform InverseMelScale with torch audio and librosa
        spec_ta = transforms.InverseMelScale(
            n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta)
        spec_lr = librosa.feature.inverse.mel_to_stft(
            melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None)
        spec_lr = torch.from_numpy(spec_lr[None, ...])

        # Align dimensions
        # librosa does not return power spectrogram while torchaudio returns power spectrogram
        spec_orig = spec_orig.sqrt()
        spec_ta = spec_ta.sqrt()

        threshold = 2.0
        # This threshold was choosen empirically, based on the following observation
        #
        # torch.dist(spec_lr, spec_ta, p=float('inf'))
        # >>> tensor(1.9666)
        #
        # The spectrograms reconstructed by librosa and torchaudio are not very comparable elementwise.
        # This is because they use different approximation algorithms and resulting values can live
        # in different magnitude. (although most of them are very close)
        # See https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm
        # See https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf
        # distance over frequencies.
        assert torch.allclose(spec_ta, spec_lr, atol=threshold)

        threshold = 1700.0
        # This threshold was choosen empirically, based on the following observations
        #
        # torch.dist(spec_orig, spec_ta, p=1)
        # >>> tensor(1644.3516)
        # torch.dist(spec_orig, spec_lr, p=1)
        # >>> tensor(1420.7103)
        # torch.dist(spec_lr, spec_ta, p=1)
        # >>> tensor(943.2759)
        assert torch.dist(spec_orig, spec_ta, p=1) < threshold
Example #4
0
    def test_InverseMelScale(self):
        """Gauge the quality of InverseMelScale transform.

        As InverseMelScale is currently implemented with
        random initialization + iterative optimization,
        it is not practically possible to assert the difference between
        the estimated spectrogram and the original spectrogram as a whole.
        Estimated spectrogram has very huge descrepency locally.
        Thus in this test we gauge what percentage of elements are bellow
        certain tolerance.
        At the moment, the quality of estimated spectrogram is not good.
        When implementation is changed in a way it makes the quality even worse,
        this test will fail.
        """
        n_fft = 400
        power = 1
        n_mels = 64
        sample_rate = 8000

        n_stft = n_fft // 2 + 1

        # Generate reference spectrogram and input mel-scaled spectrogram
        expected = get_spectrogram(get_whitenoise(sample_rate=sample_rate,
                                                  duration=1,
                                                  n_channels=2),
                                   n_fft=n_fft,
                                   power=power).to(self.device, self.dtype)
        input = T.MelScale(n_mels=n_mels,
                           sample_rate=sample_rate).to(self.device,
                                                       self.dtype)(expected)

        # Run transform
        transform = T.InverseMelScale(n_stft,
                                      n_mels=n_mels,
                                      sample_rate=sample_rate).to(
                                          self.device, self.dtype)
        torch.random.manual_seed(0)
        result = transform(input)

        # Compare
        epsilon = 1e-60
        relative_diff = torch.abs((result - expected) / (expected + epsilon))

        for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
            print(f"Ratio of relative diff smaller than {tol:e} is "
                  f"{_get_ratio(relative_diff < tol)}")
        assert _get_ratio(relative_diff < 1e-1) > 0.2
        assert _get_ratio(relative_diff < 1e-3) > 5e-3
        assert _get_ratio(relative_diff < 1e-5) > 1e-5
Example #5
0
def _mel_to_wav(mel, sr=sample_rate, engine='librosa'):
    ''' using Griffin-Lim algorithm '''

    if engine == 'librosa':
        return librosa.feature.inverse.mel_to_audio(mel,
                                                    sr=sr,
                                                    **stft_params,
                                                    power=power)
    elif engine == 'torch':
        return _spec_to_wav(tf.InverseMelScale(
            n_stft=stft_params['n_fft'] // 2 + 1,
            sample_rate=sr,
            n_mels=n_mels,
            f_max=mel_params['fmax'],
            f_min=mel_params['fmin'],
            max_iter=1000)(mel),
                            sr=sr,
                            engine=engine)

    raise ValueError(engine)