Ejemplo n.º 1
0
    def test_batch_MelScale(self):
        specgram = torch.randn(2, 31, 2786)

        # Single then transform then batch
        expected = transforms.MelScale()(specgram).repeat(3, 1, 1, 1)

        # Batch then transform
        computed = transforms.MelScale()(specgram.repeat(3, 1, 1, 1))

        # shape = (3, 2, 201, 1394)
        self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
        self.assertTrue(torch.allclose(computed, expected))
Ejemplo n.º 2
0
    def test_melscale_load_save(self):
        specgram = torch.ones(1, 1000, 100)
        melscale_transform = transforms.MelScale()
        melscale_transform(specgram)

        melscale_transform_copy = transforms.MelScale(n_stft=1000)
        melscale_transform_copy.load_state_dict(melscale_transform.state_dict())

        fb = melscale_transform.fb
        fb_copy = melscale_transform_copy.fb

        self.assertEqual(fb_copy.size(), (1000, 128))
        self.assertTrue(torch.allclose(fb, fb_copy))
Ejemplo n.º 3
0
    def test_melscale_unset_weight_warning(self):
        """Issue a warning if MelScale initialized without a weight

        As part of the deprecation of lazy intialization behavior (#1510),
        issue a warning if `n_stft` is not set.
        """
        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.simplefilter("always")
            T.MelScale(n_mels=64, sample_rate=8000)
        assert len(caught_warnings) == 1

        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.simplefilter("always")
            T.MelScale(n_mels=64, sample_rate=8000, n_stft=201)
        assert len(caught_warnings) == 0
Ejemplo n.º 4
0
    def test_MelScale(self):
        """MelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        hop_length = n_fft // 4

        # Prepare spectrogram input. We use torchaudio to compute one.
        sound, sample_rate = self._get_sample_data('whitenoise_1min.mp3')
        spec_ta = F.spectrogram(sound,
                                pad=0,
                                window=torch.hann_window(n_fft),
                                n_fft=n_fft,
                                hop_length=hop_length,
                                win_length=n_fft,
                                power=2,
                                normalized=False)
        spec_lr = spec_ta.cpu().numpy().squeeze()
        # Perform MelScale with torchaudio and librosa
        melspec_ta = transforms.MelScale(n_mels=n_mels,
                                         sample_rate=sample_rate)(spec_ta)
        melspec_lr = librosa.feature.melspectrogram(S=spec_lr,
                                                    sr=sample_rate,
                                                    n_fft=n_fft,
                                                    hop_length=hop_length,
                                                    win_length=n_fft,
                                                    center=True,
                                                    window='hann',
                                                    n_mels=n_mels,
                                                    htk=True,
                                                    norm=None)
        # Note: Using relaxed rtol instead of atol
        assert torch.allclose(melspec_ta,
                              torch.from_numpy(melspec_lr[None, ...]),
                              rtol=1e-3)
Ejemplo n.º 5
0
def preprocess(file_path='../DATASETS/LJSpeech-1.1/metadata.csv',
               root_dir='../DATASETS/LJSpeech-1.1'):
    with open(file_path, encoding='utf8') as file:
        data_ = [line.strip().split('|') for line in file]
    root_dir = root_dir
    sample_rate = 8000
    resample = transforms.Resample(orig_freq=22050, new_freq=sample_rate)
    spectogram = transforms.Spectrogram(n_fft=1024, hop_length=256)
    to_mel = transforms.MelScale(n_mels=80,
                                 sample_rate=sample_rate,
                                 n_stft=1024 // 2 + 1)

    mel_data = torch.zeros(len(data_), 316, 80)
    mel_len = torch.empty(len(data_), dtype=torch.int)

    for idx, data in enumerate(tqdm(data_)):
        path, text = data[0], data[1]
        path = f'{root_dir}/wavs/{path}.wav'

        data, sample_rate = torchaudio.load(path)
        data = resample(data)
        data = spectogram(data)
        data = to_mel(data)
        data = data.transpose(1, 2).squeeze(0)
        mel_data[idx, :data.size(0)] = data
        mel_len[idx] = data.size(0)

    torch.save(mel_data, f'{root_dir}/mel_data.pt')
    torch.save(mel_len, f'{root_dir}/mel_len.pt')
Ejemplo n.º 6
0
 def test_melscale(self):
     sample_rate = 8000
     n_fft = 400
     n_mels = n_fft // 2 + 1
     transform = T.MelScale(sample_rate=sample_rate, n_mels=n_mels)
     spec = get_spectrogram(
         get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2),
         n_fft=n_fft, power=1)
     self.assert_grad(transform, [spec])
 def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels):
     super(PhaseFbankCal, self).__init__()
     self.complexSpec = transforms.Spectrogram(n_fft=n_fft,
                                               win_length=win_length,
                                               hop_length=hop_length,
                                               power=None)
     self.mel_scale = transforms.MelScale(n_mels=n_mels,
                                          sample_rate=sample_rate,
                                          n_stft=n_fft // 2 + 1)
Ejemplo n.º 8
0
    def test_mel2(self):
        top_db = 80.
        s2db = transforms.AmplitudeToDB('power', top_db)

        waveform = self.waveform.clone()  # (1, 16000)
        waveform_scaled = self.scale(waveform)  # (1, 16000)
        mel_transform = transforms.MelSpectrogram()
        # check defaults
        spectrogram_torch = s2db(
            mel_transform(waveform_scaled))  # (1, 128, 321)
        self.assertTrue(spectrogram_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_torch.size(1), mel_transform.n_mels)
        # check correctness of filterbank conversion matrix
        self.assertTrue(mel_transform.mel_scale.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform.mel_scale.fb.sum(1).ge(0.).all())
        # check options
        kwargs = {
            'window_fn': torch.hamming_window,
            'pad': 10,
            'win_length': 500,
            'hop_length': 125,
            'n_fft': 800,
            'n_mels': 50
        }
        mel_transform2 = transforms.MelSpectrogram(**kwargs)
        spectrogram2_torch = s2db(
            mel_transform2(waveform_scaled))  # (1, 50, 513)
        self.assertTrue(spectrogram2_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram2_torch.size(1), mel_transform2.n_mels)
        self.assertTrue(mel_transform2.mel_scale.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform2.mel_scale.fb.sum(1).ge(0.).all())
        # check on multi-channel audio
        filepath = common_utils.get_asset_path(
            'steam-train-whistle-daniel_simon.wav')
        x_stereo = common_utils.load_wav(filepath)[0]  # (2, 278756), 44100
        spectrogram_stereo = s2db(mel_transform(x_stereo))  # (2, 128, 1394)
        self.assertTrue(spectrogram_stereo.dim() == 3)
        self.assertTrue(spectrogram_stereo.size(0) == 2)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_stereo.size(1), mel_transform.n_mels)
        # check filterbank matrix creation
        fb_matrix_transform = transforms.MelScale(n_mels=100,
                                                  sample_rate=16000,
                                                  f_min=0.,
                                                  f_max=None,
                                                  n_stft=400)
        self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all())
        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all())
        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
Ejemplo n.º 9
0
    def test_InverseMelScale(self):
        """InverseMelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        n_stft = n_fft // 2 + 1
        hop_length = n_fft // 4

        # Prepare mel spectrogram input. We use torchaudio to compute one.
        sound, sample_rate = self._get_sample_data(
            'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14)
        spec_orig = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig)
        melspec_lr = melspec_ta.cpu().numpy().squeeze()
        # Perform InverseMelScale with torch audio and librosa
        spec_ta = transforms.InverseMelScale(
            n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta)
        spec_lr = librosa.feature.inverse.mel_to_stft(
            melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None)
        spec_lr = torch.from_numpy(spec_lr[None, ...])

        # Align dimensions
        # librosa does not return power spectrogram while torchaudio returns power spectrogram
        spec_orig = spec_orig.sqrt()
        spec_ta = spec_ta.sqrt()

        threshold = 2.0
        # This threshold was choosen empirically, based on the following observation
        #
        # torch.dist(spec_lr, spec_ta, p=float('inf'))
        # >>> tensor(1.9666)
        #
        # The spectrograms reconstructed by librosa and torchaudio are not very comparable elementwise.
        # This is because they use different approximation algorithms and resulting values can live
        # in different magnitude. (although most of them are very close)
        # See https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm
        # See https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf
        # distance over frequencies.
        assert torch.allclose(spec_ta, spec_lr, atol=threshold)

        threshold = 1700.0
        # This threshold was choosen empirically, based on the following observations
        #
        # torch.dist(spec_orig, spec_ta, p=1)
        # >>> tensor(1644.3516)
        # torch.dist(spec_orig, spec_lr, p=1)
        # >>> tensor(1420.7103)
        # torch.dist(spec_lr, spec_ta, p=1)
        # >>> tensor(943.2759)
        assert torch.dist(spec_orig, spec_ta, p=1) < threshold
Ejemplo n.º 10
0
    def test_mel2(self):
        top_db = 80.
        s2db = transforms.SpectrogramToDB("power", top_db)

        audio_orig = self.sig.clone()  # (16000, 1)
        audio_scaled = transforms.Scale()(audio_orig)  # (16000, 1)
        audio_scaled = transforms.LC2CL()(audio_scaled)  # (1, 16000)
        mel_transform = transforms.MelSpectrogram()
        # check defaults
        spectrogram_torch = s2db(mel_transform(audio_scaled))  # (1, 319, 40)
        self.assertTrue(spectrogram_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_torch.size(-1), mel_transform.n_mels)
        # check correctness of filterbank conversion matrix
        self.assertTrue(mel_transform.fm.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform.fm.fb.sum(1).ge(0.).all())
        # check options
        kwargs = {
            "window": torch.hamming_window,
            "pad": 10,
            "ws": 500,
            "hop": 125,
            "n_fft": 800,
            "n_mels": 50
        }
        mel_transform2 = transforms.MelSpectrogram(**kwargs)
        spectrogram2_torch = s2db(mel_transform2(audio_scaled))  # (1, 506, 50)
        self.assertTrue(spectrogram2_torch.dim() == 3)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram2_torch.size(-1), mel_transform2.n_mels)
        self.assertTrue(mel_transform2.fm.fb.sum(1).le(1.).all())
        self.assertTrue(mel_transform2.fm.fb.sum(1).ge(0.).all())
        # check on multi-channel audio
        x_stereo, sr_stereo = torchaudio.load(self.test_filepath)
        spectrogram_stereo = s2db(mel_transform(x_stereo))
        self.assertTrue(spectrogram_stereo.dim() == 3)
        self.assertTrue(spectrogram_stereo.size(0) == 2)
        self.assertTrue(
            spectrogram_torch.ge(spectrogram_torch.max() - top_db).all())
        self.assertEqual(spectrogram_stereo.size(-1), mel_transform.n_mels)
        # check filterbank matrix creation
        fb_matrix_transform = transforms.MelScale(n_mels=100,
                                                  sr=16000,
                                                  f_max=None,
                                                  f_min=0.,
                                                  n_stft=400)
        self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all())
        self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all())
        self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
Ejemplo n.º 11
0
    def test_InverseMelScale(self):
        """Gauge the quality of InverseMelScale transform.

        As InverseMelScale is currently implemented with
        random initialization + iterative optimization,
        it is not practically possible to assert the difference between
        the estimated spectrogram and the original spectrogram as a whole.
        Estimated spectrogram has very huge descrepency locally.
        Thus in this test we gauge what percentage of elements are bellow
        certain tolerance.
        At the moment, the quality of estimated spectrogram is not good.
        When implementation is changed in a way it makes the quality even worse,
        this test will fail.
        """
        n_fft = 400
        power = 1
        n_mels = 64
        sample_rate = 8000

        n_stft = n_fft // 2 + 1

        # Generate reference spectrogram and input mel-scaled spectrogram
        expected = get_spectrogram(get_whitenoise(sample_rate=sample_rate,
                                                  duration=1,
                                                  n_channels=2),
                                   n_fft=n_fft,
                                   power=power).to(self.device, self.dtype)
        input = T.MelScale(n_mels=n_mels,
                           sample_rate=sample_rate).to(self.device,
                                                       self.dtype)(expected)

        # Run transform
        transform = T.InverseMelScale(n_stft,
                                      n_mels=n_mels,
                                      sample_rate=sample_rate).to(
                                          self.device, self.dtype)
        torch.random.manual_seed(0)
        result = transform(input)

        # Compare
        epsilon = 1e-60
        relative_diff = torch.abs((result - expected) / (expected + epsilon))

        for tol in [1e-1, 1e-3, 1e-5, 1e-10]:
            print(f"Ratio of relative diff smaller than {tol:e} is "
                  f"{_get_ratio(relative_diff < tol)}")
        assert _get_ratio(relative_diff < 1e-1) > 0.2
        assert _get_ratio(relative_diff < 1e-3) > 5e-3
        assert _get_ratio(relative_diff < 1e-5) > 1e-5
Ejemplo n.º 12
0
    def __init__(self, hparams: Hyperparams):
        super().__init__()

        self.hparams = hparams

        self.spectrogram = Spectrogram(
            self.hparams.n_fft,
            self.hparams.win_length,
            self.hparams.hop_length
        )

        self.mel_scale = transforms.MelScale(
            self.hparams.n_mels,
            self.hparams.sr,
            n_stft=self.hparams.n_fft // 2 + 1
        )
Ejemplo n.º 13
0
 def test_mel2(self):
     audio_orig = self.sig.clone()  # (16000, 1)
     audio_scaled = transforms.Scale()(audio_orig)  # (16000, 1)
     audio_scaled = transforms.LC2CL()(audio_scaled)  # (1, 16000)
     mel_transform = transforms.MelSpectrogram()
     # check defaults
     spectrogram_torch = mel_transform(audio_scaled)  # (1, 319, 40)
     self.assertTrue(spectrogram_torch.dim() == 3)
     self.assertTrue(spectrogram_torch.le(0.).all())
     self.assertTrue(spectrogram_torch.ge(mel_transform.top_db).all())
     self.assertEqual(spectrogram_torch.size(-1), mel_transform.n_mels)
     # check correctness of filterbank conversion matrix
     self.assertTrue(mel_transform.fm.fb.sum(1).le(1.).all())
     self.assertTrue(mel_transform.fm.fb.sum(1).ge(0.).all())
     # check options
     mel_transform2 = transforms.MelSpectrogram(window=torch.hamming_window,
                                                pad=10,
                                                ws=500,
                                                hop=125,
                                                n_fft=800,
                                                n_mels=50)
     spectrogram2_torch = mel_transform2(audio_scaled)  # (1, 506, 50)
     self.assertTrue(spectrogram2_torch.dim() == 3)
     self.assertTrue(spectrogram2_torch.le(0.).all())
     self.assertTrue(spectrogram2_torch.ge(mel_transform.top_db).all())
     self.assertEqual(spectrogram2_torch.size(-1), mel_transform2.n_mels)
     self.assertTrue(mel_transform2.fm.fb.sum(1).le(1.).all())
     self.assertTrue(mel_transform2.fm.fb.sum(1).ge(0.).all())
     # check on multi-channel audio
     x_stereo, sr_stereo = torchaudio.load(self.test_filepath)
     spectrogram_stereo = mel_transform(x_stereo)
     self.assertTrue(spectrogram_stereo.dim() == 3)
     self.assertTrue(spectrogram_stereo.size(0) == 2)
     self.assertTrue(spectrogram_stereo.le(0.).all())
     self.assertTrue(spectrogram_stereo.ge(mel_transform.top_db).all())
     self.assertEqual(spectrogram_stereo.size(-1), mel_transform.n_mels)
     # check filterbank matrix creation
     fb_matrix_transform = transforms.MelScale(n_mels=100,
                                               sr=16000,
                                               f_max=None,
                                               f_min=0.,
                                               n_stft=400)
     self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all())
     self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all())
     self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
 def tfm_spectro(ad, sr):
     # We must reshape signal for torchaudio to generate the spectrogram.
     ws=512
     hop=256
     to_db_scale=False
     n_fft=1024
     f_min=0.0
     f_max=-80 
     pad=0
     n_mels=128
     #mel = transforms.MelSpectrogram(sr, n_mels=n_mels, n_fft=n_fft, hop=hop, f_min=f_min, f_max=f_max, pad=pad)(ad)
     sp = transforms.Spectrogram()(ad)
     mel = transforms.MelScale()(sp)
     #mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human.
     #if to_db_scale: mel = transforms.SpectrogramToDB(stype='magnitude', top_db=f_max)(mel)
     #mel = mel.detach().numpy()
     if to_db_scale: 
         mel = 20*torch.log10(mel)
     return mel
Ejemplo n.º 15
0
 def test_MelScale(self):
     spec_f = torch.rand((1, 6, 201))
     self._assert_consistency(T.MelScale(), spec_f)
Ejemplo n.º 16
0
 def test_MelScale_invalid(self):
     with self.assertRaises(ValueError):
         torch.jit.script(T.MelScale())