Esempio n. 1
0
    def test_random_stft_sizes(self):
        for _ in range(5):
            nfft = random.randint(128, 2048)
            window_size = random.randint(128, nfft)
            hop_size = random.randint(64, window_size)
            fb_module = FilterbankFeatures(
                exact_pad=False, pad_to=1, n_fft=nfft, n_window_size=window_size, n_window_stride=hop_size
            )
            audio_length = random.randint(nfft, 2 ** 16)
            test_1 = torch.randn(1, audio_length)
            test_1_len = torch.tensor([audio_length])
            result, result_len = fb_module(test_1, test_1_len)
            assert (
                result.shape[2] == result_len[0]
            ), f"{result.shape} != {result_len}: {nfft}, {window_size}, {hop_size}, {audio_length}"

            spec = librosa.stft(
                test_1.cpu().detach().numpy().squeeze(), n_fft=nfft, hop_length=hop_size, win_length=window_size
            )

            assert (
                spec.shape[1] == result.shape[2]
            ), f"{result.shape} != {spec.shape}: {nfft}, {window_size}, {hop_size}, {audio_length}"

        for _ in range(5):
            nfft = random.randint(128, 2048)
            window_size = random.randint(128, nfft)
            hop_size = random.randint(64, window_size)
            fb_module = FilterbankFeatures(
                exact_pad=True, pad_to=1, n_fft=nfft, n_window_size=window_size, n_window_stride=hop_size
            )
            audio_length = random.randint(nfft, 2 ** 16)
            test_1 = torch.randn(1, audio_length)
            test_1_len = torch.tensor([audio_length])
            result, result_len = fb_module(test_1, test_1_len)
            assert (
                result.shape[2] == result_len[0]
            ), f"{result.shape} != {result_len}: {nfft}, {window_size}, {hop_size}, {audio_length}"

            test_2 = test_1.cpu().detach().numpy().squeeze()
            test_2 = np.pad(test_2, int((window_size - hop_size) // 2), mode="reflect")
            spec = librosa.stft(test_2, n_fft=nfft, hop_length=hop_size, win_length=window_size, center=False,)

            assert (
                spec.shape[1] == result.shape[2]
            ), f"{result.shape} != {spec.shape}: {nfft}, {window_size}, {hop_size}, {audio_length}"
Esempio n. 2
0
    def test_seq_len(self):
        fb_module = FilterbankFeatures(exact_pad=False, pad_to=1)
        test_1 = torch.randn(1, 800)
        test_1_len = torch.tensor([800])
        result, result_len = fb_module(test_1, test_1_len)
        assert result.shape[2] == result_len[0], f"{result.shape} != {result_len}"
        spec = librosa.stft(test_1.cpu().detach().numpy().squeeze(), n_fft=512, hop_length=160, win_length=320)

        assert spec.shape[1] == result.shape[2], f"{result.shape} != {spec.shape}"
def make_preprocessor_trainable(stt):
    big_dict = {
        k: v
        for k, v in stt.preprocessor.featurizer.__dict__.items()
        if not k.startswith('_') and k != 'forward'
    }
    st = stt.preprocessor.featurizer.state_dict()
    stt.preprocessor.featurizer = FilterbankFeatures(use_grads=True)
    stt.preprocessor.featurizer.load_state_dict(st)
    _ = {
        setattr(stt.preprocessor.featurizer, k, v)
        for k, v in big_dict.items()
    }
    #     stt = stt.cuda()
    return stt
Esempio n. 4
0
    def __init__(
        self,
        sample_rate=16000,
        window_size=0.02,
        window_stride=0.01,
        n_window_size=None,
        n_window_stride=None,
        window="hann",
        normalize="per_feature",
        n_fft=None,
        preemph=0.97,
        features=64,
        lowfreq=0,
        highfreq=None,
        log=True,
        log_zero_guard_type="add",
        log_zero_guard_value=2 ** -24,
        dither=1e-5,
        pad_to=16,
        frame_splicing=1,
        stft_exact_pad=False,
        stft_conv=False,
        pad_value=0,
        mag_power=2.0,
    ):
        super().__init__(n_window_size, n_window_stride)

        self._sample_rate = sample_rate
        if window_size and n_window_size:
            raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.")
        if window_stride and n_window_stride:
            raise ValueError(
                f"{self} received both window_stride and " f"n_window_stride. Only one should be specified."
            )
        if window_size:
            n_window_size = int(window_size * self._sample_rate)
        if window_stride:
            n_window_stride = int(window_stride * self._sample_rate)

        self.featurizer = FilterbankFeatures(
            sample_rate=self._sample_rate,
            n_window_size=n_window_size,
            n_window_stride=n_window_stride,
            window=window,
            normalize=normalize,
            n_fft=n_fft,
            preemph=preemph,
            nfilt=features,
            lowfreq=lowfreq,
            highfreq=highfreq,
            log=log,
            log_zero_guard_type=log_zero_guard_type,
            log_zero_guard_value=log_zero_guard_value,
            dither=dither,
            pad_to=pad_to,
            frame_splicing=frame_splicing,
            stft_exact_pad=stft_exact_pad,
            stft_conv=stft_conv,
            pad_value=pad_value,
            mag_power=mag_power,
        )