Beispiel #1
0
    def forward(self, signal, threshs_file):

        if self.phi is None:
            return signal

        # fft
        complex_spectrum = torch.stft(signal,
                                      n_fft=self.win_length,
                                      hop_length=self.hop_length,
                                      win_length=self.win_length,
                                      window=torch.hamming_window(
                                          self.win_length),
                                      pad_mode='constant',
                                      onesided=True)

        # mask signal with psychoacoustic thresholds
        mask = self.get_psycho_mask(complex_spectrum, threshs_file)
        complex_spectrum_masked = complex_spectrum * mask

        # ifft
        signal_out = torch.istft(complex_spectrum_masked,
                                 n_fft=self.win_length,
                                 hop_length=self.hop_length,
                                 win_length=self.win_length,
                                 window=torch.hamming_window(self.win_length),
                                 onesided=True)

        return signal_out
def test_pwelch(random_state):
    from cplxmodule.utils.spectrum import pwelch
    from scipy.signal import welch

    # https://www.mathworks.com/help/signal/ref/pwelch.html#btulskp-6
    fs = 1000.
    tt = np.r_[:5 * fs - 1] / fs

    shape = 2, len(tt)

    epsilon = random_state.randn(*shape) + 1j * random_state.randn(*shape)
    np_x = np.cos(2 * np.pi * 100 * tt)[np.newaxis] + epsilon * 0.01

    tr_x = torch.tensor(np.stack([np_x.real, np_x.imag], axis=-1))
    tr_x.requires_grad = False

    tr_window = torch.hamming_window(500, periodic=False, dtype=tr_x.dtype)

    tr_ff, tr_px = pwelch(tr_x, 1, tr_window, fs=fs,
                          scaling="density", n_overlap=300)
    np_ff, np_px = welch(np_x, fs=fs, axis=-1, window=tr_window.numpy(),
                         nfft=None, nperseg=None, scaling="density",
                         noverlap=300, detrend=False, return_onesided=False)

    assert torch.allclose(tr_px, torch.from_numpy(np_px))
    assert torch.allclose(tr_ff, torch.from_numpy(np_ff))

    tr_ff, tr_px = pwelch(tr_x, 1, tr_window, fs=fs,
                          scaling="spectrum", n_overlap=499)
    np_ff, np_px = welch(np_x, fs=fs, axis=-1, window=tr_window.numpy(),
                         nfft=None, nperseg=None, scaling="spectrum",
                         noverlap=499, detrend=False, return_onesided=False)

    assert torch.allclose(tr_px, torch.from_numpy(np_px))
    assert torch.allclose(tr_ff, torch.from_numpy(np_ff))
Beispiel #3
0
    def get_spectrogram_feature(self, signal):
        spectrogram = torch.stft(
            torch.FloatTensor(signal),
            self.n_fft,
            hop_length=self.hop_length,
            win_length=self.n_fft,
            window=torch.hamming_window(self.n_fft),
            center=False,
            normalized=False,
            onesided=True
        )
        spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5)
        spectrogram = np.log1p(spectrogram.numpy())

        # Refer to "Sequence to Sequence Learning with Neural Network" paper
        if self.input_reverse:
            spectrogram = spectrogram[:, ::-1]
            spectrogram = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(spectrogram, 0, 1)))

        else:
            spectrogram = torch.FloatTensor(spectrogram).transpose(0, 1)

        if self.normalize:
            spectrogram -= spectrogram.mean()

        return spectrogram
Beispiel #4
0
def get_window(window_type: str,
               window_length_in_samp: int,
               device: Optional[torch.device] = None) -> torch.Tensor:
    # Increase precision in order to achieve parity with scipy.signal.windows.get_window implementation
    if window_type == "bartlett":
        return torch.bartlett_window(window_length_in_samp,
                                     periodic=False,
                                     dtype=torch.float64,
                                     device=device).to(torch.float32)
    elif window_type == "blackman":
        return torch.blackman_window(window_length_in_samp,
                                     periodic=False,
                                     dtype=torch.float64,
                                     device=device).to(torch.float32)
    elif window_type == "hamming":
        return torch.hamming_window(window_length_in_samp,
                                    periodic=False,
                                    dtype=torch.float64,
                                    device=device).to(torch.float32)
    elif window_type == "hann":
        return torch.hann_window(window_length_in_samp,
                                 periodic=False,
                                 dtype=torch.float64,
                                 device=device).to(torch.float32)
    else:
        raise ValueError(f"Unknown window type: {window_type}")
def test_hamming_window(random_state=None):
    from scipy.signal.windows import hamming

    n_window = 1024

    np_window = hamming(n_window, False).astype(np.float64)
    tr_window = torch.hamming_window(n_window, periodic=True,
                                     dtype=torch.float64)

    assert torch.allclose(tr_window, torch.from_numpy(np_window))

    np_window = hamming(n_window, True).astype(np.float64)
    tr_window = torch.hamming_window(n_window, periodic=False,
                                     dtype=torch.float64)

    assert torch.allclose(tr_window, torch.from_numpy(np_window))
Beispiel #6
0
def _feature_window_function(window_type: str, window_size: int,
                             blackman_coeff: float, device: str) -> Tensor:
    r"""Returns a window function with the given type and size
    """

    if window_type == HANNING:
        window = torch.hann_window(window_size, periodic=False)

    elif window_type == HAMMING:
        window = torch.hamming_window(window_size,
                                      periodic=False,
                                      alpha=0.54,
                                      beta=0.46)

    elif window_type == POVEY:
        # like hanning but goes to zero at edges
        window = torch.hann_window(window_size, periodic=False).pow(0.85)

    elif window_type == RECTANGULAR:
        window = torch.ones(window_size)

    elif window_type == BLACKMAN:
        a = 2 * math.pi / (window_size - 1)
        window_function = torch.arange(window_size)
        # can't use torch.blackman_window as they use different coefficients
        window = (blackman_coeff - 0.5 * torch.cos(a * window_function) +
                  (0.5 - blackman_coeff) * torch.cos(2 * a * window_function))

    else:
        raise Exception('Invalid window type ' + window_type)

    return window.to(device)
Beispiel #7
0
def overlap_add(mel_frames, nmels):
    # Hamming windows used for overlap add
    hamm = torch.hamming_window(160, periodic=False)
    half_hamm = torch.cat((torch.ones(80), hamm[80:]), dim=0)
    hamm.unsqueeze_(1)
    half_hamm.unsqueeze_(1)

    time = (1 + len(mel_frames)) * 80
    recon = torch.zeros((time, nmels))
    ind = torch.arange(160) - 80

    for i in range(len(mel_frames)):
        frame = mel_frames[i]
        if frame.shape[1] != nmels:
            frame = frame.permute(1, 0)

        ind += 80
        if i == 0:
            recon[ind, :] += frame * half_hamm
        elif i == len(mel_frames) - 1:
            recon[ind, :] += frame * torch.flip(half_hamm, dims=(0, ))
        else:
            recon[ind, :] += frame * hamm

    return recon
Beispiel #8
0
 def torch_spec2wav(self, spectrogram, phase=None):
     spectrogram = spectrogram.transpose(2, 1)
     phase = phase.transpose(2, 1)
     # denormalise spectrogram
     S = (torch.clamp(spectrogram, 0.0, 1.0) - 1.0) * -self.min_level_db
     S = S + self.ref_level_db
     # db_to_amp
     stft_matrix = torch.pow(10.0, S * 0.05)
     # invert phase
     phase = torch.stack([phase.cos(), phase.sin()],
                         dim=-1).to(dtype=stft_matrix.dtype,
                                    device=stft_matrix.device)
     stft_matrix = stft_matrix.unsqueeze(-1).expand_as(phase)
     return torchaudio.functional.istft(
         stft_matrix * torch.exp(phase),
         self.n_fft,
         hop_length=self.hop_length,
         win_length=self.win_length,
         window=torch.hamming_window(
             self.win_length, periodic=False, alpha=0.5,
             beta=0.5).to(device=stft_matrix.device),
         center=True,
         normalized=False,
         onesided=True,
         length=None)
def get_spectrogram_feature(filepath):
    (rate, width, sig) = wavio.readwav(filepath)
    sig = sig.ravel()

    # sig, _ = librosa.effects.trim(sig.astype(np.float32))
    valid_pos = np.where(sig > MIN_SIGNAL_VALUE)[0]
    sig = sig[valid_pos[0]:valid_pos[-1]]

    if len(sig) > MAX_SIGNAL_LENGTH:
        sig = sig[:MAX_SIGNAL_LENGTH]

    stft = torch.stft(torch.FloatTensor(sig),
                        N_FFT,
                        hop_length=int(0.01*SAMPLE_RATE),
                        win_length=int(0.030*SAMPLE_RATE),
                        window=torch.hamming_window(int(0.030*SAMPLE_RATE)),
                        center=False,
                        normalized=False,
                        onesided=True)

    stft = (stft[:,:,0].pow(2) + stft[:,:,1].pow(2)).pow(0.5);
    amag = stft.numpy();

    feat = torch.FloatTensor(amag)
    feat = torch.FloatTensor(feat).transpose(0, 1)

    return feat
Beispiel #10
0
def get_window(name, window_length, squared=False):
    """
    Returns a windowing function.
    
    Arguments:
    ----------
        window (str)                : name of the window, currently only 'hann' is available
        window_length (int)         : length of the window
        squared (bool)              : if true, square the window
        
    Returns:
    ----------
        torch.FloatTensor           : window of size `window_length`
    """
    if name == "hann":
        window = torch.hann_window(window_length)
    elif name == "hamming":
        window = torch.hamming_window(window_length)
    elif name == "blackman":
        window = torch.blackman_window(window_length)
    else:
        raise ValueError("Invalid window name {}".format(name))
    if squared:
        window *= window
    return window
Beispiel #11
0
def sinc_impulse_response(cutoff_frequency, window_size=512, sample_rate=None):
    """Get a sinc impulse response for a set of low-pass cutoff frequencies.

    Args:
        cutoff_frequency: Frequency cutoff for low-pass sinc filter. If the
            sample_rate is given, cutoff_frequency is in Hertz. If sample_rate
            is None, cutoff_frequency is normalized ratio (frequency/nyquist)
            in the range [0, 1.0]. Shape [batch_size, n_time, 1].
        window_size: Size of the Hamming window to apply to the impulse.
        sample_rate: Optionally provide the sample rate.

    Returns:
        impulse_response: A series of impulse responses. Shape
            [batch_size, n_time, (window_size // 2) * 2 + 1].
    """
    if sample_rate is not None:
        cutoff_frequency *= 2 / sample_rate
    half_size = window_size // 2
    full_size = half_size * 2 + 1
    idx = th.arange(-half_size, half_size + 1, dtype=th.float)[None, None, :]

    impulse_response = sinc(cutoff_frequency * idx)
    window = th.hamming_window(full_size).expand_as(impulse_response)
    impulse_response = window * th.real(impulse_response)
    return impulse_response / impulse_response.sum(-1, keepdim=True)
Beispiel #12
0
    def __getitem__(self, idx):
        noisy_path = self.noisy_WAVs[idx]
        clean_path = self.clean_dir.joinpath(noisy_path.name.split('+')[0] + '.wav')  # get the filename of the clean WAV from the filename of the noisy WAV
        while True:
            try:
                clean_waveform, _ = torchaudio.load(clean_path, normalization=2**15)
                noisy_waveform, _ = torchaudio.load(noisy_path, normalization=2**15)
            except (RuntimeError, OSError):
                continue
            break

        assert clean_waveform.shape[0] == 1 and noisy_waveform.shape[0] == 1, 'WAV file is not single channel!'

        window = torch.hamming_window(self.n_fft)
        x_stft = torch.stft(noisy_waveform.view(-1), n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, window=window)
        y_stft = torch.stft(clean_waveform.view(-1), n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, window=window)

        x_ps = x_stft.pow(2).sum(-1)
        x_lps = LogTransform()(x_ps)

        x_ms = x_ps.sqrt()
        y_ms = y_stft.pow(2).sum(-1).sqrt()

        noise_ms = (x_stft - y_stft).pow(2).sum(-1).sqrt()

        # VAD
        y_ms_filtered = y_ms[self.VAD_frequencies]
        y_energy_filtered = y_ms_filtered.pow(2).mean(dim=0)
        y_energy_filtered_averaged = self.__moving_average(y_energy_filtered)
        y_peak_energy = y_energy_filtered_averaged.max()
        VAD = torch.where(y_energy_filtered_averaged > y_peak_energy / 1000, torch.ones_like(y_energy_filtered), torch.zeros_like(y_energy_filtered))
        VAD = VAD.bool()

        # mean normalization
        frames = []
        x_lps = x_lps.transpose(0, 1)  # (time, frequency)
        n_init_frames = self.n_init_frames
        alpha_feat_init = self.alpha_feat_init
        alpha_feat = self.alpha_feat
        for frame_counter, frame_feature in enumerate(x_lps):
            if frame_counter < n_init_frames:
                alpha = alpha_feat_init
            else:
                alpha = alpha_feat
            if frame_counter == 0:
                mu = frame_feature
                sigmasquare = frame_feature.pow(2)
            mu = alpha * mu + (1 - alpha) * frame_feature
            sigmasquare = alpha * sigmasquare + (1 - alpha) * frame_feature.pow(2)
            sigma = torch.sqrt(torch.clamp(sigmasquare - mu.pow(2), min=1e-12))  # limit for sqrt
            norm_feature = (frame_feature - mu) / sigma
            frames.append(norm_feature)

        x_lps = torch.stack(frames, dim=0).transpose(0, 1)   # (frequency, time)

        if not self.test:
            return x_lps, x_ms, y_ms, noise_ms, VAD
        if self.test:
            return noisy_waveform.view(-1), clean_waveform.view(-1), x_stft, y_stft, x_lps, x_ms, y_ms, VAD
Beispiel #13
0
 def __init__(self, win_length, hop_length=None, n_fft=None):
     super().__init__()
     self.window = torch.hamming_window(win_length)
     if hop_length is None:
         hop_length = win_length // 4
     if n_fft is None:
         n_fft = win_length
     self.hop_length = hop_length
     self.n_fft = n_fft
Beispiel #14
0
 def _get_feature(self, signal: np.ndarray) -> np.ndarray:
     spectrogram = torch.stft(
         Tensor(signal), self.n_fft, hop_length=self.hop_length,
         win_length=self.n_fft, window=torch.hamming_window(self.n_fft),
         center=False, normalized=False, onesided=True
     )
     spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5)
     spectrogram = np.log1p(spectrogram.numpy())
     return spectrogram
Beispiel #15
0
def get_torch_spectrogram(filepath, sr=16000, window_size=20, stride=10):
    r"""
    get a spectrogram by torch.

    Args: filepath, n_mels, del_silence, input_reverse, normalize, sr, wiindow_size, stride
        filepath (str): specific path of audio file
        sr (int): sample rate
        window_size (int): window size (ms)
        stride (int): forwarding size (ms)

    Returns: spectrogram
        - **spectrogram** (torch.Tensor): return Spectrogram feature

    Examples::
        Generate mel spectrogram from a time series

    >>> get_torch_spectrogram(filepath)
    Tensor([[  2.891e-07,   2.548e-03, ...,   8.116e-09,   5.633e-09],
            [  1.986e-07,   1.162e-02, ...,   9.332e-08,   6.716e-09],
            ...,
            [  3.668e-09,   2.029e-08, ...,   3.208e-09,   2.864e-09],
            [  2.561e-10,   2.096e-09, ...,   7.543e-10,   6.101e-10]])
    """
    if filepath.endswith('.pcm'):
        try:
            pcm = np.memmap(filepath, dtype='h', mode='r')
        except RuntimeError:
            logger.info('RuntimeError in {0}'.format(filepath))
            return None

        signal = np.array([float(x) for x in pcm])

    elif filepath.endswith('.wav'):
        signal, _ = librosa.core.load(filepath, sr=sr)

    else:
        raise ValueError("Unsupported format: {0}".format(
            filepath.split('.')[-1]))

    N_FFT = int(sr * 0.001 * window_size)
    STRIDE = int(sr * 0.001 * stride)

    spectrogram = torch.stft(torch.FloatTensor(signal),
                             N_FFT,
                             hop_length=STRIDE,
                             win_length=N_FFT,
                             window=torch.hamming_window(N_FFT),
                             center=False,
                             normalized=False,
                             onesided=True)

    spectrogram = (spectrogram[:, :, 0].pow(2) +
                   spectrogram[:, :, 1].pow(2)).pow(0.5)  # (N_FFT / 2 + 1 * T)
    spectrogram = np.log1p(spectrogram.numpy())
    spectrogram = torch.FloatTensor(spectrogram).transpose(0, 1)
    spectrogram -= spectrogram.mean()
    return spectrogram
def get_window(window_size, window_type, square_root_window=True):
    """Return the window"""
    window = {
        'hamming': torch.hamming_window(window_size),
        'hanning': torch.hann_window(window_size),
    }[window_type]
    if square_root_window:
        window = torch.sqrt(window)
    return window
def build_window(fft_size, window_fn='hann'):
    if window_fn == 'hann':
        window = torch.hann_window(fft_size, periodic=True)
    elif window_fn == 'hamming':
        window = torch.hamming_window(fft_size, periodic=True)
    else:
        raise ValueError("Not support {} window.".format(window_fn))

    return window
    def __call__(self, signal):
        with torch.no_grad():
            torch_signal = torch.Tensor(signal).unsqueeze(0)
            wave = torch_signal.to(self.device)
            stft_noisy_mag, stft_noisy_phase = torchaudio.functional.magphase(
                torch.stft(wave,
                           n_fft=self.nfft,
                           hop_length=self.hop_len,
                           win_length=self.window_len,
                           window=torch.hamming_window(self.window_len).to(
                               self.device)))
            stft_input_mag = torch.transpose(stft_noisy_mag.clone(), 2,
                                             1).cpu().numpy()

            stft_input_power = stft_input_mag[0]**2
            smallpower = stft_input_power < self.eps
            stft_input_power[smallpower] = np.log(self.eps)
            stft_input_power[~smallpower] = np.log(
                stft_input_power[~smallpower])
            stft_input_power = self.norm_function(stft_input_power)

            stft_input_power = torch.FloatTensor(stft_input_power).to(
                self.device)
            mask = self.model(stft_input_power.unsqueeze(0))
            enhanced_mag = torch.transpose(mask, 2, 1) * stft_noisy_mag

            complex_enhanced = torch.zeros(
                (enhanced_mag.shape[0], enhanced_mag.shape[1],
                 enhanced_mag.shape[2], 2))
            complex_enhanced[:, :, :,
                             0] = enhanced_mag * torch.cos(stft_noisy_phase)
            complex_enhanced[:, :, :,
                             1] = enhanced_mag * torch.sin(stft_noisy_phase)

            enhanced_sig_recon = torch.istft(complex_enhanced,
                                             n_fft=self.nfft,
                                             hop_length=self.hop_len,
                                             win_length=self.window_len,
                                             window=torch.hamming_window(
                                                 self.window_len))

        return enhanced_sig_recon
Beispiel #19
0
def get_spectrogram_feature(cfg_data, filepath, train_mode=False):

    use_mel_scale = cfg_data["use_mel_scale"]
    cfg_spec_augment = cfg_data["spec_augment"]
    use_specaug = cfg_spec_augment["use"]
    cfg_trim = cfg_data["trim_silence"]
    use_trim = cfg_trim["use"]

    (rate, width, sig) = wavio.readwav(filepath)
    sig = sig.ravel()
    if use_trim:
        sig = trim.trim(sig, cfg_trim)
    stft = torch.stft(torch.FloatTensor(sig),
                      N_FFT,
                      hop_length=int(0.01 * SAMPLE_RATE),
                      win_length=int(0.030 * SAMPLE_RATE),
                      window=torch.hamming_window(int(0.030 * SAMPLE_RATE)),
                      center=False,
                      normalized=False,
                      onesided=True)
    stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5)

    if use_mel_scale:
        amag = stft.clone().detach()
        amag = amag.view(
            -1, amag.shape[0], amag.shape[1]
        )  # reshape spectrogram shape to [batch_size, time, frequency]
        mel = melscale_pytorch.mel_scale(amag,
                                         sample_rate=SAMPLE_RATE,
                                         n_mels=N_FFT // 2 +
                                         1)  # melspec with same shape
        if use_specaug and train_mode:
            specaug_prob = 1  # augment probability
            if numpy.random.uniform(0, 1) < specaug_prob:
                # apply augment
                mel = spec_augment_pytorch.spec_augment(
                    mel,
                    time_warping_para=80,
                    frequency_masking_para=54,
                    time_masking_para=40,
                    frequency_mask_num=1,
                    time_mask_num=1)
        feat = mel.view(mel.shape[1],
                        mel.shape[2])  # squeeze back to [frequency, time]
        feat = feat.transpose(0, 1).clone().detach()
        del sig, stft, amag, mel
    else:
        # use baseline feature
        amag = stft.numpy()
        feat = torch.FloatTensor(amag)
        feat = torch.FloatTensor(feat).transpose(0, 1)
        del sig, stft, amag

    return feat
Beispiel #20
0
def periodogram(signal,
                window_fn=torch.hann_window,
                is_train=False):  # not used
    if window_fn is not None:
        signal = signal * torch.hamming_window(window_length=signal.size(-1),
                                               periodic=False,
                                               device=signal.device)
    if not is_train:
        with torch.no_grad():
            dft = torch.rfft(signal, signal_ndim=1, onesided=True)
    return torch.pow(dft, 2).sum(-1)
    def pre_process(self, data, data_length):

        # ToDo - write the code for generating pitch features

        fbank = self.fbank[data.get_device()]

        pre_emphasis = config.fbank['pre_emphasis']
        frame_size = config.fbank['frame_size']
        frame_stride = config.fbank['frame_stride']
        n_fft = config.fbank['n_fft']
        rate = config.fbank['rate']

        emphasized_data = torch.zeros_like(data).float()

        if config.use_cuda:
            emphasized_data = emphasized_data.to(data.device)

        emphasized_data[:, 1:] = data[:, 1:] - pre_emphasis * data[:, :-1]
        emphasized_data[:, 0] = data[:, 0]

        frame_length, frame_step = frame_size * rate, frame_stride * rate  # Convert from seconds to samples
        frame_length = int(frame_length)
        frame_step = int(frame_step)

        mag_frames = torch.norm(torch.stft(
            emphasized_data,
            n_fft=n_fft,
            hop_length=frame_step,
            win_length=frame_length,
            window=torch.hamming_window(frame_length).to(
                emphasized_data.device),
            pad_mode='constant'),
                                dim=3).transpose(2, 1)

        pow_frames = ((1.0 / n_fft) * (mag_frames**2))  # Power Spectrum

        filter_banks = torch.matmul(pow_frames, fbank.transpose(1, 0))
        filter_banks[filter_banks == 0] = 2.220446049250313e-16
        filter_banks = 20 * torch.log10(filter_banks)  # dB
        filter_banks -= (torch.mean(filter_banks, dim=(0, 1), keepdim=True) +
                         1e-8)

        if data_length is None:
            ilens = (torch.ones([filter_banks.shape[0]]) *
                     filter_banks.shape[1]).long()
        else:
            ilens = torch.FloatTensor([
                data_length_i // frame_step + 1
                for data_length_i in data_length
            ]).long()

        # for filter_banks.shape[0]

        return filter_banks, ilens
Beispiel #22
0
 def test_linearity_of_istft4(self):
     # hamming_window, not centered, not normalized, onesided
     kwargs4 = {
         'n_fft': 12,
         'window': torch.hamming_window(12),
         'center': False,
         'pad_mode': 'constant',
         'normalized': False,
         'onesided': True,
     }
     data_size = (2, 7, 3, 2)
     self._test_linearity_of_istft(data_size, kwargs4, atol=1e-5, rtol=1e-8)
Beispiel #23
0
 def test_linearity_of_istft3(self):
     # hamming_window, centered, normalized, not onesided
     kwargs3 = {
         'n_fft': 12,
         'window': torch.hamming_window(12),
         'center': True,
         'pad_mode': 'constant',
         'normalized': True,
         'onesided': False,
     }
     data_size = (2, 12, 7, 2)
     self._test_linearity_of_istft(data_size, kwargs3)
Beispiel #24
0
 def spectral_ops(self):
     a = torch.randn(10)
     b = torch.randn(10, 8, 4, 2)
     return (
         torch.stft(a, 8),
         torch.istft(b, 8),
         torch.bartlett_window(2, dtype=torch.float),
         torch.blackman_window(2, dtype=torch.float),
         torch.hamming_window(4, dtype=torch.float),
         torch.hann_window(4, dtype=torch.float),
         torch.kaiser_window(4, dtype=torch.float),
     )
Beispiel #25
0
 def test_istft_is_inverse_of_stft3(self):
     # hamming_window, centered, normalized, not onesided
     kwargs3 = {
         'n_fft': 15,
         'hop_length': 3,
         'win_length': 11,
         'window': torch.hamming_window(11),
         'center': True,
         'pad_mode': 'constant',
         'normalized': True,
         'onesided': False,
     }
     _test_istft_is_inverse_of_stft(kwargs3)
Beispiel #26
0
def smooth_upsample2(x, size):
    batch, channels, frames = x.shape
    hop_size = size // frames
    window_size = hop_size * 2

    window = torch.hamming_window(window_size, periodic=True).to(x.device)

    amps = x.view(batch, channels * frames)
    scaled_windows = amps[..., None] * window[None, None, :]
    scaled_windows = scaled_windows.view(batch, channels, frames, window_size)
    output = overlap_add(scaled_windows, apply_window=False)
    output = output[:, :, :-hop_size]
    return output
Beispiel #27
0
 def __init__(self, cutoffs: list, width: int = None):
     super().__init__()
     self.cutoffs = cutoffs
     if width is None:
         width = int(2 / min(cutoffs))
     self.width = width
     window = torch.hamming_window(2 * width + 1, periodic=False)
     t = np.arange(-width, width + 1, dtype=np.float32)
     filters = []
     for cutoff in cutoffs:
         sinc = torch.from_numpy(np.sinc(2 * cutoff * t))
         filters.append(2 * cutoff * sinc * window)
     self.register_buffer("filters", torch.stack(filters).unsqueeze(1))
Beispiel #28
0
def get_spectrogram_feature(filepath, train_mode=False):
    (rate, width, sig) = wavio.readwav(filepath)
    wavio.writewav24("test.wav", rate=rate, data=sig)
    sig = sig.ravel()
    sig = trim(sig)

    stft = torch.stft(torch.FloatTensor(sig),
                      N_FFT,
                      hop_length=int(0.01 * SAMPLE_RATE),
                      win_length=int(0.030 * SAMPLE_RATE),
                      window=torch.hamming_window(int(0.030 * SAMPLE_RATE)),
                      center=False,
                      normalized=False,
                      onesided=True)

    stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5)

    amag = stft.clone().detach()

    amag = amag.view(
        -1, amag.shape[0], amag.shape[1]
    )  # reshape spectrogram shape to [batch_size, time, frequency]
    mel = melscale_pytorch.mel_scale(amag,
                                     sample_rate=SAMPLE_RATE,
                                     n_mels=N_FFT // 2 +
                                     1)  # melspec with same shape

    plt.subplot(1, 2, 1)
    plt.imshow(mel.transpose(1, 2).squeeze(), cmap='jet')

    p = 1  # always augment
    randp = np.random.uniform(0, 1)
    do_aug = p > randp
    if do_aug & train_mode:  # apply augment
        print("augment image")
        mel = spec_augment_pytorch.spec_augment(mel,
                                                time_warping_para=80,
                                                frequency_masking_para=54,
                                                time_masking_para=50,
                                                frequency_mask_num=1,
                                                time_mask_num=1)
    feat = mel.view(mel.shape[1],
                    mel.shape[2])  # squeeze back to [frequency, time]
    feat = feat.transpose(0, 1).clone().detach()

    plt.subplot(1, 2, 2)
    plt.imshow(feat, cmap='jet')
    plt.show()  # display it

    del stft, amag, mel
    return feat
Beispiel #29
0
    def forward(self, x):
        """
        
        input:
        ------
         x: tensor(batch, length), where length is waveform length
        
        output:
        -------
         lfcc_output: tensor(batch, frame_num, dim_num)
        """
        # pre-emphsis
        if self.with_emphasis:
            x[:, 1:] = x[:, 1:] - 0.97 * x[:, 0:-1]

        # STFT
        x_stft = torch.stft(x,
                            self.fn,
                            self.fs,
                            self.fl,
                            window=torch.hamming_window(self.fl).to(x.device),
                            onesided=True,
                            pad_mode="constant")
        # amplitude
        sp_amp = torch.norm(x_stft, 2, -1).pow(2).permute(0, 2, 1).contiguous()

        # filter bank
        fb_feature = torch.log10(
            torch.matmul(sp_amp, self.lfcc_fb) +
            torch.finfo(torch.float32).eps)

        # DCT (if necessary, remove DCT)
        lfcc = self.l_dct(fb_feature) if not self.flag_for_LFB else fb_feature

        # Add energy
        if self.with_energy:
            power_spec = sp_amp / self.fn
            energy = torch.log10(
                power_spec.sum(axis=2) + torch.finfo(torch.float32).eps)
            lfcc[:, :, 0] = energy

        # Add delta coefficients
        if self.with_delta:
            lfcc_delta = delta(lfcc)
            lfcc_delta_delta = delta(lfcc_delta)
            lfcc_output = torch.cat((lfcc, lfcc_delta, lfcc_delta_delta), 2)
        else:
            lfcc_output = lfcc

        # done
        return lfcc_output
Beispiel #30
0
 def test_istft_is_inverse_of_stft5(self):
     # hamming_window, not centered, not normalized, not onesided
     # window same size as n_fft
     kwargs5 = {
         'n_fft': 3,
         'hop_length': 2,
         'win_length': 3,
         'window': torch.hamming_window(3),
         'center': False,
         'pad_mode': 'reflect',
         'normalized': False,
         'onesided': False,
     }
     _test_istft_is_inverse_of_stft(kwargs5)
Beispiel #31
0
    def __init__(self, scale, taps, samplerate):
        super(SincLayer, self).__init__()
        self.samplerate = int(samplerate)
        self.taps = taps
        self.scale = scale

        # each filter requires two parameters to define the filter bandwidth
        filter_parameters = torch.FloatTensor(len(scale), 2)

        self.linear = nn.Parameter(
            torch.linspace(-math.pi, math.pi, steps=taps), requires_grad=False)
        self.window = nn.Parameter(
            torch.hamming_window(self.taps), requires_grad=False)

        for i, band in enumerate(scale):
            start = self.samplerate / band.start_hz
            stop = self.samplerate / band.stop_hz
            filter_parameters[i, 0] = start
            filter_parameters[i, 1] = stop

        self.filter_parameters = nn.Parameter(filter_parameters)