Example #1
0
    def __init__(self,
                 filter_length: int = 800,
                 hop_length: int = 200,
                 win_length: int = 800,
                 window: str = "hann"):
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        # TODO
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])
        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :])

        if window is not None:
            assert (filter_length >= win_length)
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = torch.from_numpy(fft_window).to(torch.float32)

            forward_basis *= fft_window
            inverse_basis *= fft_window

        self.register_buffer("forward_basis", forward_basis)
        self.register_buffer("inverse_basis", inverse_basis)
Example #2
0
    def __init__(self, filter_length, hop_length, win_length, window='hann'):
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])

        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :])

        if window is not None:
            assert (filter_length >= win_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = torch.from_numpy(fft_window).float()

            # window the bases
            forward_basis *= fft_window
            inverse_basis *= fft_window

        self.register_buffer('forward_basis', forward_basis.float())
        self.register_buffer('inverse_basis', inverse_basis.float())
Example #3
0
def process_wav_length(wav_data, filenames, df, conversion="zero",
                       seconds=0.25, sr=44100, resample_size=0.125, 
                       testing=False):
    """Process all audios to have the same length.

    conversion:
        zero - We ignore all audios with a bigger size and 0-pad the ones that have less
        repeat - we repeat the audio as many times as necessary to fill the vector
        resample - we resample the audio to a given size
    """
    amount_samples = int(seconds*sr)
    sizes = np.vectorize(len)(wav_data)

    idx = sizes <= amount_samples
    processed_wav_data = wav_data[idx]

    if conversion == "zero":
        new_wavs = np.asarray([pad_center(a, amount_samples)
                              for a in processed_wav_data])
    elif conversion == "repeat":
        new_wavs = np.asarray([np.resize(a, amount_samples)
                               for a in processed_wav_data])
    elif conversion == "rescale":
        raise ValueError("TO-DO")

    new_df = pd.DataFrame(new_wavs)
    new_df["file"] = filenames.loc[idx, "file"].values
    new_df["original_name"] = filenames.loc[idx, "original_name"].values

    full_df = pd.merge(df, new_df, left_on="file", right_on="file",
                       validate="1:1", how="right")

    return full_df
Example #4
0
    def __init__(self,
                 filter_length=800,
                 hop_length=200,
                 win_length=800,
                 window='hann'):
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])

        self.forward_basis = nd.array(fourier_basis[:, np.newaxis, :])
        self.inverse_basis = nd.array(
            np.linalg.pinv(scale * fourier_basis).T[:, np.newaxis, :])

        if window is not None:
            assert (win_length >= filter_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = nd.array(fft_window)

            # window the bases
            self.forward_basis *= fft_window
            self.inverse_basis *= fft_window
Example #5
0
def window_sumsquare(window,
                     n_frames,
                     hop_length=200,
                     win_length=800,
                     n_fft=800,
                     dtype=np.float32,
                     norm=None):

    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
Example #6
0
    def __init__(self, filter_length: int = 1024, hop_length: int = 512, win_length: int = None, window: str = 'hann'):
        super().__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length if win_length else filter_length
        self.window = window
        self.pad_amount = self.filter_length // 2

        # make fft window
        assert (filter_length >= self.win_length)
        # get window and zero center pad it to filter_length
        fft_window = get_window(window, self.win_length, fftbins=True)
        fft_window = pad_center(fft_window, filter_length)
        fft_window = torch.from_numpy(fft_window).float()

        # calculate fourer_basis
        cut_off = int((self.filter_length / 2 + 1))
        fourier_basis = np.fft.fft(np.eye(self.filter_length))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cut_off, :]),
            np.imag(fourier_basis[:cut_off, :])
        ])

        # make forward & inverse basis
        self.register_buffer('square_window', fft_window ** 2)

        forward_basis = torch.FloatTensor(fourier_basis[:, np.newaxis, :]) * fft_window
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(self.filter_length / self.hop_length * fourier_basis).T[:, np.newaxis, :]
        ) * fft_window
        # torch.pinverse has a bug, so at this time, it is separated into two parts..
        self.register_buffer('forward_basis', forward_basis)
        self.register_buffer('inverse_basis', inverse_basis)
Example #7
0
def window_sumsquare(window,
                     n_frames,
                     hop_length=200,
                     win_length=800,
                     n_fft=800,
                     dtype=np.float32,
                     norm=None):
    # 总共800长度,n:总共解析多少个针
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)  #总长
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)  #采样函数
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2  #平方
    win_sq = librosa_util.pad_center(win_sq,
                                     n_fft)  #填充0. 结果长度是n_fft,如果win_length指定了,
    #那么这行代码彩旗效果.

    # Fill the envelope#下一个函数进行函数波形每次的偏右200然后叠加的运算.所以叫sum_square
    for i in range(n_frames):  #hop_length 表示跳过的大小.就是静音时间段的长度.
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
Example #8
0
    def __getitem__(self, index):
        'Generates one sample of data'
        x = self.x[index]
        x = pad_center(x, self.samples)
        x = self.scaler.transform(x.reshape(1, -1))
        y = self.y[index]

        return x.reshape(1, -1), y
Example #9
0
def window_sumsquare(
    window,
    n_frames,
    hop_length,
    win_length,
    n_fft,
    dtype=np.float32,
    norm=None,
):
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
    return x
    def get_mfcc(self, sig_frm):
        sig_frm = sig_frm / 32768.0
        window = 'hamming'
        win_length = sig_frm.shape[0]
        hop_length = win_length
        center = True
        n_fft = win_length
        fft_window = get_window(window, win_length, fftbins=True)
        fft_window = util.pad_center(fft_window, n_fft)
        fft_window = fft_window.reshape((-1, 1))
        util.valid_audio(sig_frm)
        sig_frm = sig_frm[:, None]
        stft_matrix = np.empty((int(1 + n_fft // 2), 1),
                               dtype=np.complex64,
                               order='F')
        stft = fft.fft(fft_window * sig_frm,
                       axis=0)[:stft_matrix.shape[0]].conj()
        powspec = np.abs(stft)**2
        melspec = librosa.feature.melspectrogram(S=powspec,
                                                 hop_length=hop_length,
                                                 n_fft=n_fft,
                                                 n_mels=40)
        mfcc = librosa.feature.mfcc(S=librosa.logamplitude(melspec), n_mfcc=13)

        n_fft = 512
        fft_window = get_window(window, win_length, fftbins=True)
        fft_window = util.pad_center(fft_window, n_fft)
        fft_window = fft_window.reshape((-1, 1))
        y = np.pad(sig_frm[:, 0], int(n_fft // 2), mode='reflect')
        pad_frame = librosa.util.frame(y,
                                       frame_length=n_fft,
                                       hop_length=win_length * 2)[:, 0][:,
                                                                        None]
        stft_matrix = np.empty((int(1 + n_fft // 2), 1),
                               dtype=np.complex64,
                               order='F')
        stft = fft.fft(fft_window * pad_frame,
                       axis=0)[:stft_matrix.shape[0]].conj()
        powspec = np.abs(stft)**2
        power_to_db = getattr(librosa, 'power_to_db')
        spec = power_to_db(powspec)
        self.spec_tape_add(spec)
        return mfcc
def gen_win_sq(denoiser):
    window = denoiser.stft.window
    win_length = denoiser.stft.win_length
    n_fft = denoiser.stft.filter_length

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=None)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    return win_sq
Example #12
0
    def __init__(self,
                 filter_length=1024,
                 hop_length=512,
                 win_length=None,
                 length=None,
                 window='hann'):
        """
        This module implements an STFT using 1D convolution and 1D transpose convolutions.
        This is a bit tricky so there are some cases that probably won't work as working
        out the same sizes before and after in all overlap add setups is tough. Right now,
        this code should work with hop lengths that are half the filter length (50% overlap
        between frames).
        
        Keyword Arguments:
            filter_length {int} -- Length of filters used (default: {1024})
            hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
            win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
                equals the filter length). (default: {None})
            window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) 
                (default: {'hann'})
        """
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length if win_length else filter_length
        self.window = window
        self.num_samples = length
        self.forward_transform = None
        self.pad_amount = int(self.filter_length / 2)
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])
        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :])

        assert (filter_length >= self.win_length)
        # get window and zero center pad it to filter_length
        fft_window = get_window(window, self.win_length, fftbins=True)
        fft_window = pad_center(fft_window, filter_length)
        fft_window = torch.from_numpy(fft_window).float()

        # window the bases
        forward_basis *= fft_window
        inverse_basis *= fft_window

        self.register_buffer('forward_basis', forward_basis.float())
        self.register_buffer('inverse_basis', inverse_basis.float())
Example #13
0
    def __init__(self,
                 filter_length=800,
                 hop_length=200,
                 win_length=800,
                 window='hann',
                 feat_stat=None):
        super(STFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])

        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :])

        if window is not None:
            assert (filter_length >= win_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = torch.from_numpy(fft_window).float()

            # window the bases
            forward_basis *= fft_window
            inverse_basis *= fft_window

        self.register_buffer('forward_basis', forward_basis.float())
        self.register_buffer('inverse_basis', inverse_basis.float())

        if feat_stat is not None:
            assert 'spec_min' in feat_stat.keys()
            assert 'spec_scale' in feat_stat.keys()
            spec_min = feat_stat['spec_min'].view(1, -1, 1)
            spec_scale = feat_stat['spec_scale'].view(1, -1, 1)
            assert spec_min.size(1) == filter_length // 2 + 1
            assert spec_scale.size(1) == filter_length // 2 + 1
            self.register_buffer('spec_min', spec_min)
            self.register_buffer('spec_scale', spec_scale)
            self.feat_stat = True
        else:
            self.feat_stat = False
Example #14
0
    def __init__(self,
                 filter_length=800,
                 hop_length=200,
                 win_length=800,
                 window='hann',
                 device="cpu"):
        super(STFT, self).__init__()
        self.device = device
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.window = window
        self.forward_transform = None
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))

        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack([
            np.real(fourier_basis[:cutoff, :]),
            np.imag(fourier_basis[:cutoff, :])
        ])

        forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :].copy())

        if window is not None:
            assert (filter_length >= win_length)
            # get window and zero center pad it to filter_length
            fft_window = get_window(window, win_length, fftbins=True)
            fft_window = pad_center(fft_window, filter_length)
            fft_window = torch.from_numpy(fft_window).float()

            # window the bases
            forward_basis *= fft_window
            inverse_basis *= fft_window

        self.register_buffer('forward_basis', forward_basis.float())
        self.register_buffer('inverse_basis', inverse_basis.float())

        inv_t_weight = self.inverse_basis.unsqueeze(-1)
        self.inv_t = torch.nn.ConvTranspose2d(
            in_channels=inv_t_weight.shape[1],
            out_channels=inv_t_weight.shape[3],
            kernel_size=inv_t_weight.shape[3],
            stride=self.hop_length,
            padding=0,
            bias=None)
        self.inv_t.weight.data = inv_t_weight
Example #15
0
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann',
            center=None, dtype=np.complex64, pad_mode='reflect'):
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
                                          stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window *
                                            y_frames[:, bl_s:bl_t],
                                            axis=0)[:stft_matrix.shape[0]]
    f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2
    return stft_matrix, f
def window_sumsquare(window,
                     n_frames,
                     hop_length=120,
                     win_length=800,
                     n_fft=800,
                     dtype=float,
                     norm=None):
    if win_length is None:
        win_length = n_fft
    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm)**2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    for i in range(n_frames):
        sample = i * hop_length
        x[sample:min(n, sample +
                     n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
    return x
Example #17
0
def frames_stft(y_frames,
                n_fft=2048,
                win_length=None,
                window='hann',
                dtype=np.complex64):
    """
    Adapted from librosa for frame input. NOTE: not centered anymore.
    """
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    fft_window = get_window(window, win_length, fftbins=True)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    float(stft_matrix.shape[0] * stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:,
                    bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t],
                                         axis=0)[:stft_matrix.shape[0]]

    return stft_matrix
Example #18
0
    def __init__(self, n_fft, hop_len, win_len):
        """
        Parameters
        ----------
        n_fft : int > 0 [scalar]
            Number of components in the fast Fourier transform (FFT).
        hop_len : int > 0 [scalar]
            Number audio of frames between STFT columns.
        win_len : int > 0 [scalar]
            Each frame of audio is windowed by `hann`.
            The window will be of length `win_length` and then padded
            with zeros to match `n_fft`. `n_fft` >= `win_len`.

        """
        super(PytorchSTFT, self).__init__()

        self.n_fft = int(n_fft)
        self.hop_len = int(hop_len)
        self.win_len = int(win_len)

        assert (self.n_fft >= self.win_len)

        self.cutoff_freq = self.n_fft // 2 + 1

        fft_basis = fft(np.eye(self.n_fft))
        fft_basis = np.vstack([
            np.real(fft_basis[:self.cutoff_freq, :]),
            np.imag(fft_basis[:self.cutoff_freq, :])
        ])
        fft_basis = torch.Tensor(fft_basis[:, np.newaxis, :])

        fft_win = torch.Tensor(
            pad_center(data=get_window(window='hann', Nx=self.win_len),
                       size=self.n_fft).astype(np.float32))

        self.basis = fft_basis * fft_win
Example #19
0
            '''
				Zero crossing rates and fundamental frequencies must be computed before normalizing
				the data, otherwise we are not calculating what we actually want.
				For ZCR no value crosses 0 after normalizing and the fundamentals won't
				correspond to the actual frequencies in hertz.
			'''
            zero_crossing_rates = zero_crossing_rate(
                time_series,
                frame_length=int(frame_ms * sr_ms),
                hop_length=int(sliding_ms * sr_ms),
                center=True)
            frames = frame(time_series,
                           frame_length=int(sr_ms * frame_ms),
                           hop_length=int(sr_ms * sliding_ms))
            frames = pad_center(frames,
                                size=zero_crossing_rates.shape[1],
                                axis=1)
            fundamentals = fundamental(frames, sr)
            '''
				We normalize with respect to the maximum and minimum found across the corpus.
			'''
            time_series = (time_series - min_max[meta_file][0]) / (
                min_max[meta_file][1] - min_max[meta_file][0])
            mfccs = mfcc(time_series,
                         sr=sr,
                         n_mfcc=12,
                         n_fft=int(frame_ms * sr_ms),
                         hop_length=int(sliding_ms * sr_ms))
            d_mfccs = delta(mfccs, width=3, order=1)

            frames = frame(time_series,
Example #20
0
def istft_noDiv(stft_matrix, hop_length=None, win_length=None, window=None,
          center=True, dtype=np.float32):
    """

    #Copied from librosa's spectrum.py file, removing division by squared
    window, which shouldn't be necessary and can cause problems in recon.
    
    Inverse short-time Fourier transform (ISTFT).
    Converts a complex-valued spectrogram `stft_matrix` to time-series `y`
    by minimizing the mean squared error between `stft_matrix` and STFT of
    `y` as described in [1]_.
    In general, window function, hop length and other parameters should be same
    as in stft, which mostly leads to perfect reconstruction of a signal from
    unmodified `stft_matrix`.
    Parameters
    ----------
    stft_matrix : np.ndarray [shape=(1 + n_fft/2, t)]
        STFT matrix from `stft`
    hop_length  : int > 0 [scalar]
        Number of frames between STFT columns.
        If unspecified, defaults to `win_length / 4`.
    win_length  : int <= n_fft = 2 * (stft_matrix.shape[0] - 1)
        When reconstructing the time series, each frame is windowed
        and each sample is normalized by the sum of squared window
        according to the `window` function (see below).
        If unspecified, defaults to `n_fft`.
    window      : None, function, np.ndarray [shape=(n_fft,)]
        - None (default): use an asymmetric Hann window
        - a window function, such as `scipy.signal.hanning`
        - a user-specified window vector of length `n_fft`
    center      : boolean
        - If `True`, `D` is assumed to have centered frames.
        - If `False`, `D` is assumed to have left-aligned frames.
    dtype       : numeric type
        Real numeric type for `y`.  Default is 32-bit float.
    Returns
    -------
    y : np.ndarray [shape=(n,)]
        time domain signal reconstructed from `stft_matrix`
    Raises
    ------
    ParameterError
        If `window` is supplied as a vector of length `n_fft`
    See Also
    --------
    stft : Short-time Fourier Transform
    Examples
    --------
    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> D = librosa.stft(y)
    >>> y_hat = librosa.istft(D)
    >>> y_hat
    array([ -4.812e-06,  -4.267e-06, ...,   6.271e-06,   2.827e-07], dtype=float32)
    Exactly preserving length of the input signal requires explicit padding.
    Otherwise, a partial frame at the end of `y` will not be represented.
    >>> n = len(y)
    >>> n_fft = 2048
    >>> y_pad = librosa.util.fix_length(y, n + n_fft // 2)
    >>> D = librosa.stft(y_pad, n_fft=n_fft)
    >>> y_out = librosa.util.fix_length(librosa.istft(D), n)
    >>> np.max(np.abs(y - y_out))
    1.4901161e-07
    """

    n_fft = 2 * (stft_matrix.shape[0] - 1)

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length / 4)

    if window is None:
        # Default is an asymmetric Hann window.
        ifft_window = scipy.signal.hann(win_length, sym=False)

    elif six.callable(window):
        # User supplied a windowing function
        ifft_window = window(win_length)

    else:
        # User supplied a window vector.
        # Make it into an array
        ifft_window = np.asarray(window)

        # Verify that the shape matches
        if ifft_window.size != n_fft:
            raise ParameterError('Size mismatch between n_fft and window size')

    # Pad out to match n_fft
    ifft_window = util.pad_center(ifft_window, n_fft)

    # scale the window
    ifft_window = ifft_window*(2.0/(win_length/hop_length))

    n_frames = stft_matrix.shape[1]
    expected_signal_len = n_fft + hop_length * (n_frames - 1)
    y = np.zeros(expected_signal_len, dtype=dtype)
    ifft_window_sum = np.zeros(expected_signal_len, dtype=dtype)
    ifft_window_square = ifft_window * ifft_window

    for i in range(n_frames):
        sample = i * hop_length
        spec = stft_matrix[:, i].flatten()
        spec = np.concatenate((spec.conj(), spec[-2:0:-1]), 0)
        ytmp = ifft_window * fft.ifft(spec).real

        y[sample:(sample + n_fft)] = y[sample:(sample + n_fft)] + ytmp
        # shouldn't need to do this sum of the squared window:
        #ifft_window_sum[sample:(sample + n_fft)] += ifft_window_square

    # don't do this:
    ## Normalize by sum of squared window
    #approx_nonzero_indices = ifft_window_sum > util.SMALL_FLOAT
    #y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]

    if center:
        y = y[int(n_fft // 2):-int(n_fft // 2)]

    return y
Example #21
0
def stft(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window='hann',
         center=True,
         dtype=np.complex64,
         pad_mode='reflect'):
    """Short-time Fourier transform (STFT)

    Returns a complex-valued matrix D such that
        `np.abs(D[f, t])` is the magnitude of frequency bin `f`
        at frame `t`

        `np.angle(D[f, t])` is the phase of frequency bin `f`
        at frame `t`

    Parameters
    ----------
    y : np.ndarray [shape=(n,)], real-valued
        the input signal (audio time series)

    n_fft : int > 0 [scalar]
        FFT window size

    hop_length : int > 0 [scalar]
        number audio of frames between STFT columns.
        If unspecified, defaults `win_length / 4`.

    win_length  : int <= n_fft [scalar]
        Each frame of audio is windowed by `window()`.
        The window will be of length `win_length` and then padded
        with zeros to match `n_fft`.

        If unspecified, defaults to ``win_length = n_fft``.

    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
        - a window specification (string, tuple, or number);
          see `scipy.signal.get_window`
        - a window function, such as `scipy.signal.hanning`
        - a vector or array of length `n_fft`

        .. see also:: `filters.get_window`

    center      : boolean
        - If `True`, the signal `y` is padded so that frame
          `D[:, t]` is centered at `y[t * hop_length]`.
        - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

    dtype       : numeric type
        Complex numeric type for `D`.  Default is 64-bit complex.

    pad_mode : string
        If `center=True`, the padding mode to use at the edges of the signal.
        By default, STFT uses reflection padding.


    Returns
    -------
    D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
        STFT matrix


    See Also
    --------
    istft : Inverse STFT

    ifgram : Instantaneous frequency spectrogram

    np.pad : array padding

    Notes
    -----
    This function caches at level 20.


    Examples
    --------

    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> D = np.abs(librosa.stft(y))
    >>> D
    array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ...,
            6.82710262e-04, 2.51654536e-04, 7.23036574e-05],
           [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ...,
            3.48026224e-04, 2.35853557e-04, 7.54836728e-05],
           [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ...,
            6.29352580e-04, 3.38571583e-04, 8.38094638e-05],
           ...,
           [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ...,
            1.85637656e-08, 2.89708542e-08, 5.74304337e-09],
           [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ...,
            3.49099771e-08, 3.11740926e-08, 5.29926236e-09],
           [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ...,
            5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32)


    Use left-aligned frames, instead of centered frames

    >>> D_left = np.abs(librosa.stft(y, center=False))


    Use a shorter hop length

    >>> D_short = np.abs(librosa.stft(y, hop_length=64))


    Display a spectrogram

    >>> import matplotlib.pyplot as plt
    >>> librosa.display.specshow(librosa.amplitude_to_db(D,
    ...                                                  ref=np.max),
    ...                          y_axis='log', x_axis='time')
    >>> plt.title('Power spectrogram')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.tight_layout()

    """

    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)

    #fft_window = get_window(window, win_length, fftbins=True)
    fft_window = vorbis(win_length)

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))

    # Check audio is valid
    util.valid_audio(y)

    # Pad the time series so that frames are centered
    if center:
        y = np.pad(y, int(n_fft // 2), mode=pad_mode)

    # Window the time series.
    y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)

    # Pre-allocate the STFT matrix
    stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
                           dtype=dtype,
                           order='F')

    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = int(util.MAX_MEM_BLOCK /
                    (stft_matrix.shape[0] * stft_matrix.itemsize))

    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])

        stft_matrix[:,
                    bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t],
                                         axis=0)[:stft_matrix.shape[0]]

    return stft_matrix
Example #22
0
import librosa
import numpy as np
import librosa.util as util
from librosa.filters import get_window

audio_path = "../AudioData/audio/D4_750.wav"
noise_path = "../AudioData/noise/Pink Noise.wav"
# 读取音频文件
y, sr = librosa.load(audio_path)

# 对音频文件进行分帧
win_len = n_fft = 200
hop_length = 80
# Pad the time series so that frames are centered
y = np.pad(y, int(n_fft // 2), mode='reflect')
# Window the time series.
y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length, axis=0)

# 获得窗系数
fft_window = get_window('hamm', 10, fftbins=False)
# fft_window = fft_window[1:-1]
print(fft_window)
fft_window = get_window('hamm', 10, fftbins=True)
print(fft_window)
# Pad the window out to n_fft size
fft_window = util.pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))

#
Example #23
0
 def pad_center(self, audio_data):
     return util.pad_center(audio_data, self.n_fft)
Example #24
0
    def hht(self,
            y,
            hop_length=None,
            win_length=None,
            center=True,
            dtype=np.complex64,
            pad_mode='reflect'):
        """Hilbert-Huang transform (HHT)

        Parameters
        ----------
        y : np.ndarray [shape=(n,)], real-valued
            the input signal (audio time series)

        hop_length : int > 0 [scalar]
            number audio of frames between STFT columns.
            If unspecified, defaults `win_length / 4`.

        win_length  : int <= n_fft [scalar]
            Each frame of audio is windowed by `window()`.
            The window will be of length `win_length` and then padded
            with zeros to match `n_fft`.

            If unspecified, defaults to ``win_length = n_fft``.

        center      : boolean
            - If `True`, the signal `y` is padded so that frame
              `D[:, t]` is centered at `y[t * hop_length]`.
            - If `False`, then `D[:, t]` begins at `y[t * hop_length]`

        dtype       : numeric type
            Complex numeric type for `D`.  Default is 64-bit complex.

        pad_mode : string
            If `center=True`, the padding mode to use at the edges of the signal.
            By default, HHT uses reflection padding.

        Returns
        -------
        hht_matrix : np.ndarray [shape=(30, t), dtype=dtype]
        bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype]

        """

        # By default, use the entire frame
        if win_length is None:
            win_length = self.n_hht

        # Set the default hop, if it's not already specified
        if hop_length is None:
            hop_length = int(win_length / 2)

        hht_window = self.window

        # Pad the window out to n_hht size
        hht_window = util.pad_center(hht_window, self.n_hht)

        # Reshape so that the window can be broadcast
        hht_window = hht_window.reshape((-1, 1))

        # Check audio is valid
        util.valid_audio(y)

        # Pad the time series so that frames are centered
        if center:
            y = np.pad(y, self.n_hht - 1, mode=pad_mode)

        # Window the time series.
        y_frames = util.frame(y,
                              frame_length=self.n_hht,
                              hop_length=hop_length).T

        # Pre-allocate the HHT matrix
        hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F')

        bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]),
                              dtype=dtype,
                              order='F')

        for bl_s in range(hht_matrix.shape[1]):
            frame_signal = hht_window[:, 0] * y_frames[bl_s, :]
            A, f, bjp = get_hht(frame_signal, self.fs)
            hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp)
            bjp_matrix[:, bl_s] = bjp

        return hht_matrix, bjp_matrix
frame_step = config['frame_step']
n_fft = config['n_fft']
n_mels = config['mfcc_bank_cnt']
fmin = config['fmin']
fmax = config['fmax']
dtype = config.get('dtype', "int")
high_prec = config.get('use_high_prec', False) or dtype == "fix32_scal"
use_power = False
rad4 = round(math.log(n_fft // 2, 4)) == math.log(n_fft // 2, 4)
ndct = config.get('n_dct', False)

from librosa.filters import get_window
from librosa import util
librosa_fft_window = get_window("hann", frame_size, fftbins=True)
# Pad the window out to n_fft size
librosa_fft_window = util.pad_center(librosa_fft_window, n_fft)

stft = librosa.core.spectrum.stft(data,
                                  n_fft,
                                  frame_step,
                                  frame_size,
                                  center=False,
                                  pad_mode="constant")
spect = np.abs(stft)**(1 if not use_power else 2)
mel_basis = librosa.filters.mel(samplerate, n_fft, n_mels, fmin, fmax)
mel_spect = np.dot(mel_basis, spect)
logmel = power_to_db(mel_spect, top_db=None)
mfcc = scipy.fftpack.dct(logmel, axis=0, type=2, norm=None)
with open("ground_truth.h", "w") as f:
    f.write(f"float ground_truth[] = {{\n")
    for elem in mfcc.T.flatten():
Example #26
0
def test(y,
         n_fft=2048,
         hop_length=None,
         win_length=None,
         window=None,
         center=True,
         dtype=np.complex64):
    import scipy
    import six
    from librosa import util
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
        # win_length = tf.constant(n_fft)

    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length / 4)
        # hop_length = win_length/4
        # hop_length.to_int64()

    if window is None:
        # Default is an asymmetric Hann window
        fft_window = scipy.signal.hann(win_length, sym=False)
        # fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False))

    elif six.callable(window):
        # User supplied a window function

        fft_window = window(win_length)

    else:
        # User supplied a window vector.
        # Make sure it's an array:
        fft_window = np.asarray(window)

        # validate length compatibility
        #        if fft_window.size != n_fft:
        #           raise ParameterError('Size mismatch between n_fft and len(window)')

    # Pad the window out to n_fft size
    fft_window = util.pad_center(fft_window, n_fft)
    # fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft))

    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))
    # tf.reshape(fft_window, (-1,1))

    if center:
        #util.valid_audio(y)
        #y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect')
        padding = int(n_fft // 2)
        y_frames = tf.Variable(tf.pad(y, [[padding, padding]], mode='REFLECT'))

    # Window the time series.
    #y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length)
    #y_frames.assign(librosa.util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=1))

    y_frames = frame(y_frames, n_fft, hop_length)

    # Pre-allocate the STFT matrix
    #stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
    #                       dtype=dtype,
    #                      order='F')
    stft_matrix = tf.Variable(
        tf.zeros(y_frames.get_shape()[1]._value, (int(1 + n_fft // 2)),
                 dtype='float32'))

    # how many columns can we fit within MAX_MEM_BLOCK?
    #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
    #                                      stft_matrix.itemsize))

    n_columns = int(librosa.util.MAX_MEM_BLOCK /
                    (stft_matrix.get_shape()[1]._value *
                     convertTFtoNP(stft_matrix).itemsize))

    #for bl_s in range(0, stft_matrix.shape[1], n_columns):
    for bl_s in range(0, stft_matrix.get_shape()[0]._value, n_columns):
        #bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
        bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[0]._value)
        # RFFT and Conjugate here to match phase from DPWE code
        #stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft(fft_window *
        #                                    y_frames[:, bl_s:bl_t],
        #                                    axis=0)[:stft_matrix.shape[0]].conj()

        stft_matrix = tf.scatter_update(
            stft_matrix, tf.constant(list(range(bl_s, bl_t, 1))),
            tf.conj(
                tf.slice(
                    tf.fft(fft_window * tf.slice(
                        y_frames, [0, bl_s],
                        [y_frames.get_shape()[0]._value, bl_t - bl_s])), [0],
                    [stft_matrix.get_shape()[0]._value])))

    return stft_matrix