def __init__(self, filter_length: int = 800, hop_length: int = 200, win_length: int = 800, window: str = "hann"): super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window # TODO self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :]) if window is not None: assert (filter_length >= win_length) fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).to(torch.float32) forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer("forward_basis", forward_basis) self.register_buffer("inverse_basis", inverse_basis)
def __init__(self, filter_length, hop_length, win_length, window='hann'): super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( np.linalg.pinv(scale * fourier_basis).T[:, None, :]) if window is not None: assert (filter_length >= win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer('forward_basis', forward_basis.float()) self.register_buffer('inverse_basis', inverse_basis.float())
def process_wav_length(wav_data, filenames, df, conversion="zero", seconds=0.25, sr=44100, resample_size=0.125, testing=False): """Process all audios to have the same length. conversion: zero - We ignore all audios with a bigger size and 0-pad the ones that have less repeat - we repeat the audio as many times as necessary to fill the vector resample - we resample the audio to a given size """ amount_samples = int(seconds*sr) sizes = np.vectorize(len)(wav_data) idx = sizes <= amount_samples processed_wav_data = wav_data[idx] if conversion == "zero": new_wavs = np.asarray([pad_center(a, amount_samples) for a in processed_wav_data]) elif conversion == "repeat": new_wavs = np.asarray([np.resize(a, amount_samples) for a in processed_wav_data]) elif conversion == "rescale": raise ValueError("TO-DO") new_df = pd.DataFrame(new_wavs) new_df["file"] = filenames.loc[idx, "file"].values new_df["original_name"] = filenames.loc[idx, "original_name"].values full_df = pd.merge(df, new_df, left_on="file", right_on="file", validate="1:1", how="right") return full_df
def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'): super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) self.forward_basis = nd.array(fourier_basis[:, np.newaxis, :]) self.inverse_basis = nd.array( np.linalg.pinv(scale * fourier_basis).T[:, np.newaxis, :]) if window is not None: assert (win_length >= filter_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = nd.array(fft_window) # window the bases self.forward_basis *= fft_window self.inverse_basis *= fft_window
def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def __init__(self, filter_length: int = 1024, hop_length: int = 512, win_length: int = None, window: str = 'hann'): super().__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length if win_length else filter_length self.window = window self.pad_amount = self.filter_length // 2 # make fft window assert (filter_length >= self.win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, self.win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # calculate fourer_basis cut_off = int((self.filter_length / 2 + 1)) fourier_basis = np.fft.fft(np.eye(self.filter_length)) fourier_basis = np.vstack([ np.real(fourier_basis[:cut_off, :]), np.imag(fourier_basis[:cut_off, :]) ]) # make forward & inverse basis self.register_buffer('square_window', fft_window ** 2) forward_basis = torch.FloatTensor(fourier_basis[:, np.newaxis, :]) * fft_window inverse_basis = torch.FloatTensor( np.linalg.pinv(self.filter_length / self.hop_length * fourier_basis).T[:, np.newaxis, :] ) * fft_window # torch.pinverse has a bug, so at this time, it is separated into two parts.. self.register_buffer('forward_basis', forward_basis) self.register_buffer('inverse_basis', inverse_basis)
def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): # 总共800长度,n:总共解析多少个针 """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) #总长 x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) #采样函数 win_sq = librosa_util.normalize(win_sq, norm=norm)**2 #平方 win_sq = librosa_util.pad_center(win_sq, n_fft) #填充0. 结果长度是n_fft,如果win_length指定了, #那么这行代码彩旗效果. # Fill the envelope#下一个函数进行函数波形每次的偏右200然后叠加的运算.所以叫sum_square for i in range(n_frames): #hop_length 表示跳过的大小.就是静音时间段的长度. sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def __getitem__(self, index): 'Generates one sample of data' x = self.x[index] x = pad_center(x, self.samples) x = self.scaler.transform(x.reshape(1, -1)) y = self.y[index] return x.reshape(1, -1), y
def window_sumsquare( window, n_frames, hop_length, win_length, n_fft, dtype=np.float32, norm=None, ): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] return x
def get_mfcc(self, sig_frm): sig_frm = sig_frm / 32768.0 window = 'hamming' win_length = sig_frm.shape[0] hop_length = win_length center = True n_fft = win_length fft_window = get_window(window, win_length, fftbins=True) fft_window = util.pad_center(fft_window, n_fft) fft_window = fft_window.reshape((-1, 1)) util.valid_audio(sig_frm) sig_frm = sig_frm[:, None] stft_matrix = np.empty((int(1 + n_fft // 2), 1), dtype=np.complex64, order='F') stft = fft.fft(fft_window * sig_frm, axis=0)[:stft_matrix.shape[0]].conj() powspec = np.abs(stft)**2 melspec = librosa.feature.melspectrogram(S=powspec, hop_length=hop_length, n_fft=n_fft, n_mels=40) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(melspec), n_mfcc=13) n_fft = 512 fft_window = get_window(window, win_length, fftbins=True) fft_window = util.pad_center(fft_window, n_fft) fft_window = fft_window.reshape((-1, 1)) y = np.pad(sig_frm[:, 0], int(n_fft // 2), mode='reflect') pad_frame = librosa.util.frame(y, frame_length=n_fft, hop_length=win_length * 2)[:, 0][:, None] stft_matrix = np.empty((int(1 + n_fft // 2), 1), dtype=np.complex64, order='F') stft = fft.fft(fft_window * pad_frame, axis=0)[:stft_matrix.shape[0]].conj() powspec = np.abs(stft)**2 power_to_db = getattr(librosa, 'power_to_db') spec = power_to_db(powspec) self.spec_tape_add(spec) return mfcc
def gen_win_sq(denoiser): window = denoiser.stft.window win_length = denoiser.stft.win_length n_fft = denoiser.stft.filter_length # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=None)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) return win_sq
def __init__(self, filter_length=1024, hop_length=512, win_length=None, length=None, window='hann'): """ This module implements an STFT using 1D convolution and 1D transpose convolutions. This is a bit tricky so there are some cases that probably won't work as working out the same sizes before and after in all overlap add setups is tough. Right now, this code should work with hop lengths that are half the filter length (50% overlap between frames). Keyword Arguments: filter_length {int} -- Length of filters used (default: {1024}) hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) win_length {[type]} -- Length of the window function applied to each frame (if not specified, it equals the filter length). (default: {None}) window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) (default: {'hann'}) """ super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length if win_length else filter_length self.window = window self.num_samples = length self.forward_transform = None self.pad_amount = int(self.filter_length / 2) scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( np.linalg.pinv(scale * fourier_basis).T[:, None, :]) assert (filter_length >= self.win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, self.win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer('forward_basis', forward_basis.float()) self.register_buffer('inverse_basis', inverse_basis.float())
def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann', feat_stat=None): super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( np.linalg.pinv(scale * fourier_basis).T[:, None, :]) if window is not None: assert (filter_length >= win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer('forward_basis', forward_basis.float()) self.register_buffer('inverse_basis', inverse_basis.float()) if feat_stat is not None: assert 'spec_min' in feat_stat.keys() assert 'spec_scale' in feat_stat.keys() spec_min = feat_stat['spec_min'].view(1, -1, 1) spec_scale = feat_stat['spec_scale'].view(1, -1, 1) assert spec_min.size(1) == filter_length // 2 + 1 assert spec_scale.size(1) == filter_length // 2 + 1 self.register_buffer('spec_min', spec_min) self.register_buffer('spec_scale', spec_scale) self.feat_stat = True else: self.feat_stat = False
def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann', device="cpu"): super(STFT, self).__init__() self.device = device self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([ np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :]) ]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( np.linalg.pinv(scale * fourier_basis).T[:, None, :].copy()) if window is not None: assert (filter_length >= win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer('forward_basis', forward_basis.float()) self.register_buffer('inverse_basis', inverse_basis.float()) inv_t_weight = self.inverse_basis.unsqueeze(-1) self.inv_t = torch.nn.ConvTranspose2d( in_channels=inv_t_weight.shape[1], out_channels=inv_t_weight.shape[3], kernel_size=inv_t_weight.shape[3], stride=self.hop_length, padding=0, bias=None) self.inv_t.weight.data = inv_t_weight
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann', center=None, dtype=np.complex64, pad_mode='reflect'): # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) fft_window = get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2 return stft_matrix, f
def window_sumsquare(window, n_frames, hop_length=120, win_length=800, n_fft=800, dtype=float, norm=None): if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x
def frames_stft(y_frames, n_fft=2048, win_length=None, window='hann', dtype=np.complex64): """ Adapted from librosa for frame input. NOTE: not centered anymore. """ # By default, use the entire frame if win_length is None: win_length = n_fft fft_window = get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / float(stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] return stft_matrix
def __init__(self, n_fft, hop_len, win_len): """ Parameters ---------- n_fft : int > 0 [scalar] Number of components in the fast Fourier transform (FFT). hop_len : int > 0 [scalar] Number audio of frames between STFT columns. win_len : int > 0 [scalar] Each frame of audio is windowed by `hann`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. `n_fft` >= `win_len`. """ super(PytorchSTFT, self).__init__() self.n_fft = int(n_fft) self.hop_len = int(hop_len) self.win_len = int(win_len) assert (self.n_fft >= self.win_len) self.cutoff_freq = self.n_fft // 2 + 1 fft_basis = fft(np.eye(self.n_fft)) fft_basis = np.vstack([ np.real(fft_basis[:self.cutoff_freq, :]), np.imag(fft_basis[:self.cutoff_freq, :]) ]) fft_basis = torch.Tensor(fft_basis[:, np.newaxis, :]) fft_win = torch.Tensor( pad_center(data=get_window(window='hann', Nx=self.win_len), size=self.n_fft).astype(np.float32)) self.basis = fft_basis * fft_win
''' Zero crossing rates and fundamental frequencies must be computed before normalizing the data, otherwise we are not calculating what we actually want. For ZCR no value crosses 0 after normalizing and the fundamentals won't correspond to the actual frequencies in hertz. ''' zero_crossing_rates = zero_crossing_rate( time_series, frame_length=int(frame_ms * sr_ms), hop_length=int(sliding_ms * sr_ms), center=True) frames = frame(time_series, frame_length=int(sr_ms * frame_ms), hop_length=int(sr_ms * sliding_ms)) frames = pad_center(frames, size=zero_crossing_rates.shape[1], axis=1) fundamentals = fundamental(frames, sr) ''' We normalize with respect to the maximum and minimum found across the corpus. ''' time_series = (time_series - min_max[meta_file][0]) / ( min_max[meta_file][1] - min_max[meta_file][0]) mfccs = mfcc(time_series, sr=sr, n_mfcc=12, n_fft=int(frame_ms * sr_ms), hop_length=int(sliding_ms * sr_ms)) d_mfccs = delta(mfccs, width=3, order=1) frames = frame(time_series,
def istft_noDiv(stft_matrix, hop_length=None, win_length=None, window=None, center=True, dtype=np.float32): """ #Copied from librosa's spectrum.py file, removing division by squared window, which shouldn't be necessary and can cause problems in recon. Inverse short-time Fourier transform (ISTFT). Converts a complex-valued spectrogram `stft_matrix` to time-series `y` by minimizing the mean squared error between `stft_matrix` and STFT of `y` as described in [1]_. In general, window function, hop length and other parameters should be same as in stft, which mostly leads to perfect reconstruction of a signal from unmodified `stft_matrix`. Parameters ---------- stft_matrix : np.ndarray [shape=(1 + n_fft/2, t)] STFT matrix from `stft` hop_length : int > 0 [scalar] Number of frames between STFT columns. If unspecified, defaults to `win_length / 4`. win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1) When reconstructing the time series, each frame is windowed and each sample is normalized by the sum of squared window according to the `window` function (see below). If unspecified, defaults to `n_fft`. window : None, function, np.ndarray [shape=(n_fft,)] - None (default): use an asymmetric Hann window - a window function, such as `scipy.signal.hanning` - a user-specified window vector of length `n_fft` center : boolean - If `True`, `D` is assumed to have centered frames. - If `False`, `D` is assumed to have left-aligned frames. dtype : numeric type Real numeric type for `y`. Default is 32-bit float. Returns ------- y : np.ndarray [shape=(n,)] time domain signal reconstructed from `stft_matrix` Raises ------ ParameterError If `window` is supplied as a vector of length `n_fft` See Also -------- stft : Short-time Fourier Transform Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> D = librosa.stft(y) >>> y_hat = librosa.istft(D) >>> y_hat array([ -4.812e-06, -4.267e-06, ..., 6.271e-06, 2.827e-07], dtype=float32) Exactly preserving length of the input signal requires explicit padding. Otherwise, a partial frame at the end of `y` will not be represented. >>> n = len(y) >>> n_fft = 2048 >>> y_pad = librosa.util.fix_length(y, n + n_fft // 2) >>> D = librosa.stft(y_pad, n_fft=n_fft) >>> y_out = librosa.util.fix_length(librosa.istft(D), n) >>> np.max(np.abs(y - y_out)) 1.4901161e-07 """ n_fft = 2 * (stft_matrix.shape[0] - 1) # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length / 4) if window is None: # Default is an asymmetric Hann window. ifft_window = scipy.signal.hann(win_length, sym=False) elif six.callable(window): # User supplied a windowing function ifft_window = window(win_length) else: # User supplied a window vector. # Make it into an array ifft_window = np.asarray(window) # Verify that the shape matches if ifft_window.size != n_fft: raise ParameterError('Size mismatch between n_fft and window size') # Pad out to match n_fft ifft_window = util.pad_center(ifft_window, n_fft) # scale the window ifft_window = ifft_window*(2.0/(win_length/hop_length)) n_frames = stft_matrix.shape[1] expected_signal_len = n_fft + hop_length * (n_frames - 1) y = np.zeros(expected_signal_len, dtype=dtype) ifft_window_sum = np.zeros(expected_signal_len, dtype=dtype) ifft_window_square = ifft_window * ifft_window for i in range(n_frames): sample = i * hop_length spec = stft_matrix[:, i].flatten() spec = np.concatenate((spec.conj(), spec[-2:0:-1]), 0) ytmp = ifft_window * fft.ifft(spec).real y[sample:(sample + n_fft)] = y[sample:(sample + n_fft)] + ytmp # shouldn't need to do this sum of the squared window: #ifft_window_sum[sample:(sample + n_fft)] += ifft_window_square # don't do this: ## Normalize by sum of squared window #approx_nonzero_indices = ifft_window_sum > util.SMALL_FLOAT #y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices] if center: y = y[int(n_fft // 2):-int(n_fft // 2)] return y
def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann', center=True, dtype=np.complex64, pad_mode='reflect'): """Short-time Fourier transform (STFT) Returns a complex-valued matrix D such that `np.abs(D[f, t])` is the magnitude of frequency bin `f` at frame `t` `np.angle(D[f, t])` is the phase of frequency bin `f` at frame `t` Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.hanning` - a vector or array of length `n_fft` .. see also:: `filters.get_window` center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, STFT uses reflection padding. Returns ------- D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype] STFT matrix See Also -------- istft : Inverse STFT ifgram : Instantaneous frequency spectrogram np.pad : array padding Notes ----- This function caches at level 20. Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> D = np.abs(librosa.stft(y)) >>> D array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ..., 6.82710262e-04, 2.51654536e-04, 7.23036574e-05], [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ..., 3.48026224e-04, 2.35853557e-04, 7.54836728e-05], [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ..., 6.29352580e-04, 3.38571583e-04, 8.38094638e-05], ..., [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ..., 1.85637656e-08, 2.89708542e-08, 5.74304337e-09], [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ..., 3.49099771e-08, 3.11740926e-08, 5.29926236e-09], [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ..., 5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32) Use left-aligned frames, instead of centered frames >>> D_left = np.abs(librosa.stft(y, center=False)) Use a shorter hop length >>> D_short = np.abs(librosa.stft(y, hop_length=64)) Display a spectrogram >>> import matplotlib.pyplot as plt >>> librosa.display.specshow(librosa.amplitude_to_db(D, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('Power spectrogram') >>> plt.colorbar(format='%+2.0f dB') >>> plt.tight_layout() """ # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) #fft_window = get_window(window, win_length, fftbins=True) fft_window = vorbis(win_length) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] return stft_matrix
import librosa import numpy as np import librosa.util as util from librosa.filters import get_window audio_path = "../AudioData/audio/D4_750.wav" noise_path = "../AudioData/noise/Pink Noise.wav" # 读取音频文件 y, sr = librosa.load(audio_path) # 对音频文件进行分帧 win_len = n_fft = 200 hop_length = 80 # Pad the time series so that frames are centered y = np.pad(y, int(n_fft // 2), mode='reflect') # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length, axis=0) # 获得窗系数 fft_window = get_window('hamm', 10, fftbins=False) # fft_window = fft_window[1:-1] print(fft_window) fft_window = get_window('hamm', 10, fftbins=True) print(fft_window) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) #
def pad_center(self, audio_data): return util.pad_center(audio_data, self.n_fft)
def hht(self, y, hop_length=None, win_length=None, center=True, dtype=np.complex64, pad_mode='reflect'): """Hilbert-Huang transform (HHT) Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, HHT uses reflection padding. Returns ------- hht_matrix : np.ndarray [shape=(30, t), dtype=dtype] bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype] """ # By default, use the entire frame if win_length is None: win_length = self.n_hht # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length / 2) hht_window = self.window # Pad the window out to n_hht size hht_window = util.pad_center(hht_window, self.n_hht) # Reshape so that the window can be broadcast hht_window = hht_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, self.n_hht - 1, mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=self.n_hht, hop_length=hop_length).T # Pre-allocate the HHT matrix hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F') bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]), dtype=dtype, order='F') for bl_s in range(hht_matrix.shape[1]): frame_signal = hht_window[:, 0] * y_frames[bl_s, :] A, f, bjp = get_hht(frame_signal, self.fs) hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp) bjp_matrix[:, bl_s] = bjp return hht_matrix, bjp_matrix
frame_step = config['frame_step'] n_fft = config['n_fft'] n_mels = config['mfcc_bank_cnt'] fmin = config['fmin'] fmax = config['fmax'] dtype = config.get('dtype', "int") high_prec = config.get('use_high_prec', False) or dtype == "fix32_scal" use_power = False rad4 = round(math.log(n_fft // 2, 4)) == math.log(n_fft // 2, 4) ndct = config.get('n_dct', False) from librosa.filters import get_window from librosa import util librosa_fft_window = get_window("hann", frame_size, fftbins=True) # Pad the window out to n_fft size librosa_fft_window = util.pad_center(librosa_fft_window, n_fft) stft = librosa.core.spectrum.stft(data, n_fft, frame_step, frame_size, center=False, pad_mode="constant") spect = np.abs(stft)**(1 if not use_power else 2) mel_basis = librosa.filters.mel(samplerate, n_fft, n_mels, fmin, fmax) mel_spect = np.dot(mel_basis, spect) logmel = power_to_db(mel_spect, top_db=None) mfcc = scipy.fftpack.dct(logmel, axis=0, type=2, norm=None) with open("ground_truth.h", "w") as f: f.write(f"float ground_truth[] = {{\n") for elem in mfcc.T.flatten():
def test(y, n_fft=2048, hop_length=None, win_length=None, window=None, center=True, dtype=np.complex64): import scipy import six from librosa import util # By default, use the entire frame if win_length is None: win_length = n_fft # win_length = tf.constant(n_fft) # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length / 4) # hop_length = win_length/4 # hop_length.to_int64() if window is None: # Default is an asymmetric Hann window fft_window = scipy.signal.hann(win_length, sym=False) # fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False)) elif six.callable(window): # User supplied a window function fft_window = window(win_length) else: # User supplied a window vector. # Make sure it's an array: fft_window = np.asarray(window) # validate length compatibility # if fft_window.size != n_fft: # raise ParameterError('Size mismatch between n_fft and len(window)') # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft)) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # tf.reshape(fft_window, (-1,1)) if center: #util.valid_audio(y) #y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect') padding = int(n_fft // 2) y_frames = tf.Variable(tf.pad(y, [[padding, padding]], mode='REFLECT')) # Window the time series. #y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length) #y_frames.assign(librosa.util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=1)) y_frames = frame(y_frames, n_fft, hop_length) # Pre-allocate the STFT matrix #stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), # dtype=dtype, # order='F') stft_matrix = tf.Variable( tf.zeros(y_frames.get_shape()[1]._value, (int(1 + n_fft // 2)), dtype='float32')) # how many columns can we fit within MAX_MEM_BLOCK? #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * # stft_matrix.itemsize)) n_columns = int(librosa.util.MAX_MEM_BLOCK / (stft_matrix.get_shape()[1]._value * convertTFtoNP(stft_matrix).itemsize)) #for bl_s in range(0, stft_matrix.shape[1], n_columns): for bl_s in range(0, stft_matrix.get_shape()[0]._value, n_columns): #bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[0]._value) # RFFT and Conjugate here to match phase from DPWE code #stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft(fft_window * # y_frames[:, bl_s:bl_t], # axis=0)[:stft_matrix.shape[0]].conj() stft_matrix = tf.scatter_update( stft_matrix, tf.constant(list(range(bl_s, bl_t, 1))), tf.conj( tf.slice( tf.fft(fft_window * tf.slice( y_frames, [0, bl_s], [y_frames.get_shape()[0]._value, bl_t - bl_s])), [0], [stft_matrix.get_shape()[0]._value]))) return stft_matrix