def extract(file_list, train_scaler=False): fns = np.loadtxt(file_list, dtype='str') cur_batch_size = len(fns) f_to_mel = filters.mel(sr=sampling_rate, n_fft=nfft, n_freq_bins=n_freq_bins) print("Extracting features") mp_func = partial(log_filterbank_energy, output_dir="features", sampling_rate=sampling_rate, nfft=nfft, n_freq_bins=n_freq_bins, spectral_frame_length_s=spectral_frame_length_s, frame_length_s=frame_length_s, hop_length_s=hop_length_s, force=False, mel_scale=True) feature_fns = mp_with_pbar(mp_func, fns, mp.cpu_count()) if train_scaler: print("Training scaler") scaler = StandardScaler() for i, fn in tqdm(enumerate(feature_fns), total=len(feature_fns)): with h5py.File(fn, 'r') as f: spec = f['data'] scaler.partial_fit(spec[:, :, 0].T) joblib.dump(scaler, "scaler.pkl") return feature_fns
def fft_and_melscale(song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False): """ fft and melscale method. fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う. melscale: 周波数の次元を削減するとともに,log10の値を取っている. """ feat_channels = [] for nfft in nffts: feats = [] window = signal.blackmanharris(nfft) filt = mel(song.samplerate, nfft, mel_nband, mel_freqlo, mel_freqhi) # get normal frame frame = make_frame(song.data, nhop, nfft) # melscaling processedframe = fft(window * frame)[:, :nfft // 2 + 1] processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2)) processedframe = 20 * np.log10(processedframe + 0.1) feat_channels.append(processedframe) if include_zero_cross: song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0] print(song.zero_crossing) res = np.array(feat_channels) return res
def extract_f0_func_audiofile(audio_file, gender='M'): floor_sp, ceil_sp = -80, 30 mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5) if gender == 'M': lo, hi = 50, 250 elif gender == 'F': lo, hi = 100, 600 else: raise ValueError prng = RandomState(0) x, fs = sf.read(audio_file) if(len(x.shape) >= 2): x = x[:, 0] if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) tmp = f0_rapt[index_nonzero] mean_f0, std_f0 = np.mean(tmp), np.std(tmp) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) return S, f0_norm
def log_melsp_01(x, sr=16000, n_fft=1024, hop_length=256, n_mels=80, fmin=80, fmax=8000): ''' ''' mel_basis = mel(sr, n_fft, fmin=fmin, fmax=fmax, n_mels=n_mels).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5) # # Remove drifting noise y = signal.filtfilt(b, a, x) # Ddd a little random noise for model roubstness prng = RandomState() wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 # Compute spect D = pySTFT(wav, fft_length=n_fft, hop_length=hop_length).T # Convert to mel and normalize D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = np.clip((D_db + 100) / 100, 0, 1) return S.astype(np.float32)
def pncc(audio_wave, n_fft=1024, sr=16000, window="hamming", n_mels=40, n_pncc=13, weight_N=4, power=2, dct=True): pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave) stft_pre_emphasis_signal = np.abs( stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power power_stft_pre_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T) q_ = medium_time_power_calculation(power_stft_pre_signal) q_le = asymmetric_lawpass_filtering(q_, 0.999, 0.5) pre_q_0 = q_ - q_le q_0 = halfwave_rectification(pre_q_0) q_f = asymmetric_lawpass_filtering(q_0) q_th = temporal_masking(q_0) r_sp = after_temporal_masking(q_th, q_f) r_ = switch_excitation_or_non_excitation(r_sp=r_sp, q_f=q_f, q_le=q_le, q_power_stft_pre_signal=q_) s_ = weight_smoothing(r_=r_, q_=q_, N=weight_N) t_ = time_frequency_normalization(p_=power_stft_pre_signal, s_=s_) u_ = mean_power_normalization(t_, r_) v_ = power_function_nonlinearity(u_) dct_v = np.dot(filters.dct(n_pncc, v_.shape[1]), v_.T) if dct: return dct_v.T else: return v_.T
def __init__( self, n_fft: int = 512, n_mels: int = 80, sample_rate: int = 16000, hop_length: int = 200, f_max=8000, # default f_min=0, # default power=2.0, # default win_length=None, window='hann', # default center=True, pad_mode='reflect', # default norm=None, # default for pytorch htk=True # default for pytorch ): self.n_fft = n_fft self.sample_rate = sample_rate self.pad_mode = pad_mode self.hop_length = hop_length self.power = power self.win_length = n_fft self.mel_basis = filters.mel( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, # mel filter fmin=f_min, # mel filter fmax=f_max, # mel filter norm=norm, # mel filter htk=htk) self.fft_window = get_window(window, self.win_length, fftbins=True).reshape((-1, 1))
def extract(audio_fn): # Read and Resample the audio try: data, _ = librosa.core.load(audio_fn, sr=sampling_rate) data = normalize(data) except Exception as e: logging.exception(e) return None # ensure length if len(data) > duration: data = data[:duration] elif len(data) < duration: data = np.pad(data, (duration - len(data), ), mode='constant', constant_values=0) # spectrogram f, t, Sxx = sp.signal.spectrogram(data, fs=sampling_rate, window=window, nperseg=frame_length, noverlap=overlap_length, nfft=nfft) if mel_scale: # spectrogram -> log mel fb f_to_mel = filters.mel(sr=sampling_rate, n_fft=nfft, n_mels=n_freq_bins) Sxx = f_to_mel.dot(Sxx) Sxx = np.expand_dims(np.log(1e-8 + Sxx), axis=-1) return Sxx
def frft_MFCC(S, fs, n_mfcc=13, n_mels=128, dct_type=2, norm='ortho', power=2, pic=None): n_fft = 2 * (S.shape[0] - 1) # Build a Mel filter y = np.abs(S)**power mel_basis = filters.mel(sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=0.0, fmax=None, htk=False, norm=1) melspectrogram = np.dot(mel_basis, y) S_db = lib.core.power_to_db(melspectrogram) feature = fftpack.dct(S_db, axis=0, type=dct_type, norm=norm)[:n_mfcc] if pic is not None: visual.specgram(X=feature, title='frft_mfcc', xlabel='Time', ylabel='frft_mfccs', pic=pic + '_frft_mfcc') return feature
def apply_melfb(spec, fs, n_mels=128, amin=1e-10): fbin = spec.shape[-1] n_fft = fbin * 2 - 2 mfb = mel(fs, n_fft, n_mels=n_mels) spec = np.maximum(spec, amin) mspec = np.maximum(spec @ mfb.T, amin) return mspec
def fftandmelscaleikkatsu(song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False): feat_channels = [] for nfft in nffts: feats = [] window = signal.blackmanharris(nfft) filt = mel(song.samplerate, nfft, mel_nband, mel_freqlo, mel_freqhi) frame = Frame(song.data, nhop, nfft) # frame = Frame2(data, nhop, nfft, nffts[-1]) print(frame.shape) processedframe = fft(window * frame)[:, :nfft // 2 + 1] processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2)) processedframe = 20 * np.log10(processedframe + 0.1) # processedframe = normalize(processedframe, axis=1, copy=False) print(processedframe.shape) feat_channels.append(processedframe) if include_zero_cross: song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0] print(song.zero_crossing) return np.array(feat_channels)
def test_mel_filterbank(N=15): np.random.seed(12345) i = 0 while i < N: fs = np.random.randint(50, 10000) n_filters = np.random.randint(2, 20) window_len = np.random.randint(10, 100) norm = np.random.randint(2) mine = mel_filterbank(window_len, n_filters, fs, min_freq=0, max_freq=None, normalize=bool(norm)) theirs = mel( fs, n_fft=window_len, n_mels=n_filters, htk=True, norm=norm if norm == 1 else None, ) np.testing.assert_almost_equal(mine, theirs) print("PASSED") i += 1
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = mel(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) self.mel_basis = nd.array(mel_basis)
def _wav_to_spec(self, wav, sample_rate, wav_path=None, introduce_noise=False): """Convert wav file to a mel spectrogram Args: wav (numpy array): audio data either 1-d (mono) or 2-d (stereo) sample_rate (int): the sampling rate of the .wav (sf.read[1]) wav_path (str): Path to original wav file note that these two variables can be loaded using: wavfile, sample_rate = sf.read(os.path.join(input_dir, speaker, fileName)) Returns: np.array: Mel spectrogram """ mel_basis = mel(Config.audio_sr, Config.n_fft, fmin=Config.fmin, fmax=Config.fmax, n_mels=Config.n_mels).T min_level = np.exp(Config.min_level_db / 20 * np.log(10)) b, a = self._butter_highpass(30, Config.audio_sr, order=5) # Resample wav if needed if sample_rate != Config.audio_sr: wav = librosa.resample(wav, sample_rate, Config.audio_sr) print( f"Wav file with sr {sample_rate} != {Config.audio_sr}, Now resampling to {Config.audio_sr}, then try to write to {wav_path}" ) if wav_path: sf.write(wav_path, wav, Config.audio_sr) # Write downsampled file # Remove drifting noise wav = signal.filtfilt(b, a, wav) # add a little random noise for model robustness if introduce_noise: log.info(f"Introducing random noise into wav.file") wav = wav * 0.96 + (self._prng.rand(wav.shape[0]) - 0.5) * 1e-06 # Compute spectrogram D = self._pySTFT(wav, fft_length=Config.n_fft, hop_length=Config.hop_length).T # Convert to mel and normalize D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum( min_level, D_mel)) - Config.ref_level_db # amp to db S = np.clip((D_db - Config.min_level_db) / -Config.min_level_db, 0, 1) # clip between 0-1 return S
def pncc(audio_wave, n_fft=512, sr=16000, winlen=0.020, winstep=0.010, n_mels=128, n_pncc=13, weight_N=4, power=2): pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave) mono_wave = to_mono(pre_emphasis_signal.T) stft_pre_emphasis_signal = np.abs( stft(mono_wave, n_fft=n_fft, hop_length=int(sr * winstep), win_length=int(sr * winlen), window=np.ones(int(sr * winlen)), center=False))**power mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T) medium_time_power = medium_time_power_calculation(power_stft_signal) lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999, 0.5) subtracted_lower_envelope = medium_time_power - lower_envelope rectified_signal = halfwave_rectification(subtracted_lower_envelope) floor_level = asymmetric_lawpass_filtering(rectified_signal) temporal_masked_signal = temporal_masking(rectified_signal) final_output = switch_excitation_or_non_excitation(temporal_masked_signal, floor_level, lower_envelope, medium_time_power) spectral_weight_smoothing = weight_smoothing(final_output, medium_time_power, L=n_mels) transfer_function = time_frequency_normalization( power_stft_signal, spectral_weight_smoothing) normalized_power = mean_power_normalization(transfer_function, final_output, L=n_mels) power_law_nonlinearity = power_function_nonlinearity(normalized_power) dct = np.dot(power_law_nonlinearity, filters.dct(n_pncc, power_law_nonlinearity.shape[1]).T) return dct
def __init__(self, n_mels, sample_rate, filter_length, hop_length, win_length=None, mel_fmin=0.0, mel_fmax=None): super(MelSpectrogram, self).__init__() self.stft = STFT(filter_length, hop_length, win_length) mel_basis = mel(sample_rate, filter_length, n_mels, mel_fmin, mel_fmax, htk=True) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def get_filters(config: Dict): mel_basis = mel(config["rate"], config["window"], fmin=90, fmax=config["fmax"], n_mels=config["mels"]).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, config["rate"], order=5) return mel_basis, min_level, b, a
def __init__(self, frame_stream, specfmt="dB", mels_N=12): ''' DFTStream(frame_stream, specfmt, mels_N) Create a stream of discrete Fourier transform (DFT) frames using the specified sample frame stream. Only bins up to the Nyquist rate are returned in the stream Optional arguments: specfmt - DFT output: "complex" - return complex DFT results "dB" [default] - return power spectrum 20log10(magnitude) "mag^2" - magnitude squared spectrum "Mel" - melodic scale mels_N - Number of Mel filters to use. Only applicable when specfmt == "Mel". ''' self.format_types = {"complex" : 0, "mag^2" : 1, "dB" : 2, "Mel" : 3} self.framer = frame_stream self.frame_len = frame_stream.get_framelen_samples() try: self.format = self.format_types[specfmt] except KeyError: raise ValueError("Unknown specfmt {}. Use one of [{}]".format( specfmt, ", ".join(self.format_types.keys()))) # Number of frequency bins is the same as the number of bins in the # frame self.dft_bins = self.frame_len # Only bins up to the Nyquist rate are usable. The DFT routine that # we are using will return up to and including the Nyuist (half bins # plus 1 if even) self.Nyquist_Hz = self.framer.get_Fs() / 2.0 # We add 1.1 instead of 1, see numpy.around for details which # np.round uses. self.bins_Nyquist = np.int(np.round((self.frame_len+1.1)/2.0)) self.window = signal.get_window("hamming", self.frame_len) if self.format == self.format_types["Mel"]: # Construct Mel filters self.mel_filters = mel(self.framer.get_Fs(), self.dft_bins, mels_N) # Center frequencies of the Mel filters in Hz # Returns two more than are actually used (0 Hz and Nyquist) self.bins_Hz = mel_frequencies(mels_N+2, fmin=0, fmax=self.Nyquist_Hz) self.bins_Hz = self.bins_Hz[1:-1] # Remove ends self.bins_N = len(self.bins_Hz) else: self.bins_Hz = np.arange(self.bins_Nyquist) / self.bins_Nyquist * self.Nyquist_Hz self.bins_N = self.bins_Hz.shape[0]
def get_coeffs(self,A,num_ceps=13,num_filters=16,f_bins=400,fs=100,normalize=True,corr=False): fbank = filters.mel(fs,f_bins,num_filters, norm=None) fbank_coeffs = np.dot(fbank,A).T cc = fftpack.dct(fbank_coeffs, type=2, norm='ortho')[:, 1 : (num_ceps + 1)] if normalize == True: cc -= (np.mean(cc, axis=0) + 1e-8) fbank_coeffs -= (np.mean(fbank_coeffs, axis=0) + 1e-8) if corr == False: return cc else: return fbank_coeffs
def __init__(self, sampling_rate: int = 22050, n_fft: int = 1024, window_size: int = 1024, hop_size: int = 256, num_mels: int = 80, fmin: float = 0., fmax: float = 8000.): super().__init__() self.n_fft = n_fft self.hop_size = hop_size self.window_size = window_size self.pad_size = (self.n_fft - self.hop_size) // 2 mel_filter_tensor = torch.FloatTensor(mel(sampling_rate, n_fft, num_mels, fmin, fmax)) self.register_buffer('mel_filter', mel_filter_tensor) self.register_buffer('window', torch.hann_window(window_size))
def mel_scaled_spectrogram(spectrogram: ndarray, sr: int, n_mels: Optional[int] = 128, fmin: Optional[float] = 0.0, fmax: Optional[Union[float, None]] = None, htk: Optional[bool] = False): """Calculates the mel scaled version of the spectrogram. :param spectrogram: Spectrogram to be used. :type spectrogram: numpy.ndarray :param sr: Sampling frequency of the original signal. :type sr: int :param n_mels: Amount of mel filters to use, defaults to 128. :type n_mels: int, optional :param fmin: Minimum frequency for mel filters, defaults to 0.0. :type fmin: float, optional :param fmax: Maximum frequency for mel filters. If `None`, \ sr/2.0 is used. Defaults to None :type fmax: float|None, optional :param htk: Use HTK formula, instead of Slaney, defaults to False. :type htk: bool, optional :return: Mel scaled version of the input spectrogram, with shape \ (channels, nb_mels, values) for channels >= 2, else \ (nb_mels, values). :rtype: numpy.ndarray """ ndim = spectrogram.ndim if ndim not in [2, 3]: raise AttributeError('Input spectrogram must be of shape ' '(channels, nb_frames, frames). ' f'Current input has {ndim} dimensions. ' f'Allowed are either 2 or 3.') n_fft = 2 * (spectrogram[ndim - 2] - 1) mel_filters = mel( sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=htk) if ndim == 2: mel_spectrogram = np_dot(mel_filters, spectrogram) else: mel_spectrogram = np_cat([expand_dims(np_dot(mel_filters, i), 0) for i in spectrogram], axis=0) return mel_spectrogram
def melspectrogram(y=None, sr=16000, n_fft=400, hop_length=160, power=2.0, **kwargs): """Compute a mel-scaled spectrogram. If a spectrogram input `S` is provided, then it is mapped directly onto the mel basis `mel_f` by `mel_f.dot(S)`. If a time-series input `y, sr` is provided, then its magnitude spectrogram `S` is first computed, and then mapped onto the mel scale by `mel_f.dot(S**power)`. By default, `power=2` operates on a power spectrum. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time-series sr : number > 0 [scalar] sampling rate of `y` n_fft : int > 0 [scalar] length of the FFT window hop_length : int > 0 [scalar] number of samples between successive frames. See `librosa.core.stft` power : float > 0 [scalar] Exponent for the magnitude melspectrogram. e.g., 1 for energy, 2 for power, etc. kwargs : additional keyword arguments Mel filter bank parameters. See `librosa.filters.mel` for details. Returns ------- S : np.ndarray [shape=(n_mels, t)] Mel spectrogram """ # Compute a magnitude spectrogram from input S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length, center=False))**power # Build a Mel filter mel_basis = filters.mel(sr, n_fft, **kwargs) return np.dot(mel_basis, S)
def analyze(audio, time): """ """ # print(time) n_fft = int(44100 * 0.5) mel_basis = filters.mel(44100, n_fft, n_mels=256) # print(mel_basis.shape) start_idx = int(44100 * time) #-n_fft/2) spec = np.log10(np.abs(np.fft.fft(audio[start_idx:start_idx + n_fft]))**2) spec = spec[:int(len(spec) / 2) + 1] # print(spec.shape) mel_spec = np.dot(mel_basis, spec) norm = np.linalg.norm(mel_spec, ord=2) return mel_spec / norm, norm
def pncc(audio_wave, n_fft=1024, sr=16000, window="hamming", n_mels=40, n_pncc=13, weight_N=4, power=2, dct=True): pre_emphasis_signal = scipy.signal.lfilter([1.0, -0.97], 1, audio_wave) stft_pre_emphasis_signal = np.abs( stft(pre_emphasis_signal, n_fft=n_fft, window=window))**power mel_filter = np.abs(filters.mel(sr, n_fft=n_fft, n_mels=n_mels))**power power_stft_signal = np.dot(stft_pre_emphasis_signal.T, mel_filter.T) medium_time_power = medium_time_power_calculation(power_stft_signal) lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999, 0.5) subtracted_lower_envelope = medium_time_power - lower_envelope rectified_signal = halfwave_rectification(subtracted_lower_envelope) floor_level = asymmetric_lawpass_filtering(rectified_signal) temporal_masked_signal = temporal_masking(rectified_signal) temporal_masked_signal = after_temporal_masking(temporal_masked_signal, floor_level) final_output = switch_excitation_or_non_excitation(temporal_masked_signal, floor_level, lower_envelope, medium_time_power) spectral_weight_smoothing = weight_smoothing(final_output, medium_time_power, weight_N) transfer_function = time_frequency_normalization( power_stft_signal=power_stft_signal, spectral_weight_smoothing=spectral_weight_smoothing) normalized_power = mean_power_normalization(transfer_function, final_output) power_law_nonlinearity = power_function_nonlinearity(normalized_power) dct_v = np.dot(filters.dct(n_pncc, power_law_nonlinearity.shape[1]), power_law_nonlinearity.T) return power_law_nonlinearity
def __init__(self, sample_rate, preemphasis, frequency, frame_length, frame_shift, min_dbs, ref_dbs, mels_size, griff_lim_iters, power): self.preemphasis = preemphasis self.n_fft = (frequency - 1) * 2 self.win_length = int(frame_length / 1e3 * sample_rate) self.hop_length = int(frame_shift / 1e3 * sample_rate) self.min_dbs = min_dbs self.ref_dbs = ref_dbs self.griff_lim_iters = griff_lim_iters self.power = power # Create a Filterbank matrix to combine FFT bins into Mel-frequency bins self.mel_basis = filters.mel(sr=sample_rate, n_fft=self.n_fft, n_mels=mels_size)
def transform2mel(spectrogram, samplerate, fft_window_size, n_mel_bands=80, freq_min=0, freq_max=None): '''Transform to Mel convert a spectrogram to a Mel scale spectrogram by grouping original frequency bins to Mel frequency bands (using Mel filter from Librosa) Parameters spectrogram: input spectrogram samplerate: samplerate of audio signal fft_window_size: number of time window / frequency bins in the FFT analysis n_mel_bands: number of desired Mel bands, typically 20, 40, 80 (max. 128 which is default when 'None' is provided) freq_min: minimum frequency (Mel filters will be applied >= this frequency, but still return n_meld_bands number of bands) freq_max: cut-off frequency (Mel filters will be applied <= this frequency, but still return n_meld_bands number of bands) Returns: mel_spectrogram: Mel spectrogram: np.array of shape(n_mel_bands,frames) maintaining the number of frames in the original spectrogram ''' from librosa.filters import mel # Syntax: librosa.filters.mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False) mel_basis = mel(samplerate, fft_window_size, n_mels=n_mel_bands, fmin=freq_min, fmax=freq_max) freq_bin_max = mel_basis.shape[1] # will be fft_window_size / 2 + 1 # IMPLEMENTATION WITH FOR LOOP # initialize Mel Spectrogram matrix #n_mel_bands = mel_basis.shape[0] # get the number of bands from result in case 'None' was specified as parameter #mel_spectrogram = np.empty((n_mel_bands, frames)) #for i in range(frames): # stepping through the wave segment, building spectrum for each window # mel_spectrogram[:,i] = np.dot(mel_basis,spectrogram[0:freq_bin_max,i]) # IMPLEMENTATION WITH DOT PRODUCT (15% faster) # multiply the mel filter of each band with the spectogram frame (dot product executes it on all frames) # filter will be adapted in a way so that frequencies beyond freq_max will be discarded mel_spectrogram = np.dot(mel_basis, spectrogram[0:freq_bin_max, :]) return (mel_spectrogram)
def __init__(self, dlnet_config: dict, ds_config: str): """ Init wrapper object. Reads DL Network config and dataset config. Parameters ---------- dlnet_config : dict Config for DL Network and preprocessing ds_config : str Path to Dataset config to extract classes """ # Set config: self.config = dlnet_config # Set random seed: random.seed = self.config['random_seed'] # Classes if self.config['binary']: self.config['classes'] = ['compressed_wav', 'uncompr_wav'] else: self.config['classes'] = self.get_classes_from_dataset(ds_config) # Input shape and filter settings: if self.config['calculate_mel']: # Mel filter init: self._mel_filter = filters.mel(self.config['sr'], self.config['n_fft'], n_mels=dlnet_config['n_mels'], norm='slaney') self.config['input_shape'] = (self.config['n_mels'], self.config['n_frames'], 1) elif self.config['filter_signal']: # Crop spectrogram # frequency array self._freqs = np.fft.rfftfreq(self.config['n_fft'], d=1 / self.config['sr']) # cutoff frequency bin at cutoff frequency self._cutoff_bin = int( np.argmin(np.abs(self._freqs - self.config['filter_config'][1]))) self.config['input_shape'] = ( int(len(self._freqs) - self._cutoff_bin), self.config['n_frames'], 1) else: self.config['input_shape'] = (int(self.config['n_fft'] / 2 + 1), self.config['n_frames'], 1)
def __init__(self, flows, n_group, sr, window_size, n_mels, hp, use_conv1x1=False): super().__init__() self.flows = flows self.n_group = n_group self.win_size = window_size self.hop_size = hp.audio.hop_length self.n_mels = n_mels self.sr = sr self.sub_sr = self.hop_size // n_group self.upsampler = nn.Sequential( nn.ConvTranspose1d(n_mels, n_mels, self.sub_sr * 2 + 1, self.sub_sr, padding=self.sub_sr), nn.LeakyReLU(0.4, True)) self.upsampler.apply(add_weight_norms) self.WNs = nn.ModuleList() if use_conv1x1: self.invconv1x1 = nn.ModuleList() # Set up layers with the right sizes based on how many dimensions # have been output already for k in range(flows): self.WNs.append( WN2D(n_group, n_mels, hp.model.dilation_channels, hp.model.residual_channels, hp.model.skip_channels)) if use_conv1x1: self.invconv1x1.append( InvertibleConv1x1(n_group, memory_efficient=False)) filters = mel(sr, window_size, n_mels, fmax=8000) self.filter_idx = np.nonzero(filters) self.register_buffer('filter_value', torch.Tensor(filters[self.filter_idx])) self.filter_size = torch.Size(filters.shape) self.register_buffer('window', torch.hann_window(window_size))
def __init__(self, flows, n_group, n_early_every, n_early_size, sr, window_size, hop_size, n_mels, memory_efficient, **kwargs): super().__init__() self.flows = flows self.n_group = n_group self.n_early_every = n_early_every self.n_early_size = n_early_size self.win_size = window_size self.hop_size = hop_size self.n_mels = n_mels self.sr = sr self.upsample_factor = hop_size // n_group sub_win_size = window_size // n_group # self.upsampler = nn.ConvTranspose1d(n_mels, n_mels, sub_win_size, self.upsample_factor, # padding=sub_win_size // 2, bias=False) self.invconv1x1 = nn.ModuleList() self.WNs = nn.ModuleList() # Set up layers with the right sizes based on how many dimensions # have been output already n_remaining_channels = n_group self.z_split_sizes = [] for k in range(flows): if k % self.n_early_every == 0 and k: n_remaining_channels -= n_early_size self.z_split_sizes.append(n_early_size) self.invconv1x1.append( InvertibleConv1x1(n_remaining_channels, memory_efficient=memory_efficient)) self.WNs.append( AffineCouplingBlock(WN, memory_efficient=memory_efficient, in_channels=n_remaining_channels // 2, aux_channels=n_mels, **kwargs)) self.z_split_sizes.append(n_remaining_channels) filters = mel(sr, window_size, n_mels, fmax=8000) self.filter_idx = np.nonzero(filters) self.register_buffer('filter_value', torch.Tensor(filters[self.filter_idx])) self.filter_size = torch.Size(filters.shape) self.register_buffer('window', torch.hann_window(window_size))
def __init__(self, sr=22050, n_fft=2048, n_mels=128, hop_length=512, window='hann', center=True, pad_mode='reflect', htk=False, fmin=0.0, fmax=None, norm=1, trainable_mel=False, trainable_STFT=False): super(MelSpectrogram, self).__init__() self.stride = hop_length self.center = center self.pad_mode = pad_mode self.n_fft = n_fft # Create filter windows for stft start = time() wsin, wcos, self.bins2freq, _ = create_fourier_kernels(n_fft, freq_bins=None, window=window, freq_scale='no', sr=sr) self.wsin = torch.tensor(wsin, dtype=torch.float) self.wcos = torch.tensor(wcos, dtype=torch.float) print("STFT filter created, time used = {:.4f} seconds".format(time() - start)) # Creating kenral for mel spectrogram start = time() mel_basis = mel(sr, n_fft, n_mels, fmin, fmax, htk=htk, norm=norm) self.mel_basis = torch.tensor(mel_basis) print("Mel filter created, time used = {:.4f} seconds".format(time() - start)) if trainable_mel == True: self.mel_basis = torch.nn.Parameter(self.mel_basis) if trainable_STFT == True: self.wsin = torch.nn.Parameter(self.wsin) self.wcos = torch.nn.Parameter(self.wcos)
class AudioProcessor: """Process audio data.""" sample_rate = 16000 top_db = 15 ref_db = 20 max_db = 100 fft_len = 1024 hop_len = 256 mel_basis = mel(sample_rate, fft_len, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) @classmethod def butter_highpass(cls, cutoff=30, order=5): """Create butter highpass filter.""" normal_cutoff = cutoff / (0.5 * cls.sample_rate) return butter(order, normal_cutoff, btype='high', analog=False) @classmethod def short_time_fourier_transform(cls, wav): """Apply short time Fourier transform.""" d_matrix = stft(wav, n_fft=cls.fft_len, hop_length=cls.hop_len) return np.abs(d_matrix) @classmethod def file2spectrogram(cls, file_path): """Load audio file and create spectrogram.""" wav = load(file_path, sr=cls.sample_rate)[0] wav = trim(wav, top_db=cls.top_db)[0] wav = filtfilt(*cls.butter_highpass(), wav) wav = wav * 0.96 d_mag = cls.short_time_fourier_transform(wav) d_mel = np.dot(d_mag.T, cls.mel_basis) db_val = 20 * np.log10(np.maximum(cls.min_level, d_mel)) db_scaled = db_val - cls.ref_db db_normalized = (db_scaled + cls.max_db) / cls.max_db return np.clip(db_normalized, 0, 1).astype(np.float32)
def transform2mel(spectrogram,samplerate,fft_window_size,n_mel_bands = 80,freq_min = 0,freq_max = None): '''Transform to Mel convert a spectrogram to a Mel scale spectrogram by grouping original frequency bins to Mel frequency bands (using Mel filter from Librosa) Parameters spectrogram: input spectrogram samplerate: samplerate of audio signal fft_window_size: number of time window / frequency bins in the FFT analysis n_mel_bands: number of desired Mel bands, typically 20, 40, 80 (max. 128 which is default when 'None' is provided) freq_min: minimum frequency (Mel filters will be applied >= this frequency, but still return n_meld_bands number of bands) freq_max: cut-off frequency (Mel filters will be applied <= this frequency, but still return n_meld_bands number of bands) Returns: mel_spectrogram: Mel spectrogram: np.array of shape(n_mel_bands,frames) maintaining the number of frames in the original spectrogram ''' from librosa.filters import mel # Syntax: librosa.filters.mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False) mel_basis = mel(samplerate,fft_window_size, n_mels=n_mel_bands,fmin=freq_min,fmax=freq_max) freq_bin_max = mel_basis.shape[1] # will be fft_window_size / 2 + 1 # IMPLEMENTATION WITH FOR LOOP # initialize Mel Spectrogram matrix #n_mel_bands = mel_basis.shape[0] # get the number of bands from result in case 'None' was specified as parameter #mel_spectrogram = np.empty((n_mel_bands, frames)) #for i in range(frames): # stepping through the wave segment, building spectrum for each window # mel_spectrogram[:,i] = np.dot(mel_basis,spectrogram[0:freq_bin_max,i]) # IMPLEMENTATION WITH DOT PRODUCT (15% faster) # multiply the mel filter of each band with the spectogram frame (dot product executes it on all frames) # filter will be adapted in a way so that frequencies beyond freq_max will be discarded mel_spectrogram = np.dot(mel_basis,spectrogram[0:freq_bin_max,:]) return (mel_spectrogram)