def mel(sr, n_dft, n_mels=128, fmin=0.0, fmax=None): '''[np] create a filterbank matrix to combine stft bins into mel-frequency bins use Slaney Keunwoo: copied from Librosa, librosa.filters.mel n_mels: numbre of mel bands fmin : lowest frequency [Hz] fmax : highest frequency [Hz] If `None`, use `sr / 2.0` ''' if fmax is None: fmax = float(sr) / 2 # init n_mels = int(n_mels) weights = np.zeros((n_mels, int(1 + n_dft // 2))) # center freqs of each FFT bin dftfreqs = _dft_frequencies(sr=sr, n_dft=n_dft) # centre freqs of mel bands freqs = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax) # Slaney-style mel is scaled to be approx constant energy per channel enorm = 2.0 / (freqs[2:n_mels + 2] - freqs[:n_mels]) for i in range(n_mels): # lower and upper slopes qfor all bins lower = (dftfreqs - freqs[i]) / (freqs[i + 1] - freqs[i]) upper = (freqs[i + 2] - dftfreqs) / (freqs[i + 2] - freqs[i + 1]) # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) * enorm[i] return weights.astype(K.floatx())
def __init__(self, frame_stream, specfmt="dB", mels_N=12): ''' DFTStream(frame_stream, specfmt, mels_N) Create a stream of discrete Fourier transform (DFT) frames using the specified sample frame stream. Only bins up to the Nyquist rate are returned in the stream Optional arguments: specfmt - DFT output: "complex" - return complex DFT results "dB" [default] - return power spectrum 20log10(magnitude) "mag^2" - magnitude squared spectrum "Mel" - melodic scale mels_N - Number of Mel filters to use. Only applicable when specfmt == "Mel". ''' self.format_types = {"complex" : 0, "mag^2" : 1, "dB" : 2, "Mel" : 3} self.framer = frame_stream self.frame_len = frame_stream.get_framelen_samples() try: self.format = self.format_types[specfmt] except KeyError: raise ValueError("Unknown specfmt {}. Use one of [{}]".format( specfmt, ", ".join(self.format_types.keys()))) # Number of frequency bins is the same as the number of bins in the # frame self.dft_bins = self.frame_len # Only bins up to the Nyquist rate are usable. The DFT routine that # we are using will return up to and including the Nyuist (half bins # plus 1 if even) self.Nyquist_Hz = self.framer.get_Fs() / 2.0 # We add 1.1 instead of 1, see numpy.around for details which # np.round uses. self.bins_Nyquist = np.int(np.round((self.frame_len+1.1)/2.0)) self.window = signal.get_window("hamming", self.frame_len) if self.format == self.format_types["Mel"]: # Construct Mel filters self.mel_filters = mel(self.framer.get_Fs(), self.dft_bins, mels_N) # Center frequencies of the Mel filters in Hz # Returns two more than are actually used (0 Hz and Nyquist) self.bins_Hz = mel_frequencies(mels_N+2, fmin=0, fmax=self.Nyquist_Hz) self.bins_Hz = self.bins_Hz[1:-1] # Remove ends self.bins_N = len(self.bins_Hz) else: self.bins_Hz = np.arange(self.bins_Nyquist) / self.bins_Nyquist * self.Nyquist_Hz self.bins_N = self.bins_Hz.shape[0]
def get_filterbank(n_filters=60, NFFT=512, fs=16000, fmin=0.0, fmax=None, htk=False, normalize=False): n_mels = n_filters if fmax is None: fmax = float(fs) / 2 mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) # Initialize the weights n_mels = int(n_mels) weights = np.zeros((n_mels, int(1 + NFFT // 2))) # Center freqs of each FFT bin fftfreqs = fft_frequencies(sr=fs, n_fft=NFFT) # 'Center freqs' of mel bands - uniformly spaced between limits mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) # to make evenly spaced filterbank, use fft_frequencies fdiff = np.diff(mel_f) ramps = np.subtract.outer(mel_f, fftfreqs) for i in range(n_mels): # lower and upper slopes for all bins lower = -ramps[i] / fdiff[i] upper = ramps[i + 2] / fdiff[i + 1] # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) if normalize == True: enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) weights *= enorm[:, np.newaxis] return weights
def prepare_mel_matrix(hparams, rate, return_numpy=True, GPU_backend=False): """ Create mel filter """ # import tensorflow if needed if "tf" not in sys.modules: if not GPU_backend: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = "" import tensorflow as tf tf.enable_eager_execution() assert tf.executing_eagerly() # create a filter to convolve with the spectrogram mel_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=hparams.num_mel_bins, num_spectrogram_bins=int(hparams.n_fft / 2) + 1, sample_rate=rate, lower_edge_hertz=hparams.mel_lower_edge_hertz, upper_edge_hertz=hparams.mel_upper_edge_hertz, dtype=tf.dtypes.float32, name=None, ) # gets the center frequencies of mel bands mel_f = mel_frequencies( n_mels=hparams.num_mel_bins + 2, fmin=hparams.mel_lower_edge_hertz, fmax=hparams.mel_upper_edge_hertz, ) # Slaney-style mel is scaled to be approx constant energy per channel (from librosa) enorm = tf.dtypes.cast( tf.expand_dims( tf.constant( 2.0 / (mel_f[2 : hparams.num_mel_bins + 2] - mel_f[: hparams.num_mel_bins]) ), 0, ), tf.float32, ) mel_matrix = tf.multiply(mel_matrix, enorm) mel_matrix = tf.divide(mel_matrix, tf.reduce_sum(mel_matrix, axis=0)) if return_numpy: return mel_matrix.numpy() else: return mel_matrix
def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False, norm=1): if fmax is None: fmax = float(sr) / 2 if norm is not None and norm != 1 and norm != np.inf: raise ParameterError('Unsupported norm: {}'.format(repr(norm))) # Initialize the weights n_mels = int(n_mels) weights = np.zeros((n_mels, int(1 + n_fft // 2))) # Center freqs of each FFT bin fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) # 'Center freqs' of mel bands - uniformly spaced between limits mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) fdiff = np.diff(mel_f) ramps = np.subtract.outer(mel_f, fftfreqs) for i in range(n_mels): # lower and upper slopes for all bins lower = -ramps[i] / fdiff[i] upper = ramps[i+2] / fdiff[i+1] # .. then intersect them with each other and zero weights[i] = np.maximum(0, np.minimum(lower, upper)) if norm == 1: # Slaney-style mel is scaled to be approx constant energy per channel enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels]) weights *= enorm[:, np.newaxis] # Only check weights if f_mel[0] is positive if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): # This means we have an empty channel somewhere warnings.warn('Empty filters detected in mel frequency basis. ' 'Some channels will produce empty responses. ' 'Try increasing your sampling rate (and fmax) or ' 'reducing n_mels.') return weights