def _do_spec(self): if self._signal is None or len(self._signal) == 0: self.set_data(np.array([[0.5]])) return #if len(self._signal) / self._sr > 30: num_steps = 1000 if len(self._signal) < num_steps: num_steps = len(self._signal) step_samp = int(len(self._signal)/ num_steps) #if step_samp < 28: # step_samp = 28 # step = step_samp / self._sr #self._n_fft = 512 #window = partial(gaussian, std = 250/12) if self._window == 'gaussian': window = partial(gaussian, std = 0.45*(self._win_len)/2) else: window = None #import matplotlib.pyplot as plt #plt.plot(window(250)) #plt.show() data = stft(self._signal, self._n_fft, step_samp, center = True, win_length = self._win_len, window = window) data = np.abs(data) data = 20 * np.log10(data) if self._color_scale == 'log' else data self.set_data(data)
def generate_spectrogram(signal, sr, color_scale='log'): n_fft = 256 # if len(self._signal) / self._sr > 30: window_length = 0.005 win_len = int(window_length * sr) if win_len > n_fft: n_fft = win_len num_steps = 500 if len(signal) < num_steps: num_steps = len(signal) step_samp = int(len(signal) / num_steps) time_step = step_samp / sr freq_step = sr / n_fft # if step_samp < 28: # step_samp = 28 # step = step_samp / self._sr # self._n_fft = 512 # window = partial(gaussian, std = 250/12) window = 'gaussian' # win_len = None if window == 'gaussian': window = partial(gaussian, std=0.45 * (win_len) / 2) data = stft(signal, n_fft, step_samp, center=True, win_length=win_len, window=window) data = np.abs(data) data = 20 * np.log10(data) if color_scale == 'log' else data return data, time_step, freq_step
def audio_to_data(signal): if config.silence_thr_db is not None: signal, _ = trim(signal, config.silence_thr_db, frame_length=config.fft_bins, hop_length=config.fft_hop_len) spec = abs( stft(signal, config.fft_bins, config.fft_hop_len, config.fft_window_len)) # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins) # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len) # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(plot(chroma)) vector = deepcopy(spec) print('\tmax min initially:', max(vector), min(vector)) vector = amplitude_to_db(vector) print('\tmax min in db:', max(vector), min(vector)) # vector = concatenate([vector, chroma], 0) vector = vector.T print('\tfinal vector shape:', vector.shape) return vector
def process_signal(self, signal): ft = np.abs(stft(signal, n_fft=self.window_size, hop_length=self.window_stride, window='hann')) mel = melspectrogram(sr=self.sample_rate,S=ft) mfccs = mfcc( sr=self.sample_rate, n_mfcc=self.num_mfccs,S=mel) deltas= delta(mfccs) delta_deltas= delta(mfccs,order=2) return mfccs, deltas, delta_deltas
def generate_spectrogram(signal, sr, log_color_scale=True): """ Generate a spectrogram Parameters ---------- signal : numpy.array Signal to generate spectrogram from sr : int Sample rate of the signal log_color_scale : bool Flag to make the color scale logarithmic Returns ------- numpy.array Spectrogram data float Time step between frames float Frequency step between bins """ n_fft = 256 # if len(self._signal) / self._sr > 30: window_length = 0.005 win_len = int(window_length * sr) if win_len > n_fft: n_fft = win_len num_steps = 500 if len(signal) < num_steps: num_steps = len(signal) step_samp = int(len(signal) / num_steps) time_step = step_samp / sr freq_step = sr / n_fft # if step_samp < 28: # step_samp = 28 # step = step_samp / self._sr # self._n_fft = 512 # window = partial(gaussian, std = 250/12) window = 'gaussian' # win_len = None if window == 'gaussian': window = partial(gaussian, std=0.45 * win_len / 2) data = stft(signal, n_fft, step_samp, center=True, win_length=win_len, window=window) data = np.abs(data) if log_color_scale: data = 20 * np.log10(data) return data, time_step, freq_step
def __cqt_response(y, n_fft, hop_length, fft_basis, mode): '''Compute the filter response with a target STFT hop.''' # Compute the STFT matrix D = stft(y, n_fft=n_fft, hop_length=hop_length, window='ones', pad_mode=mode) # And filter response energy return fft_basis.dot(D)
def melspectrogram(y=None, sr=16000, n_fft=400, hop_length=160, power=2.0, **kwargs): """Compute a mel-scaled spectrogram. If a spectrogram input `S` is provided, then it is mapped directly onto the mel basis `mel_f` by `mel_f.dot(S)`. If a time-series input `y, sr` is provided, then its magnitude spectrogram `S` is first computed, and then mapped onto the mel scale by `mel_f.dot(S**power)`. By default, `power=2` operates on a power spectrum. Parameters ---------- y : np.ndarray [shape=(n,)] or None audio time-series sr : number > 0 [scalar] sampling rate of `y` n_fft : int > 0 [scalar] length of the FFT window hop_length : int > 0 [scalar] number of samples between successive frames. See `librosa.core.stft` power : float > 0 [scalar] Exponent for the magnitude melspectrogram. e.g., 1 for energy, 2 for power, etc. kwargs : additional keyword arguments Mel filter bank parameters. See `librosa.filters.mel` for details. Returns ------- S : np.ndarray [shape=(n_mels, t)] Mel spectrogram """ # Compute a magnitude spectrogram from input S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length, center=False))**power # Build a Mel filter mel_basis = filters.mel(sr, n_fft, **kwargs) return np.dot(mel_basis, S)
def librosa_compute_spec(y=None, sr=1600, S=None, n_fft=2048, hop_length=512, power=1): if S is not None: # Infer n_fft from spectrogram shape n_fft = 2 * (S.shape[0] - 1) else: # Otherwise, compute a magnitude spectrogram from input # 计算的幅度普, 希望abs, 然后在取 power次方 S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length))**power return S, n_fft
def log_energy(y, n_fft=400, hop_length=160): power_spectrum = np.abs( stft(y, n_fft=n_fft, hop_length=hop_length, center=False))**2 log_E = 10 * np.log10(sum(power_spectrum)) # in dB return log_E
def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, filter_scale=1, norm=1, sparsity=0.01, window='hann', scale=True, pad_mode='reflect'): '''Compute the pseudo constant-Q transform of an audio signal. This uses a single fft size that is the smallest power of 2 that is greater than or equal to the max of: 1. The longest CQT filter 2. 2x the hop_length Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` hop_length : int > 0 [scalar] number of samples between successive CQT columns. fmin : float > 0 [scalar] Minimum frequency. Defaults to C1 ~= 32.70 Hz n_bins : int > 0 [scalar] Number of frequency bins, starting at `fmin` bins_per_octave : int > 0 [scalar] Number of bins per octave tuning : None or float in `[-0.5, 0.5)` Tuning offset in fractions of a bin (cents). If `None`, tuning will be automatically estimated from the signal. filter_scale : float > 0 Filter filter_scale factor. Larger values use longer windows. sparsity : float in [0, 1) Sparsify the CQT basis by discarding up to `sparsity` fraction of the energy in each basis. Set `sparsity=0` to disable sparsification. window : str, tuple, number, or function Window specification for the basis filters. See `filters.get_window` for details. pad_mode : string Padding mode for centered frame analysis. See also: `librosa.core.stft` and `np.pad`. Returns ------- CQT : np.ndarray [shape=(n_bins, t), dtype=np.float] Pseudo Constant-Q energy for each frequency at each time. Raises ------ ParameterError If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)` Or if `y` is too short to support the frequency range of the CQT. Notes ----- This function caches at level 20. ''' if fmin is None: # C1 by default fmin = note_to_hz('C1') if tuning is None: tuning = estimate_tuning(y=y, sr=sr) fft_basis, n_fft, _ = __cqt_filter_fft(sr, fmin, n_bins, bins_per_octave, tuning, filter_scale, norm, sparsity, hop_length=hop_length, window=window) fft_basis = np.abs(fft_basis) # Compute the magnitude STFT with Hann window D = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length, pad_mode=pad_mode)) # Project onto the pseudo-cqt basis C = fft_basis.dot(D) if scale: C /= np.sqrt(n_fft) else: lengths = filters.constant_q_lengths(sr, fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window=window, filter_scale=filter_scale) C *= np.sqrt(lengths[:, np.newaxis] / n_fft) return C
def main(): files = glob( config.data_path + '/*.wav') # + glob('data/*.mp3') # try ffmpeg -i input.mp3 output.wav if not config.frequencies_to_pick: # gather initial info from all files frequency_strengths = zeros(len(config.frequencies_of_bins)) for file in files: signal = load(file, config.sample_rate)[0] spec = abs( stft(signal, config.fft_bins, config.fft_hop_len, config.fft_window_len)) # print('\tmax min initially:', max(spec), min(spec)) # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len)) frequency_strengths += spec.sum(1) / spec.shape[1] max_strength = max(frequency_strengths) strength_thr = max_strength / config.frequency_strength_thr band_low_hz = 999_999 band_high_hz = -1 for frequency, strength in zip(config.frequencies_of_bins, frequency_strengths): if strength >= strength_thr: config.frequencies_to_pick.append(frequency) if frequency < band_low_hz: band_low_hz = frequency if frequency > band_high_hz: band_low_hz = frequency # spec = cat([spec[config.frequencies_of_bins.index(i),:] for i in config.frequencies_to_pick], 0) # print('\tmax min after bandpass:'******'with bandpass, timestep size: {len(config.frequencies_of_bins)} -> {len(config.frequencies_to_pick)}' ) print( f'copy paste this line into frequencies_to_pick @ config: \n{config.frequencies_to_pick}' ) # proceed to separately processing each file converted = [] for file_id, file in enumerate(files): print(f'reading: {file}') song_id = [0 if i == file_id else 1 for i in range(len(files))] # analysis signal, sample_rate = load(file, config.sample_rate) data, meta = audio_to_data(signal, song_id) converted.append([data, meta]) # synthesis signal_recons = data_to_audio(data, meta) write(f'{file.split("/")[-1]}_{file_id}.wav', config.sample_rate, signal_recons) signal_recons, sample_rate = load( f'{file.split("/")[-1]}_{file_id}.wav', config.sample_rate) pickle_save(converted, config.data_path + '.pk') print('saved data.')
def audio_to_data(signal, song_id): meta = [song_id] if config.silence_thr_db: signal, _ = trim(signal, config.silence_thr_db, frame_length=config.fft_bins, hop_length=config.fft_hop_len) spec = abs( stft(signal, config.fft_bins, config.fft_hop_len, config.fft_window_len)) # mfccs = mfcc(signal, config.sample_rate, n_mfcc=config.mfcc_bins) # chroma = chroma_stft(signal, config.sample_rate, n_fft=config.fft_bins, hop_length=config.fft_hop_len, win_length=config.fft_window_len) # rows-frequencies cols-times # show(specshow(spec, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(specshow(mfccs, sr=config.sample_rate, hop_length=config.fft_hop_len)) # show(plot(chroma)) spec_mod = deepcopy(spec) print('\tmax min initially:', max(spec_mod), min(spec_mod)) spec_mod = stack([ spec_mod[config.frequencies_of_bins.index(i), :] for i in config.frequencies_to_pick ], 0) print('\tmax min after bandpass:'******'\tmax min in db:', max(spec_mod), min(spec_mod)) # spec_mod = clip(spec_mod, config.amp_min_thr_db, config.amp_max_thr_db) # print('db clipped.') if config.zscore_scale: mean = spec_mod.mean() std = spec_mod.std() spec_mod -= mean spec_mod /= std print('\tmax min after std:', max(spec_mod), min(spec_mod)) scale = max([abs(max(spec_mod)), abs(min(spec_mod))]) spec_mod /= scale meta.extend([mean, std, scale]) elif config.minmax_scale: spec_min = min(spec_mod) spec_max = max(spec_mod) spec_mod -= spec_min spec_mod /= spec_max - spec_min print('\tmax min after min/max:', max(spec_mod), min(spec_mod)) meta.extend([spec_min, spec_max]) elif config.log_scale: spec_mod = log(spec_mod + 1e-10) print('\tmax min after log:', max(spec_mod), min(spec_mod)) vector = spec_mod # vector = concatenate([vector, chroma], 0) vector = vector.T # now first index time, second index frequency print('\tfinal vector shape:', vector.shape) return vector, meta