def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def test_mfcc(): def __test(dct_type, norm, n_mfcc, S): E_total = np.sum(S, axis=0) mfcc = librosa.feature.mfcc(S=S, dct_type=dct_type, norm=norm, n_mfcc=n_mfcc) assert mfcc.shape[0] == n_mfcc assert mfcc.shape[1] == S.shape[1] # In type-2 mode, DC component should be constant over all frames if dct_type == 2: assert np.var(mfcc[0] / E_total) <= 1e-30 S = librosa.power_to_db(np.random.randn(128, 100)**2, ref=np.max) for n_mfcc in [13, 20]: for dct_type in [1, 2, 3]: for norm in [None, 'ortho']: if dct_type == 1 and norm == 'ortho': tf = pytest.mark.xfail(__test, raises=NotImplementedError) else: tf = __test yield tf, dct_type, norm, n_mfcc, S
def test_previews(meta): np.random.seed(20171207) recordings = meta.groupby('target')['filename'].apply(lambda cat: cat.sample(1)).reset_index()['filename'] f, ax = plt.subplots(1, 1, sharey=False, sharex=False, figsize=(8, 2)) with tempfile.TemporaryDirectory() as tmpdir: for index in range(len(recordings)): recording = recordings[index] signal = librosa.load('audio/' + recording, sr=44100)[0] spec = librosa.feature.melspectrogram(signal, sr=44100, n_fft=2205, hop_length=441) spec = librosa.power_to_db(spec) category = meta[meta.filename == recording].category.values[0] ax.imshow(spec, origin='lower', interpolation=None, cmap='viridis', aspect=1.1) ax.set_title(f'{category} - {recording}', fontsize=11) ax.get_yaxis().set_visible(False) ax.get_xaxis().set_visible(False) f.tight_layout() plt.savefig(f'{tmpdir}/{index:02d}.png', bbox_inches='tight', dpi=72) subprocess.call(['convert', '-delay', '100', '-loop', '0', f'{tmpdir}/*.png', '_esc50.gif']) assert filecmp.cmp('esc50.gif', '_esc50.gif')
def static_spectrogram( data, filename, block_nb=0, mel_bands=128, fmax=22050, x_axis='time', y_axis='mel', display=False): """ Compute the static spectrogram of a time serie of samples. The static spectromgram is computed by take the power of the signal in the frequency domain according a decomposition in mel bands and a maximum frequency. Args: data (array): 1D array of audio data. mel_bands (int): number of mel bands for the decomposition fmax (int): maximum frequency (in Hertz). display (boolean): plotting or saving the output figure. Returns: None Todo: - remove the padding/margin around the plot - Add a path and a name where to save the plots Note: Need to ensure that the computation is accurate """ data_freq_power = np.abs(librosa.stft(data))**2 librosa.feature.melspectrogram( S=data_freq_power, power=2.0, n_mels=mel_bands, fmax=fmax) librosa.display.specshow( librosa.power_to_db(data_freq_power, ref=np.max), y_axis=y_axis, x_axis=x_axis, fmax=fmax) if display: plt.ylabel('Mel') plt.xlabel('Time [samples]') plt.show() else: spec_path = utils.read_config('path', 'spectrograms') fname = os.path.splitext(os.path.basename(filename)) fig_path = utils.create_filename( spec_path, 'png', fname[0], 'static', block_nb) plt.savefig(fig_path)
def __call__(self, data): stft = data['stft'] sample_rate = data['sample_rate'] n_fft = data['n_fft'] mel_basis = librosa.filters.mel(sample_rate, n_fft, self.n_mels) s = np.dot(mel_basis, np.abs(stft)**2.0) data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max) return data
def __test(x, ref, amin, top_db): y = librosa.power_to_db(x, ref=ref, amin=amin, top_db=top_db) assert np.isrealobj(y) eq_(y.shape, x.shape) if top_db is not None: assert y.min() >= y.max()-top_db
def test_power_to_db_logamp(): srand() NOISE_FLOOR = 1e-6 # Make some noise x = np.abs(np.random.randn(1000)) + NOISE_FLOOR db1 = librosa.power_to_db(x**2, top_db=None) db2 = librosa.logamplitude(x**2, top_db=None) assert np.allclose(db1, db2)
def __call__(self, path): # load data with trimming and normalizing raw, _ = librosa.load(path, self.sr, res_type='kaiser_fast') raw, _ = librosa.effects.trim(raw, self.top_db) raw /= numpy.abs(raw).max() raw = raw.astype(numpy.float32) # mu-law transform quantized = self.mu_law.transform(raw) # padding/triming if self.length is not None: if len(raw) <= self.length: # padding pad = self.length - len(raw) raw = numpy.concatenate( (raw, numpy.zeros(pad, dtype=numpy.float32))) quantized = numpy.concatenate( (quantized, self.quantize // 2 * numpy.ones(pad))) quantized = quantized.astype(numpy.int32) else: # triming start = random.randint(0, len(raw) - self.length - 1) raw = raw[start:start + self.length] quantized = quantized[start:start + self.length] # calculate mel-spectrogram spectrogram = librosa.feature.melspectrogram( raw, self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels) spectrogram = librosa.power_to_db( spectrogram, ref=numpy.max) # normalize mel spectrogram into [-1, 1] spectrogram += 40 spectrogram /= 40 if self.length is not None: spectrogram = spectrogram[:, :self.length // self.hop_length] spectrogram = spectrogram.astype(numpy.float32) # expand dimensions one_hot = numpy.identity( self.quantize, dtype=numpy.float32)[quantized] one_hot = numpy.expand_dims(one_hot.T, 2) spectrogram = numpy.expand_dims(spectrogram, 2) quantized = numpy.expand_dims(quantized, 1) return one_hot[:, :-1], spectrogram, quantized[1:]
def wav_to_spec(wav_audio, hparams): """Transforms the contents of a wav file into a series of spectrograms.""" if hparams.spec_type == 'raw': spec = _wav_to_framed_samples(wav_audio, hparams) else: if hparams.spec_type == 'cqt': spec = _wav_to_cqt(wav_audio, hparams) elif hparams.spec_type == 'mel': spec = _wav_to_mel(wav_audio, hparams) else: raise ValueError('Invalid spec_type: {}'.format(hparams.spec_type)) if hparams.spec_log_amplitude: spec = librosa.power_to_db(spec) return spec
def melspectrogram(filename): import librosa y, sr = librosa.load(filename) librosa.feature.melspectrogram(y=y, sr=sr) D = np.abs(librosa.stft(y)) ** 2 S = librosa.feature.melspectrogram(S=D) # Passing through arguments to the Mel filters #S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000) import matplotlib.pyplot as plt plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.power_to_db(S, ref=np.max), y_axis='mel', fmax=8000, x_axis='time') plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') plt.tight_layout() plt.show()
def __call__(self, data): samples = data['samples'] sample_rate = data['sample_rate'] s = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=self.n_mels) data['mel_spectrogram'] = librosa.power_to_db(s, ref=np.max) return data
fig.add_subplot(4,2,7) ''' Pxx, freqs, bins, im = plt.specgram(audData, Fs=rate, NFFT=1024, cmap=plt.get_cmap('autumn_r')) cbar=plt.colorbar(im) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') cbar.set_label('Intensity dB') ''' y, sr = librosa.load(temp_folder) librosa.feature.melspectrogram(y=y, sr=sr) D = np.abs(librosa.stft(y))**2 S = librosa.feature.melspectrogram(S=D, sr=sr) S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=sr/2) S_dB = librosa.power_to_db(S, ref=np.max) librosa.display.specshow(S_dB, x_axis='time',y_axis='mel', sr=sr, fmax=sr/2) #plt.yticks(np.arange(0, 10000, step=1000)) plt.colorbar(format='%+2.0f dB') plt.title('Spectrogram') plt.show() plt.close() fig = plt.figure(figsize=(20, 20)) plt.subplots_adjust(hspace = 0.2, wspace = 0.2)
def specgram(audio, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False): """Spectrogram using librosa. Args: audio: 1-D array of float32 sound samples. n_fft: Size of the FFT. hop_length: Stride of FFT. Defaults to n_fft/2. mask: Mask the phase derivative by the magnitude. log_mag: Use the logamplitude. re_im: Output Real and Imag. instead of logMag and dPhase. dphase: Use derivative of phase instead of phase. mag_only: Don't return phase. Returns: specgram: [n_fft/2 + 1, audio.size / hop_length, 2]. The first channel is the logamplitude and the second channel is the derivative of phase. """ if not hop_length: hop_length = int(n_fft / 2.) fft_config = dict( n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) spec = librosa.stft(audio, **fft_config) if re_im: re = spec.real[:, :, np.newaxis] im = spec.imag[:, :, np.newaxis] spec_real = np.concatenate((re, im), axis=2) else: mag, phase = librosa.core.magphase(spec) phase_angle = np.angle(phase) # Magnitudes, scaled 0-1 if log_mag: mag = (librosa.power_to_db( mag**2, amin=1e-13, top_db=120., ref=np.max) / 120.) + 1 else: mag /= mag.max() if dphase: # Derivative of phase phase_unwrapped = np.unwrap(phase_angle) p = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] p = np.concatenate([phase_unwrapped[:, 0:1], p], axis=1) / np.pi else: # Normal phase p = phase_angle / np.pi # Mask the phase if log_mag and mask: p = mag * p # Return Mag and Phase p = p.astype(np.float32)[:, :, np.newaxis] mag = mag.astype(np.float32)[:, :, np.newaxis] if mag_only: spec_real = mag[:, :, np.newaxis] else: spec_real = np.concatenate((mag, p), axis=2) return spec_real
test_preds = classifier.predict(test_pca) test_acc = np.sum(test_preds == y_te) test_acc = test_acc / len(y_te) scale_file = "debussy2ms.wav" scale, sr = lb.load(scale_file) #S_scale = librosa.stft(scale, n_fft=FRAME_SIZE, hop_length=HOP_SIZE) mel_spectrogram_TestSong = lb.feature.melspectrogram(scale, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS) #trained = scaler.transform(mel_spectrogram_TestSong) melspectrogram_TestSong = lb.power_to_db(mel_spectrogram_TestSong**2) melspectrogram_TestSong = melspectrogram_TestSong.reshape( 1, melspectrogram_TestSong.shape[0] * melspectrogram_TestSong.shape[1]) print("Shape that i pass to predicter", melspectrogram_TestSong.shape) scaler1 = StandardScaler() scaler1.fit(melspectrogram_TestSong) melspectrogram_TestSong = scaler1.transform(melspectrogram_TestSong) #pca2 = PCA(n_components = 1) #pca2.fit(melspectrogram_TestSong) #mel_spectrogram_TestSong = pca2.transform(melspectrogram_TestSong) print("after pca", melspectrogram_TestSong.shape) melspectrogram_TestSong.shape
def detectionOnsets(y): fmin = librosa.note_to_hz(Notemin) fmax = librosa.note_to_hz(Notemax) #Nmin = int((sr/(fmax*(2**(1/BINS_PER_OCTAVE)-1)))) #Nmax = int((sr/(fmin*(2**(1/BINS_PER_OCTAVE)-1)))) n_bins = int( (librosa.note_to_midi(Notemax) - librosa.note_to_midi(Notemin)) * BINS_PER_OCTAVE / 12) Chrom = librosa.amplitude_to_db(np.abs( librosa.cqt(y=y, sr=sr, hop_length=STEP, fmin=fmin, bins_per_octave=BINS_PER_OCTAVE, n_bins=n_bins)), ref=np.max) Nf = len(Chrom) N = len(Chrom[0]) Diff = np.zeros((Nf, N)) Dev = np.zeros(N) for j in range(1, N): for i in range(Nf): Diff[i, j] = np.abs(Chrom[i, j] - Chrom[i, j - 1]) Dev[j] = sum(Diff[:, j]) # FONCTION DE SEUIL # Ajout de zéros en queue et en tête l = [] Seuil = [] Onsets = [] for k in range(int(H / 2)): l.append(0) for val in Dev: l.append(val) for k in range(int(H / 2)): l.append(0) #Calcul de la médiane for i in range(N): Seuil.append(ALPHA + BETA * stat.median(l[i:i + H])) if Dev[i] > Seuil[i]: Onsets.append(i) times = librosa.frames_to_time(np.arange(N), sr=sr, hop_length=STEP) # FONCTION DE TRI SUR LES ONSETS i = 0 while i < (len(Onsets) - 1): while (i < (len(Onsets) - 1)) and (times[Onsets[i + 1]] < times[Onsets[i]] + T): if Dev[Onsets[i + 1]] < Dev[Onsets[i]]: del Onsets[i + 1] else: del Onsets[i] i = i + 1 onset_frames = librosa.util.fix_frames(Onsets, x_min=0, x_max=Chrom.shape[1] - 1) onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=STEP) #Synchronisation sur les onsets, en enlevant le début et la fin des longues frames ChromSync = np.zeros((Nf, len(onset_frames) - 1)) n_att = int(librosa.time_to_frames(T_att, sr=sr, hop_length=STEP)) for j in range(len(onset_frames) - 1): for i in range(Nf): ChromSync[i, j] = np.mean(Chrom[i][(onset_frames[j] + n_att):(onset_frames[j + 1] - n_att)]) #Normalisation du spectre # ChromSync[:,1] = librosa.power_to_db(librosa.db_to_power(ChromSync[:,1]) / np.sum(librosa.db_to_power(ChromSync[:,1]))) if norm_spectre: for j in range(ChromSync.shape[1]): ChromSync[:, j] = librosa.power_to_db( librosa.db_to_power(ChromSync[:, j]) / np.sum(librosa.db_to_power(ChromSync[:, j]))) #Affichage if plot_onsets: plt.figure(figsize=(13, 7)) ax1 = plt.subplot(3, 1, 1) librosa.display.specshow(Chrom, bins_per_octave=BINS_PER_OCTAVE, fmin=fmin, y_axis='cqt_note', x_axis='time', x_coords=times) plt.title('CQT spectrogram') plt.subplot(3, 1, 2, sharex=ax1) plt.plot(times, Dev, label='Deviation') plt.plot(times, Seuil, color='g', label='Seuil') plt.vlines(times[Onsets], 0, Dev.max(), color='r', alpha=0.9, linestyle='--', label='Onsets') plt.axis('tight') plt.legend(frameon=True, framealpha=0.75) ax1 = plt.subplot(3, 1, 3, sharex=ax1) librosa.display.specshow(ChromSync, bins_per_octave=BINS_PER_OCTAVE, fmin=fmin, y_axis='cqt_note', x_axis='time', x_coords=onset_times) plt.show() return onset_times
def get_log_spectrum(x): s = librosa.core.stft(x, n_fft=2048, win_length=2048, hop_length=512) a = np.abs(s)**2 #melspect = librosa.feature.melspectrogram(S=a) feat = librosa.power_to_db(a) return feat
def test_melspectrogram_correctness( n_fft, sr, hop_length, n_ch, data_format, amin, dynamic_range, n_mels, mel_f_min, mel_f_max ): """Test the correctness of melspectrogram. Note that mel filterbank is tested separated """ def _get_melgram_model(return_decibel, amin, dynamic_range, input_shape=None): # compute with kapre melgram_model = get_melspectrogram_layer( n_fft=n_fft, sample_rate=sr, n_mels=n_mels, mel_f_min=mel_f_min, mel_f_max=mel_f_max, win_length=win_length, hop_length=hop_length, input_data_format=data_format, output_data_format=data_format, return_decibel=return_decibel, input_shape=input_shape, db_amin=amin, db_dynamic_range=dynamic_range, ) return melgram_model src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) win_length = n_fft # test with x2 # compute with librosa S_ref = librosa.feature.melspectrogram( src_mono, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, power=1.0, n_mels=n_mels, fmin=mel_f_min, fmax=mel_f_max, ).T S_ref = np.expand_dims(S_ref, axis=2) # time, freq, ch=1 S_ref = np.tile(S_ref, [1, 1, n_ch]) # time, freq, ch=n_ch if data_format == 'channels_first': S_ref = np.transpose(S_ref, (2, 0, 1)) # ch, time, freq # melgram melgram_model = _get_melgram_model( return_decibel=False, input_shape=input_shape, amin=None, dynamic_range=120.0 ) S = melgram_model.predict(batch_src)[0] # 3d representation np.testing.assert_allclose(S_ref, S, atol=1e-4) # log melgram melgram_model = _get_melgram_model( return_decibel=True, input_shape=input_shape, amin=amin, dynamic_range=dynamic_range ) S = melgram_model.predict(batch_src)[0] # 3d representation S_ref_db = librosa.power_to_db(S_ref, ref=1.0, amin=amin, top_db=dynamic_range) np.testing.assert_allclose( S_ref_db, S, rtol=3e-3 ) # decibel is evaluated with relative tolerance
sr=sr, n_mels=128, ) fig, ax = plt.subplots(figsize=(30, 10)) D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr, ax=ax, cmap='viridis') plt.figure(dpi=1200) plt.show() fig, ax = plt.subplots() S_dB = librosa.power_to_db(S, ref=np.max) img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=sr, ax=ax, cmap='viridis') fig.colorbar(img, ax=ax, format='%+2.0f dB') ax.set_title('Mel-Frequency Spectrogram', size=20) ax.set_ylabel('log Hz', size=15) ax.set_xlabel('Time', size=15) plt.show() plt.show()
def getAudioSamples(fn, min_dur=50, max_dur=-1, fft=2048, hop_length=512, backtrack=True, superFlux=True, y=None, sr=None, delta=0.07): basename = os.path.basename(fn) fn = getAudioFile(fn) duration = 0 # load audio if y is None or sr is None: try: y, sr = loadAudioData(fn) duration = int(getDurationFromAudioData(y, sr) * 1000) except audioop.error: duration = 0 y = None sr = None # maxVal = y.max() # if maxVal != 0: # y /= maxVal if duration <= 0: return ([], y, sr) # retrieve onsets using superflux method # https://librosa.github.io/librosa/auto_examples/plot_superflux.html#sphx-glr-auto-examples-plot-superflux-py # http://dafx13.nuim.ie/papers/09.dafx2013_submission_12.pdf if superFlux: lag = 2 n_mels = 138 fmin = 27.5 fmax = 16000.0 max_size = 3 S = librosa.feature.melspectrogram(y, sr=sr, n_fft=fft, hop_length=hop_length, fmin=fmin, fmax=fmax, n_mels=n_mels) odf = librosa.onset.onset_strength(S=librosa.power_to_db(S, ref=np.max), sr=sr, hop_length=hop_length, lag=lag, max_size=max_size) onsets = librosa.onset.onset_detect(onset_envelope=odf, sr=sr, hop_length=hop_length, backtrack=backtrack, delta=delta) # retrieve onsets using default method else: onsets = librosa.onset.onset_detect(y=y, sr=sr, hop_length=hop_length, backtrack=backtrack, delta=delta) times = [ int(round(1.0 * hop_length * onset / sr * 1000)) for onset in onsets ] # add the end of the audio times.append(duration - 1) samples = [] for i, t in enumerate(times): if i > 0: prev = times[i - 1] dur = t - prev if max_dur > 0 and dur > max_dur: dur = max_dur if dur >= min_dur: samples.append({ "filename": basename, "start": prev, "dur": dur }) return (samples, y, sr)
temp_signal = signal[:3 * Config.sr] # take 3s chunks mask[:int(3 * Config.sr - 1)] = False # go forward mask out the first 3s chunk signal = signal[mask] #compute mel-spectrogram mel_spec = librosa.feature.melspectrogram( temp_signal, sr=Config.sr, n_fft=Config.n_fft, hop_length=Config.hop_length, n_mels=Config.n_mels, fmin=Config.fmin, fmax=Config.fmax) # compute log mel spectrogram log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # normalization norm_log_mel_spec = librosa.util.normalize(log_mel_spec) #plot and save plot plt.figure(figsize=Config.figsize, dpi=Config.dpi) librosa.display.specshow(norm_log_mel_spec, fmin=Config.fmin, fmax=Config.fmax, sr=Config.sr, hop_length=Config.hop_length, cmap=Config.color) fig = plt.gcf()
def feature_extraction(y, fs=44100, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): eps = numpy.spacing(1) # 窓関数 window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) # 静的係数を計算する # librosa.stft -> 短時間フーリエ変換 # librosa.mel -> メルフィルタバンクを作成する power_spectrogram = numpy.abs( librosa.stft( y + eps, n_fft=mfcc_params['n_fft'], #win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], center=True, window=window))**2 mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk']) mel_spectrum = numpy.dot(mel_basis, power_spectrogram) mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrum), n_mfcc=mfcc_params['n_mfcc']) # Collect the feature matrix 特徴量行列 feature_matrix = mfcc if include_delta: # デルタ係数(1階微分) mfcc_delta = librosa.feature.delta(mfcc, **delta_params) # 特徴量行列にデルタ係数を加える feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) if include_acceleration: # 加速度係数(二階微分) mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) # 特徴量行列にデルタ係数を加える feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) if not include_mfcc0: # Omit mfcc0 feature_matrix = feature_matrix[1:, :] feature_matrix = feature_matrix.T # Collect into data structure if statistics: return { 'feat': feature_matrix, 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix**2, axis=0), } } else: return {'feat': feature_matrix}
desc=label): f = row['path'] try: data, sr = lr.load(os.path.join(root, label, 'clips', f), sr=SR, mono=True, dtype=np.float32, res_type='kaiser_fast') data, _ = lr.effects.trim( data) # trim leading and trailing silence mel_specgram = lr.feature.melspectrogram( data, n_mels=64, hop_length=hop_in_samples, n_fft=n_fft) mfcc = lr.feature.mfcc(S=lr.power_to_db(mel_specgram), sr=SR, n_mfcc=n_channels, n_dim=1) # plt.imshow(mfcc.T, cmap='viridis', aspect='auto') # plt.savefig('MFCC_test_{}.png'.format(label)) if METHOD is 'h5': writer.create_dataset(str(idx), data=mfcc.reshape(-1)) elif METHOD is 'tfrecord': mfcc_feature = tf.train.Feature( float_list=tf.train.FloatList( value=mfcc.reshape(-1).tolist())) tf_label = tf.train.Feature( bytes_list=tf.train.BytesList( value=[label.encode('utf-8')])) age = '' if type(row['age']) == float else row[
def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0): return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db)
spec_augment_tensorflow.visualization_spectrogram( warped_masked_spectrogram, 'after') shape = warped_masked_spectrogram.shape[ 0] * warped_masked_spectrogram.shape[1] if shape > max_shape: max_shape = shape x_train[count] = [0] * shape shape = 0 for i in range(warped_masked_spectrogram.shape[0]): for j in range(warped_masked_spectrogram.shape[1]): x_train[count][shape] = warped_masked_spectrogram[i][j] shape - shape + 1 count = count + 1 librosa.display.specshow(librosa.power_to_db(melspec, ref=np.max)) pylab.savefig(save_path, bbox_inches=None, pad_inches=0) pylab.close() x_train = x_train.reshape(1, num_of_files, max_shape) labels = np.zeros((1, num_of_files, 1)) for i in range(num_of_files): labels[0][i][0] = random.randint(0, 9) continue from AudioDataGenerator import AudioDataGenerator datagen = AudioDataGenerator(featurewise_center=True, featurewise_std_normalization=True, shift=.2,
def _normalize(audio): audio = librosa.power_to_db(audio, ref=np.max) audio = (audio + 80) / 80 return audio
# ``` # librosa.load(audio_path, sr=None) # ``` # to disable resampling. # # Mel spectrogram # This first step will show how to compute a [Mel](http://en.wikipedia.org/wiki/Mel_scale) spectrogram from an audio waveform. # In[4]: # Let's make and display a mel-scaled power (energy-squared) spectrogram S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) # Convert to log scale (dB). We'll use the peak power (max) as reference. log_S = librosa.power_to_db(S, ref=np.max) # Make a new figure plt.figure(figsize=(12,4)) # Display the spectrogram on a mel scale # sample rate and hop length parameters are used to render the time axis librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') # Put a descriptive title on the plot plt.title('mel power spectrogram') # draw a color bar plt.colorbar(format='%+02.0f dB') # Make the figure layout compact
plt.subplot(211) plt.title('Spectrogram') D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) librosa.display.specshow(D, x_axis='time', y_axis='log') plt.subplot(212) plt.title('Audioform') librosa.display.waveplot(y, sr=sr) librosa.feature.melspectrogram(y=X, sr=sample_rate) D = np.abs(librosa.stft(X))**2 S = librosa.feature.melspectrogram(S=D) S = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128,fmax=8000) plt.figure(figsize=(10, 4)) librosa.display.specshow(librosa.power_to_db(S,ref=np.max),y_axis='mel', fmax=8000,x_axis='time') plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') plt.tight_layout() plt.show() y_fast = librosa.effects.time_stretch(X, 2.0) time = np.arange(0,len(y_fast))/sample_rate fig, ax = plt.subplots() ax.plot(time,y_fast) ax.set(xlabel='Time(s)',ylabel='sound amplitude') plt.show()#compress to be twice as fast y_slow = librosa.effects.time_stretch(X, 0.5) time = np.arange(0,len(y_slow))/sr fig, ax = plt.subplots()
def save_mel_spectrogram(dataset_path, json_path, num_segments, n_fft=2048, hop_length=512): """Extracts MELs from music dataset and saves them into a json file along witgh genre labels. :param dataset_path (str): Path to dataset :param json_path (str): Path to json file used to save MELs :param: num_segments (int): Number of segments we want to divide sample tracks into :param n_fft (int): Interval we consider to apply FFT. Measured in # of samples :param hop_length (int): Sliding window for FFT. Measured in # of samples :return: """ # dictionary to store mapping, labels, and MELs data = {"mapping": [], "labels": [], "MEL": []} samples_per_segment = int(SAMPLES_PER_TRACK / num_segments) num_mel_vectors_per_segment = math.ceil(samples_per_segment / hop_length) # generating label list label_list = pd.read_csv(NON_GUITAR_LABEL_PATH, delimiter='\s+', index_col=False, header=None) label_list = label_list[2].tolist() # generating corresponding mapping voicing = {} counter = 0 for chord in label_list: if chord not in voicing: voicing[chord] = counter counter += 1 data["mapping"].append([*voicing]) # loop through instrument samples for instrument in os.listdir(dataset_path): # handling the audio files if instrument.endswith(".wav"): sample_path = os.path.join(dataset_path, instrument) signal, sample_rate = librosa.load(sample_path, sr=SAMPLE_RATE) # segmenting sample into its constituent 2 second chord voicing for s in range(num_segments): # calculating start and end sample for each chord voicing start = s * samples_per_segment end = start + samples_per_segment # extract log spaced frequency, log amplitude mel spectogram segment = signal[start:end] spectrogram = librosa.feature.melspectrogram( segment, hop_length=hop_length, n_fft=2048, sr=sr, n_mels=133, window="hann") mel_spectrogram = librosa.power_to_db(spectrogram) mel_spectrogram = mel_spectrogram.T # store only spectrogram with expected number of vectors & append corresponding label if len(mel_spectrogram) == num_mel_vectors_per_segment: data["MEL"].append(mel_spectrogram.tolist()) print("{}, chord:{}".format(sample_path, s + 1)) data["labels"].append(voicing[label_list[s]]) with open(json_path, "w") as fp: json.dump(data, fp, indent=4)
for indexA in os.listdir(loadpath): for indexB in os.listdir(loadpath + indexA): os.makedirs(savepath + indexA + '\\' + indexB) for indexC in os.listdir(loadpath + indexA + '\\' + indexB): print(indexA, indexB, indexC) y, sr = librosa.load(loadpath + indexA + '\\' + indexB + '\\' + indexC, sr=16000) D = numpy.abs( librosa.stft(y, n_fft=n_fft, win_length=win_length, hop_length=hop_length, window=signal.hamming, center=False))**2 S = librosa.feature.melspectrogram(S=D, n_mels=m_bands) gram = librosa.power_to_db(S, ref=numpy.max) gram = numpy.transpose(gram, (1, 0)) # print(numpy.shape(gram)) file = open( savepath + indexA + '\\' + indexB + '\\' + indexC + '.csv', 'w') for indexX in range(len(gram)): for indexY in range(len(gram[indexX])): if indexY != 0: file.write(',') file.write(str(gram[indexX][indexY])) file.write('\n') file.close()
i.label = 0 # Split data up into sets that will feed into the network data = [] labels = [] # Getting Input Mel-Spectrograms from music to use in Deep Learning for i in training: ## 256 Mels ## 20s = 862 Frames y, sr = librosa.core.load(library_path + "/" + i.name) segment = y[60 * sr:80 * sr] spectrogram = librosa.feature.melspectrogram(y=segment, sr=sr, n_fft=2048, n_mels=256) log_spectro = librosa.power_to_db(spectrogram**2, ref=1.0) i.spectrogram = log_spectro x = 0 z = 21 for j in range(41): temp = log_spectro[0:256, x:z] data.append(temp) label = i.label labels.append(label) x = z z += 21 for i in testing: ## 256 Mels ## 20s = 862 Frames y, sr = librosa.core.load(library_path + "/" + i.name) segment = y[60 * sr:80 * sr]
print('Number of Training Files: ', len(training_files)) # Loop over files and apply SpecAugment for file in training_files: # Load the audio file audio, sr = librosa.load(file) # Extract Mel Spectrogram Features from the audio file mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=256, hop_length=128, fmax=8000) plt.figure(figsize=(14, 6)) librosa.display.specshow(librosa.power_to_db(mel_spectrogram, ref=np.max), x_axis='time', y_axis='mel', fmax=8000) # Base # Apply SpecAugment apply = SpecAugment(mel_spectrogram, args.policy) time_warped = apply.time_warp( ) # Applies Time Warping to the mel spectrogram #plt.figure(figsize=(14, 6)) #librosa.display.specshow(librosa.power_to_db(time_warped[0, :, :, 0].numpy(), ref=np.max), x_axis='time', y_axis='mel', fmax=8000) # Time Warped freq_masked = apply.freq_mask( ) # Applies Frequency Masking to the mel spectrogram
# Load waveforms y, _ = librosa.load(filename, mono=True, sr=sr) # Get spectrogram D = librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) mag = np.abs(D) scaled_mag = mag * 2 # Get mel-spectrogram mel_basis = librosa.filters.mel(sr, n_fft, n_mels) # (n_mels, 1+n_fft//2) mel = np.dot(mel_basis, mag ** 1) # (n_mels, t) # mel spectrogram scaled_mel = np.dot(mel_basis, scaled_mag ** 1) # Get mfccs db = librosa.power_to_db(mel) scaled_db = librosa.power_to_db(scaled_mel) mfccs = np.dot(librosa.filters.dct(n_mfcc, db.shape[0]), mel) scaled_mfccs = np.dot(librosa.filters.dct(n_mfcc, db.shape[0]), scaled_mel) mfccs = mfccs.T # (t, n_mfccs) scaled_mfccs = scaled_mfccs.T assert(np.all(mfccs * 2 == scaled_mfccs)) print(mfccs) print(scaled_mfccs)
harmonic = librosa.effects.harmonic(y, margin=8) chromagram = librosa.feature.chroma_cqt(y=harmonic, sr=sr) note, mode = key(chromagram) ##Beat_srength & Tempo onset_env = librosa.onset.onset_strength(y, sr=sr) tempo = librosa.beat.tempo(onset_envelope=onset_env, aggregate=None) ##Power & Loudness S = librosa.stft(y, center=False) power = np.abs(S)**2 p_mean = np.sum(power, axis=0, keepdims=True) p_ref = np.max( power) # or whatever other reference power you want to use loudness = librosa.power_to_db(p_mean, ref=p_ref) tonnetz = np.mean(librosa.feature.tonnetz(y=harmonic, sr=sr)) #Artist, Title, Album extraction filename = filename.split('-') artist = filename[0] song = filename[1] if len(filename) > 2: album = filename[2] else: album = song artist = artist.split(',') album = album[:-4] song, album = song.strip(), album.strip() album, song = album.lower(), song.lower()
def get_melspec(spec, n_mels): # Power spectrum powerspec = np.abs(spec)**2 melspec = librosa.feature.melspectrogram(S=powerspec, n_mels=n_mels) S = librosa.power_to_db(melspec, np.max) return S
sr=RATE, n_mels=128, fmax=8000) plt.subplot(211) ax1.set_ylim(yrange) plt.plot(full) if args.vis == 'mfcc': plt.subplot(212) librosa.display.specshow(vis, x_axis='time') plt.colorbar() elif args.vis == 'spec': plt.subplot(212) librosa.display.specshow(librosa.power_to_db(vis, ref=np.max), y_axis='mel', fmax=8000, x_axis='time') plt.colorbar(format='%+2.0f dB') plt.pause(0.01) previous = data_int end = time.time() tot.append(end - start) print("Time taken =", end - start) print("finished recording") print("Total time =", sum(tot))
def __test(y_true, x, rp): y = librosa.power_to_db(x, ref=rp, top_db=None) assert np.isclose(y, y_true)
fmax = 16000. max_size = 3 ######################################################## # The paper uses a log-frequency representation, but for # simplicity, we'll use a Mel spectrogram instead. S = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax, n_mels=n_mels) plt.figure(figsize=(6, 4)) librosa.display.specshow(librosa.power_to_db(S, ref=np.max), y_axis='mel', x_axis='time', sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax) plt.tight_layout() ################################################################ # Now we'll compute the onset strength envelope and onset events # using the librosa defaults. odf_default = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length) onset_default = librosa.onset.onset_detect(y=y, sr=sr, hop_length=hop_length, units='time') ######################################### # And similarly with the superflux method
def load_mel_spectrogram_db(path, config): spec, mel_filters = load_mel_spectrogram(path, config) config['ref_power'] = np.max(spec) return lr.power_to_db(spec, ref=np.max), mel_filters
import numpy as np import matplotlib.pyplot as plt from glob import glob import librosa as lr import librosa.display audio = 'arabic6' y, sr = lr.load('./{}.wav'.format(audio)) lr.feature.melspectrogram(y=y, sr=sr) D = np.abs(lr.stft(y))**2 S = lr.feature.melspectrogram(S=D) S = lr.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000) plt.figure(figsize=(10, 4)) lr.display.specshow(lr.power_to_db(S, ref=np.max), y_axis='mel', fmax=8000, x_axis='time') plt.colorbar(format='%+2.0f dB') plt.title('Mel spectrogram') plt.tight_layout() plt.show()
def build_mfcc_fingerprint(data, n_mels): data = librosa.feature.melspectrogram(data, sr=SAMPLE_RATE, n_mels=40) data = librosa.power_to_db(data, ref=np.max) return data
def graph_audio(f, opt, y=None, sr=None, show=True, shape=None, dest=None, ext=None, verbose=True): ''' This function generates various audio representation graphs for specified .wav files (or given audio time series and sampling rate values). It also accepts an optional parameter to save the generated graphs to categorized directories based on the corresponding emotion conveyed in the audio sample. Args: f (str): the absolute path to the input .wav file opt (str): the type of audio graph representation to be generated ("spect" => spectrogram, "mp_spect" => mel-power spectrogram, "cqt" => constant-Q transform, "chrom" => chromagram, "mfcc" => MFCC intensity values) y (np.ndarray): supplied audio time series; optional sr (int): supplied sampling rate of audio time series y; optional show (bool): specifies whether or not to show the resulting graph (default is True, which always depicts the resulting graph) shape (tuple(int, int)): the dimensions (in inches) of the image to display dest (str): if a value is given, this will serve as the path of the root directory to write to (default value is None, which does not save the resulting graph) ext (int): if supplied, adds "..._<ext>.png" to saved audio file verbose (bool): specifies whether or not to add axis labels, ticks, and colorbars to resulting plots (default value is True, which adds the aforementioned details) Returns: None (function may display a graph and / or save resulting graph file to a specified directory) ''' if None in [y,sr]: y, sr = librosa.load(f) cmap = cm.get_cmap('viridis') # Spectrogram if opt == 'spect': log_spect = np.log(get_spectrogram(y)) if verbose: librosa.display.specshow(log_spect, sr=sr, x_axis='time', y_axis='linear', cmap=cmap) plt.colorbar(format='%+2.0f dB') else: fig, ax = plt.subplots(1) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') librosa.display.specshow(log_spect, sr=sr, cmap=cmap) plt.axis('off') # Mel Power Spectrogram elif opt == 'mp_spect': S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) log_S = librosa.power_to_db(S, ref=np.max) if verbose: librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', cmap=cmap) plt.colorbar(format='%+2.0f dB') else: fig, ax = plt.subplots(1) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') librosa.display.specshow(log_S, sr=sr, cmap=cmap) plt.axis('off') # Constant-Q Transform elif opt == 'cqt': C = librosa.cqt(y, sr) if verbose: librosa.display.specshow(librosa.amplitude_to_db(C**2), x_axis='time', y_axis='cqt_note', cmap=cmap) plt.colorbar(format='%+2.0f dB') else: fig,ax = plt.subplots(1) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') librosa.display.specshow(librosa.amplitude_to_db(C**2), cmap=cmap) plt.axis('off') # Chromagram elif opt == 'chrom': C = np.abs(librosa.cqt(y, sr)) chroma = librosa.feature.chroma_cqt(C=C, sr=sr) if verbose: librosa.display.specshow(chroma, x_axis='time', y_axis='chroma', cmap=cmap) plt.colorbar() else: fig,ax = plt.subplots(1) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') librosa.display.specshow(chroma, cmap=cmap) plt.axis('off') # MFCC Intensity elif opt == 'mfcc': raw_mfcc = librosa.feature.mfcc(y=y,sr=sr) scaled_mfcc = scaled = scale(raw_mfcc, axis=1) if verbose: librosa.display.specshow(scaled, sr=sr, x_axis='time', cmap=cmap) plt.colorbar() else: fig, ax = plt.subplots(1) fig.subplots_adjust(left=0, right=1, bottom=0, top=1) ax.axis('off') librosa.display.specshow(scaled, sr=sr, cmap=cmap) plt.axis('off') if shape: fig = plt.gcf() dpi = 256 fig.set_size_inches(*shape) if show: plt.show() if dest: basename = os.path.basename(f) if shape: fig.set_size_inches(*shape) ext = '_{0:02d}'.format(ext) if ext else '' fig.savefig(dest + get_category(basename) + '/' + basename[:-4] + ext + '.png', dpi=256, frameon=False) plt.close()
def aug_get_spectrogram_feature(filepath): """ (rate, width, sig) = wavio.readwav(filepath) #sig, sample_rate = librosa.core.load(filepath, 16000) sig = sig.ravel() stft = torch.stft(torch.FloatTensor(sig), N_FFT, hop_length=int(0.01*SAMPLE_RATE), win_length=int(0.030*SAMPLE_RATE), window=torch.hamming_window(int(0.030*SAMPLE_RATE)), center=False, normalized=False, onesided=True) stft = (stft[:,:,0].pow(2) + stft[:,:,1].pow(2)).pow(0.5); amag = stft.numpy(); feat = torch.FloatTensor(amag) feat = torch.FloatTensor(feat).transpose(0, 1) """ """ input_nfft = int(round(sample_rate * 0.025)) input_stride = int(round(sample_rate * 0.010)) #S = np.abs(librosa.stft(sig)) #mel_spec = librosa.feature.melspectrogram(sr=sample_rate, y=sig, n_mels=40, n_fft=512, hop_length=128) #mel_spec = librosa.feature.melspectrogram(sr=sample_rate, y=sig, n_mels=128, n_fft=N_FFT, win_length=int(0.030*SAMPLE_RATE),hop_length=int(0.01*SAMPLE_RATE)) mel_spec = librosa.feature.melspectrogram(sr=sample_rate, y=sig, n_fft=2048, hop_length=512) mel_spec = librosa.power_to_db(mel_spec, ref=np.max) #mel_spec = _normalize(mel_spec) #mel_spec = torch.FloatTensor(mel_spec) mel_spec = torch.FloatTensor(mel_spec).transpose(0,1) """ sample_rate = 16000 hop_length = 128 sig, sample_rate = librosa.core.load(filepath, sample_rate) mel_spectrogram = librosa.feature.melspectrogram(y=sig, n_mels=128, sr=sample_rate, n_fft=512, hop_length=128) shape = mel_spectrogram.shape mel_spectrogram = np.reshape(mel_spectrogram, (-1, shape[0], shape[1])) mel_spectrogram = torch.from_numpy(mel_spectrogram) mel_spectrogram = spec_augment(mel_spectrogram) mel_spectrogram = librosa.power_to_db(mel_spectrogram[0, :, :], ref=np.max) mel_spectrogram = _normalize(mel_spectrogram) mel_spectrogram = torch.FloatTensor(mel_spectrogram).transpose(0, 1) """ sample_rate = 16000 hop_length = 128 sig, sample_rate = librosa.core.load(filepath, sample_rate) mfcc_feat = librosa.feature.mfcc(y=sig, sr=sample_rate, hop_length = hop_length, n_mfcc = 257,n_fft=512) mfcc_feat = torch.FloatTensor(mfcc_feat).transpose(0,1) """ return mel_spectrogram
def calculates_log_mel(data): S = librosa.feature.melspectrogram(data, sr=SAMPLE_RATE, n_mels=128) return librosa.power_to_db(S, ref=np.max)
sample_rate, samples = wavfile.read(trainset[index][2]) for idx in range(1500): print('Creating Silence') start_point = np.int((900000) * (np.random.rand(1))) end_point = start_point + 16000 cur_samples = samples[start_point:end_point - 1] power_factor = 0.5 + np.random.rand(1) cur_samples = cur_samples * power_factor S = librosa.feature.melspectrogram(cur_samples.astype(float), sr=sample_rate, n_mels=64, hop_length=250, n_fft=480, fmin=20, fmax=4000) log_S = librosa.power_to_db(S, ref=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=64) delta2_mfcc = librosa.feature.delta(mfcc, order=2) # choose if 'silence' is going to train or validation if np.random.binomial(1, 1400 / 1500): # trainSetMEL_DB.append(log_S) trainSetMFCC.append(delta2_mfcc) trainSetClasses.append(trainset[index][0]) else: # valSetMEL_DB.append(log_S) valSetMFCC.append(delta2_mfcc) valSetClasses.append(10) else: if (trainset[index][0] == 11) & (np.random.binomial(1, 0.80)): continue
def __test(ref): db = librosa.power_to_db(xp, ref=ref, top_db=None) xp2 = librosa.db_to_power(db, ref=ref) assert np.allclose(xp, xp2)
def compute_MFCC(y, parameter): M = compute_Mel_Spectrum(y, parameter) M = librosa.power_to_db(M, ref=1.0) F = librosa.feature.mfcc(S=M, n_mfcc=parameter.mfccs) return F
# A full list of the supported parameters is provided in the # `librosa.display.specshow` documentation. # %% # Other types of spectral data # ---------------------------- # The examples above illustrate how to plot linear spectrograms, # but librosa provides many kinds of spectral representations: # Mel-scaled, constant-Q, variable-Q, chromagrams, tempograms, etc. # # specshow can plot these just as well. For example, a Mel spectrogram # can be displayed as follows: fig, ax = plt.subplots() M = librosa.feature.melspectrogram(y=y, sr=sr) M_db = librosa.power_to_db(M, ref=np.max) img = librosa.display.specshow(M_db, y_axis='mel', x_axis='time', ax=ax) ax.set(title='Mel spectrogram display') fig.colorbar(img, ax=ax, format="%+2.f dB") # %% # Constant-Q plots, and other logarithmically scaled frequency representations # such as Variable-Q or `iirt` can be decorated using either the frequencies (Hz) # or their note names in scientific pitch notation: C = librosa.cqt(y=y, sr=sr) C_db = librosa.amplitude_to_db(np.abs(C), ref=np.max) fig, ax = plt.subplots() librosa.display.specshow(C_db, y_axis='cqt_hz', x_axis='time', ax=ax) ax.set(title='Frequency (Hz) axis decoration')
def compute_Normalized_Log_Spectrogram(y, parameter): P = compute_Power_Spectrogram(y, parameter) P_db = librosa.power_to_db(P, ref=parameter.n_fft) return P_db