def extract(args): audio_directory, output_directory, af, overwrite = args subdir, output_file = os.path.split(af.split(audio_directory)[1]) output_file = os.path.splitext(output_file)[0] output_file = os.path.join(output_directory, output_file) if os.path.exists(output_file) and not overwrite: print('Skipping {}. Already exists.'.format(output_file)) return output = dict() try: y, _sr = soundfile.read(af) y = to_mono(y) sr = 22050 y = resample(y, _sr, sr) except Exception as e: y, sr = load(af) output['linspec_mag'], output['linspec_phase'] = linspec(y) output['melspec'] = melspec(y, sr=sr) output['logspec'] = logspec(y, sr=sr) output['hcqt_mag'], output['hcqt_phase'] = hcqt(y, sr=sr) output['vggish_melspec'] = vggish_melspec(y, sr=sr) # high-level output['percussive_ratio'], output['percussive_rms'], output[ 'total_rms'] = percussive_ratio(y, margin=3.0) output['onset_strength'] = onset_strength(y, detrend=True) output['tempogram'] = tempogram(y) output['onset_patterns'] = onset_patterns(y, sr=sr) np.savez_compressed(output_file, **output)
def findRhythmic(wave): # 3 dimensions rhythm_feature = {} env = onset_strength(wave) tempogram = feature.tempogram(onset_envelope=env, hop_length=hop_size) rhythm_feature['tempo_sum'] = np.sum(tempogram) return rhythm_feature
def tempo(y=None, sr=22050, onset_envelope=None, hop_length=512, start_bpm=120, std_bpm=1.0, ac_size=8.0, max_tempo=320.0, aggregate=np.mean): if start_bpm <= 0: raise ParameterError('start_bpm must be strictly positive') win_length = np.asscalar(core.time_to_frames(ac_size, sr=sr, hop_length=hop_length)) tg = tempogram(y=y, sr=sr, onset_envelope=onset_envelope, hop_length=hop_length, win_length=win_length) # Eventually, we want this to work for time-varying tempo if aggregate is not None: tg = aggregate(tg, axis=1, keepdims=True) # Get the BPM values for each bin, skipping the 0-lag bin bpms = core.tempo_frequencies(tg.shape[0], hop_length=hop_length, sr=sr) # Weight the autocorrelation by a log-normal distribution prior = np.exp(-0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm)**2) prior2 = np.argsort(prior, axis=0) prior2_idx = prior2[-2] # print(prior2_idx) # print('prior_2_idx', prior2_idx) # Kill everything above the max tempo if max_tempo is not None: max_idx = np.argmax(bpms < max_tempo) prior[:max_idx] = 0 # Really, instead of multiplying by the prior, we should set up a # probabilistic model for tempo and add log-probabilities. # This would give us a chance to recover from null signals and # rely on the prior. # it would also make time aggregation much more natural # Get the maximum, weighted by the prior period = tg * prior[:, np.newaxis] best_period = np.argmax(period, axis=0) best_2 = np.argsort(period, axis=0) prior2_idx = best_2[-2] #print(prior2_idx) #print(best_period) second_period = prior2_idx tempi = bpms[best_period] tempi2 = bpms[second_period] #print(type(tempi), type(tempi2)) # Wherever the best tempo is index 0, return start_bpm tempi[best_period == 0] = start_bpm tempi2[second_period == 0] = start_bpm return (tempi2.astype(float)[0].item(), tempi.astype(float)[0].item())
def _compute_tempo(self, audio_buffer): """ uses """ sample_rate = 8000 tempo = tempogram(y=audio_buffer.astype(float), sr=sample_rate, norm=None) return tempo
def plot_tempograms(filepaths): """Accepts a list of filepaths, and plots a tempogram for each associated audio file.""" fig, axes = plt.subplots(3, 1) for k in range(len(filepaths)): data, rate = librosa.load(filepaths[k]) gram = tempogram(data, rate) temp = tempo(data, rate) print('Tempogram dimensions:', gram.shape) display.specshow(gram, sr=rate, x_axis='time', y_axis='tempo', cmap='magma', ax=axes[k]) axes[k].axhline(temp, color='w', linestyle='--', alpha=1) axes[k].set_title(str(filepaths[k][15:])) plt.tight_layout() plt.show()
def transform_audio(self, y): '''Compute the tempogram Parameters ---------- y : np.ndarray Audio buffer Returns ------- data : dict data['tempogram'] : np.ndarray, shape=(n_frames, win_length) The tempogram ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) tgram = tempogram(y=y, sr=self.sr, hop_length=self.hop_length, win_length=self.win_length).astype(np.float32) tgram = fix_length(tgram, n_frames) return {'tempogram': tgram.T[self.idx]}
def ac_peaks(data, rate, plot=False): """Return the three highest peaks in the autocorrelation (tempo) array. Plot if needed.""" # Get the onset strength envelope (deals with lag, spectral flux, but the main idea is that it can give us info about lag/variation in the audio, so we can use it to get tempo information) oenv = librosa.onset.onset_strength(y=data, sr=rate) # Compute the tempogram and truncate at time (index) 1000 gram = tempogram(data, rate) gram = gram[:, :1000] # Get the global autocorrelation and the frequencies (in this case, freqs indicate BPM estimates) ac_global = librosa.autocorrelate(oenv, max_size=gram.shape[0]) freqs = librosa.tempo_frequencies(gram.shape[0], sr=rate) # Find the peaks of the autocorrelation plot, sort them, and keep only the three highest peaks peaks, _ = find_peaks(ac_global) sorting = np.argsort(ac_global[peaks]) peaks = peaks[sorting][-3:] # Plot the stuff if requested if plot: plt.semilogx(freqs, ac_global, ':', base=2) plt.semilogx(freqs[peaks], ac_global[peaks], marker='o', linestyle='', base=2) plt.xlabel('BPM') plt.ylabel('Autocorrelation') plt.legend(['Global Autocorrelation', 'Three Highest Peaks']) plt.show() # Return the frequencies with the three highest autocorrelation value as an array if len(freqs[peaks]) == 3: return np.array(freqs[peaks])[::-1] else: return np.array([float('NaN'), float('NaN'), float('NaN')])
row = np.concatenate((row, spcent)) flatness = np.mean(lf.spectral_flatness(thing1[:-1]).T, axis=0) row = np.concatenate((row, flatness)) rolloff = np.mean(lf.spectral_rolloff(thing1[:-1]).T, axis=0) row = np.concatenate((row, rolloff)) mspec = np.mean(lf.melspectrogram(thing1[:-1]).T, axis=0) row = np.concatenate((row, mspec)) mfcc = np.mean(lf.mfcc(thing1[:-1], n_mfcc=30).T, axis=0) row = np.concatenate((row, mfcc)) tonnetz = np.mean(lf.tonnetz(thing1[:-1]).T, axis=0) row = np.concatenate((row, tonnetz)) rmse = np.mean(lf.rmse(thing1[:-1]).T, axis=0) row = np.concatenate((row, rmse)) contrast = np.mean(lf.spectral_contrast(thing1[:-1]).T, axis=0) row = np.concatenate((row, contrast)) tempo = np.mean(lf.tempogram(thing[:-1], win_length=88).T, axis=0) row = np.concatenate((row, tempo)) row = np.append(row, thing1[-1]) #print(len(row)) train_data = np.append(train_data, row) counter += 1 columns = ["feat_" + str(i) for i in range(299)] columns.append("class") df_train2 = pd.DataFrame(columns=columns) for i in range(6325): print(float(i) / 6325. * 100) row = train_data[300 * i:300 * (i + 1)] #print(pd.Series(row))