def analyse(filename, resample_to=2756, bt_hop_length=128, chroma_hop_length=512, chroma_n_fft=1024): samples, sampleRate = librosa.load(filename) length = float(len(samples))/sampleRate if resample_to: samples = librosa.resample(samples, sampleRate, resample_to) sampleRate = resample_to newSampleRate = 2756 samples = librosa.resample(samples, sampleRate, newSampleRate) sampleRate = newSampleRate tempo, beats = librosa.beat.beat_track(samples, sampleRate, hop_length=bt_hop_length) beat_times = librosa.frames_to_time(beats, sampleRate, hop_length=bt_hop_length) chromagram = librosa.feature.chromagram(samples, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft) chromagram = numpy.transpose(chromagram) distances = scipy.spatial.distance.cdist(chromagram, CHORDS, "cosine") chords = distances.argmin(axis=1) chords = scipy.signal.medfilt(chords, 11) chord_frames = numpy.array(numpy.where(numpy.diff(chords) != 0)) chords = chords[chord_frames][0].astype(int) chord_times = librosa.frames_to_time(chord_frames, sampleRate, hop_length=chroma_hop_length, n_fft=chroma_n_fft)[0] chord_names = CHORD_NAMES[chords] return {"beats": list(beat_times), "chords": [{"chord": chord_name, "time": chord_time} for chord_name, chord_time in zip(chord_names, chord_times)], "tempo": tempo}
def features(filename): # print '\t[1/5] loading audio' y, sr = librosa.load(filename, sr=SR) # print '\t[2/5] Separating harmonic and percussive signals' y_perc, y_harm = hp_sep(y) # print '\t[3/5] detecting beats' bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH) # print '\t[4/5] generating CQT' M1 = np.abs( librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72) ) M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max) # print '\t[5/5] generating MFCC' S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS) M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC) n = min(M1.shape[1], M2.shape[1]) beats = beats[beats < n] beats = np.unique(np.concatenate([[0], beats])) times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH) times = np.concatenate([times, [float(len(y)) / sr]]) M1 = librosa.feature.sync(M1, beats, aggregate=np.median) M2 = librosa.feature.sync(M2, beats, aggregate=np.mean) return (M1, M2), times
def logcqt_onsets(x, fs, pre_max=0, post_max=1, pre_avg=0, post_avg=1, delta=0.05, wait=50): """ Parameters ---------- x : np.ndarray Audio signal fs : scalar Samplerate of the audio signal. pre_max, post_max, pre_avg, post_avg, delta, wait See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray, ndim=1 Times in seconds for splitting. """ hop_length = 1024 x_noise = x + np.random.normal(scale=10.**-3, size=x.shape) cqt = librosa.cqt(x_noise.flatten(), sr=fs, hop_length=hop_length, fmin=27.5, n_bins=24*8, bins_per_octave=24, tuning=0, sparsity=0, real=False, norm=1) cqt = np.abs(cqt) lcqt = np.log1p(5000*cqt) c_n = utils.canny(51, 3.5, 1) onset_strength = sig.lfilter(c_n, np.ones(1), lcqt, axis=1).mean(axis=0) peak_idx = librosa.onset.onset_detect( onset_envelope=onset_strength, delta=delta, wait=wait) return librosa.frames_to_time(peak_idx, hop_length=hop_length)
def get_beat(y, PARAMETERS): '''Estimate beat times and tempo''' # Compute a log-power mel spectrogram on the percussive component S_p = librosa.feature.melspectrogram(y=y, sr=PARAMETERS['load']['sr'], n_fft=PARAMETERS['stft']['n_fft'], hop_length=PARAMETERS['beat']['hop_length'], n_mels=PARAMETERS['mel']['n_mels'], fmax=PARAMETERS['mel']['fmax']) S_p = librosa.logamplitude(S_p, ref_power=S_p.max()) # Compute the median onset aggregation odf = librosa.onset.onset_strength(S=S_p, aggregate=np.median) # Get beats tempo, beats = librosa.beat.beat_track(onset_envelope=odf, sr=PARAMETERS['load']['sr'], hop_length=PARAMETERS['beat']['hop_length']) beat_times = librosa.frames_to_time(beats, sr=PARAMETERS['load']['sr'], hop_length=PARAMETERS['beat']['hop_length']) return tempo, beat_times, odf
def compute_beats(y_percussive, sr=22050): """Computes the beats using librosa. Parameters ---------- y_percussive: np.array Percussive part of the audio signal in samples. sr: int Sample rate. Returns ------- beats_idx: np.array Indeces in frames of the estimated beats. beats_times: np.array Time of the estimated beats. """ logging.info("Estimating Beats...") tempo, beats_idx = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=msaf.Anal.hop_size) # Add first and last beat beats_idx = np.concatenate(([0], beats_idx, [len(y_percussive) / msaf.Anal.hop_size])).\ astype(np.int) # To times times = librosa.frames_to_time(beats_idx, sr=sr, hop_length=msaf.Anal.hop_size) return beats_idx, times
def analyse_bpm(self, y, sr): """ determine le bpm d'une musique exemple de test: analyse1 = analyse("/home/bettini/Musique/Deorro.wav", "fichier_csv") y, sr = analyse1.extrairedatamusic() analyse1.analyse_bpm(y, sr) :param pathtofile: chemin absolue du fichier audio dont on veut analyser le bpm :param fichier_csv: fichier csv dans lequel sera enregistre les bpms du morceau (nom de la playlist en cours) :Comment ecrit dans le fichier csv a la fin """ # creation de la liste qui va etre exportee dans le csv ElemCsv = [] # execution du tracker bpm par default tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) # Converti les sequences d'indice de beat en un chronogramme correspondant aux impulsions d'énergie au cours de la musique beat_times = librosa.frames_to_time(beat_frames, sr=sr) # calcul du bpm du debut et de la fin de la musique dans le cas d'un changement au cours de la musique bpm_d = 0 bpm_f = 0 for i in range(100): bpm_d = bpm_d + (beat_times[i + 1] - beat_times[i]) bpm_f = bpm_f + (beat_times[len(beat_times) - i - 1] - beat_times[len(beat_times) - i - 2]) # on complete la lste qui va etre mis dans le de la base de donnée ElemCsv.append(tempo) ElemCsv.append(60 / (bpm_d / 100)) ElemCsv.append(60 / (bpm_f / 100)) return ElemCsv # bpm debut, bpm fin , bpm moyen
def envelope_onsets(x, fs, wait=100): """ Parameters ---------- filename : str Path to an audiofile to split. Returns ------- onsets : np.ndarray, ndim=1 Times in seconds for splitting. """ log_env = 10 * np.log10(10. ** -4.5 + np.power(x.flatten()[:], 2.0)) w_n = np.hanning(100) w_n /= w_n.sum() log_env_lpf = sig.filtfilt(w_n, np.ones(1), log_env) n_hop = 100 kernel = utils.canny(100, 3.5, 1) kernel /= np.abs(kernel).sum() onsets_forward = sig.lfilter( kernel, np.ones(1), log_env_lpf[::n_hop] - log_env_lpf.min(), axis=0) onsets_pos = onsets_forward * (onsets_forward > 0) peak_idx = librosa.util.peak_pick(onsets_pos, pre_max=500, post_max=500, pre_avg=10, post_avg=10, delta=0.025, wait=wait) return librosa.frames_to_time(peak_idx, hop_length=n_hop)
def libroRMS(filepath, kRatio): y, sr = librosa.load(filepath) # Load the waveform as y, sr is sample rate clipLength = librosa.get_duration(y=y, sr=sr) kValue = int(clipLength/kRatio +1) #sets up relative ratio of samples ### get the RMS of the audio sample ### data = librosa.feature.rmse(y=y, hop_length=2048) boundaries = librosa.segment.agglomerative(data, k=kValue) # Agglomeration boundary_times = librosa.frames_to_time(boundaries, hop_length=2048) # ~.1s intervals = np.hstack([boundary_times[:-1, np.newaxis], boundary_times[1:, np.newaxis]]) get_rms = librosa.feature.sync(data, boundaries, aggregate=np.max) nkValue = kValue-1 #because, for some reason, the intervals above leave out the last one fixedN = np.delete(get_rms, nkValue, axis=1) npsTurn = np.concatenate((intervals, fixedN.T), axis=1) #transform from np array to regular list flatnps = npsTurn.tolist() slice_value = int(kValue//3) rmsOut1 = sorted(flatnps, key = lambda x: int(x[2]), reverse=True) #rmsOut2 = slice(rmsOut1[0: slice_value]) rmsOut2 = rmsOut1[0 : slice_value] rmsOut3 = sorted(rmsOut2, key = lambda x: int(x[0])) return rmsOut3
def logcqt_onsets(x, fs, pre_max=0, post_max=1, pre_avg=0, post_avg=1, delta=0.05, wait=50, hop_length=1024): """ Parameters ---------- x : np.ndarray Audio signal fs : scalar Samplerate of the audio signal. pre_max, post_max, pre_avg, post_avg, delta, wait See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray, ndim=1 Times in seconds for splitting. """ lcqt = logcqt(x, fs, hop_length) c_n = utils.canny(51, 3.5, 1) onset_strength = sig.lfilter(c_n, np.ones(1), lcqt, axis=1).mean(axis=0) peak_idx = librosa.onset.onset_detect( onset_envelope=onset_strength, delta=delta, wait=wait) return librosa.frames_to_time(peak_idx, hop_length=hop_length)
def estimate_beats(self): """Estimates the beats using librosa. Returns ------- times: np.array Times of estimated beats in seconds. frames: np.array Frame indeces of estimated beats. """ # Compute harmonic-percussive source separiation if needed if self._audio_percussive is None: self._audio_harmonic, self._audio_percussive = self.compute_HPSS() # Compute beats tempo, frames = librosa.beat.beat_track( y=self._audio_percussive, sr=self.sr, hop_length=self.hop_length) # To times times = librosa.frames_to_time(frames, sr=self.sr, hop_length=self.hop_length) # TODO: Is this really necessary? if len(times) > 0 and times[0] == 0: times = times[1:] frames = frames[1:] return times, frames
def filter_out(self,nob,song2): song2.change_temp(self.tempo) song2.cut_song(self.length_of_songs) l=scipy.signal.firwin( numtaps=10, cutoff=300, nyq=self.sr/2) h=-l h[10/2]=h[10/2]+1 fader_l=self.audio_left[int(self.bars[-nob-1][1]*self.sr):] fader_r=self.audio_right[int(self.bars[-nob-1][1]*self.sr):] fader=np.arange(float(len(fader_l)))/float(len(fader_l)) fader_l=scipy.signal.lfilter(l,1.0,fader_l*fader[::-1]) fader_r=scipy.signal.lfilter(l,1.0,fader_r*fader[::-1]) haha=scipy.signal.lfilter(h,1.0,(song2.audio_left[int(song2.beat_times[0]*self.sr):int(song2.beat_times[0]*self.sr)+len(self.audio_left[int(self.bars[-nob-1][1]*self.sr):])]*fader)) self.audio_left[int(self.bars[-nob-1][1]*self.sr):]=fader_l+haha self.audio_left=np.concatenate((self.audio_left,song2.audio_left[len(haha):])) haha=scipy.signal.lfilter(h,1.0,(song2.audio_right[int(song2.beat_times[0]*self.sr):int(song2.beat_times[0]*self.sr)+len(self.audio_right[int(self.bars[-nob-1][1]*self.sr):])]*fader)) self.audio_right[int(self.bars[-nob-1][1]*self.sr):]=fader_r+haha self.audio_right=np.concatenate((self.audio_right,song2.audio_right[len(haha):])) tempo, beats = librosa.beat.beat_track(y=self.audio_left, sr=self.sr) self.beat_times=librosa.frames_to_time(beats, sr=self.sr) bars=[] for i in range(len(self.beat_times)/4-1): bars.append([self.beat_times[i*4],self.beat_times[(i+1)*4]]) self.bars=np.array(bars)
def beat_track(input_file, output_csv): '''Beat tracking function :parameters: - input_file : str Path to input audio file (wav, mp3, m4a, flac, etc.) - output_file : str Path to save beat event timestamps as a CSV file ''' print 'Loading ', input_file y, sr = librosa.load(input_file, sr=22050) # Use a default hop size of 64 frames @ 22KHz ~= 11.6ms HOP_LENGTH = 64 # This is the window length used by default in stft N_FFT = 2048 print 'Tracking beats' tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=HOP_LENGTH) print 'Estimated tempo: %0.2f beats per minute' % tempo # 3. save output # 'beats' will contain the frame numbers of beat events. beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH, n_fft=N_FFT) print 'Saving output to ', output_csv librosa.output.times_csv(output_csv, beat_times) print 'done!'
def hpss_beats(input_file, output_csv): '''HPSS beat tracking :parameters: - input_file : str Path to input audio file (wav, mp3, m4a, flac, etc.) - output_file : str Path to save beat event timestamps as a CSV file ''' # Load the file print 'Loading ', input_file y, sr = librosa.load(input_file) # Do HPSS print 'Harmonic-percussive separation ... ' y = percussive(y) # Construct onset envelope from percussive component print 'Tracking beats on percussive component' onsets = librosa.onset.onset_strength(y=y, sr=sr, hop_length=HOP_LENGTH, n_fft=N_FFT, aggregate=np.median) # Track the beats tempo, beats = librosa.beat.beat_track( onsets=onsets, sr=sr, hop_length=HOP_LENGTH) beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH, n_fft=N_FFT) # Save the output print 'Saving beats to ', output_csv librosa.output.times_csv(output_csv, beat_times)
def extract_cqt(audio_data): ''' CQT routine with default parameters filled in, and some post-processing. Parameters ---------- audio_data : np.ndarray Audio data to compute CQT of Returns ------- cqt : np.ndarray CQT of the supplied audio data. frame_times : np.ndarray Times, in seconds, of each frame in the CQT ''' # Compute CQT cqt = librosa.cqt(audio_data, sr=FS, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES, hop_length=HOP_LENGTH, tuning=0.) # Compute the time of each frame times = librosa.frames_to_time( np.arange(cqt.shape[1]), sr=FS, hop_length=HOP_LENGTH) # Use float32 for the cqt to save space/memory cqt = cqt.astype(np.float32) return cqt, times
def process_file(input_file, **kwargs): output_file = os.path.basename(input_file) output_file = os.path.splitext(output_file)[0] output_file = os.path.extsep.join([output_file, "log"]) if kwargs["median"]: output_file = os.path.extsep.join([output_file, "med"]) else: output_file = os.path.extsep.join([output_file, "sum"]) output_file = os.path.extsep.join([output_file, kwargs["spectrogram"]]) output_file = os.path.extsep.join([output_file, "csv"]) output_file = os.path.join(kwargs["destination"], output_file) with open(input_file, "r") as f: S = pickle.load(f)[SPECMAP[kwargs["spectrogram"]]].astype(np.float32) if kwargs["median"]: odf = librosa.onset.onset_strength(S=S, sr=SR, hop_length=HOP, n_fft=N_FFT, aggregate=np.median) else: odf = librosa.onset.onset_strength(S=S, sr=SR, hop_length=HOP, n_fft=N_FFT, aggregate=np.mean) tempo, beats = librosa.beat.beat_track(onsets=odf, sr=SR, hop_length=HOP, tightness=TIGHTNESS) times = librosa.frames_to_time(beats, sr=SR, hop_length=HOP, n_fft=N_FFT) librosa.output.times_csv(output_file, times)
def segment_audio_timeit(signal, sr): start_time = timeit.default_timer() silence_threshold = get_silence_threshold(signal, sr) print("getsilencethreshold: ") print(timeit.default_timer() - start_time) start_time = timeit.default_timer() o_env = librosa.onset.onset_strength(y=signal, sr=sr, centering=False, hop_length=HOP_LENGTH) onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr, hop_length=HOP_LENGTH) onset_times = librosa.frames_to_time(onset_frames, sr=sr, hop_length=HOP_LENGTH) print("librosa.onset_detect: ") print(timeit.default_timer() - start_time) segments = [] overalltime = timeit.default_timer() for i in range(len(onset_times)): segment_start = onset_times[i]*sr if i != len(onset_times)-1: segment_end = (onset_times[i+1]*sr)-HOP_LENGTH else: segment_end = len(signal)-1 segment_end = find_segment_end(segment_start, segment_end, signal, silence_threshold) if (segment_end - segment_start >= MIN_SOUND_LEN*sr) and (onset_times[i] > START_TIME)\ and (onset_times[i] < (len(signal)/sr-END_TIME)): segments.append((signal[segment_start: segment_end], onset_times[i])) print('all segments') print(timeit.default_timer() - overalltime) return segments
def compute_beats(y_percussive, sr=22050): """Computes the beats using librosa. Parameters ---------- y_percussive: np.array Percussive part of the audio signal in samples. sr: int Sample rate. Returns ------- beats_idx: np.array Indeces in frames of the estimated beats. beats_times: np.array Time of the estimated beats. """ logging.info("Estimating Beats...") tempo, beats_idx = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=msaf.Anal.hop_size) times = librosa.frames_to_time(beats_idx, sr=sr, hop_length=msaf.Anal.hop_size) # Remove first beat time if 0 if times[0] == 0: times = times[1:] beats_idx = beats_idx[1:] return beats_idx, times
def beat_track(input_file, output_csv): '''Beat tracking function :parameters: - input_file : str Path to input audio file (wav, mp3, m4a, flac, etc.) - output_file : str Path to save beat event timestamps as a CSV file ''' print('Loading ', input_file) y, sr = librosa.load(input_file, sr=22050) # Use a default hop size of 512 samples @ 22KHz ~= 23ms hop_length = 512 # This is the window length used by default in stft print('Tracking beats') tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length) print('Estimated tempo: {:0.2f} beats per minute'.format(tempo)) # save output # 'beats' will contain the frame numbers of beat events. beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=hop_length) print('Saving output to ', output_csv) librosa.output.times_csv(output_csv, beat_times) print('done!')
def analyze(self): audio_path = self.path y, sr = librosa.load(audio_path, sr=None) tempo, beats = librosa.beat.beat_track(y=y, sr=sr) self.tempo = tempo self.beats = list(beats) self.times = list(librosa.frames_to_time(beats, sr=sr))
def fade_out(self,nob,song2): song2.change_temp(self.tempo) song2.cut_song(self.length_of_songs) fader_l=self.audio_left[int(self.bars[-nob-1][1]*self.sr):] fader_r=self.audio_right[int(self.bars[-nob-1][1]*self.sr):] fader=np.arange(float(len(fader_l)))/float(len(fader_l)) fader_l=fader_l*fader[::-1] fader_r=fader_r*fader[::-1] haha=song2.audio_left[int(song2.beat_times[0]*self.sr):int(song2.beat_times[0]*self.sr)+len(self.audio_left[int(self.bars[-nob-1][1]*self.sr):])]*fader self.audio_left[int(self.bars[-nob-1][1]*self.sr):]=fader_l+haha self.audio_left=np.concatenate((self.audio_left,song2.audio_left[len(haha):])) haha=song2.audio_right[int(song2.beat_times[0]*self.sr):int(song2.beat_times[0]*self.sr)+len(self.audio_right[int(self.bars[-nob-1][1]*self.sr):])]*fader self.audio_right[int(self.bars[-nob-1][1]*self.sr):]=fader_r+haha self.audio_right=np.concatenate((self.audio_right,song2.audio_right[len(haha):])) tempo, beats = librosa.beat.beat_track(y=self.audio_left, sr=self.sr) self.beat_times=librosa.frames_to_time(beats, sr=self.sr) bars=[] for i in range(len(self.beat_times)/4-1): bars.append([self.beat_times[i*4],self.beat_times[(i+1)*4]]) self.bars=np.array(bars)
def ellis_bpm(fname, start_bpm, hpss=True, hop_length=512, tightness=100.0, plot=False, sound=False): y, sr = librosa.load(fname, sr=None) log.debug(u'Estimating tempo: {}'.format(TERM.cyan(fname))) if hpss: log.debug(TERM.magenta("Getting percussive elements")) y_harmonic, y_percussive = librosa.effects.hpss(y) chunks = np.array_split(y_percussive, PLOT_SPLIT) log.debug(TERM.magenta("Estimating beats per minute")) bpm, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr, start_bpm=start_bpm, hop_length=hop_length, tightness=tightness) else: log.debug(TERM.magenta("Estimating beats per minute")) bpm, beat_frames = librosa.beat.beat_track(y=y, sr=sr, start_bpm=start_bpm, hop_length=hop_length, tightness=tightness) chunks = np.array_split(y, PLOT_SPLIT) log.debug(u'Tempo: {:6.2f} bpm'.format(bpm)) if plot: plt.figure(figsize=(16,10)) curr_frame = 0 for i in range(PLOT_SPLIT): plt.subplot(PLOT_SPLIT * 100 + 11 + i) plt.plot(curr_frame + np.arange(len(chunks[i])), chunks[i], 'g') for b in beat_frames: plt.axvline(x=b*hop_length, color='k') plt.xlim([curr_frame, len(chunks[i]) + curr_frame]) curr_frame += len(chunks[i]) plt.show(block=False) if sound: beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=hop_length) clicks = mir_eval.sonify.clicks(beat_times, sr, length=len(y)) sd.play(y + clicks, sr) input('Press Return key to stop sound') sd.stop() return bpm
def main(): """ main() - Main function for feature extraction Inputs: None Outputs: Pickle file with feature data """ vocalData = loadmat('../../Data/firstVerseTimes.mat') audioPath = '../../Audio/Vocals/' assert isdir(audioPath), "Audio path does not exist" # Make sure directory of audio exists fileList = [ join(audioPath, 'Vocals_' + str(vocalData['firstVerseTimes'][i][3][0])) for i in range(len(vocalData['firstVerseTimes'])) ] numFiles = len(fileList) vocalFeatures = np.zeros((numFiles, 8)) for i in range(numFiles): print 'Working on file {} of {}'.format(i, numFiles) # Read in audio audio, sr = librosa.load(fileList[i], sr=44100) S = librosa.stft(audio, n_fft = 1024, hop_length = 512) spec = np.abs(S) # Extract features centroids = centroid(spec, sr) # Spectral centroid contrasts = contrast(spec, sr, 0.05) # Spectral contrast onset_frames = librosa.onset.onset_detect(y=audio, sr=sr, hop_length=64) # Calculate frames of onsets onset_times = librosa.frames_to_time(onset_frames, sr, hop_length=64) # Calculate times of onsets # Extract feature statistics vocalFeatures[i,0] = np.mean(np.diff(onset_times)) # Mean of onset durations vocalFeatures[i,1] = np.var(np.diff(onset_times)) # Variance of onset durations vocalFeatures[i,2], beats = librosa.beat.beat_track(audio, sr ) # Get beats and tempo vocalFeatures[i,3] = np.mean(centroids) # Mean of centroids vocalFeatures[i,4] = np.var(centroids) # Variance of centroids vocalFeatures[i,5] = np.mean(contrasts) # Mean of spectral contrast vocalFeatures[i,6] = np.var(contrasts) # Mean of spectral contrast vocalFeatures[i,7] = onset_times.shape[0] / (audio.shape[0] / float(sr))# Onset density # Create dictionary for features dataDict = {'ID': np.array([vocalData['firstVerseTimes'][i][0][0][0] for i in range(len(vocalData['firstVerseTimes']))]), 'onsetMean': vocalFeatures[:,0], 'onsetVar': vocalFeatures[:,1], 'tempo': vocalFeatures[:,2], 'centroidMean': vocalFeatures[:,3], 'centroidVar': vocalFeatures[:,4], 'contrastMean': vocalFeatures[:,5], 'contrastVar': vocalFeatures[:,6], 'onsetDensity': vocalFeatures[:,7], 'artist': [vocalData['firstVerseTimes'][i][1][0] for i in range(len(vocalData['firstVerseTimes']))], 'song': np.array([vocalData['firstVerseTimes'][i][2][0] for i in range(len(vocalData['firstVerseTimes']))])} dump(dataDict, open('vocalFeatureData.p', 'w')) print ('Done')
def extract_timing_data(filename, samplerate=22050, channels=1, hop_length=64): x_n, fs = marl.audio.read(filename, samplerate, channels) onset_env = librosa.onset.onset_strength( x_n.squeeze(), fs, hop_length=hop_length, aggregate=np.median) tempo, beat_frames = librosa.beat.beat_track( onset_envelope=onset_env, sr=fs, hop_length=hop_length) beat_times = librosa.frames_to_time( beat_frames, sr=fs, hop_length=hop_length) onset_frames = librosa.onset.onset_detect( onset_envelope=onset_env, sr=fs, hop_length=hop_length) onset_times = librosa.frames_to_time( onset_frames, sr=fs, hop_length=hop_length) duration = len(x_n) / fs return dict(onset_times=onset_times.tolist(), beat_times=beat_times.tolist(), tempo=tempo, duration=duration)
def analyze_frames(y, sr, debug=False): A = {} hop_length = 128 # First, get the track duration A['duration'] = float(len(y)) / sr # Then, get the beats if debug: print "> beat tracking" tempo, beats = librosa.beat.beat_track(y, sr, hop_length=hop_length) # Push the last frame as a phantom beat A['tempo'] = tempo A['beats'] = librosa.frames_to_time(beats, sr, hop_length=hop_length).tolist() if debug: print "beats count: ", len(A['beats']) if debug: print "> spectrogram" S = librosa.feature.melspectrogram(y, sr, n_fft=2048, hop_length=hop_length, n_mels=80, fmax=8000) S = S / S.max() # A['spectrogram'] = librosa.logamplitude(librosa.feature.sync(S, beats)**2).T.tolist() # Let's make some beat-synchronous mfccs if debug: print "> mfcc" S = librosa.feature.mfcc(librosa.logamplitude(S), n_mfcc=40) A['timbres'] = librosa.feature.sync(S, beats).T.tolist() if debug: print "timbres count: ", len(A['timbres']) # And some chroma if debug: print "> chroma" S = N.abs(librosa.stft(y, hop_length=hop_length)) # Grab the harmonic component H = librosa.decompose.hpss(S)[0] # H = librosa.hpss.hpss_median(S, win_P=31, win_H=31, p=1.0)[0] A['chroma'] = librosa.feature.sync(librosa.feature.chromagram(S=H, sr=sr), beats, aggregate=N.median).T.tolist() # Relative loudness S = S / S.max() S = S**2 if debug: print "> dists" dists = structure(N.vstack([N.array(A['timbres']).T, N.array(A['chroma']).T])) A['dense_dist'] = dists edge_lens = [A["beats"][i] - A["beats"][i - 1] for i in xrange(1, len(A["beats"]))] A["avg_beat_duration"] = N.mean(edge_lens) return A
def __test(units, hop_length, y, sr): tempo, b1 = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length) _, b2 = librosa.beat.beat_track(y=y, sr=sr, hop_length=hop_length, units=units) t1 = librosa.frames_to_time(b1, sr=sr, hop_length=hop_length) if units == 'time': t2 = b2 elif units == 'samples': t2 = librosa.samples_to_time(b2, sr=sr) elif units == 'frames': t2 = librosa.frames_to_time(b2, sr=sr, hop_length=hop_length) assert np.allclose(t1, t2)
def __test(units, hop_length, y, sr): b1 = librosa.onset.onset_detect(y=y, sr=sr, hop_length=hop_length) b2 = librosa.onset.onset_detect(y=y, sr=sr, hop_length=hop_length, units=units) t1 = librosa.frames_to_time(b1, sr=sr, hop_length=hop_length) if units == 'time': t2 = b2 elif units == 'samples': t2 = librosa.samples_to_time(b2, sr=sr) elif units == 'frames': t2 = librosa.frames_to_time(b2, sr=sr, hop_length=hop_length) assert np.allclose(t1, t2)
def __test(infile): DATA = load(infile) (bpm, beats) = librosa.beat.beat_track(y=None, sr=8000, hop_length=32, onsets=DATA['onsetenv'][0], n_fft=None) print beats print DATA['beats'] assert numpy.allclose(librosa.frames_to_time(beats, sr=8000, hop_length=32), DATA['beats']) pass
def beat_analysis(self): """runs the analysis on the song to determine where the beats are, and adds a beat channel""" self.tempo, self.beat_frames = librosa.beat.beat_track(self.waveform,self.sample_rate) self.beat_times = librosa.frames_to_time(self.beat_frames, self.sample_rate) self.beat_channel=Channel('Beat',False) for second in self.beat_times: #rounds time to 1/10 of a second second = round(second, 1) time=datetime.timedelta(0,second) #saves beat in channel self.beat_channel.update(time, True)
def __test(infile): DATA = load(infile) (bpm, beats) = librosa.beat.beat_track(y=None, sr=8000, hop_length=32, onset_envelope=DATA['onsetenv'][0]) beat_times = librosa.frames_to_time(beats, sr=8000, hop_length=32) assert np.allclose(beat_times, DATA['beats'])
def direct(self,song2): song2.change_temp(self.tempo) song2.cut_song(self.length_of_songs) self.audio_left=np.concatenate((self.audio_left[:int(self.beat_times[-1]*self.sr)],song2.audio_left[int(song2.beat_times[0]*self.sr):])) self.audio_right=np.concatenate((self.audio_right[:int(self.beat_times[-1]*self.sr)],song2.audio_right[int(song2.beat_times[0]*self.sr):])) tempo, beats = librosa.beat.beat_track(y=self.audio_left, sr=self.sr) self.beat_times=librosa.frames_to_time(beats, sr=self.sr) bars=[] for i in range(len(self.beat_times)/4-1): bars.append([self.beat_times[i*4],self.beat_times[(i+1)*4]]) self.bars=np.array(bars)
def midi_to_chroma(midi): return midi.get_chroma(times = librosa.frames_to_time(np.arange(midi.get_end_time()*22050/512)))
def test_feature(): file = "/mnt/hgfs/vmfiles/genres/pop/pop.00003.wav" fp = FeaturePlan(sample_rate=22050) fp.addFeature('mfcc: MFCC blockSize=512 stepSize=256') #13 fp.addFeature('sr: SpectralRolloff blockSize=512 stepSize=256') #1 fp.addFeature('sf: SpectralFlux blockSize=512 stepSize=256') #1 fp.addFeature( 'scfp: SpectralCrestFactorPerBand FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #19 fp.addFeature( 'sf1: SpectralFlatness FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #1 fp.addFeature( 'sc: SpectralShapeStatistics FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #4 fp.addFeature( 'sfp: SpectralFlatnessPerBand FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #19 fp.addFeature('energy: Energy blockSize=512 stepSize=256') #1 fp.addFeature( 'loudness: Loudness FFTLength=0 FFTWindow=Hanning LMode=Relative blockSize=512 stepSize=256' ) #24 fp.addFeature( 'ms: MagnitudeSpectrum FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #257 fp.addFeature( 'ps: PerceptualSharpness FFTLength=0 FFTWindow=Hanning blockSize=512 stepSize=256' ) #1 fp.addFeature('zcr:ZCR blockSize=512 stepSize=256') #1 engine = Engine() engine.load(fp.getDataFlow()) afp = AudioFileProcessor() afp.processFile(engine, file) feats = engine.readAllOutputs() ceps = feats['scfp'] print 'scfp', ceps.shape print 'loudness', feats['loudness'].shape print 'sfp', feats['sfp'].shape #num_ceps = len(ceps) c = calc_statistical_features(ceps.transpose()) print 'c', c.shape y, sr = librosa.load(file) print y.shape print sr hop_length = 256 oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length) tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length) ac_global = librosa.autocorrelate(oenv, max_size=tempogram.shape[0]) #384 ac_global = librosa.util.normalize(ac_global) print ac_global.shape tempo = librosa.beat.estimate_tempo(oenv, sr=sr, hop_length=hop_length) #1 print "tempo", tempo print "tempogram", tempogram.shape #384 tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) print "tempo", tempo print "beat_frames", beat_frames.shape beat_times = librosa.frames_to_time(beat_frames, sr=sr) print "beat_times", beat_times.shape print beat_times y_harmonic, y_percussive = librosa.effects.hpss(y) # Compute MFCC features from the raw signal mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13) print "mfcc", mfcc.shape # And the first-order differences (delta features) mfcc_delta = librosa.feature.delta(mfcc) print "mfcc_delta", mfcc_delta.shape # Stack and synchronize between beat events # This time, we'll use the mean value (default) instead of median beat_mfcc_delta = librosa.feature.sync(np.vstack([mfcc, mfcc_delta]), beat_frames) print "beat_mfcc_delta", beat_mfcc_delta.shape chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr) #12 c = np.mean(chromagram, axis=1) print "c", c.shape print "chromagram", chromagram.shape r = calc_statistical_features(chromagram) print r.shape beat_chroma = librosa.feature.sync(chromagram, beat_frames, aggregate=np.median) print "beat_chroma", beat_chroma.shape #print beat_chroma # Finally, stack all beat-synchronous features together beat_features = np.vstack([beat_chroma, beat_mfcc_delta]) print "beat_features", beat_features.shape beat_feature_set = np.mean(beat_features, axis=1) print beat_feature_set.shape print beat_feature_set #print np.mean(ceps,axis =0) #return np.mean(ceps[int(num_ceps*1/10):int(num_ceps*9/10)], axis=0) a1 = calc_statistical_features(feats['scfp'].transpose()) #19*7 = 133 a1 = a1.reshape(a1.shape[0] * a1.shape[1]) print a1.shape a2 = calc_statistical_features(feats['sfp'].transpose()) #19*7 = 133 a2 = a2.reshape(a2.shape[0] * a2.shape[1]) print a2.shape a3 = calc_statistical_features(feats['loudness'].transpose()) #24*7 = 168 a3 = a3.reshape(a3.shape[0] * a3.shape[1]) print a3.shape a4 = calc_statistical_features(tempogram) a4 = a4.reshape(a4.shape[0] * a4.shape[1]) print a4.shape a5 = calc_statistical_features(chromagram) #12*7 = 84 a5 = a5.reshape(a5.shape[0] * a5.shape[1]) print a5.shape feature5_set = np.hstack((a1, a2, a3, a4, a5)) #384*7 = 2688 print "feature5_set", feature5_set.shape
recognize_y = onsets_frames.copy() onsets_frames_strength = librosa.onset.onset_strength(y=y, sr=sr) #onsets_frames = get_onsets_by_all_v2(y, sr,len(codes[type_index])+2) if len(onsets_frames) < 3: continue #print("onsets_frames is {}".format(onsets_frames)) # 标准节拍时间点 base_frames = onsets_base_frames(codes[type_index], total_frames_number) #print("base_frames is {}".format(base_frames)) min_d, best_y, onsets_frames = get_dtw_min(onsets_frames, base_frames, 65) base_onsets = librosa.frames_to_time(best_y, sr=sr) #print("base_onsets is {}".format(base_onsets)) # 节拍时间点 onstm = librosa.frames_to_time(onsets_frames, sr=sr) #print("onstm is {}".format(onstm)) duration = librosa.get_duration(y, sr=sr) # 获取音频时长 #print("duration is {}".format(duration)) #节拍数之差 diff_real_base = len(onsets_frames) - len(base_frames) #librosa.display.waveplot(y, sr=sr) # plt.show() plt.vlines(onstm,
def test_model_on_folk(): # X_polovnicek = wav2cqt_spec('polovnicek.wav') # times = librosa.frames_to_time(np.arange(X_polovnicek.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_polovnicek = midi2labels('polovnicek.MID', times) # # # X_jedna = wav2cqt_spec('jedna.mp3') # times = librosa.frames_to_time(np.arange(X_jedna.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_jedna = midi2labels('jedna.MID', times) # # # X_kohutik = wav2cqt_spec('kohutik.wav') # times = librosa.frames_to_time(np.arange(X_kohutik.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_kohutik = midi2labels('kohutik.MID', times) # # X_marienka = wav2cqt_spec('marienka.mp3') # times = librosa.frames_to_time(np.arange(X_marienka.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_marienka = midi2labels('marienka.mid', times) # # X_hora = wav2cqt_spec('hora.mp3') # times = librosa.frames_to_time(np.arange(X_hora.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_hore = midi2labels('hora.mid', times) # # X_onvo = wav2cqt_spec('onvo.mp3') # times = librosa.frames_to_time(np.arange(X_marienka.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y_onvo = midi2labels('onvo.mid', times) # model = load_model('DNN_mp3_piano.hdf5') # dnn = DNN(3, 256) # dnn.set_model(model) # dnn.summary() # # # # # dnn.predict(X_kohutik, y_kohutik) #dnn.predict(X_polovnicek, y_polovnicek) # X = wav2cqt_spec('MAPS_MUS-alb_esp2_AkPnCGdD.flac') # dnn.predict(X) # X = wav2cqt_spec('alb_esp{0}.wav'.format(1)) # times = librosa.frames_to_time(np.arange(X.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y = midi2labels('alb_esp{0}.mid'.format(1), times) # # dnn.predict(X, y) #exit(0) X_all, y_all = None, None for i in range(1, 7): X = wav2cqt_spec('alb_esp{0}.mp3'.format(i)) times = librosa.frames_to_time(np.arange(X.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) y = midi2labels('alb_esp{0}.mid'.format(i), times) print(X.shape, y.shape) if i == 1: X_all, y_all = X, y else: X_all, y_all = np.concatenate((X_all, X)), np.concatenate((y_all, y)) # wavs = [x for x in listdir(WAV_DIR) if x.endswith('.mp3') and 'format0' not in x] # np.random.seed() # np.random.shuffle(wavs) # # i, length = 1, len(wavs) # X_all, y_all = None, None # for wav in wavs: # try: # X = wav2cqt_spec(wav) # times = librosa.frames_to_time(np.arange(X.shape[0]), sr=SAMPLE_RATE, hop_length=HOP_LENGTH) # y = midi2labels('{0}.mid'.format(wav.split('.')[0]), times) # # if X_all is None or y_all is None: # X_all, y_all = X, y # elif X.shape[0] == y.shape[0]: # X_all, y_all = np.concatenate((X_all, X)), np.concatenate((y_all, y)) # # print('{0}/{1} {2} {3}.mid'.format(i, length, wav, wav.split('.')[0]), X.shape, '/', X_all.shape, y.shape, # '/', y_all.shape) # i += 1 # # if i >= 20: # break # # except FileNotFoundError as err: # print(err) # except Exception as err: # print(err) min_all, max_all = X_all.min(axis=0), X_all.max(axis=0) X_all = (X_all - min_all) / (max_all - min_all) size = X_all.shape[0] half_size, third_size = size // 2, size // 2 + size // 4 X_train, y_train = X_all[:half_size], y_all[:half_size] X_val, y_val = X_all[half_size:third_size], y_all[half_size:third_size] X_test, y_test = X_all[third_size:], y_all[third_size:] # dnn = DNN(256, 3) # dnn.create() # dnn.train(X_train, y_train, X_val, y_val) # dnn.predict(X_test, y_test) X_train = np.array([X_train[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(X_train) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) y_train = np.array([y_train[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(y_train) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) X_val = np.array([X_val[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(X_val) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) y_val = np.array([y_val[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(y_val) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) X_test = np.array([X_test[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(X_test) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) y_val = np.array([y_test[i:i + LSTM_SAMPLE_SIZE, :] for i in range(0, len(y_test) - LSTM_SAMPLE_SIZE + 1, LSTM_SAMPLE_SIZE)]) try: lstm = LSTM(256, 3) lstm.create() lstm.summary() lstm.train(X_train, y_train, X_val, y_val) lstm.predict(X_test, y_test) except Exception as ex: print(ex)
praatEXE = 'C:/Users/user/Desktop/Praat.exe' all_song = 'C:/Users/user/Desktop/mir_final/lemon.wav' file = 'C:/Users/user/Desktop/mir_final/lemon.wav' data, fs = librosa.load(file, sr=None, dtype='double') all_data, fs = librosa.load(all_song, sr=None, dtype='double') ''' Param setting ''' win_len = 2048 # n of fft hop_len = 512 # samples rmse = np.log( librosa.feature.rmse(y=data, frame_length=win_len, hop_length=hop_len)) ''' frame step to time step''' time_step = librosa.frames_to_time(range(rmse.shape[-1]), sr=fs, hop_length=hop_len, n_fft=win_len) ''' ZCR, pitch and energy to find candidates for beat''' zcr = librosa.feature.zero_crossing_rate(data, frame_length=win_len, hop_length=hop_len) energy = extractIntensity(file, 'C:/Users/user/Desktop/mir_final/energy.txt', praatEXE, minPitch=65, sampleStep=librosa.samples_to_time(hop_len, fs), forceRegenerate=True, undefinedValue=0) pitch = extractPitch(file, 'C:/Users/user/Desktop/mir_final/pitch.txt', praatEXE,
M = np.vstack([mfcc, delta_mfcc, delta2_mfcc]) ####################### # Beat tracking ####################### # Now, let's run the beat tracker. # We'll use the percussive component for this part plt.figure(figsize=(12, 6)) tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr) # Let's re-draw the spectrogram, but this time, overlay the detected beats plt.figure(figsize=(12, 4)) librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') # Let's draw transparent lines over the beat frames plt.vlines(librosa.frames_to_time(beats), 1, 0.5 * sr, colors='w', linestyles='-', linewidth=2, alpha=0.5) plt.axis('tight') plt.colorbar(format='%+02.0f dB') plt.tight_layout() print('Estimated tempo: %.2f BPM' % tempo)
def analyzeSound(filename, beatsPerMeasure): y, sr = librosa.load(filename) tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) print('Estimated tempo: {:.2f} beats per minute'.format(tempo)) beat_times = librosa.frames_to_time(beat_frames, sr=sr) return tempo #returns b
# beats per min goes here #from __future__ import print_function import librosa import librosa #add default location here filename = 'train/edmtest.mp3' y, s = librosa.load(filename) temp, frames = librosa.beat.beat_track(y=y, sr=s) print('Estimated tempo: {:.2f} beats per minute'.format(temp)) bpm = librosa.frames_to_time(frames, sr=s) print('Saving output to bpm.csv') #librosa.output.times_csv('bpm.csv', bpm)
librosa.display.specshow(librosa.amplitude_to_db(S_full, ref=np.max), y_axis='log', x_axis='time', sr=sr) plt.colorbar() plt.tight_layout() ########################################################### # As you can see, there are periods of silence and # non-silence throughout this recording. # # As a first step, we can plot the root-mean-square (RMS) curve rms = librosa.feature.rms(y=y)[0] times = librosa.frames_to_time(np.arange(len(rms))) plt.figure(figsize=(12, 4)) plt.plot(times, rms) plt.axhline(0.02, color='r', alpha=0.5) plt.xlabel('Time') plt.ylabel('RMS') plt.axis('tight') plt.tight_layout() # The red line at 0.02 indicates a reasonable threshold for silence detection. # However, the RMS curve occasionally dips below the threshold momentarily, # and we would prefer the detector to not count these brief dips as silence. # This is where the Viterbi algorithm comes in handy! #####################################################
plt.figure(figsize=(14, 5)) plt.plot(x[n0:n1]) plt.grid() plt.title(title) plt.show() #Zero Crossings zero_crossings = librosa.zero_crossings(x[n0:n1], pad=False) print(f'Zero Crossings: {sum(zero_crossings)}') #Spectral Centroid spectral_centroids = librosa.feature.spectral_centroid(x, sr=sr)[0] spectral_centroids.shape # Computing the time variable for visualization frames = range(len(spectral_centroids)) t = librosa.frames_to_time(frames) # Normalising the spectral centroid for visualisation def normalize(x, axis=0): return sklearn.preprocessing.minmax_scale(x, axis=axis) #Plotting the Spectral Centroid along the waveform dsp.waveplot(x, sr=sr, alpha=0.4) plt.plot(t, normalize(spectral_centroids), color='r') plt.title(title) plt.show() #Spectral Rolloff #specified percentage of the total spectral energy, e.g. 85%, lies. spectral_rolloff = librosa.feature.spectral_rolloff(x, sr=sr)[0]
def start_frames(self, value): self.__start_frames = value self.__start_time = librosa.frames_to_time(value, sr=utils.SAMPLE_RATE) self.__start_beat = round_to_sixteenth( 0.25 * (self.__start_time / self.quarter_note_time))
def analyze(self, inputFile, count=2): if not(self.loaded): raise UnloadedException() timeS = time.time() try: (signal, samplerate) = sf.read(inputFile) except: print( "Error with chunk file. Unable to perform features extraction on the file.") raise Exception() # The number of columns in the dataset (except for index) dataset_shape = (self.PARAM_FRAME_LENGTH / 10) * self.PARAM_NUMBER_MELS X_test_vectors = [ np.repeat(0, dataset_shape) ] signal = librosa.to_mono(np.transpose(signal)) signal, _ = librosa.effects.trim(signal, top_db=50) #spectrogram = librosa.feature.melspectrogram(signal, sr=samplerate, n_fft=1024, hop_length=160, fmin=240, fmax=3000) spectrogram = librosa.feature.melspectrogram(signal, sr=samplerate, n_fft=1024, hop_length=160) logSpectrogram = self.refFun(spectrogram) signalLength = float(len(signal) / samplerate) * 1000 indexPosition = 0 while indexPosition < signalLength - self.PARAM_FRAME_LENGTH: row = np.asarray(logSpectrogram[:, int(indexPosition / 10):int((indexPosition + self.PARAM_FRAME_LENGTH) / 10)]).ravel() X_test_vectors.append(row) indexPosition += self.PARAM_FRAME_LENGTH X_test_vectors = X_test_vectors[1:] # We remove first row which is only 0 X_test = [] for i in range(len(X_test_vectors)): matrix = np.zeros((self.PARAM_NUMBER_MELS, int(self.PARAM_FRAME_LENGTH / 10))) for l in range(self.PARAM_NUMBER_MELS): for m in range(int(self.PARAM_FRAME_LENGTH / 10)): matrix[l, m] = X_test_vectors[i][l * int(self.PARAM_FRAME_LENGTH / 10) + m] X_test.append([matrix]) # Creating vector into clustering space cluster_space_layer = K.function([self.model.layers[0].input], [self.model.layers[7].output]) layer_output = cluster_space_layer([X_test])[0] cosinus_dist = 1. - sklearn.metrics.pairwise.cosine_similarity(layer_output) cosinus_dist[cosinus_dist < 0] = 0 cosine_tsne = manifold.TSNE(n_components=2, metric='precomputed').fit_transform(cosinus_dist) Z = linkage(layer_output, metric='cosine', method='complete') minDist = max([row[2] for row in Z]) nb_clusters = len(Z) for i in range(len(Z)-1): if (minDist > Z[i+1][2] - Z[i][2]): minDist = Z[i+1][2] - Z[i][2] nb_clusters = i if count is None: count = 2 int(count) clustering = AgglomerativeClustering(affinity='cosine', linkage="complete", n_clusters=count).fit_predict(layer_output) # Now we need to find indexes when current speaker changes flags = [] currentSpeaker = clustering[0] for i in range(1, len(clustering)): if clustering[i] != currentSpeaker: currentSpeaker = clustering[i] flags.append(i) finalClustering = [] for flag in flags: fragment = signal[(flag-1)*samplerate:(flag+1)*samplerate] chroma = librosa.feature.chroma_cens(y=fragment, sr=samplerate) #librosa.output.write_wav("output/test_fragment.wav", test_fragment, samplerate) bounds = librosa.segment.agglomerative(chroma, 3) speakerStartPos = (flag-1) + librosa.frames_to_time(bounds, sr=samplerate)[1] finalClustering.append(float("{0:.3f}".format(speakerStartPos))) flags.insert(0, 0) finalClustering.insert(0, 0) result = [[] for i in range(count)] for i in range(1, len(flags)): print(flags[i] - 1) n = clustering[flags[i] - 1] result[n].append((finalClustering[i-1], finalClustering[i] - 0.001)) result[clustering[-1]].append((finalClustering[-1], "EOF")) #clustering = KMeans(n_clusters=4).fit_predict(layer_output) return {'res': result, 'exec_time': time.time() - timeS}
def worker(self): audio = pyaudio.PyAudio() print('\n*******************************************') print('RHAPSODY MODULE-I INPUT') print('*******************************************\n') print('\n===========================================') print('STARTED RECORDING') print('===========================================\n') for i in range(1, 4): print('\n===========================================') print(str(i) + '...') print('===========================================\n') sleep(1) stream = audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) f = [] for i in range(0, int(self.RATE / self.CHUNK * self.RECORD_SECONDS)): data = stream.read(self.CHUNK) f.append(data) print('\n===========================================') print('DONE RECORDING') print('===========================================\n') stream.stop_stream() stream.close() audio.terminate() wf = wave.open(self.WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(self.CHANNELS) wf.setsampwidth(audio.get_sample_size(self.FORMAT)) wf.setframerate(self.RATE) wf.writeframes(b''.join(f)) wf.close() """""" """""" """""" """""" """""" """""" """ 1 - Loading File """ """""" """""" """""" """""" """""" """""" filename = self.WAVE_OUTPUT_FILENAME y, sr = librosa.load(filename) """""" """""" """""" """""" """""" """""" """ 2 - Get Tempo == bpm """ """""" """""" """""" """""" """""" """""" tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) print('\n===========================================') print('Estimated tempo: {:.2f} beats per minute'.format(tempo)) print('===========================================\n') # generate csv files with beat times #CSV_FILENAME = self.WAVE_OUTPUT_FILENAME_NO_EXTENSION + ".csv" beat_times = librosa.frames_to_time(beat_frames, sr=sr) CSV_FILENAME = os.path.abspath( os.path.join(os.path.dirname(__file__), "Recordings", self.final + ".csv")) librosa.output.times_csv(CSV_FILENAME, beat_times) # WRITING A FILE WITH THE TEMPO #TEXT_FILENAME = self.WAVE_OUTPUT_FILENAME_NO_EXTENSION + ".txt" TEXT_FILENAME = os.path.abspath( os.path.join(os.path.dirname(__file__), "Recordings", self.final + ".txt")) bpm_value = open(TEXT_FILENAME, 'w') tempo_text = str(tempo) + '\n' bpm_value.write(tempo_text) """""" """""" """""" """""" """""" """""" """ 3 - Get Notes """ """""" """""" """""" """""" """""" """""" hz = librosa.feature.chroma_cqt(y=y, sr=sr) ## GET STRONGEST OCTAVE strongestOctave = 0 strongestOctave_sum = 0 for octave in range(len(hz)): sum = 0 for frame in hz[octave]: sum = sum + frame if sum > strongestOctave_sum: strongestOctave_sum = sum strongestOctave = octave ## GET HEIGHEST HZ FOR EACH TIME FRAME strongestHz = [] for i in range(len(hz[0])): strongestHz.append(0) notes = [] for i in range(len(hz[0])): notes.append(0) for frame_i in range(len(hz[0])): strongest_temp = 0 for octave_i in range(len(hz)): if hz[octave_i][frame_i] > strongest_temp: strongest_temp = hz[octave_i][frame_i] strongestHz[frame_i] = octave_i + 1 notes[frame_i] = librosa.hz_to_note(hz[octave_i][frame_i]) # C C# D D# E F F# G G# A A# B # 1 2 3 4 5 6 7 8 9 10 11 12 strongestHz_sum = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for note in strongestHz: strongestHz_sum[note - 1] = strongestHz_sum[note - 1] + 1 for i in range(len(strongestHz_sum)): strongestHz_sum[i] = float(strongestHz_sum[i]) / len(strongestHz) noteSorted = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for num in range(len(noteSorted)): biggest = strongestHz_sum.index(max(strongestHz_sum)) noteSorted[num] = biggest + 1 strongestHz_sum[biggest] = strongestHz_sum[biggest] - 0.25 for note in noteSorted: noteString = str(note) + '\n' bpm_value.write(noteString) bpm_value.close() print('\n===========================================') print('RECORDING ANALYSIS COMPLETED SUCCESSFULLY!!!') print('===========================================\n') self.finished.emit()
hop_length=hop_length, units='time') ###################################################################### # If you look carefully, the default onset detector (top sub-plot) has # several false positives in high-vibrato regions, eg around 0.62s or # 1.80s. # # The superflux method (middle plot) is less susceptible to vibrato, and # does not detect onset events at those points. # sphinx_gallery_thumbnail_number = 2 plt.figure(figsize=(6, 6)) frame_time = librosa.frames_to_time(np.arange(len(odf_default)), sr=sr, hop_length=hop_length) ax = plt.subplot(2, 1, 2) librosa.display.specshow(librosa.power_to_db(S, ref=np.max), y_axis='mel', x_axis='time', sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax) plt.xlim([0, 5.0]) plt.axis('tight') plt.subplot(4, 1, 1, sharex=ax) plt.plot(frame_time, odf_default, label='Spectral flux')
def preprocess(args): #params path = os.path.join('models',args['model_name']) config = load_config(os.path.join(path,'config.json')) bin_multiple = int(args['bin_multiple']) spec_type = args['spec_type'] framecnt = 0 # hack to deal with high PPQ from MAPS # https://github.com/craffel/pretty-midi/issues/112 pretty_midi.pretty_midi.MAX_TICK = 1e10 for s in os.listdir(data_dir): subdir = os.path.join(data_dir,s) if not os.path.isdir(subdir): continue # recursively search in subdir print subdir inputs,outputs = [],[] addCnt, errCnt = 0,0 for dp, dn, filenames in os.walk(subdir): # in each level of the directory, look at filenames ending with .mid for f in filenames: # if there exists a .wav file and .midi file with the same name if f.endswith('.wav'): audio_fn = f fprefix = audio_fn.split('.wav')[0] mid_fn = fprefix + '.mid' txt_fn = fprefix + '.txt' if mid_fn in filenames: # wav2inputnp audio_fn = os.path.join(dp,audio_fn) # mid2outputnp mid_fn = os.path.join(dp,mid_fn) pm_mid = pretty_midi.PrettyMIDI(mid_fn) inputnp = wav2inputnp(audio_fn,spec_type=spec_type,bin_multiple=bin_multiple) times = librosa.frames_to_time(np.arange(inputnp.shape[0]),sr=sr,hop_length=hop_length) outputnp = mid2outputnp(pm_mid,times) # check that num onsets is equal if inputnp.shape[0] == outputnp.shape[0]: print("adding to dataset fprefix {}".format(fprefix)) addCnt += 1 framecnt += inputnp.shape[0] print("framecnt is {}".format(framecnt)) inputs.append(inputnp) outputs.append(outputnp) else: print("error for fprefix {}".format(fprefix)) errCnt += 1 print(inputnp.shape) print(outputnp.shape) print("{} examples in dataset".format(addCnt)) print("{} examples couldnt be processed".format(errCnt)) if addCnt: inputs = np.concatenate(inputs) outputs = np.concatenate(outputs) fn = subdir.split('/')[-1] if not fn: fn = subdir.split('/')[-2] datapath = joinAndCreate(path,'data') fnpath = joinAndCreate(datapath,fn) mmi = np.memmap(filename=os.path.join(fnpath,'input.dat'), mode='w+',shape=inputs.shape) mmi[:] = inputs[:] mmo = np.memmap(filename=os.path.join(fnpath,'output.dat'), mode='w+',shape=outputs.shape) mmo[:] = outputs[:] del mmi del mmo
# filename, onset_code = 'F:/项目/花城音乐项目/样式数据/7.18MP3/旋律/小学8题20190718-9728-3.wav', '[2000;250,250,250,250,1000;2000;500,500,1000]' # 100 # filename, onset_code = 'F:/项目/花城音乐项目/样式数据/7.18MP3/旋律/小学8题20190718-9728-4.wav', '[1000,250,250,250,250;2000;1000,500,500;2000]' # 100 # rhythm_code = '[1000,1000;500,500,1000;500,250,250,500,500;2000]' # melody_code = '[5,5,3,2,1,2,2,3,2,6-,5-]' print("rhythm_code is {}".format(rhythm_code)) print("pitch_code is {}".format(pitch_code)) # plt, total_score, onset_score, note_scroe, detail_content = draw_plt(filename, rhythm_code, pitch_code) # plt.show() # plt.clf() y, sr = librosa.load(filename) CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=16000), ref=np.max) CQT = np.where(CQT > -22, np.max(CQT), np.min(CQT)) plt.subplot(2,1,1) rms, sig_ff, max_indexs = get_cqt_diff(filename) times = librosa.frames_to_time(np.arange(len(rms))) librosa.display.specshow(CQT, x_axis='time') #plt.plot(times, rms) #plt.plot(times, sig_ff) plt.xlim(0, np.max(times)) max_index_times = librosa.frames_to_time(max_indexs) #plt.vlines(max_index_times, 0, np.max(rms), color='r', linestyle='dashed') start, end, length = get_onset_frame_length(filename,onset_code) base_frames = onsets_base_frames(onset_code, length) base_frames_diff =np.diff(base_frames) start_indexs = get_cqt_start_indexs(filename) print("start_indexs is {} ,size {}".format(start_indexs, len(start_indexs))) # best_start_indexs = get_best_cqt_start_indexs_by_diff_level(filename,start, end,base_frames) # start_indexs = best_start_indexs
parser = TestOptions() args = parser.parse() args.train = False thr = args.thr # Process music and get feature infile = args.aud_path outfile = 'style.npy' p.preprocess(infile, outfile) y, sr = librosa.load(infile) onset_env = librosa.onset.onset_strength(y, sr=sr, aggregate=np.median) times = librosa.frames_to_time(np.arange(len(onset_env)), sr=sr, hop_length=512) tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr) np.save('beats.npy', times[beats]) beats = np.round(librosa.frames_to_time(beats, sr=sr) * 15) beats = np.load('beats.npy') aud = np.load('style.npy') os.remove('beats.npy') os.remove('style.npy') #shutil.rmtree('normalized') #### Pretrain network from Decomp initp_enc, stdp_dec, movement_enc = loadDecompModel(args) #### Comp network
def cqt_one(input_file, output_file, cqt_params=None, audio_params=None, harmonic_params=None, skip_existing=True): """Compute the CQT for a input/output file Pair. Parameters ---------- input_file : str Audio file to apply the CQT output_file : str Path to write the output. cqt_params : dict, default=None Parameters for the CQT function. See `librosa.cqt`. audio_params : dict, default=None Parameters for reading the audio file. See `claudio.read`. harmonic_params : dict, default=None Parameters for the `harmonic_cqt` function, which will update those in cqt_params. skip_existing : bool, default=True Skip outputs that exist. Returns ------- success : bool True if the output file was successfully created. """ input_exists, output_exists = [os.path.exists(f) for f in (input_file, output_file)] if not input_exists: logger.warning("[{0}] Input file doesn't exist, skipping: {1}" "".format(time.asctime(), input_file)) return input_exists if skip_existing and output_exists: logger.info("[{0}] Output file exists, skipping: {1}" "".format(time.asctime(), output_file)) return output_exists logger.debug("[{0}] Starting {1}".format(time.asctime(), input_file)) if not cqt_params: cqt_params = CQT_PARAMS.copy() if not audio_params: audio_params = AUDIO_PARAMS.copy() if not harmonic_params: harmonic_params = HARMONIC_PARAMS.copy() logger.debug("[{0}] Audio conversion {1}".format( time.asctime(), input_file)) try: x, fs = claudio.read(input_file, **audio_params) if len(x) <= 0: logger.error("Bad Input signal length={} for audio {}".format( len(x), input_file)) return False logger.debug("[{0}] Computing features {1}".format( time.asctime(), input_file)) cqt_spectra = np.array([np.abs(librosa.cqt(x_c, sr=fs, **cqt_params).T) for x_c in x.T]) cqt_params.update(**harmonic_params) harm_spectra = harmonic_cqt(x, fs, **cqt_params) frame_idx = np.arange(cqt_spectra.shape[1]) time_points = librosa.frames_to_time( frame_idx, sr=fs, hop_length=cqt_params['hop_length']) logger.debug("[{0}] Saving: {1}".format(time.asctime(), output_file)) np.savez( output_file, time_points=time_points, cqt=np.abs(cqt_spectra).astype(np.float32), harmonic_cqt=np.abs(harm_spectra).astype(np.float32)) except AssertionError as e: logger.error("Failed to load audio file: {} with error:\n{}".format( input_file, e)) logger.debug("[{0}] Finished: {1}".format(time.asctime(), output_file)) return os.path.exists(output_file)
def extract_feature(path): id = 1 # Song ID feature_set = pd.DataFrame() # Feature Matrix # Individual Feature Vectors songname_vector = pd.Series() tempo_vector = pd.Series() total_beats = pd.Series() average_beats = pd.Series() chroma_stft_mean = pd.Series() chroma_stft_std = pd.Series() chroma_stft_var = pd.Series() chroma_cq_mean = pd.Series() chroma_cq_std = pd.Series() chroma_cq_var = pd.Series() chroma_cens_mean = pd.Series() chroma_cens_std = pd.Series() chroma_cens_var = pd.Series() mel_mean = pd.Series() mel_std = pd.Series() mel_var = pd.Series() mfcc_mean = pd.Series() mfcc_std = pd.Series() mfcc_var = pd.Series() mfcc_delta_mean = pd.Series() mfcc_delta_std = pd.Series() mfcc_delta_var = pd.Series() rmse_mean = pd.Series() rmse_std = pd.Series() rmse_var = pd.Series() cent_mean = pd.Series() cent_std = pd.Series() cent_var = pd.Series() spec_bw_mean = pd.Series() spec_bw_std = pd.Series() spec_bw_var = pd.Series() contrast_mean = pd.Series() contrast_std = pd.Series() contrast_var = pd.Series() rolloff_mean = pd.Series() rolloff_std = pd.Series() rolloff_var = pd.Series() poly_mean = pd.Series() poly_std = pd.Series() poly_var = pd.Series() tonnetz_mean = pd.Series() tonnetz_std = pd.Series() tonnetz_var = pd.Series() zcr_mean = pd.Series() zcr_std = pd.Series() zcr_var = pd.Series() harm_mean = pd.Series() harm_std = pd.Series() harm_var = pd.Series() perc_mean = pd.Series() perc_std = pd.Series() perc_var = pd.Series() frame_mean = pd.Series() frame_std = pd.Series() frame_var = pd.Series() # Traversing over each file in path file_data = [f for f in listdir(path) if isfile(join(path, f))] for line in file_data: if (line[-1:] == '\n'): line = line[:-1] # Reading Song songname = path + line y, sr = librosa.load(songname, duration=60) S = np.abs(librosa.stft(y)) # Extracting Features tempo, beats = librosa.beat.beat_track(y=y, sr=sr) chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) chroma_cq = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr) melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr) rmse = librosa.feature.rmse(y=y) cent = librosa.feature.spectral_centroid(y=y, sr=sr) spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr) contrast = librosa.feature.spectral_contrast(S=S, sr=sr) rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) poly_features = librosa.feature.poly_features(S=S, sr=sr) tonnetz = librosa.feature.tonnetz(y=y, sr=sr) zcr = librosa.feature.zero_crossing_rate(y) harmonic = librosa.effects.harmonic(y) percussive = librosa.effects.percussive(y) mfcc = librosa.feature.mfcc(y=y, sr=sr) mfcc_delta = librosa.feature.delta(mfcc) onset_frames = librosa.onset.onset_detect(y=y, sr=sr) frames_to_time = librosa.frames_to_time(onset_frames[:20], sr=sr) # Transforming Features songname_vector.set_value(id, line) # song name tempo_vector.set_value(id, tempo) # tempo total_beats.set_value(id, sum(beats)) # beats average_beats.set_value(id, np.average(beats)) chroma_stft_mean.set_value(id, np.mean(chroma_stft)) # chroma stft chroma_stft_std.set_value(id, np.std(chroma_stft)) chroma_stft_var.set_value(id, np.var(chroma_stft)) chroma_cq_mean.set_value(id, np.mean(chroma_cq)) # chroma cq chroma_cq_std.set_value(id, np.std(chroma_cq)) chroma_cq_var.set_value(id, np.var(chroma_cq)) chroma_cens_mean.set_value(id, np.mean(chroma_cens)) # chroma cens chroma_cens_std.set_value(id, np.std(chroma_cens)) chroma_cens_var.set_value(id, np.var(chroma_cens)) mel_mean.set_value(id, np.mean(melspectrogram)) # melspectrogram mel_std.set_value(id, np.std(melspectrogram)) mel_var.set_value(id, np.var(melspectrogram)) mfcc_mean.set_value(id, np.mean(mfcc)) # mfcc mfcc_std.set_value(id, np.std(mfcc)) mfcc_var.set_value(id, np.var(mfcc)) mfcc_delta_mean.set_value(id, np.mean(mfcc_delta)) # mfcc delta mfcc_delta_std.set_value(id, np.std(mfcc_delta)) mfcc_delta_var.set_value(id, np.var(mfcc_delta)) rmse_mean.set_value(id, np.mean(rmse)) # rmse rmse_std.set_value(id, np.std(rmse)) rmse_var.set_value(id, np.var(rmse)) cent_mean.set_value(id, np.mean(cent)) # cent cent_std.set_value(id, np.std(cent)) cent_var.set_value(id, np.var(cent)) spec_bw_mean.set_value(id, np.mean(spec_bw)) # spectral bandwidth spec_bw_std.set_value(id, np.std(spec_bw)) spec_bw_var.set_value(id, np.var(spec_bw)) contrast_mean.set_value(id, np.mean(contrast)) # contrast contrast_std.set_value(id, np.std(contrast)) contrast_var.set_value(id, np.var(contrast)) rolloff_mean.set_value(id, np.mean(rolloff)) # rolloff rolloff_std.set_value(id, np.std(rolloff)) rolloff_var.set_value(id, np.var(rolloff)) poly_mean.set_value(id, np.mean(poly_features)) # poly features poly_std.set_value(id, np.std(poly_features)) poly_var.set_value(id, np.var(poly_features)) tonnetz_mean.set_value(id, np.mean(tonnetz)) # tonnetz tonnetz_std.set_value(id, np.std(tonnetz)) tonnetz_var.set_value(id, np.var(tonnetz)) zcr_mean.set_value(id, np.mean(zcr)) # zero crossing rate zcr_std.set_value(id, np.std(zcr)) zcr_var.set_value(id, np.var(zcr)) harm_mean.set_value(id, np.mean(harmonic)) # harmonic harm_std.set_value(id, np.std(harmonic)) harm_var.set_value(id, np.var(harmonic)) perc_mean.set_value(id, np.mean(percussive)) # percussive perc_std.set_value(id, np.std(percussive)) perc_var.set_value(id, np.var(percussive)) frame_mean.set_value(id, np.mean(frames_to_time)) # frames frame_std.set_value(id, np.std(frames_to_time)) frame_var.set_value(id, np.var(frames_to_time)) print(songname) id = id + 1 # Concatenating Features into one csv and json format feature_set['song_name'] = songname_vector # song name feature_set['tempo'] = tempo_vector # tempo feature_set['total_beats'] = total_beats # beats feature_set['average_beats'] = average_beats feature_set['chroma_stft_mean'] = chroma_stft_mean # chroma stft feature_set['chroma_stft_std'] = chroma_stft_std feature_set['chroma_stft_var'] = chroma_stft_var feature_set['chroma_cq_mean'] = chroma_cq_mean # chroma cq feature_set['chroma_cq_std'] = chroma_cq_std feature_set['chroma_cq_var'] = chroma_cq_var feature_set['chroma_cens_mean'] = chroma_cens_mean # chroma cens feature_set['chroma_cens_std'] = chroma_cens_std feature_set['chroma_cens_var'] = chroma_cens_var feature_set['melspectrogram_mean'] = mel_mean # melspectrogram feature_set['melspectrogram_std'] = mel_std feature_set['melspectrogram_var'] = mel_var feature_set['mfcc_mean'] = mfcc_mean # mfcc feature_set['mfcc_std'] = mfcc_std feature_set['mfcc_var'] = mfcc_var feature_set['mfcc_delta_mean'] = mfcc_delta_mean # mfcc delta feature_set['mfcc_delta_std'] = mfcc_delta_std feature_set['mfcc_delta_var'] = mfcc_delta_var feature_set['rmse_mean'] = rmse_mean # rmse feature_set['rmse_std'] = rmse_std feature_set['rmse_var'] = rmse_var feature_set['cent_mean'] = cent_mean # cent feature_set['cent_std'] = cent_std feature_set['cent_var'] = cent_var feature_set['spec_bw_mean'] = spec_bw_mean # spectral bandwidth feature_set['spec_bw_std'] = spec_bw_std feature_set['spec_bw_var'] = spec_bw_var feature_set['contrast_mean'] = contrast_mean # contrast feature_set['contrast_std'] = contrast_std feature_set['contrast_var'] = contrast_var feature_set['rolloff_mean'] = rolloff_mean # rolloff feature_set['rolloff_std'] = rolloff_std feature_set['rolloff_var'] = rolloff_var feature_set['poly_mean'] = poly_mean # poly features feature_set['poly_std'] = poly_std feature_set['poly_var'] = poly_var feature_set['tonnetz_mean'] = tonnetz_mean # tonnetz feature_set['tonnetz_std'] = tonnetz_std feature_set['tonnetz_var'] = tonnetz_var feature_set['zcr_mean'] = zcr_mean # zero crossing rate feature_set['zcr_std'] = zcr_std feature_set['zcr_var'] = zcr_var feature_set['harm_mean'] = harm_mean # harmonic feature_set['harm_std'] = harm_std feature_set['harm_var'] = harm_var feature_set['perc_mean'] = perc_mean # percussive feature_set['perc_std'] = perc_std feature_set['perc_var'] = perc_var feature_set['frame_mean'] = frame_mean # frames feature_set['frame_std'] = frame_std feature_set['frame_var'] = frame_var # Converting Dataframe into CSV Excel and JSON file feature_set.to_csv('Emotion_features.csv') feature_set.to_json('Emotion_features.json')
""" Segmentation using silence detection with spectral flatness of chroma features. WiMIR workshop topic: Verse and chorus detection in vocal cover versions. Author - Shreyan Chowdhury """ import librosa from librosa import display import numpy as np from matplotlib import pyplot as plt y, sr = librosa.core.load('/home/shreyan/PROJECTS/_data/structure_workshop/hero_vocals.wav') chroma = librosa.feature.chroma_stft(y) chroma_flatness = librosa.feature.spectral_flatness(S=chroma) smoothed_chroma_flatness = np.convolve(chroma_flatness.squeeze(), np.ones(100)) bounds = librosa.segment.agglomerative(smoothed_chroma_flatness, 20) xtimes = librosa.frames_to_time(range(len(smoothed_chroma_flatness)), sr=sr) fig, (ax1, ax2) = plt.subplots(2, 1) librosa.display.specshow(chroma, ax=ax1) ax2.plot(xtimes, smoothed_chroma_flatness) ax2.vlines(librosa.frames_to_time(bounds, sr=sr), 0, max(smoothed_chroma_flatness), color='black', linestyle='--',linewidth=2, alpha=0.9, label='Segment boundaries') plt.show()
# Estimates harmonic energy S = np.abs(librosa.stft(y)) freqs = librosa.core.fft_frequencies(sr) harms = [1, 2, 3, 4] weights = [1.0, 0.5, 0.33, 0.25] S_sal = librosa.salience(S, freqs, harms, weights, fill_value=0) # Estimates of a possible beat to be filtered later, and it's timing, magnitude onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median) tempo, beat_frames = librosa.beat.beat_track(onset_envelope=onset_env, hop_length=hop_length, y=y_harmonic, sr=sr, tightness=0.1) timing = librosa.frames_to_time(beat_frames) pitches, magnitudes = librosa.core.piptrack(y=y_harmonic, sr=sr, n_fft=(hop_length * 4), hop_length=hop_length, threshold=0.1) # Notes are the corresponding to the timing variable to be plotted -> sent to pickle notes, pick, mags, freq, harm = [], [], [], [], [] for x in range(0, len(beat_frames)): try: freq.append(detect_pitch(y_harmonic, sr, beat_frames[x])) harm.append(get_energy(y_harmonic, sr, beat_frames[x])) note = librosa.hz_to_note( detect_pitch(y_harmonic, sr, beat_frames[x])) notes.append(note)
def get_detail_cqt_rms_secondary_optimised(filename): onset_frames_cqt, best_y, best_threshold, _ = get_detail_cqt_rms(filename) y, sr = librosa.load(filename) loss_frames = [] for i in range(len(onset_frames_cqt) - 1): start = onset_frames_cqt[i] end = onset_frames_cqt[i + 1] if end - start > 30: start_end_time = librosa.frames_to_time([start, end], sr=sr) #print("start_end_time is {}".format(start_end_time)) y1, sr1 = librosa.load(filename, offset=start_end_time[0], duration=start_end_time[1] - start_end_time[0]) # 根据rms阀值线找漏的 if len(onset_frames_cqt) > 0: threshold = 0.6 tmp = get_missing_by_best_threshod(y1, [start, end], threshold) if len(tmp) >= 3: for j in range(1, len(tmp) - 1): loss_frames.append(tmp[j]) #print("add is {}".format(tmp[1:-1])) # rms = librosa.feature.rmse(y=y1)[0] # rms_on_onset_frames_cqt = [rms[x] for x in [start,end]] # min_rms_on_onset_frames_cqt = np.min(rms_on_onset_frames_cqt) # rms = [1 if x >=min_rms_on_onset_frames_cqt else 0 for x in rms] # # loss = [i for i in range(len(rms)-6) if rms[i] == 0 and rms[i+1] == 1 and np.min(rms[i+1:i+6]) == 1 and i < end and i > start ] # for x in loss: # loss_frames.append(x) if len(loss_frames) > 0: for x in loss_frames: onset_frames_cqt.append(x) onset_frames_cqt.sort() CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=16000), ref=np.max) #onset_frames_cqt, best_threshold = get_onsets_by_cqt_rms_optimised(filename) #print("5. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) # if len(onset_frames_cqt)<topN: onset_frames_cqt = get_miss_onsets_by_cqt(y, onset_frames_cqt) onset_frames_cqt = find_false_onsets_rms_secondary_optimised( y, onset_frames_cqt, 0.1, 0.1) if onset_frames_cqt: min_width = 5 # print("min_width is {}".format(min_width)) onset_frames_cqt = del_overcrowding(onset_frames_cqt, min_width) #print("6. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) #onset_frames_cqt = check_onset_by_cqt_v2(y, onset_frames_cqt) #print("7. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) onset_frames_cqt_time = librosa.frames_to_time(onset_frames_cqt, sr=sr) type_index = get_onsets_index_by_filename(filename) total_frames_number = get_total_frames_number(filename) best_y = [] # 标准节拍时间点 if len(onset_frames_cqt) > 0: base_frames = onsets_base_frames_for_note(filename) base_frames = [ x + onset_frames_cqt[0] - base_frames[0] for x in base_frames ] min_d, best_y, onsets_frames = get_dtw_min(onset_frames_cqt, base_frames, 65) else: base_frames = onsets_base_frames_for_note(filename) base_onsets = librosa.frames_to_time(base_frames, sr=sr) plt.close() # 关闭第一次的图片句柄 # librosa.display.specshow(CQT) plt.figure(figsize=(10, 6)) plt.subplot(4, 1, 1) # 要生成两行两列,这是第一个图plt.subplot('行','列','编号') # plt.colorbar(format='%+2.0f dB') # plt.title('Constant-Q power spectrogram (note)') # for x in onset_frames_cqt: # sub_cqt = CQT.copy()[:,x] # sub_cqt[0:20] = np.min(CQT) # max_index = np.where(sub_cqt==np.max(sub_cqt))[0][0] # print("max_index is {}".format(max_index)) # #plt.axhline(max_index,color="r") # CQT[max_index,:] = np.min(CQT) librosa.display.specshow(CQT, y_axis='cqt_note', x_axis='time') plt.vlines(onset_frames_cqt_time, 0, sr, color='y', linestyle='solid') #plt.vlines(base_onsets, 0, sr, color='r', linestyle='solid') # print(plt.figure) plt.subplot(4, 1, 2) # 要生成两行两列,这是第一个图plt.subplot('行','列','编号') librosa.display.waveplot(y, sr=sr) plt.vlines(onset_frames_cqt_time, -1 * np.max(y), np.max(y), color='y', linestyle='solid') plt.subplot(4, 1, 3) rms = librosa.feature.rmse(y=y)[0] rms = [x / np.std(rms) for x in rms] max_rms = np.max(rms) # rms = np.diff(rms) times = librosa.frames_to_time(np.arange(len(rms))) rms_on_onset_frames_cqt = [rms[x] for x in onset_frames_cqt] min_rms_on_onset_frames_cqt = np.min(rms_on_onset_frames_cqt) rms = [1 if x >= min_rms_on_onset_frames_cqt else 0 for x in rms] plt.plot(times, rms) # plt.axhline(min_rms_on_onset_frames_cqt) plt.axhline(max_rms * best_threshold) # plt.vlines(onsets_frames_rms_best_time, 0,np.max(rms), color='y', linestyle='solid') plt.vlines(onset_frames_cqt_time, 0, np.max(rms), color='y', linestyle='solid') #plt.vlines(base_onsets, 0, np.max(rms), color='r', linestyle='solid') plt.xlim(0, np.max(times)) plt.subplot(4, 1, 4) plt.vlines(base_onsets, 0, np.max(rms), color='r', linestyle='dashed') plt.xlim(0, np.max(times)) plt.axhline(max_rms * best_threshold) return onset_frames_cqt, best_y, best_threshold, plt
# merge all the features in a matrix as training feature def merge(onset_strength, is_onset, beat, mfcc): feature = [] for i in range(len(onset_strength)): feature.append(onset_strength[i] + is_onset[i] + beat[i] + mfcc[i]) return feature if __name__ == "__main__": if len(sys.argv) != 3: raise argparse.ArgumentTypeError('the number of argument has to be 3') exit(-1) y, sr = librosa.load(sys.argv[1], sr=None) o_env = librosa.onset.onset_strength(y, sr=sr) times = librosa.frames_to_time(np.arange(len(o_env)), sr=sr) onset = getOnset(y, sr) with open(sys.argv[2], 'r') as my_file: csvreader = csv.reader(my_file) mis = list(csvreader) mis = [[int(ele[0]), float(ele[1])] for ele in mis] mfcc = getmfcc(y, sr, mis) onset_strength = [[ o_env[librosa.core.time_to_frames(e[1] / 1000, sr=sr)[0]] ] for e in mis] is_onset = isonset(onset, mis) beat = isbeat(mis) tr_f = merge(onset_strength, is_onset, beat, mfcc)
def get_detail_cqt_rms(filename): y, sr = librosa.load(filename) CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=16000), ref=np.max) onset_frames_cqt, best_threshold = get_onsets_by_cqt_rms_optimised( filename) #print("5. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) # if len(onset_frames_cqt)<topN: onset_frames_cqt = get_miss_onsets_by_cqt(y, onset_frames_cqt) #print("6. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) #onset_frames_cqt = check_onset_by_cqt_v2(y, onset_frames_cqt) #print("7. onset_frames_cqt,best_threshold is {},{}".format(onset_frames_cqt, best_threshold)) onset_frames_cqt_time = librosa.frames_to_time(onset_frames_cqt, sr=sr) #print("onset_frames_cqt_time is {}".format(onset_frames_cqt_time)) type_index = get_onsets_index_by_filename(filename) total_frames_number = get_total_frames_number(filename) best_y = [] # 标准节拍时间点 if len(onset_frames_cqt) > 0: base_frames = onsets_base_frames( codes[type_index], total_frames_number - onset_frames_cqt[0]) base_frames = [ x + (onset_frames_cqt[0] - base_frames[0]) for x in base_frames ] min_d, best_y, onsets_frames = get_dtw_min(onset_frames_cqt, base_frames, 65) else: base_frames = onsets_base_frames(codes[type_index], total_frames_number) base_onsets = librosa.frames_to_time(base_frames, sr=sr) # librosa.display.specshow(CQT) plt.figure(figsize=(10, 6)) plt.subplot(4, 1, 1) # 要生成两行两列,这是第一个图plt.subplot('行','列','编号') # plt.colorbar(format='%+2.0f dB') # plt.title('Constant-Q power spectrogram (note)') librosa.display.specshow(CQT, y_axis='cqt_note', x_axis='time') plt.vlines(onset_frames_cqt_time, 0, sr, color='y', linestyle='solid') #plt.vlines(base_onsets, 0, sr, color='r', linestyle='solid') # print(plt.figure) plt.subplot(4, 1, 2) # 要生成两行两列,这是第一个图plt.subplot('行','列','编号') librosa.display.waveplot(y, sr=sr) plt.vlines(onset_frames_cqt_time, -1 * np.max(y), np.max(y), color='y', linestyle='solid') plt.subplot(4, 1, 3) rms = librosa.feature.rmse(y=y)[0] rms = [x / np.std(rms) for x in rms] max_rms = np.max(rms) # rms = np.diff(rms) times = librosa.frames_to_time(np.arange(len(rms))) # rms_on_onset_frames_cqt = [rms[x] for x in onset_frames_cqt] # min_rms_on_onset_frames_cqt = np.min(rms_on_onset_frames_cqt) # rms = [1 if x >=min_rms_on_onset_frames_cqt else 0 for x in rms] plt.plot(times, rms) # plt.axhline(min_rms_on_onset_frames_cqt) plt.axhline(max_rms * best_threshold) # plt.vlines(onsets_frames_rms_best_time, 0,np.max(rms), color='y', linestyle='solid') plt.vlines(onset_frames_cqt_time, 0, np.max(rms), color='y', linestyle='solid') #plt.vlines(base_onsets, 0, np.max(rms), color='r', linestyle='solid') plt.xlim(0, np.max(times)) plt.subplot(4, 1, 4) plt.vlines(base_onsets, 0, np.max(rms), color='r', linestyle='dashed') plt.xlim(0, np.max(times)) plt.axhline(max_rms * best_threshold) return onset_frames_cqt, best_y, best_threshold, plt
def analyze_audio(original, recording, tempo): o_y, o_sr = librosa.load(original) r_y, r_sr = librosa.load(recording) # calculate tempos and tempo evaluation (USE GIVEN TEMPO HERE? ALSO REPEAT IS DUMB, probably just change this to my own thing) o_tempo, o_beat_frames = librosa.beat.beat_track(y=o_y, sr=o_sr, start_bpm=tempo) r_tempo, r_beat_frames = librosa.beat.beat_track(y=r_y, sr=r_sr, start_bpm=tempo) o_beats, r_beats = librosa.frames_to_time( o_beat_frames, sr=o_sr), librosa.frames_to_time(r_beat_frames, sr=r_sr) ref_weight = 0.5 # mir_eval.tempo.validate(np.repeat(o_tempo,2),ref_weight,np.repeat(r_tempo,2)) tempo_score = mir_eval.tempo.evaluate(np.repeat(o_tempo, 2), ref_weight, np.repeat(r_tempo, 2))['P-score'] # beat calculations using DP o_beats, r_beats = mir_eval.beat.trim_beats( o_beats), mir_eval.beat.trim_beats(r_beats) beat_metrics = mir_eval.beat.evaluate(o_beats, r_beats) beat_p, beat_kl = beat_metrics['P-score'], beat_metrics['Information gain'] # onset calculation (between [0,1]) o_onsets = librosa.onset.onset_detect(y=o_y, sr=o_sr, units='time') r_onsets = librosa.onset.onset_detect(y=r_y, sr=r_sr, units='time') onset_precision = mir_eval.onset.evaluate(o_onsets, r_onsets)['Precision'] # cosine similarity between spectral centroids o_centroids = librosa.feature.spectral_centroid(y=o_y, sr=o_sr) r_centroids = librosa.feature.spectral_centroid(y=r_y, sr=r_sr) o_len, r_len = o_centroids.shape[1], r_centroids.shape[1] if o_len < r_len: r_centroids = r_centroids[:, (r_len - o_len):] else: o_centroids = o_centroids[:, (o_len - r_len):] centroid_sim = np.sum(o_centroids * r_centroids) / ( np.linalg.norm(o_centroids) * np.linalg.norm(r_centroids)) # chroma freq (12 pitch classes per frame, compute freq for max per frame using Short Time FT) o_chroma, r_chroma = librosa.feature.chroma_stft( y=o_y, sr=o_sr), librosa.feature.chroma_stft(y=r_y, sr=r_sr) o_mchroma, r_mchroma = np.argmax(o_chroma, axis=0), np.argmax(r_chroma, axis=0) o_counts, r_counts = collections.Counter(o_mchroma), collections.Counter( r_mchroma) oc_len, rc_len = len(o_mchroma), len(r_mchroma) if oc_len < rc_len: r_mchroma = r_mchroma[(rc_len - oc_len):] elif oc_len > rc_len: o_mchroma = o_mchroma[(oc_len - rc_len):] nmse = 0 for i in range(len(o_mchroma)): if abs(o_mchroma[i] - r_mchroma[i]) > 2: nmse += np.sign(o_mchroma[i] - r_mchroma[i]) nmse /= oc_len nmse = 1 - abs(nmse) # probabilistic YIN (HMM model on pitch classes for computing fundamental frequencies) """ o_f0, ovf, ovp = librosa.pyin(o_y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) r_f0, rvf, rvp = librosa.pyin(r_y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) o_f0 = np.array([el for el in o_f0 if not math.isnan(el)]) r_f0 = np.array([el for el in r_f0 if not math.isnan(el)]) of_len, rf_len = len(o_f0), len(r_f0) if of_len < rf_len: r_f0 = r_f0[(rf_len-of_len):] else: o_f0 = o_f0[(of_len-rf_len):] f0_sim = np.sum(o_f0 * r_f0) / (np.linalg.norm(o_f0) * np.linalg.norm(r_f0)) """ f0_sim = centroid_sim # MFCC (mel freq cepstral coefficients for ML) return (tempo_score, beat_p, beat_kl, onset_precision, nmse, centroid_sim, f0_sim)
import librosa import sys filename = 'humble.mp3' #load humble y, sr = librosa.load(filename) #get beats tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) beat_times = librosa.frames_to_time(beat_frames, sr=sr) #print print(beat_times) librosa.output.times_csv('beat_times.csv', beat_times)
return audio, sr savepath = 'e:/test_image/' filename = 'F:/项目/花城音乐项目/样式数据/2.27MP3/旋律/视唱1-02(90).wav' #y, sr = librosa.load(filename) y, sr = load_and_trim(filename) chromagram = librosa.feature.chroma_cqt(y, sr=sr) librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', cmap='coolwarm') onset_frames = librosa.onset.onset_detect(y=y, sr=sr) onset_times = librosa.frames_to_time(onset_frames, sr=sr) plt.vlines(onset_times, 0, y.max(), color='r', linestyle='--') onset_samples = librosa.time_to_samples(onset_times) print(onset_samples) #plt.subplot(len(onset_times),1,1) plt.show() plt.figure(figsize=(5, 80)) for i in range(0, len(onset_times)): start = onset_samples[i] - sr / 2 if start < 0: start = 0 end = onset_samples[i] + sr / 2 #y2 = [x if i> start and i<end else 0 for i,x in enumerate(y)] y2 = [x for i, x in enumerate(y) if i > start and i < end] y2[int(len(y2) / 2)] = np.max(y) # 让图片展示归一化
def midi_to_piano_cqt(midi): piano_roll = midi.get_piano_roll(times = librosa.frames_to_time(np.arange(midi.get_end_time()*22050/512))) piano_subset = piano_roll[36:96]+1e-10 #want just C3 to C8 of piano roll return piano_subset
img.itemset((c_max[x],x), 1) img.itemset((c_max[x],x), 1) img.itemset((c_max[x],x), 1) # 最强音色图 # librosa.display.specshow(img, x_axis='time', cmap='coolwarm') # 音频时长 time = librosa.get_duration(y) print("time is {}".format(time)) # 节拍点 onsets_frames = librosa.onset.onset_detect(y) print(onsets_frames) # 节拍时间点 onstm = librosa.frames_to_time(onsets_frames, sr=sr) print(onstm) #plt.rcParams['figure.figsize'] = (2.0, 2.0) # 设置figure_size尺寸 #plt.rcParams['savefig.dpi'] = 28 #图片像素 #plt.rcParams['figure.dpi'] = 28 #分辨率 #librosa.display.specshow(librosa.amplitude_to_db(D)) #plt.vlines(onstm, 0, sr, color='r', linestyle='dashed') #plt.colorbar() code = '[500,500,1000;500,500,1000;500,500,750,250;2000]' pitch_code = '[3,3,3,3,3,3,3,5,1,2,3]' pitch_v = get_chroma_pitch(pitch_code) onsets_base_frames = onsets_base_frames(code,h) onsets_base_frames[-1] = onsets_base_frames[-1]-1 print(onsets_base_frames) print(np.diff(onsets_base_frames))
librosa.display.specshow(C, y_axis='cqt_hz', sr=sr, bins_per_octave=BINS_PER_OCTAVE, x_axis='time') plt.tight_layout() ########################################################## # To reduce dimensionality, we'll beat-synchronous the CQT tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False) Csync = librosa.util.sync(C, beats, aggregate=np.median) # For plotting purposes, we'll need the timing of the beats # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame) beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats, x_min=0, x_max=C.shape[1]), sr=sr) plt.figure(figsize=(12, 4)) librosa.display.specshow(Csync, bins_per_octave=12 * 3, y_axis='cqt_hz', x_axis='time', x_coords=beat_times) plt.tight_layout() ##################################################################### # Let's build a weighted recurrence matrix using beat-synchronous CQT # (Equation 1) # width=3 prevents links within the same bar # mode='affinity' here implements S_rep (after Eq. 8)