def __test_cqt(pad_mode): D1 = librosa.cqt(y, pad_mode='reflect') D2 = librosa.cqt(y, pad_mode=pad_mode) assert D1.shape == D2.shape if pad_mode != 'reflect': assert not np.allclose(D1, D2) else: assert np.allclose(D1, D2)
def do_cqt(src, track_id): SRC_cqt_L = librosa.logamplitude(librosa.cqt(src[0,:], sr=CQT_CONST["sr"], hop_length=CQT_CONST["hop_len"], bins_per_octave=CQT_CONST["bins_per_octave"], n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0) SRC_cqt_R = librosa.logamplitude(librosa.cqt(src[1,:], sr=CQT_CONST["sr"], hop_length=CQT_CONST["hop_len"], bins_per_octave=CQT_CONST["bins_per_octave"], n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0) np.save(PATH_CQT + str(track_id) + '.npy', np.dstack((SRC_cqt_L, SRC_cqt_R))) print "Done: %s" % str(track_id)
def compute_features(self): """Actual implementation of the features. Returns ------- cqt: np.array(N, F) The features, each row representing a feature vector for a give time frame/beat. """ linear_cqt = ( np.abs( librosa.cqt( self._audio, sr=self.sr, hop_length=self.hop_length, n_bins=self.n_bins, norm=self.norm, filter_scale=self.filter_scale, real=False, ) ) ** 2 ) cqt = librosa.logamplitude(linear_cqt, ref_power=self.ref_power).T return cqt
def compute_cqt(filename): a, sr = librosa.load(filename, sr=SR) spectrum = librosa.stft(a) harm_spec, _ = librosa.decompose.hpss(spectrum) harm = librosa.istft(harm_spec) cqt = np.abs(librosa.cqt(harm, sr=sr, hop_length=HOP, real=False)) return cqt
def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity): C2 = librosa.hybrid_cqt( y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity, ) C1 = librosa.cqt( y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity, ) eq_(C1.shape, C2.shape) # Check for numerical comparability assert np.mean(np.abs(C1 - C2)) < 1e-3
def makeSpectragrams(filename): f, sr = librosa.load(filename) print "first" melSpectra = librosa.feature.melspectrogram(f) cqtSpectra = librosa.cqt(f) stftSpectra = librosa.stft(f) print "stuff" librosa.display.specshow(melSpectra) # plt.specgram(melSpectra) imageName = filename, "MelSpectragram.png" title = "Mel Spectrogram \nof " + filename[26:] plt.title(title) plt.ion() # plt.savefig(imageName) plt.show() librosa.display.specshow(cqtSpectra) title = "Constant Q Spectrogram \nof " + filename[26:] plt.title(title) # plt.spectrogram(cqtSpectra) plt.show() librosa.display.specshow(stftSpectra) title = "STFT Spectrogram \nof " + filename[26:] plt.title(title) # plt.spectrogram(cqtSpectra) plt.show() return True
def __test(sr, scale, hop_length, over_sample, y): bins_per_octave = over_sample * 12 n_bins = 7 * bins_per_octave C = librosa.cqt(y, sr=sr, n_bins=n_bins, bins_per_octave=bins_per_octave, scale=scale, hop_length=hop_length) yinv = librosa.icqt(C, sr=sr, scale=scale, hop_length=hop_length, bins_per_octave=bins_per_octave) # Only test on the middle section yinv = librosa.util.fix_length(yinv, len(y)) y = y[sr//2:-sr//2] yinv = yinv[sr//2:-sr//2] residual = np.abs(y - yinv) # We'll tolerate 11% RMSE # error is lower on more recent numpy/scipy builds resnorm = np.sqrt(np.mean(residual**2)) assert resnorm <= 1.1e-1, resnorm
def extract_cqt(audio_data): ''' CQT routine with default parameters filled in, and some post-processing. Parameters ---------- audio_data : np.ndarray Audio data to compute CQT of Returns ------- cqt : np.ndarray CQT of the supplied audio data. frame_times : np.ndarray Times, in seconds, of each frame in the CQT ''' # Compute CQT cqt = librosa.cqt(audio_data, sr=FS, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES, hop_length=HOP_LENGTH, tuning=0.) # Compute the time of each frame times = librosa.frames_to_time( np.arange(cqt.shape[1]), sr=FS, hop_length=HOP_LENGTH) # Use float32 for the cqt to save space/memory cqt = cqt.astype(np.float32) return cqt, times
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512): ''' Feature extraction for audio data. Gets a power CQT of harmonic component and onset strength signal of percussive. Input: midi - pretty_midi.PrettyMIDI object fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this Output: audio_gram - CQT of audio data audio_onset_strength - onset strength signal ''' # Use harmonic part for gram, percussive part for onsets H, P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) # Compute log-frequency spectrogram of original audio audio_gram = np.abs(librosa.cqt(y=audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60))**2 # Beat track the audio file at 4x the hop rate audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs) return audio_gram, audio_onset_strength
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512): ''' Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT. Input: midi - pretty_midi.PrettyMIDI object sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512 Output: midi_gram - Simulated CQT of the midi data ''' # Synthesize the MIDI using the supplied sf2 path midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path) # Use the harmonic part of the signal H, P = librosa.decompose.hpss(librosa.stft(midi_audio)) midi_audio_harmonic = librosa.istft(H) # Compute log frequency spectrogram of audio synthesized from MIDI midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60, tuning=0.0))**2 return midi_gram
def ExtractCQSpectraSparcityFeatures(f, epsilon=defaultEpsilon): cqtspectra = librosa.cqt(f) ##### Sparcity - fraction of entries that are zero (or within \epsilon) ##### We can have Sparcity of a full spectra or the max over time (if it is ever spase in that band). cqtEpsilonSparcityMatrix = (cqtspectra > epsilon) #cqtSparcity = cqtspectra.size - np.count_nonzero(cqtspectra) cqtEpsilonSparcity = float((cqtEpsilonSparcityMatrix.size - np.count_nonzero(cqtEpsilonSparcityMatrix)))/cqtEpsilonSparcityMatrix.size #print "size", cqtspectra.size #print "cqtspectra epsilon sparcity for epsilon = ", epsilon, " is ", cqtEpsilonSparcity cqtSpectraMax = np.amax(cqtspectra, axis=1) cqtSpectraBandSparcityMatrix = (cqtSpectraMax > epsilon) cqtBandSparcity = float(len(cqtSpectraMax) - np.count_nonzero(cqtSpectraBandSparcityMatrix))/len(cqtSpectraMax) #print "Epsilon Band sparcity for cqt spectra: ", cqtBandSparcity cqtave = np.mean(cqtspectra, axis=1) cqtAveSpectraBandSparcityMatrix = (cqtave > epsilon) cqtBandSparcityTimeAve = float(len(cqtave) - np.count_nonzero(cqtAveSpectraBandSparcityMatrix))/len(cqtave) #print "Epsilon Band sparcity based on ave cqt spectra: ", cqtBandSparcityTimeAve return cqtEpsilonSparcity, cqtBandSparcity, cqtBandSparcityTimeAve
def features(filename): # print '\t[1/5] loading audio' y, sr = librosa.load(filename, sr=SR) # print '\t[2/5] Separating harmonic and percussive signals' y_perc, y_harm = hp_sep(y) # print '\t[3/5] detecting beats' bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH) # print '\t[4/5] generating CQT' M1 = np.abs( librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72) ) M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max) # print '\t[5/5] generating MFCC' S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS) M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC) n = min(M1.shape[1], M2.shape[1]) beats = beats[beats < n] beats = np.unique(np.concatenate([[0], beats])) times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH) times = np.concatenate([times, [float(len(y)) / sr]]) M1 = librosa.feature.sync(M1, beats, aggregate=np.median) M2 = librosa.feature.sync(M2, beats, aggregate=np.mean) return (M1, M2), times
def logcqt_onsets(x, fs, pre_max=0, post_max=1, pre_avg=0, post_avg=1, delta=0.05, wait=50): """ Parameters ---------- x : np.ndarray Audio signal fs : scalar Samplerate of the audio signal. pre_max, post_max, pre_avg, post_avg, delta, wait See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray, ndim=1 Times in seconds for splitting. """ hop_length = 1024 x_noise = x + np.random.normal(scale=10.**-3, size=x.shape) cqt = librosa.cqt(x_noise.flatten(), sr=fs, hop_length=hop_length, fmin=27.5, n_bins=24*8, bins_per_octave=24, tuning=0, sparsity=0, real=False, norm=1) cqt = np.abs(cqt) lcqt = np.log1p(5000*cqt) c_n = utils.canny(51, 3.5, 1) onset_strength = sig.lfilter(c_n, np.ones(1), lcqt, axis=1).mean(axis=0) peak_idx = librosa.onset.onset_detect( onset_envelope=onset_strength, delta=delta, wait=wait) return librosa.frames_to_time(peak_idx, hop_length=hop_length)
def process_one_file(midi_filename, skip=True): ''' Load in midi data, compute features, and write out file :parameters: - midi_filename : str Full path to midi file - skip : bool Whether to skip creating the file when the npz already exists ''' # npz files go in the 'npz' dir instead of 'mid' output_filename = mid_to_npz_path(midi_filename) # Skip files already created if skip and os.path.exists(output_filename): return try: m = pretty_midi.PrettyMIDI(midi_filename) midi_audio = alignment_utils.fast_fluidsynth(m, MIDI_FS) midi_gram = librosa.cqt( midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) midi_beats, midi_tempo = alignment_utils.midi_beat_track(m) midi_sync_gram = alignment_utils.post_process_cqt( midi_gram, librosa.time_to_frames( midi_beats, sr=MIDI_FS, hop_length=MIDI_HOP)) np.savez_compressed( output_filename, sync_gram=midi_sync_gram, beats=midi_beats, bpm=midi_tempo) except Exception as e: print "Error processing {}: {}".format(midi_filename, e)
def __test(hop_length, fmin, n_bins, bins_per_octave, tuning, resolution, norm, sparsity): C2 = librosa.hybrid_cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity) C1 = librosa.cqt(y, sr=sr, hop_length=hop_length, fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, resolution=resolution, norm=norm, sparsity=sparsity) eq_(C1.shape, C2.shape) # Check for numerical comparability idx1 = (C1 > 1e-4 * C1.max()) idx2 = (C2 > 1e-4 * C2.max()) perc = 0.99 thresh = 1e-3 idx = idx1 | idx2 assert np.percentile(np.abs(C1[idx] - C2[idx]), perc) < thresh * max(C1.max(), C2.max())
def get_cqt(y, PARAMETERS): '''Constant-Q transform, energy-only''' CQT = np.abs(librosa.cqt(y, sr=PARAMETERS['load']['sr'], hop_length=PARAMETERS['stft']['hop_length'], **PARAMETERS['cqt'])) return CQT
def CQT(filename, fmin=None, n_bins=84, hop_length=512): data, fs = librosa.load(filename) cqt = librosa.cqt(data, sr=fs, fmin=fmin, n_bins=n_bins, hop_length=hop_length) delta1 = librosa.feature.delta(cqt[24:,:],order=1) delta2 = librosa.feature.delta(cqt[24:,:],order=2) energy = librosa.feature.rmse(y=data) features = np.vstack((cqt,delta1,delta2,energy)) return features.T
def get_spec(): __EXAMPLE_FILE = 'data/test1_22050.wav' y, sr = librosa.load(__EXAMPLE_FILE) C = librosa.cqt(y, sr=sr) return librosa.stft(y), C, sr
def compute_features(audio, y_harmonic): """Computes the HPCP and MFCC features. Parameters ---------- audio: np.array(N) Audio samples of the given input. y_harmonic: np.array(N) Harmonic part of the audio signal, in samples. Returns ------- mfcc: np.array(N, msaf.Anal.mfcc_coeff) Mel-frequency Cepstral Coefficients. hpcp: np.array(N, 12) Pitch Class Profiles. tonnetz: np.array(N, 6) Tonal Centroid features. cqt: np.array(N, msaf.Anal.cqt_bins) Constant-Q log-scale features. tempogram: np.array(N, 192) Tempogram features. """ logging.info("Computing Spectrogram...") S = librosa.feature.melspectrogram(audio, sr=msaf.Anal.sample_rate, n_fft=msaf.Anal.frame_size, hop_length=msaf.Anal.hop_size, n_mels=msaf.Anal.n_mels) logging.info("Computing Constant-Q...") cqt = librosa.logamplitude(np.abs( librosa.cqt(audio, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_bins=msaf.Anal.cqt_bins, real=False)) ** 2, ref_power=np.max).T logging.info("Computing MFCCs...") log_S = librosa.logamplitude(S, ref_power=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=msaf.Anal.mfcc_coeff).T logging.info("Computing HPCPs...") hpcp = librosa.feature.chroma_cqt(y=y_harmonic, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_octaves=msaf.Anal.n_octaves, fmin=msaf.Anal.f_min).T logging.info("Computing Tonnetz...") tonnetz = utils.chroma_to_tonnetz(hpcp) logging.info("Computing Tempogram...") tempogram = librosa.feature.tempogram(audio, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, win_length=192).T return mfcc, hpcp, tonnetz, cqt, tempogram
def extract(self, infile): '''Extract Constant-Q spectra from an input file''' y, sr = librosa.load(infile, sr=self.sr) return librosa.cqt(y, sr=sr, hop_length=self.hop_length, n_bins=12 * self.n_octaves * self.over_sample, bins_per_octave=12 * self.over_sample, fmin=self.fmin).T.astype(self.dtype)
def compute_features(audio_file, intervals, level): """Computes the subseg-sync cqt features from the given audio file, if they are not previously computed. Saves the results in the feat_dir folder. Parameters ---------- audio_file : str Path to the audio file. intervals : np.array Intervals containing the estimated boundaries. level : str Level in the hierarchy. Returns ------- cqgram : np.array Subseg-sync constant-Q power spectrogram. intframes : np.array The frame indeces. """ # Check if features have already been computed if level == "small_scale": features_file = os.path.join(features_dir, os.path.basename(audio_file).split('.')[0] + "_small_scale.mp3.pk") else: features_file = os.path.join(features_dir, os.path.basename(audio_file) + ".pk") if os.path.isfile(features_file): return read_features(features_file) y, sr = librosa.load(audio_file, sr=11025) # Default hopsize is 512 hopsize = 512 cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr, hop_length=hopsize)**2, ref_power=np.max) # Track beats y_harmonic, y_percussive = librosa.effects.hpss(y) tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=hopsize) # Synchronize cqgram = librosa.feature.sync(cqgram, beats, aggregate=np.median) intframes = None if intervals is not None: # convert intervals to frames intframes = librosa.time_to_frames(intervals, sr=sr, hop_length=hopsize) # Match intervals to subseg points intframes = librosa.util.match_events(intframes, beats) # Save the features save_features(cqgram, intframes, beats, features_file) return cqgram, intframes
def __test(sr, hop_length, y): C = np.abs(librosa.cqt(y=y, sr=sr, hop_length=hop_length)) response = np.mean(C**2, axis=1) continuity = np.abs(np.diff(response)) # Test that integrated energy is approximately constant assert np.max(continuity) < 5e-4, continuity
def CQT(filename, fmin=None, n_bins=84, hop_length=512,nfreqs=None): f = Sndfile(filename, 'r') data = f.read_frames(f.nframes) cqt = librosa.cqt(data, sr=f.samplerate, fmin=fmin, n_bins=n_bins, hop_length=hop_length) if nfreqs != None: cqt = cqt[:nfreqs,:] delta1 = librosa.feature.delta(cqt,order=1) delta2 = librosa.feature.delta(cqt,order=2) energy = librosa.feature.rmse(y=data) features = np.vstack((cqt,delta1,delta2,energy)) return features.T
def chroma(y): # Build the wrapper CQT = np.abs(librosa.cqt(y, sr=SR, resolution=NOTE_RES, hop_length=HOP_LENGTH, fmin=NOTE_MIN, n_bins=NOTE_NUM)) C_to_Chr = librosa.filters.cq_to_chroma(CQT.shape[0], n_chroma=N_CHROMA) return librosa.logamplitude(librosa.util.normalize(C_to_Chr.dot(CQT)))
def logcqt(x, fs, hop_length=1024): """ """ x_noise = x + np.random.normal(scale=10.**-3, size=x.shape) cqt = librosa.cqt(x_noise.flatten(), sr=fs, hop_length=hop_length, fmin=27.5, n_bins=24 * 8, bins_per_octave=24, tuning=0, sparsity=0, real=False, norm=1) cqt = np.abs(cqt) lcqt = np.log1p(5000 * cqt) return lcqt
def do_cqt(src, clip_id, seg_idx): '''see do_mfcc''' if check_if_done('%s%d_%d.npy'%(PATH_CQT,clip_id,seg_idx)): return np.save('%s%d_%d.npy'%(PATH_CQT,clip_id,seg_idx) , librosa.logamplitude(librosa.cqt(y=src, sr=SR, hop_length=HOP_LEN, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_CQT_BINS)**2, ref_power=1.0)) return
def __test(real): warnings.resetwarnings() warnings.simplefilter('always') with warnings.catch_warnings(record=True) as out: C = librosa.cqt(y=y, sr=sr, real=real) assert len(out) > 0 assert out[0].category is DeprecationWarning if real: assert np.isrealobj(C) else: assert np.iscomplexobj(C)
def transform_audio(self, y): cqt, phase = librosa.magphase(librosa.cqt(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin, n_bins=self.n_octaves * self.over_sample * 12, bins_per_octave=self.over_sample * 12, real=False)) return {'mag': cqt.T.astype(np.float32), 'phase': np.angle(phase).T.astype(np.float32)}
def __test(sr, hop_length, y): C = np.abs(librosa.cqt(y=y, sr=sr, hop_length=hop_length, real=False)) max_response = np.max(C, axis=1) ref_response = np.max(max_response) continuity = np.abs(np.diff(max_response)) # Test that continuity is never violated by more than 15% point-wise energy assert np.max(continuity) < 1.5e-1 * ref_response, np.max(continuity) / ref_response # Test that peak-energy deviation is bounded assert np.std(max_response) < 0.5 * ref_response, np.std(max_response) / ref_response
def CQT_stacked(filename, fmin=None, n_bins=84, hop_length=512,nfreqs=None): f = Sndfile(filename, 'r') data = f.read_frames(f.nframes) cqt = librosa.cqt(data, sr=f.samplerate, fmin=fmin, n_bins=n_bins, hop_length=hop_length) if nfreqs != None: cqt = cqt[:nfreqs,:] delta1 = librosa.feature.delta(cqt,order=1) delta2 = librosa.feature.delta(cqt,order=2) d,L = cqt.shape cqt = cqt.T.reshape(1,L,d) delta1 = delta1.T.reshape(1,L,d) delta2 = delta2.T.reshape(1,L,d) features = np.vstack((cqt,delta1,delta2)) return features
def compute_features(audio, y_harmonic): """Computes the HPCP and MFCC features. Parameters ---------- audio: np.array(N) Audio samples of the given input. y_harmonic: np.array(N) Harmonic part of the audio signal, in samples. Returns ------- mfcc: np.array(N, msaf.Anal.mfcc_coeff) Mel-frequency Cepstral Coefficients. hpcp: np.array(N, 12) Pitch Class Profiles. tonnetz: np.array(N, 6) Tonal Centroid features. cqt: np.array(N, msaf.Anal.cqt_bins) Constant-Q log-scale features. gmt: np.array(N, msaf.Anal.mfcc_coeff+6) Gammatone features """ logging.info("Computing Spectrogram...") S = librosa.feature.melspectrogram(audio, sr=msaf.Anal.sample_rate, n_fft=msaf.Anal.frame_size, hop_length=msaf.Anal.hop_size, n_mels=msaf.Anal.n_mels) logging.info("Computing Constant-Q...") cqt = librosa.logamplitude(librosa.cqt(audio, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_bins=msaf.Anal.cqt_bins)**2, ref_power=np.max).T # cqt = librosa.logamplitude(np.abs(librosa.cqt(audio, sr=msaf.Anal.sample_rate, # hop_length=msaf.Anal.hop_size, # n_bins=msaf.Anal.cqt_bins, real=False))**2, # ref_power=np.max).T logging.info("Computing MFCCs...") log_S = librosa.logamplitude(S, ref_power=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=msaf.Anal.mfcc_coeff).T logging.info("Computing HPCPs...") # hpcp = librosa.feature.chroma_cqt(y=y_harmonic, # sr=msaf.Anal.sample_rate, # hop_length=msaf.Anal.hop_size, # n_octaves=msaf.Anal.n_octaves, # fmin=msaf.Anal.f_min).T hpcp = librosa.feature.chroma_cqt(y=y_harmonic, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_octaves=msaf.Anal.n_octaves, n_chroma=12, fmin=msaf.Anal.f_min).T #plt.imshow(hpcp.T, interpolation="nearest", aspect="auto"); plt.show() logging.info("Computing Tonnetz...") tonnetz = utils.chroma_to_tonnetz(hpcp) '''Mi: Extracting Gammatone features''' logging.info("Computing gammatone features...") gcc = librosa.feature.gammatone_cepstral_coeffecients(audio, sr=msaf.Anal.sample_rate, nfft=msaf.Anal.frame_size*2,\ hop_length=msaf.Anal.hop_size, nfilters=64, f_min=50,\ f_max=msaf.Anal.sample_rate/2, nCoeff=msaf.Anal.mfcc_coeff, log=False).T gc = librosa.feature.gammatone_contrast(audio, sr=msaf.Anal.sample_rate, nfft=msaf.Anal.frame_size*2, hop_length=msaf.Anal.hop_size,\ nfilters=64, f_min=50, f_max=msaf.Anal.sample_rate/2, n_bands=6, quantile=0.02, log=False).T gmt = np.hstack((gcc, gc)) # logging.info("%s" %(hpcp.shape,)) # logging.info("%s" %(gmt.shape,)) return mfcc, hpcp, tonnetz, cqt, gmt
def Signal_Process(audio_samples, first_axis_is_batch=False, sr=22050, method='stft'): """ :param audio_samples: sampled raw audio input (tf.Tensor) :param first_axis_is_batch: first axis means batch, default = False :param sr: sampling rate :param method: signal process methods :return: signal_processed output [feature_size, sequence_length] """ # TODO: define your signal process method with various functions and hyper parameters if method == 'your_own_way': stfts = tf.signal.stft(audio_samples, frame_length=2048, frame_step=512, fft_length=2048, pad_end=True) spectrograms = tf.abs(stfts) num_spectrogram_bins = stfts.shape[-1] lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 2048.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) if first_axis_is_batch: return tf.transpose(log_mel_spectrograms, perm=[0, 2, 1]) else: return tf.transpose(log_mel_spectrograms, perm=[1, 0]) elif method == 'raw_audio': return audio_samples elif method == 'tf_stft': stfts = tf.signal.stft(audio_samples, frame_length=2048, frame_step=512, fft_length=2048, pad_end=True) stfts = tf.abs(stfts) if first_axis_is_batch: return tf.transpose(stfts, perm=[0, 2, 1]) else: return tf.transpose(stfts, perm=[1, 0]) elif method == 'tf_mel_spectrogram': stfts = tf.signal.stft(audio_samples, frame_length=2048, frame_step=512, fft_length=2048, pad_end=True) spectrograms = tf.abs(stfts) num_spectrogram_bins = stfts.shape[-1] lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) if first_axis_is_batch: return tf.transpose(mel_spectrograms, perm=[0, 2, 1]) else: return tf.transpose(mel_spectrograms, perm=[1, 0]) elif method == 'tf_log_mel_spectrogram': stfts = tf.signal.stft(audio_samples, frame_length=2048, frame_step=512, fft_length=2048, pad_end=True) spectrograms = tf.abs(stfts) num_spectrogram_bins = stfts.shape[-1] lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) if first_axis_is_batch: return tf.transpose(log_mel_spectrograms, perm=[0, 2, 1]) else: return tf.transpose(log_mel_spectrograms, perm=[1, 0]) elif method == 'tf_mfcc': stfts = tf.signal.stft(audio_samples, frame_length=2048, frame_step=512, fft_length=2048, pad_end=True) spectrograms = tf.abs(stfts) num_spectrogram_bins = stfts.shape[-1] lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :20] if first_axis_is_batch: return tf.transpose(mfccs, perm=[0, 2, 1]) else: return tf.transpose(mfccs, perm=[1, 0]) elif method == 'stft': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): f.append( np.abs( librosa.stft(audio_samples[i], n_fft=2048, hop_length=512))) else: f = np.abs(librosa.stft(audio_samples, n_fft=2048, hop_length=512)) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'cqt': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): f.append( np.abs( librosa.cqt(audio_samples[i], sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12))) else: f = np.abs( librosa.cqt(audio_samples, sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12)) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'chroma_cqt': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): cqt = np.abs( librosa.cqt(audio_samples[i], sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12, tuning=None)) f.append( librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7)) else: cqt = np.abs( librosa.cqt(audio_samples, sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12, tuning=None)) f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'chroma_cens': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): cqt = np.abs( librosa.cqt(audio_samples[i], sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12, tuning=None)) f.append( librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7)) else: cqt = np.abs( librosa.cqt(audio_samples, sr=float(sr), hop_length=512, bins_per_octave=12, n_bins=7 * 12, tuning=None)) f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'chroma_stft': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): stft = np.abs( librosa.stft(audio_samples[i], n_fft=2048, hop_length=512)) f.append(librosa.feature.chroma_stft(S=stft**2, n_chroma=12)) else: stft = np.abs( librosa.stft(audio_samples, n_fft=2048, hop_length=512)) f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'rms': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): stft = np.abs( librosa.stft(audio_samples[i], n_fft=2048, hop_length=512)) f.append(librosa.feature.rms(S=stft)) else: stft = np.abs( librosa.stft(audio_samples, n_fft=2048, hop_length=512)) f = librosa.feature.rms(S=stft) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'mel_spectrogram': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): stft = np.abs( librosa.stft(audio_samples[i], n_fft=2048, hop_length=512)) f.append(librosa.feature.melspectrogram(S=stft**2, sr=sr)) else: stft = np.abs( librosa.stft(audio_samples, n_fft=2048, hop_length=512)) f = librosa.feature.melspectrogram(S=stft**2, sr=sr) return tf.convert_to_tensor(f, dtype=tf.float32) elif method == 'mfcc': audio_samples = audio_samples.numpy() if first_axis_is_batch: f = list() for i in range(len(audio_samples)): stft = np.abs( librosa.stft(audio_samples[i], n_fft=2048, hop_length=512)) mel_spectrogram = librosa.feature.melspectrogram(S=stft**2, sr=sr) f.append( librosa.feature.mfcc( S=librosa.power_to_db(mel_spectrogram), n_mfcc=20)) else: stft = np.abs( librosa.stft(audio_samples, n_fft=2048, hop_length=512)) mel_spectrogram = librosa.feature.melspectrogram(S=stft**2, sr=sr) f = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrogram), n_mfcc=20) return tf.convert_to_tensor(f, dtype=tf.float32) else: raise NotImplementedError
hop_length=frame_hop) # hanning window to smooth the spectrum out han_win = signal.hanning(frame_length) # let's extract CQT_frames = [] for frame in range(y_frames.shape[1]): if not use_han: sig = y_frames[:, frame] else: sig = han_win * y_frames[:, frame] CQTf = np.abs( librosa.cqt(sig, sr=sr, n_bins=n_bins, bins_per_octave=bins_per_octave, fmin=fmin, hop_length=cqt_hop, real=False)) CQT_frames.append(CQTf[:, 1:-1]) # concatenate everything together CQT = np.hstack(CQT_frames) # Take the log amplitude CQTlog = librosa.logamplitude(CQT**2, ref_power=np.max) # save the extracted CQT inface.upload_raw_array(stim_out.format(cqt_range, stimuli[i]), CQTlog)
def draw_cqt(audio, samplerate): s = 3 C = librosa.cqt(audio, sr=samplerate, n_bins=60 * s, bins_per_octave=12 * s, hop_length=16) librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), sr=samplerate, x_axis='time', y_axis='cqt_note')
def laplacian_segmentation(y, sr, k=5): """This function uses the Laplacian Segmentation method described in McFee and Ellis, 2014, and adapted from example code in the librosa documentation. It returns the segment boundaries (in frame number and time and segment ID's of isolated music file segments.""" BINS_PER_OCTAVE = 12 * 3 N_OCTAVES = 7 C = librosa.amplitude_to_db(np.abs( librosa.cqt(y=y, sr=sr, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_OCTAVES * BINS_PER_OCTAVE)), ref=np.max) tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False) Csync = librosa.util.sync(C, beats, aggregate=np.median) # For plotting purposes, we'll need the timing of the beats # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame) beat_times = librosa.frames_to_time(librosa.util.fix_frames( beats, x_min=0, x_max=C.shape[1]), sr=sr) R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity', sym=True) # Enhance diagonals with a median filter (Equation 2) df = librosa.segment.timelag_filter(scipy.ndimage.median_filter) Rf = df(R, size=(1, 7)) mfcc = librosa.feature.mfcc(y=y, sr=sr) Msync = librosa.util.sync(mfcc, beats) path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0) sigma = np.median(path_distance) path_sim = np.exp(-path_distance / sigma) R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1) deg_path = np.sum(R_path, axis=1) deg_rec = np.sum(Rf, axis=1) mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2) A = mu * Rf + (1 - mu) * R_path L = scipy.sparse.csgraph.laplacian(A, normed=True) # and its spectral decomposition evals, evecs = scipy.linalg.eigh(L) # We can clean this up further with a median filter. # This can help smooth over small discontinuities evecs = scipy.ndimage.median_filter(evecs, size=(9, 1)) # cumulative normalization is needed for symmetric normalize laplacian eigenvectors Cnorm = np.cumsum(evecs**2, axis=1)**0.5 # If we want k clusters, use the first k normalized eigenvectors. # Fun exercise: see how the segmentation changes as you vary k k = k X = evecs[:, :k] / Cnorm[:, k - 1:k] KM = sklearn.cluster.KMeans(n_clusters=k) seg_ids = KM.fit_predict(X) bound_beats = 1 + np.flatnonzero(seg_ids[:-1] != seg_ids[1:]) # Count beat 0 as a boundary bound_beats = librosa.util.fix_frames(bound_beats, x_min=0) # Compute the segment label for each boundary bound_segs = list(seg_ids[bound_beats]) # Convert beat indices to frames bound_frames = beats[bound_beats] # Make sure we cover to the end of the track bound_frames = librosa.util.fix_frames(bound_frames, x_min=None, x_max=C.shape[1] - 1) bound_times = librosa.frames_to_time(bound_frames) bound_times = [(x / 60) * tempo for x in bound_times] beat_numbers = list(range(len(bound_frames))) bound_beats = np.append(bound_beats, list(range(len(beats)))[-1]) segments = list( zip(zip(bound_times, bound_times[1:]), zip(bound_beats, bound_beats[1:]), bound_segs)) return segments, beat_times, tempo
def getFusedSimilarity(filename, sr, hop_length, win_fac, wins_per_block, K, reg_diag, reg_neighbs, niters, do_animation, plot_result, do_crema=True): """ Load in filename, compute features, average/stack delay, and do similarity network fusion (SNF) on all feature types Parameters ---------- filename: string Path to music file sr: int Sample rate at which to sample file hop_length: int Hop size between frames in chroma and mfcc win_fac: int Number of frames to average (i.e. factor by which to downsample) If negative, then do beat tracking, and subdivide by |win_fac| times within each beat wins_per_block: int Number of aggregated windows per sliding window block K: int Number of nearest neighbors in SNF. If -1, then autotuned to sqrt(N) for an NxN similarity matrix reg_diag: float Regularization for self-similarity promotion reg_neighbs: float Regularization for direct neighbor similarity promotion niters: int Number of iterations in SNF do_animation: boolean Whether to plot and save images of the evolution of SNF plot_result: boolean Whether to plot the result of the fusion do_crema: boolean Whether to include precomputed crema in the fusion Returns ------- {'Ws': An dictionary of weighted adjacency matrices for individual features and the fused adjacency matrix, 'times': Time in seconds of each row in the similarity matrices, 'K': The number of nearest neighbors actually used} """ ## Step 1: Load audio print("Loading %s..." % filename) if MANUAL_AUDIO_LOAD: subprocess.call([ FFMPEG_BINARY, "-i", filename, "-ar", "%i" % sr, "-ac", "1", "%s.wav" % filename ]) sr, y = sio.wavfile.read("%s.wav" % filename) y = y / 2.0**15 os.remove("%s.wav" % filename) else: y, sr = librosa.load(filename, sr=sr) ## Step 2: Figure out intervals to which to sync features if win_fac > 0: # Compute features in intervals evenly spaced by the hop size # but average within "win_fac" intervals of hop_length nHops = int( (y.size - hop_length * win_fac * wins_per_block) / hop_length) intervals = np.arange(0, nHops, win_fac) else: # Compute features in intervals which are subdivided beats # by a factor of |win_fac| C = np.abs(librosa.cqt(y=y, sr=sr)) _, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False, start_bpm=240) intervals = librosa.util.fix_frames(beats, x_max=C.shape[1]) intervals = librosa.segment.subsegment(C, intervals, n_segments=abs(win_fac)) ## Step 3: Compute features # 1) CQT chroma with 3x oversampling in pitch chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length, bins_per_octave=12 * 3) # 2) Exponentially liftered MFCCs S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128, hop_length=hop_length) log_S = librosa.power_to_db(S, ref=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20) lifterexp = 0.6 coeffs = np.arange(mfcc.shape[0])**lifterexp coeffs[0] = 1 mfcc = coeffs[:, None] * mfcc # 3) Tempograms # Use a super-flux max smoothing of 5 frequency bands in the oenv calculation SUPERFLUX_SIZE = 5 oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length, max_size=SUPERFLUX_SIZE) tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, hop_length=hop_length) # 4) Crema if do_crema: matfilename = "%s_crema.mat" % filename if not os.path.exists(matfilename): print("****WARNING: PRECOMPUTED CREMA DOES NOT EXIST****") do_crema = False else: data = sio.loadmat(matfilename) fac = (float(sr) / 44100.0) * 4096.0 / hop_length times_orig = fac * np.arange(len(data['chord_bass'])) times_new = np.arange(mfcc.shape[1]) interp = scipy.interpolate.interp1d(times_orig, data['chord_pitch'].T, kind='nearest', fill_value='extrapolate') chord_pitch = interp(times_new) ## Step 4: Synchronize features to intervals n_frames = np.min([chroma.shape[1], mfcc.shape[1], tempogram.shape[1]]) if do_crema: n_frames = min(n_frames, chord_pitch.shape[1]) # median-aggregate chroma to suppress transients and passing tones intervals = librosa.util.fix_frames(intervals, x_min=0, x_max=n_frames) times = intervals * float(hop_length) / float(sr) chroma = librosa.util.sync(chroma, intervals, aggregate=np.median) chroma = chroma[:, :n_frames] mfcc = librosa.util.sync(mfcc, intervals) mfcc = mfcc[:, :n_frames] tempogram = librosa.util.sync(tempogram, intervals) tempogram = tempogram[:, :n_frames] if do_crema: chord_pitch = librosa.util.sync(chord_pitch, intervals) chord_pitch = chord_pitch[:, :n_frames] ## Step 5: Do a delay embedding and compute SSMs XChroma = librosa.feature.stack_memory(chroma, n_steps=wins_per_block, mode='edge').T DChroma = getCSMCosine(XChroma, XChroma) #Cosine distance XMFCC = librosa.feature.stack_memory(mfcc, n_steps=wins_per_block, mode='edge').T DMFCC = getCSM(XMFCC, XMFCC) #Euclidean distance XTempogram = librosa.feature.stack_memory(tempogram, n_steps=wins_per_block, mode='edge').T DTempogram = getCSM(XTempogram, XTempogram) if do_crema: XChordPitch = librosa.feature.stack_memory(chord_pitch, n_steps=wins_per_block, mode='edge').T DChordPitch = getCSMCosine(XChordPitch, XChordPitch) ## Step 5: Run similarity network fusion FeatureNames = ['MFCCs', 'Chromas'] Ds = [DMFCC, DChroma, DTempogram] if do_crema: FeatureNames.append('Crema') Ds.append(DChordPitch) # Edge case: If it's too small, zeropad SSMs for i, Di in enumerate(Ds): if Di.shape[0] < 2 * K: D = np.zeros((2 * K, 2 * K)) D[0:Di.shape[0], 0:Di.shape[1]] = Di Ds[i] = D pK = K if K == -1: pK = int(np.round(2 * np.log(Ds[0].shape[0]) / np.log(2))) print("Autotuned K = %i" % pK) # Do fusion on all features Ws = [getW(D, pK) for D in Ds] if REC_SMOOTH > 0: from scipy.ndimage import median_filter df = librosa.segment.timelag_filter(median_filter) Ws = [df(W, size=(1, REC_SMOOTH)) for W in Ws] WFused = doSimilarityFusionWs(Ws, K=pK, niters=niters, \ reg_diag=reg_diag, reg_neighbs=reg_neighbs, \ do_animation=do_animation, PlotNames=FeatureNames, \ PlotExtents=[times[0], times[-1]]) WsDict = {} for n, W in zip(FeatureNames, Ws): WsDict[n] = W WsDict['Fused'] = WFused # Do fusion with only Chroma and MFCC #WsDict['Fused MFCC/Chroma'] = doSimilarityFusionWs(Ws[0:2], K=pK, niters=niters, \ # reg_diag=reg_diag, reg_neighbs=reg_neighbs) if do_crema: # Do fusion with tempograms and Crema if Crema is available WsDict['Fused Tgram_Crema'] = doSimilarityFusionWs(Ws[2::], K=pK, niters=niters, \ reg_diag=reg_diag, reg_neighbs=reg_neighbs) # Do fusion with MFCC and Crema WsDict['Fused MFCC_Crema'] = doSimilarityFusionWs([Ws[0], Ws[-1]], K=pK, niters=niters, \ reg_diag=reg_diag, reg_neighbs=reg_neighbs) # Do fusion with MFCC, Chroma, and Crema WsDict['Fused MFCC_Chroma_Crema'] = doSimilarityFusionWs([Ws[0], Ws[1], Ws[-1]], K=pK, niters=niters, \ reg_diag=reg_diag, reg_neighbs=reg_neighbs) if plot_result: plotFusionResults(WsDict, {}, {}, times, win_fac) plt.savefig("%s_Plot.png" % filename, bbox_inches='tight') return {'Ws': WsDict, 'times': times, 'K': pK}
def preprocess_wav_file(file_path_or_bytes, Y_numSlice): # returns 1 example (downsampled, cqt, normalized) np_array_list = [] y, sr = auto_load(file_path_or_bytes, sr =None) y_downsample = librosa.resample(y, orig_sr=sr, target_sr=DOWNSAMPLED_SR) CQT_result = librosa.cqt(y_downsample, sr=DOWNSAMPLED_SR, hop_length=HOP_LENGTH, n_bins=NUM_BINS, bins_per_octave=BINS_PER_OCTAVE) CQT_result = np.absolute(CQT_result) np_array_list.append(CQT_result) # normalize data combined = np.concatenate(np_array_list, axis = 1) #### ''' max_val = combined.max() min_val = combined.min() combined_norm = (combined - min_val) / (max_val - min_val) mean_per_label = np.mean(combined_norm, axis = 1) mean_per_label = np.reshape(mean_per_label, (-1, 1)) for i in range(len(np_array_list)): np_array_list[i] = (np_array_list[i] - min_val) / (max_val - min_val) np_array_list[i] = np_array_list[i] - mean_per_label with h5py.File('minmax_meanlabel.h5', 'w') as h5f: h5f.create_dataset('min_max', data=[min_val, max_val], compression='gzip') h5f.create_dataset('mean_per_label', data=mean_per_label, compression='gzip') ''' ######## with h5py.File('sl_data/std/means_stds-nm.h5', 'r') as h5f: #cqt_result = np.divide(np.subtract(cqt_result, h5f['means']), h5f['stds']) mean = h5f['means'][:]#np.mean(combined, axis = 1, keepdims =True) std = h5f['stds'][:]#np.std(combined, axis = 1, keepdims=True) for i in range(len(np_array_list)): np_array_list[i] = np.divide(np.subtract(np_array_list[i], mean), std) ''' with h5py.File('means_stds.h5', 'w') as h5f: h5f.create_dataset('means', data=mean, compression='gzip') h5f.create_dataset('stds', data=std, compression='gzip') ''' #### frame_windows_list = [] numSlices_list = [] for i in range(len(np_array_list)): CQT_result = np_array_list[i] # print (CQT_result.shape[0]) # print ("====") # print (CQT_result.shape[1]) paddedX = np.zeros((CQT_result.shape[0], CQT_result.shape[1] + WINDOW_SIZE - 1), dtype=float) pad_amount = WINDOW_SIZE / 2 pad_amount = int(pad_amount) paddedX[:, pad_amount:-pad_amount] = CQT_result # print (paddedX[:, pad_amount:-pad_amount]) frame_windows = np.array([paddedX[:, j:j+WINDOW_SIZE] for j in range(CQT_result.shape[1])]) frame_windows = np.expand_dims(frame_windows, axis=3) if Y_numSlice is not None: numSlices = min(frame_windows.shape[0], Y_numSlice) #Y_numSlices[i]) else: numSlices = frame_windows.shape[0] numSlices_list.append(numSlices) frame_windows_list.append(frame_windows[:numSlices]) # return np.concatenate(frame_windows_list, axis=0), numSlices_list return frame_windows_list, numSlices_list
type=int, action='store', help='Program of the instrument in the output MIDI') parser.add_argument('--min_midi', default=24, type=int, action='store', help='Minimum MIDI note to transcribe') parser.add_argument('--max_midi', default=107, type=int, action='store', help='Maximum MIDI note to transcribe') parser.add_argument('--threshold', default=64, type=int, action='store', help='Threshold to activate note on event, 0-127') parameters = vars(parser.parse_args(sys.argv[1:])) y, sr = librosa.load(parameters['input_audio']) min_midi, max_midi = parameters['min_midi'], parameters['max_midi'] cqt = librosa.cqt(y, sr=sr, fmin=min_midi, n_bins=max_midi - min_midi) pr = cqt_to_piano_roll(cqt, min_midi, max_midi, parameters['threshold']) # get audio time audio_time = len(y) / sr # get sampling frequency of cqt spectrogram fs = pr.shape[1] / audio_time pm = piano_roll_to_pretty_midi(pr, fs=fs, program=parameters['program']) pm.write(parameters['output_midi'])
def test_cqt_precision(y_cqt, sr_cqt, dtype): C = librosa.cqt(y=y_cqt, sr=sr_cqt, dtype=dtype) assert np.dtype(C.dtype) == np.dtype(dtype)
def test_cqt_fail_short_late(): y = np.zeros(16) librosa.cqt(y, sr=22050)
def __process_audio(self): """ The main audio processing routine for the thread. This routine uses Laplacian Segmentation to find and group similar beats in the song. This code has been adapted from the sample created by Brian McFee at https://librosa.github.io/librosa_gallery/auto_examples/plot_segmentation.html#sphx-glr-auto-examples-plot-segmentation-py and is based on his 2014 paper published at http://bmcfee.github.io/papers/ismir2014_spectral.pdf I have made some performance improvements, but the basic parts remain (mostly) unchanged """ self.__report_progress(.1, "loading file and extracting raw audio") # # load the file as stereo with a high sample rate and # trim the silences from each end # y, sr = librosa.core.load(self.__filename, mono=False, sr=None) y, _ = librosa.effects.trim(y) self.duration = librosa.core.get_duration(y, sr) self.raw_audio = (y * np.iinfo(np.int16).max).astype( np.int16).T.copy(order='C') self.sample_rate = sr # after the raw audio bytes are saved, convert the samples to mono # because the beat detection algorithm in librosa requires it. y = librosa.core.to_mono(y) self.__report_progress(.2, "computing pitch data...") # Compute the constant-q chromagram for the samples. BINS_PER_OCTAVE = 12 * 3 N_OCTAVES = 7 cqt = librosa.cqt(y=y, sr=sr, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_OCTAVES * BINS_PER_OCTAVE) C = librosa.amplitude_to_db(np.abs(cqt), ref=np.max) self.__report_progress(.3, "Finding beats...") ########################################################## # To reduce dimensionality, we'll beat-synchronous the CQT tempo, btz = librosa.beat.beat_track(y=y, sr=sr, trim=False) # tempo, btz = librosa.beat.beat_track(y=y, sr=sr) Csync = librosa.util.sync(C, btz, aggregate=np.median) self.tempo = tempo # For alignment purposes, we'll need the timing of the beats # we fix_frames to include non-beat frames 0 and C.shape[1] (final frame) beat_times = librosa.frames_to_time(librosa.util.fix_frames( btz, x_min=0, x_max=C.shape[1]), sr=sr) self.__report_progress(.4, "building recurrence matrix...") ##################################################################### # Let's build a weighted recurrence matrix using beat-synchronous CQT # (Equation 1) # width=3 prevents links within the same bar # mode='affinity' here implements S_rep (after Eq. 8) R = librosa.segment.recurrence_matrix(Csync, width=3, mode='affinity', sym=True) # Enhance diagonals with a median filter (Equation 2) df = librosa.segment.timelag_filter(scipy.ndimage.median_filter) Rf = df(R, size=(1, 7)) ################################################################### # Now let's build the sequence matrix (S_loc) using mfcc-similarity # # :math:`R_\text{path}[i, i\pm 1] = \exp(-\|C_i - C_{i\pm 1}\|^2 / \sigma^2)` # # Here, we take :math:`\sigma` to be the median distance between successive beats. # mfcc = librosa.feature.mfcc(y=y, sr=sr) Msync = librosa.util.sync(mfcc, btz) path_distance = np.sum(np.diff(Msync, axis=1)**2, axis=0) sigma = np.median(path_distance) path_sim = np.exp(-path_distance / sigma) R_path = np.diag(path_sim, k=1) + np.diag(path_sim, k=-1) ########################################################## # And compute the balanced combination (Equations 6, 7, 9) deg_path = np.sum(R_path, axis=1) deg_rec = np.sum(Rf, axis=1) mu = deg_path.dot(deg_path + deg_rec) / np.sum((deg_path + deg_rec)**2) A = mu * Rf + (1 - mu) * R_path ##################################################### # Now let's compute the normalized Laplacian (Eq. 10) L = scipy.sparse.csgraph.laplacian(A, normed=True) # and its spectral decomposition _, evecs = scipy.linalg.eigh(L) # We can clean this up further with a median filter. # This can help smooth over small discontinuities evecs = scipy.ndimage.median_filter(evecs, size=(9, 1)) # cumulative normalization is needed for symmetric normalize laplacian eigenvectors Cnorm = np.cumsum(evecs**2, axis=1)**0.5 # If we want k clusters, use the first k normalized eigenvectors. # Fun exercise: see how the segmentation changes as you vary k self.__report_progress(.5, "clustering...") # if a value for clusters wasn't passed in, then we need to auto-cluster if self.clusters == 0: # if we've been asked to use the original auto clustering alogrithm, otherwise # use the new and improved one that accounts for silhouette scores. if self._use_v1: self.clusters, seg_ids = self.__compute_best_cluster( evecs, Cnorm) else: self.clusters, seg_ids = self.__compute_best_cluster_with_sil( evecs, Cnorm) else: # otherwise, just use the cluster value passed in k = self.clusters self.__report_progress(.51, "using %d clusters" % self.clusters) X = evecs[:, :k] / Cnorm[:, k - 1:k] seg_ids = sklearn.cluster.KMeans(n_clusters=k, max_iter=1000, random_state=0, n_init=1000).fit_predict(X) # Get the amplitudes and beat-align them self.__report_progress(.6, "getting amplitudes") # newer versions of librosa have renamed the rmse function if hasattr(librosa.feature, 'rms'): amplitudes = librosa.feature.rms(y=y) else: amplitudes = librosa.feature.rmse(y=y) ampSync = librosa.util.sync(amplitudes, btz) # create a list of tuples that include the ordinal position, the start time of the beat, # the cluster to which the beat belongs and the mean amplitude of the beat zbeat_tuples = zip(range(0, len(btz)), beat_times, seg_ids, ampSync[0].tolist()) beat_tuples = tuple(zbeat_tuples) info = [] bytes_per_second = int(round(len(self.raw_audio) / self.duration)) last_cluster = -1 current_segment = -1 segment_beat = 0 for i in range(0, len(beat_tuples)): final_beat = {} final_beat['start'] = float(beat_tuples[i][1]) final_beat['cluster'] = int(beat_tuples[i][2]) final_beat['amplitude'] = float(beat_tuples[i][3]) if final_beat['cluster'] != last_cluster: current_segment += 1 segment_beat = 0 else: segment_beat += 1 final_beat['segment'] = current_segment final_beat['is'] = segment_beat last_cluster = final_beat['cluster'] if i == len(beat_tuples) - 1: final_beat['duration'] = self.duration - final_beat['start'] else: final_beat['duration'] = beat_tuples[i + 1][1] - beat_tuples[i][1] if ((final_beat['start'] * bytes_per_second) % 2 > 1.5): final_beat['start_index'] = int( math.ceil(final_beat['start'] * bytes_per_second)) else: final_beat['start_index'] = int(final_beat['start'] * bytes_per_second) final_beat['stop_index'] = int( math.ceil((final_beat['start'] + final_beat['duration']) * bytes_per_second)) # save pointers to the raw bytes for each beat with each beat. final_beat['buffer'] = self.raw_audio[ final_beat['start_index']:final_beat['stop_index']] info.append(final_beat) self.__report_progress(.7, "truncating to fade point...") # get the max amplitude of the beats # max_amplitude = max([float(b['amplitude']) for b in info]) max_amplitude = sum([float(b['amplitude']) for b in info]) / len(info) # assume that the fade point of the song is the last beat of the song that is >= 75% of # the max amplitude. self.max_amplitude = max_amplitude fade = len(info) - 1 for b in reversed(info): if b['amplitude'] >= (.75 * max_amplitude): fade = info.index(b) break # truncate the beats to [start:fade + 1] beats = info[self.__start_beat:fade + 1] loop_bounds_begin = self.__start_beat self.__report_progress(.8, "computing final beat array...") # assign final beat ids for beat in beats: beat['id'] = beats.index(beat) beat['quartile'] = beat['id'] // (len(beats) / 4.0) # compute a coherent 'next' beat to play. This is always just the next ordinal beat # unless we're at the end of the song. Then it gets a little trickier. for beat in beats: if beat == beats[-1]: # if we're at the last beat, then we want to find a reasonable 'next' beat to play. It should (a) share the # same cluster, (b) be in a logical place in its measure, (c) be after the computed loop_bounds_begin, and # is in the first half of the song. If we can't find such an animal, then just return the beat # at loop_bounds_begin beat['next'] = next( (b['id'] for b in beats if b['cluster'] == beat['cluster'] and b['id'] % 4 == (beat['id'] + 1) % 4 and b['id'] <= (.5 * len(beats)) and b['id'] >= loop_bounds_begin), loop_bounds_begin) else: beat['next'] = beat['id'] + 1 # find all the beats that (a) are in the same cluster as the NEXT oridnal beat, (b) are of the same # cluster position as the next ordinal beat, (c) are in the same place in the measure as the NEXT beat, # (d) but AREN'T the next beat, and (e) AREN'T in the same cluster as the current beat. # # THAT collection of beats contains our jump candidates jump_candidates = [ bx['id'] for bx in beats[loop_bounds_begin:] if (bx['cluster'] == beats[beat['next']]['cluster']) and ( bx['is'] == beats[beat['next']]['is']) and ( bx['id'] % 4 == beats[beat['next']]['id'] % 4) and (bx['segment'] != beat['segment']) and ( bx['id'] != beat['next']) ] if jump_candidates: beat['jump_candidates'] = jump_candidates else: beat['jump_candidates'] = [] # save off the segment count self.segments = max([b['segment'] for b in beats]) + 1 # we don't want to ever play past the point where it's impossible to loop, # so let's find the latest point in the song where there are still jump # candidates and make sure that we can't play past it. last_chance = len(beats) - 1 for b in reversed(beats): if len(b['jump_candidates']) > 0: last_chance = beats.index(b) break # if we play our way to the last beat that has jump candidates, then just skip # to the earliest jump candidate rather than enter a section from which no # jumping is possible. beats[last_chance]['next'] = min(beats[last_chance]['jump_candidates']) # store the beats that start after the last jumpable point. That's # the outro to the song. We can use these # beasts to create a sane ending for a fixed-length remix outro_start = last_chance + 1 + self.__start_beat if outro_start >= len(info): self.outro = [] else: self.outro = info[outro_start:] # # This section of the code computes the play_vector -- a 1024*1024 beat length # remix of the current song. # random.seed() # how long should our longest contiguous playback blocks be? One way to # consider it is that higher bpm songs need longer blocks because # each beat takes less time. A simple way to estimate a good value # is to scale it by it's distance from 120bpm -- the canonical bpm # for popular music. Find that value and round down to the nearest # multiple of 4. (There almost always are 4 beats per measure in Western music). max_sequence_len = int(round((self.tempo / 120.0) * 48.0)) max_sequence_len = max_sequence_len - (max_sequence_len % 4) min_sequence = max(random.randrange(16, max_sequence_len, 4), loop_bounds_begin) current_sequence = 0 beat = beats[0] self.__report_progress(.9, "creating play vector") play_vector = [] play_vector.append({ 'beat': 0, 'seq_len': min_sequence, 'seq_pos': current_sequence }) # we want to keep a list of recently played segments so we don't accidentally wind up in a local loop # # the number of segments in a song will vary so we want to set the number of recents to keep # at 25% of the total number of segments. Eg: if there are 34 segments, then the depth will # be set at round(8.5) == 9. # # On the off chance that the (# of segments) *.25 < 1 we set a floor queue depth of 1 recent_depth = int(round(self.segments * .25)) recent_depth = max(recent_depth, 1) recent = collections.deque(maxlen=recent_depth) # keep track of the time since the last successful jump. If we go more than # 10% of the song length since our last jump, then we will prioritize an # immediate jump to a not recently played segment. Otherwise playback will # be boring for the listener. This also has the advantage of busting out of # local loops. max_beats_between_jumps = int(round(len(beats) * .1)) beats_since_jump = 0 failed_jumps = 0 for i in range(0, 1024 * 1024): if beat['segment'] not in recent: recent.append(beat['segment']) current_sequence += 1 # it's time to attempt a jump if we've played all the beats we wanted in the # current sequence. Also, if we've gone more than 10% of the length of the song # without jumping we need to immediately prioritze jumping to a non-recent segment. will_jump = (current_sequence == min_sequence) or ( beats_since_jump >= max_beats_between_jumps) # since it's time to jump, let's find the most musically pleasing place # to go if (will_jump): # find the jump candidates that haven't been recently played non_recent_candidates = [ c for c in beat['jump_candidates'] if beats[c]['segment'] not in recent ] # if there aren't any good jump candidates, then we need to fall back # to another selection scheme. if len(non_recent_candidates) == 0: beats_since_jump += 1 failed_jumps += 1 # suppose we've been trying to jump but couldn't find a good non-recent candidate. If # the length of time we've been trying (and failing) is >= 10% of the song length # then it's time to relax our criteria. Let's find the jump candidate that's furthest # from the current beat (irrespective if it's been played recently) and go there. Ideally # we'd like to jump to a beat that is not in the same quartile of the song as the currently # playing section. That way we maximize our chances of avoiding a long local loop -- such as # might be found in the section preceeding the outro of a song. non_quartile_candidates = [ c for c in beat['jump_candidates'] if beats[c]['quartile'] != beat['quartile'] ] if (failed_jumps >= (.1 * len(beats))) and (len(non_quartile_candidates) > 0): furthest_distance = max([ abs(beat['id'] - c) for c in non_quartile_candidates ]) jump_to = next(c for c in non_quartile_candidates if abs(beat['id'] - c) == furthest_distance) beat = beats[jump_to] beats_since_jump = 0 failed_jumps = 0 # uh oh! That fallback hasn't worked for yet ANOTHER 10% # of the song length. Something is seriously broken. Time # to punt and just start again from the first beat. elif failed_jumps >= (.2 * len(beats)): beats_since_jump = 0 failed_jumps = 0 beat = beats[loop_bounds_begin] # asuuming we're not in one of the failure modes but haven't found a good # candidate that hasn't been recently played, just play the next beat in the # sequence else: beat = beats[beat['next']] else: # if it's time to jump and we have at least one good non-recent # candidate, let's just pick randomly from the list and go there beats_since_jump = 0 failed_jumps = 0 beat = beats[random.choice(non_recent_candidates)] # reset our sequence position counter and pick a new target length # between 16 and max_sequence_len, making sure it's evenly divisible by # 4 beats current_sequence = 0 min_sequence = random.randrange(16, max_sequence_len, 4) # if we're in the place where we want to jump but can't because # we haven't found any good candidates, then set current_sequence equal to # min_sequence. During playback this will show up as having 00 beats remaining # until we next jump. That's the signal that we'll jump as soon as we possibly can. # # Code that reads play_vector and sees this value can choose to visualize this in some # interesting way. if beats_since_jump >= max_beats_between_jumps: current_sequence = min_sequence # add an entry to the play_vector play_vector.append({ 'beat': beat['id'], 'seq_len': min_sequence, 'seq_pos': current_sequence }) else: # if we're not trying to jump then just add the next item to the play_vector play_vector.append({ 'beat': beat['next'], 'seq_len': min_sequence, 'seq_pos': current_sequence }) beat = beats[beat['next']] beats_since_jump += 1 # save off the beats array and play_vector. Signal # the play_ready event (if it's been set) self.beats = beats self.play_vector = play_vector self.__report_progress(1.0, "finished processing") if self.play_ready: self.play_ready.set()
for i in range(1, 5): COOKED_DIR = 'F:/项目/花城音乐项目/样式数据/ALL/旋律/' + path_index[i - 1] + '/' #savepath = 'F:\\mfcc_pic\\'+ str(i) +'\\' for root, dirs, files in os.walk(COOKED_DIR): print("Root = ", root, "dirs = ", dirs, "files = ", files) index = 0 for filename in files: print(filename) if filename.find('wav') <= 0: continue else: index = index + 1 path_one = COOKED_DIR + filename y, sr = load_and_trim(path_one) CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=16000), ref=np.max) librosa.display.specshow(CQT) #plt.ylabel('Frequency') #plt.xlabel('Time(s)') #plt.show() fig = matplotlib.pyplot.gcf() fig.set_size_inches(4, 4) if "." in filename: Filename = filename.split(".")[0] plt.axis('off') plt.axes().get_xaxis().set_visible(False) plt.axes().get_yaxis().set_visible(False) #plt.rcParams['savefig.dpi'] = 300 # 图片像素 #plt.figure(figsize=(10, 10)) #plt.rcParams['figure.dpi'] = 300 # 分辨率 if filename.find('标准') > 0:
num = int(song_t.split('.')[0]) if year in [1960, 1965, 1970, 1975, 1980, 1985, 2000, 2005]: num = int(song[5:8]) ''' if num in arr_2015: print num,song,'already processed' continue ''' y, sr = librosa.load(year_dir+song) C = librosa.amplitude_to_db(librosa.cqt(y=y, sr=sr, bins_per_octave=BINS_PER_OCTAVE, n_bins=N_OCTAVES * BINS_PER_OCTAVE), ref=np.max) tempo, beats = librosa.beat.beat_track(y=y, sr=sr, trim=False) Csync = librosa.util.sync(C, beats, aggregate=np.median) beat_times = librosa.frames_to_time(librosa.util.fix_frames(beats, x_min=0, x_max=C.shape[1]), sr=sr) R = librosa.segment.recurrence_matrix(Csync, width=1, mode='affinity', sym=True)
def get_spec(y, sr): C = np.abs(librosa.cqt(y, sr=sr)) return librosa.stft(y), C, sr
def C(y, sr): return np.abs(librosa.cqt(y, sr=sr))
def feature_examples(filepath): # example of various librosa features # please check [https://librosa.github.io/librosa/feature.html] threshold = 630000 try: x, sr = librosa.load(filepath, sr=None, mono=True, duration=29.0) x = x.tolist() if len(x) < threshold: raise ValueError('song length is shorter than threshold') else: x = x[:threshold] x = np.array(x) # zero_crossing_rate # returns (1,t) f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512) cqt = np.abs( librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12, n_bins=7 * 12, tuning=None)) assert cqt.shape[0] == 7 * 12 assert np.ceil( len(x) / 512) <= cqt.shape[1] <= np.ceil(len(x) / 512) + 1 # chroma_cqt # returns (n_chroma, t) f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7) # chroma_cqt # returns (n_chroma, t) f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7) del cqt stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512)) assert stft.shape[0] == 1 + 2048 // 2 assert np.ceil( len(x) / 512) <= stft.shape[1] <= np.ceil(len(x) / 512) + 1 del x # chroma_stft # returns (n_chroma, t) f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12) # rmse # returns (1,t) f = librosa.feature.rmse(S=stft) # spectral_centroid # returns (1,t) f = librosa.feature.spectral_centroid(S=stft) # spectral_bandwidth # returns (1,t) f = librosa.feature.spectral_bandwidth(S=stft) # spectral_contrast # returns (n_bands+1, t) f = librosa.feature.spectral_contrast(S=stft, n_bands=6) # spectral_rolloff # returns (1,t) f = librosa.feature.spectral_rolloff(S=stft) # mfcc # returns (n_mfcc, t) mel = librosa.feature.melspectrogram(sr=sr, S=stft**2) del stft f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20) except Exception as e: print('{}: {}'.format(filepath, repr(e)))
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max) plt.subplot(4, 2, 1) librosa.display.specshow(D, y_axis='linear') plt.colorbar(format='%+2.0f dB') plt.title('Linear-frequency power spectrogram') # Or on a logarithmic scale plt.subplot(4, 2, 2) librosa.display.specshow(D, y_axis='log') plt.colorbar(format='%+2.0f dB') plt.title('Log-frequency power spectrogram') # Or use a CQT scale CQT = librosa.amplitude_to_db(np.abs(librosa.cqt(y, sr=sr)), ref=np.max) plt.subplot(4, 2, 3) librosa.display.specshow(CQT, y_axis='cqt_note') plt.colorbar(format='%+2.0f dB') plt.title('Constant-Q power spectrogram (note)') plt.subplot(4, 2, 4) librosa.display.specshow(CQT, y_axis='cqt_hz') plt.colorbar(format='%+2.0f dB') plt.title('Constant-Q power spectrogram (Hz)') # Draw a chromagram with pitch classes C = librosa.feature.chroma_cqt(y=y, sr=sr) plt.subplot(4, 2, 5) librosa.display.specshow(C, y_axis='chroma')
spt = [] ins = [] n = 0 for instrument, note in itertools.product(range(128), range(50)): y, sr = librosa.load('./Audio_Classification/output.wav', sr=None, offset=n, duration=2.0) # from n to n+2 n += 2 # adding white noise for data argumentation (0,1e-4,1e-3). for r in (0, 1e-4, 1e-3): ret = librosa.cqt(y + ((np.random.rand(*y.shape) - 0.5) * r if r else 0), sr, hop_length=1024, n_bins=24 * 7, bins_per_octave=24) ret = np.abs(ret) spt.append(ret) # save spectrogram as a numpy list ins.append((instrument, 38 + note)) # save instrument's number and note (labeling) for note in range(46): y, sr = librosa.load('./Audio_Classification/output.wav', sr=None, offset=n, duration=2.0) n += 2
def calc_cqt(x, fs=fs, hop_length=hop_length, n_bins=n_bins, mag_exp=mag_exp): C = librosa.cqt(x, sr=fs, hop_length=hop_length, fmin=None, n_bins=n_bins) C_mag = librosa.magphase(C)[0]**mag_exp CdB = librosa.core.amplitude_to_db(C_mag, ref=np.max) return CdB
import librosa import librosa.display import matplotlib.pyplot as plt import numpy as np x, sr = librosa.load('test.wav') C = librosa.cqt( x, sr=sr) #, fmin=30, n_bins=16, bins_per_octave=2, hop_length=2**8) # print (C[:,5]) # C[0:3,:] = 0 # C[4:8,:] = 0 librosa.display.specshow(librosa.amplitude_to_db(np.abs(C), ref=np.max), sr=sr, x_axis='time', y_axis='cqt_note') plt.colorbar(format='%+2.0f dB') plt.title('Constant-Q power spectrum') plt.tight_layout() y = librosa.icqt(C, sr) #, fmin=30, bins_per_octave=2, hop_length=2**8) librosa.output.write_wav('testOut.wav', y, sr, norm=True) plt.show()
def test_cqt_fail_short_early(): # sampling rate is sufficiently above the top octave to trigger early downsampling y = np.zeros(16) librosa.cqt(y, sr=44100, n_bins=36)
def get_note_with_cqt_rms(filename): y, sr = librosa.load(filename) rms = librosa.feature.rmse(y=y)[0] rms = [x / np.std(rms) for x in rms] time = librosa.get_duration(filename=filename) print("time is {}".format(time)) CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=16000), ref=np.max) w, h = CQT.shape print("w.h is {},{}".format(w, h)) onsets_frames = get_real_onsets_frames_rhythm(y) CQT = np.where(CQT > -20, np.max(CQT), np.min(CQT)) result = [] last = 0 is_ok = 0 #print("max is {}".format(np.max(CQT))) for i in range(15, h - 10): is_ok = 0 last_j = 100 for j in range(w - 1, 15, -1): if CQT[j, i] == np.max(CQT) and CQT[j, i - 1] == np.min(CQT): if np.min(CQT[j, i:i + 5]) == np.max(CQT) and np.max( CQT[j, i - 4:i - 1]) == np.min(CQT) and i - last > 5: if np.min(CQT[j, i:i + 10]) == np.max(CQT) and np.mean( CQT[j, i - 5:i - 1]) == np.min(CQT): #print("3... is {},{},{}".format(CQT[j, i - 4:i - 3],CQT[j, i - 3:i-2],i)) is_ok += 2 break if last_j - j > 10: is_ok += 1 last_j = j elif np.min(CQT[j, i:i + 5]) == np.max(CQT) and is_ok == 1: is_ok += 1 # elif np.min(CQT[j, i+1:i + 2]) == np.max(CQT): # result.append(i) if rms[i + 1] > rms[i] and is_ok > 1: if len(result) == 0: result.append(i) last = i elif i - result[-1] > 10: result.append(i) last = i elif rms[i + 1] - rms[i - 1] > 0.75 and i > 50 and i < len(rms) - 45: if len(result) == 0: result.append(i) last = i elif i - result[-1] > 8: result.append(i) last = i rms_on_frames = [rms[x] for x in result] mean_rms_on_frames = np.mean(rms_on_frames) onstm = librosa.frames_to_time(result, sr=sr) #print("result is {}".format(result)) longest_note = [] for i in range(len(result)): x = result[i] if i < len(result) - 1: next_frame = result[i + 1] else: next_frame = result[-1] + 20 if result[-1] + 20 < CQT.shape[ 1] else CQT.shape[1] note_line = get_note_line_by_block_for_frames(x, CQT) #print("x,note_line is {},{}".format(x,note_line)) longest_note_line = find_the_longest_note_line(x, next_frame, CQT) longest_note.append(longest_note_line) #print("x,longest_note_line is {},{}".format(x, longest_note_line)) #print("longest_note is {}".format(longest_note)) # CQT[:,onsets_frames[1]:h] = -100 plt.subplot(3, 1, 1) total_frames_number = get_total_frames_number(filename) #print("total_frames_number is {}".format(total_frames_number)) # librosa.display.specshow(CQT) CQT, base_notes = add_base_note_to_cqt_for_filename_by_base_notes( filename, result, result[0], CQT, longest_note) base_notes = [ x + int(np.mean(longest_note) - np.mean(base_notes)) for x in base_notes ] #print("base_notes is {}".format(base_notes)) librosa.display.specshow(CQT, y_axis='cqt_note', x_axis='time') print(np.max(y)) # onstm = librosa.frames_to_time(onsets_frames, sr=sr) plt.vlines(onstm, 0, sr, color='y', linestyle='solid') plt.subplot(3, 1, 2) plt.text(onstm[0], 1, result[0]) max_rms = np.max(rms) # rms = np.diff(rms) times = librosa.frames_to_time(np.arange(len(rms))) # rms_on_onset_frames_cqt = [rms[x] for x in onset_frames_cqt] # min_rms_on_onset_frames_cqt = np.min(rms_on_onset_frames_cqt) # rms = [1 if x >=min_rms_on_onset_frames_cqt else 0 for x in rms] plt.plot(times, rms) plt.axhline(mean_rms_on_frames, color='r') # plt.axhline(min_rms_on_onset_frames_cqt) # plt.vlines(onsets_frames_rms_best_time, 0,np.max(rms), color='y', linestyle='solid') plt.vlines(onstm, 0, np.max(rms), color='y', linestyle='solid') # plt.vlines(base_onsets, 0, np.max(rms), color='r', linestyle='solid') plt.xlim(0, np.max(times)) plt.subplot(3, 1, 3) librosa.display.waveplot(y, sr=sr) return plt
def test_griffinlim_cqt( y_chirp, hop_length, window, use_length, over_sample, fmin, res_type, pad_mode, scale, momentum, init, random_state, dtype, ): if use_length: length = len(y_chirp) else: length = None sr = 22050 bins_per_octave = 12 * over_sample n_bins = 6 * bins_per_octave C = librosa.cqt( y_chirp, sr=sr, hop_length=hop_length, window=window, fmin=fmin, bins_per_octave=bins_per_octave, n_bins=n_bins, scale=scale, pad_mode=pad_mode, res_type=res_type, ) Cmag = np.abs(C) y_rec = librosa.griffinlim_cqt( Cmag, hop_length=hop_length, window=window, sr=sr, fmin=fmin, bins_per_octave=bins_per_octave, scale=scale, pad_mode=pad_mode, n_iter=3, momentum=momentum, random_state=random_state, length=length, res_type=res_type, init=init, dtype=dtype, ) y_inv = librosa.icqt( Cmag, sr=sr, fmin=fmin, hop_length=hop_length, window=window, bins_per_octave=bins_per_octave, scale=scale, length=length, res_type=res_type, ) # First check for length if use_length: assert len(y_rec) == length assert y_rec.dtype == dtype # Check that the data is okay assert np.all(np.isfinite(y_rec))
def compute_features(tid): features = pd.Series(index=columns(), dtype=np.float32, name=tid) # Catch warnings as exceptions (audioread leaks file descriptors). warnings.filterwarnings('error', module='librosa') def feature_stats(name, values): features[name, 'mean'] = np.mean(values, axis=1) features[name, 'std'] = np.std(values, axis=1) features[name, 'skew'] = stats.skew(values, axis=1) features[name, 'kurtosis'] = stats.kurtosis(values, axis=1) features[name, 'median'] = np.median(values, axis=1) features[name, 'min'] = np.min(values, axis=1) features[name, 'max'] = np.max(values, axis=1) try: filepath = utils.get_audio_path(os.environ.get('AUDIO_DIR'), tid) x, sr = librosa.load(filepath, sr=None, mono=True) # kaiser_fast f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512) feature_stats('zcr', f) cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12, n_bins=7*12, tuning=None)) assert cqt.shape[0] == 7 * 12 assert np.ceil(len(x)/512) <= cqt.shape[1] <= np.ceil(len(x)/512)+1 f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7) feature_stats('chroma_cqt', f) f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7) feature_stats('chroma_cens', f) f = librosa.feature.tonnetz(chroma=f) feature_stats('tonnetz', f) del cqt stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512)) assert stft.shape[0] == 1 + 2048 // 2 assert np.ceil(len(x)/512) <= stft.shape[1] <= np.ceil(len(x)/512)+1 del x f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12) feature_stats('chroma_stft', f) f = librosa.feature.rmse(S=stft) feature_stats('rmse', f) f = librosa.feature.spectral_centroid(S=stft) feature_stats('spectral_centroid', f) f = librosa.feature.spectral_bandwidth(S=stft) feature_stats('spectral_bandwidth', f) f = librosa.feature.spectral_contrast(S=stft, n_bands=6) feature_stats('spectral_contrast', f) f = librosa.feature.spectral_rolloff(S=stft) feature_stats('spectral_rolloff', f) mel = librosa.feature.melspectrogram(sr=sr, S=stft**2) del stft f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20) feature_stats('mfcc', f) except Exception as e: print('{}: {}'.format(tid, repr(e))) return features
y = librosa.resample(y_orig, sr_orig, sr) print("Resampled y and sr: ", len(y), sr) IPython.display.Audio(data=y, rate=sr) print(librosa.samples_to_time(len(y), sr), "sec") # Spectral Representations D = librosa.stft(y) print(D.shape, D.dtype) S, phase = librosa.magphase(D) print(S.dtype, phase.dtype, np.allclose(D, S * phase)) # Constant-Q Transform C = librosa.cqt(y, sr=sr) print(C.shape, C.dtype) # # librosa.feature # melspec = librosa.feature.melspectrogram(y=y, sr=sr) melspec_stft = librosa.feature.melspectrogram(S=S**2, sr=sr) print(np.allclose(melspec, melspec_stft)) chroma = librosa.feature.chroma_stft(y=y, sr=sr) # # librosa.display #
offset=5, duration=25) #y, sr = librosa.load('/home/bmcfee/working/Battles - Tonto-it1CCNCHPc0.mp3', # offset=300, duration=30) #y, sr = librosa.load('/home/bmcfee/working/Conlon Nancarrow, Study for Player Piano No. 21 (Canon X)-f2gVhBxwRqg.m4a', # duration=30, offset=60) # In[787]: over_sample = 3 res_factor = 1 C = librosa.cqt(y, sr=sr, hop_length=librosa['hop_length'], bins_per_octave=int(12*over_sample), n_bins=int(8 * 12 * over_sample), real=False, filter_scale=res_factor, fmin=librosa.note_to_hz('C1'), scale=True) # # TODO # # - Implement `scale` for icqt # In[789]: y2 = icqt(C, sr=sr, hop_length=librosa['hop_length'], bins_per_octave=int(12 * over_sample),
import matplotlib.pyplot as plt import IPython.display as ipd import numpy as np import librosa.display sr = 22050 # sample rate T = 2.0 # seconds t = np.linspace(0, T, int(T * sr), endpoint=False) # time variable x = 0.5 * np.sin(2 * np.pi * 440 * t) # pure sine wave at 440 Hz x, sr = librosa.load("audio/piano.wav") fmin = librosa.midi_to_hz(36) hop_length = 512 C = librosa.cqt(x, sr=sr, fmin=fmin, n_bins=72, hop_length=hop_length) logC = librosa.amplitude_to_db(np.abs(C)) plt.figure(figsize=(15, 5)) librosa.display.specshow(logC, sr=sr, x_axis='time', y_axis='cqt_note', fmin=fmin, cmap='coolwarm') chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length) plt.figure(figsize=(15, 5)) librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma',
audio = audio[indices[0]:indices[-1]] if indices.size else audio[0:0] return audio, sr #y, sr = librosa.load(filename) y, sr = load_and_trim(filename) chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr) # For display purposes, let's zoom in on a 15-second chunk from the middle of the song #idx = tuple([slice(None), slice(*list(librosa.time_to_frames([45, 60])))]) # And for comparison, we'll show the CQT matrix as well. #C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=12*3, n_bins=7*12*3)) C = np.abs(librosa.cqt(y=y, sr=sr)) plt.figure(figsize=(12, 4)) plt.subplot(2, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), y_axis='cqt_note') plt.colorbar() plt.subplot(2, 1, 2) librosa.display.specshow(chroma_orig, y_axis='chroma') plt.colorbar() plt.ylabel('Original') plt.tight_layout() chroma_os = librosa.feature.chroma_cqt(y=y, sr=sr) plt.figure(figsize=(12, 4))
life_on_mars_fname = "/Users/bgeelen/Music/iTunes/iTunes Media/Music/Compilations/Life on Mars/02 Life on Mars_.mp3" # life_on_mars_fname = '/Users/bgeelen/Downloads/Tones and I - Dance Monkey (Lyrics).mp3' life_on_mars_wav, sr = librosa.load(life_on_mars_fname) my_funny_valentine_fname = "/Users/bgeelen/Music/iTunes/iTunes Media/Music/Chet Baker/Chet Baker Sings/10 My Funny Valentine.mp3" #%% chromae = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] n_octaves = 5 n_bins = 12 * n_octaves + 1 hop_length = 128 sparsity = 0.9 fmin = librosa.note_to_hz("C2") C = librosa.cqt( life_on_mars_wav, sr=sr, hop_length=hop_length, fmin=fmin, sparsity=sparsity, n_bins=n_bins, ) abs_C = np.abs(C) log_C = np.log(abs_C + 0.01) #%% h, w = C.shape plt.figure(figsize=(7.5, 3), dpi=300) plt.imshow(log_C[:, :10 * sr // hop_length], interpolation="nearest", aspect="auto") plt.gca().invert_yaxis()
# # plz = cutter * sr # whatthe = y.shape # # if y.size > cutter * sr: # y = y[0:sr] # Normalize librosa.util.normalize(y, norm=1) # Let's make a spectrogram (freq, power) Spec = librosa.amplitude_to_db(abs(librosa.stft(y, n_fft=2048)), ref=np.max) # Let's make a CQT C = librosa.amplitude_to_db(abs(librosa.cqt(y, sr=sr))) # Let's make and display a mel-scaled power (energy-squared) spectrogram S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) # Convert to log scale (dB). We'll use the peak power (max) as reference. log_S = librosa.power_to_db(S, ref=np.max) # Next, we'll extract the top 13 Mel-frequency cepstral coefficients (MFCCs) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13) # Let's pad on the first and second deltas while we're at it delta_mfcc = librosa.feature.delta(mfcc) delta2_mfcc = librosa.feature.delta(mfcc, order=2) if len(Spec[0]) == len(C[0]) == len(S[0]) == len(log_S[0]) == len(