def transform_audio(audio, n_fft=2048, n_mels=40, sr=22050, hop_length=512, fmin=None, fmax=None): # Midi values of 24 (C2) and 120 (C10) are chosen, since humans typically # can't hear much beyond this range. if not fmin: fmin = librosa.midi_to_hz(24) if not fmax: fmax = librosa.midi_to_hz(120) # First stage is a mel-frequency specrogram of bounded range. mel = librosa.feature.melspectrogram(audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmax=fmax, fmin=fmin) # Second stage is log-amplitude; power is relative to peak in the signal. log_amplitude = librosa.logamplitude(mel, ref_power=np.max) # Third stage transposes the data so that frames become samples. # Its shape is: # (length of audio / frame duration, number of mel bands) transpose = np.transpose(log_amplitude) return (transpose, {'n_fft': n_fft, 'n_mels': n_mels, 'sr': sr, 'hop_length': hop_length, 'fmin': fmin, 'fmax': fmax})
def plot_gram(gram): ''' Plots a *gram (cqt-gram, pian roll-gram). :parameters: - gram : np.ndarray A 2-d representation of time/frequency, with frequencies being the notes between MIDI note 36 and 96. ''' librosa.display.specshow(gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96))
def getChromagram(filename): x, sr = librosa.load(filename) fmin = librosa.midi_to_hz(36) hop_length = 512 C = librosa.cqt(x, sr=sr, fmin=fmin, n_bins=72, hop_length=hop_length) chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length) return chromagram
def log_filter_bank_fn(): """ generate a logarithmic filterbank """ log_filter_bank_basis = madmom.audio.filters.LogarithmicFilterbank( bin_frequencies=librosa.fft_frequencies(sr=16000, n_fft=2048), num_bands=48, fmin=librosa.midi_to_hz([27])[0], fmax=librosa.midi_to_hz([114])[0] * 2. ** (1. / 48) ) log_filter_bank_basis = np.array(log_filter_bank_basis) assert log_filter_bank_basis.shape[1] == 229 assert np.abs(np.sum(log_filter_bank_basis[:, 0]) - 1.) < 1e-3 assert np.abs(np.sum(log_filter_bank_basis[:, -1]) - 1.) < 1e-3 return log_filter_bank_basis
def stft_module(y, plot=False): stft_spectrum = lb.stft(y, n_fft=1024, hop_length=512, center=True, dtype=np.complex64) stft = np.abs(stft_spectrum) #compute the amplitude if plot: #For testing plt.figure() print stft.shape # =(1 + n_fft/2, t), t=431 if hop_length=512 plt.subplot(211) # plt.imshow(stft) # plt.colorbar(format='%+2.0f dB') plt.plot(stft[:, 100]) # Plot a single frame plt.title('100th frame') plt.subplot(212) # Note:lb display must be just put after the other plots lb.display.specshow(lb.amplitude_to_db(stft), sr=sr, fmin=lb.midi_to_hz(RangeMIDInotes[0]), x_axis='time', y_axis='linear') plt.colorbar(format='%+2.0f dB') plt.title('STFT_Linear-frequency power spectrogram') return stft
def extract_cqt(audio_data): ''' CQT routine with default parameters filled in, and some post-processing. Parameters ---------- audio_data : np.ndarray Audio data to compute CQT of Returns ------- cqt : np.ndarray CQT of the supplied audio data. frame_times : np.ndarray Times, in seconds, of each frame in the CQT ''' # Compute CQT cqt = librosa.cqt(audio_data, sr=FS, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES, hop_length=HOP_LENGTH, tuning=0.) # Compute the time of each frame times = librosa.frames_to_time( np.arange(cqt.shape[1]), sr=FS, hop_length=HOP_LENGTH) # Use float32 for the cqt to save space/memory cqt = cqt.astype(np.float32) return cqt, times
def audio_cqt(audio_data, fs=AUDIO_FS): ''' Compute some audio data's constant-Q spectrogram, normalize, and log-scale it Parameters ---------- audio_data : np.ndarray Some audio signal. fs : int Sampling rate the audio data is sampled at, should be ``AUDIO_FS``. Returns ------- midi_gram : np.ndarray Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI data. ''' # Compute CQT of the synthesized audio data audio_gram = librosa.cqt(audio_data, sr=fs, hop_length=AUDIO_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) # L2-normalize and log-magnitute it return audio_gram
def test_cqt_position(): # synthesize a two second sine wave at midi note 60 sr = 22050 freq = librosa.midi_to_hz(60) y = np.sin(2 * np.pi * freq * np.linspace(0, 2.0, 2 * sr)) def __test(note_min): C = librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)) # Average over time Cbar = np.median(C, axis=1) # Find the peak idx = np.argmax(Cbar) eq_(idx, 60 - note_min) # Make sure that the max outside the peak is sufficiently small Cscale = Cbar / Cbar[idx] Cscale[idx] = np.nan assert np.nanmax(Cscale) < 1e-1 Cscale[idx-1:idx+2] = np.nan assert np.nanmax(Cscale) < 1e-2 for note_min in [12, 18, 24, 30, 36]: yield __test, note_min
def test_midi_to_hz_is_accurate(self): """Tests converting between MIDI values and their frequencies in hertz.""" midi = np.arange(128) librosa_hz = librosa.midi_to_hz(midi) with self.cached_session() as sess: tf_hz = sess.run(core.midi_to_hz(midi)) self.assertAllClose(librosa_hz, tf_hz)
def shift_f0(audio_features, pitch_shift=0.0): """Shift f0 by a number of ocatves.""" audio_features['f0_hz'] *= 2.0 ** (pitch_shift) audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 0.0, librosa.midi_to_hz(110.0)) return audio_features
def __init__(self, parent, controller): tk.Frame.__init__(self, parent) label = tk.Label(self, text='Constant-Q Plot') label.pack(side=tk.TOP) prev_plot = tk.Button(self, text='<--Prev Plot', command=lambda: controller.show_frame('MelView')) prev_plot.pack(side=tk.BOTTOM) next_plot = tk.Button(self, text='Next Plot-->', command=lambda: controller.show_frame('OnsetView')) next_plot.pack(side=tk.TOP) ## call fetch data for plots plt.style.use('ggplot') c = controller.fetch_data('const_q') fig = plt.figure(figsize=(10,10), dpi=100) # make figure fig.add_subplot(111) fmin = lsa.midi_to_hz(48) lsa.display.specshow(c, x_axis='time', y_axis='cqt_note', fmin=fmin, cmap='coolwarm') plt.tight_layout() canvas = FigureCanvasTkAgg(fig, self) canvas.get_tk_widget().pack(side=tk.TOP, expand=True)
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512): ''' Feature extraction for audio data. Gets a power CQT of harmonic component and onset strength signal of percussive. Input: midi - pretty_midi.PrettyMIDI object fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this Output: audio_gram - CQT of audio data audio_onset_strength - onset strength signal ''' # Use harmonic part for gram, percussive part for onsets H, P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) # Compute log-frequency spectrogram of original audio audio_gram = np.abs(librosa.cqt(y=audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins = 60))**2 # Beat track the audio file at 4x the hop rate audio_onset_strength = librosa.onset.onset_strength(audio_percussive , hop_length=hop/4, sr=fs) return audio_gram, audio_onset_strength
def midi_cqt(midi_object): ''' Synthesize MIDI data, compute its constant-Q spectrogram, normalize, and log-scale it Parameters ---------- midi_object : pretty_midi.PrettyMIDI MIDI data to create constant-Q spectrogram of. Returns ------- midi_gram : np.ndarray Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI data. ''' # Synthesize MIDI object as audio data midi_audio = fast_fluidsynth(midi_object, MIDI_FS) # Compute CQT of the synthesized audio data midi_gram = librosa.cqt(midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) # L2-normalize and log-magnitute it return midi_gram
def post_process_features(gram, beats): ''' Apply processing to a feature matrix given the supplied param values Parameters ---------- gram : np.ndarray Feature matrix, shape (n_features, n_samples) beats : np.ndarray Indices of beat locations in gram Returns ------- gram : np.ndarray Feature matrix, shape (n_samples, n_features), post-processed according to the values in `params` ''' # Convert to chroma if params['feature'] == 'chroma': gram = librosa.feature.chroma_cqt( C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START)) # Beat-synchronize the feature matrix if params['beat_sync']: gram = librosa.feature.sync(gram, beats, pad=False) # Compute log magnitude gram = librosa.logamplitude(gram, ref_power=gram.max()) # Normalize the feature vectors gram = librosa.util.normalize(gram, norm=params['norm']) # Standardize the feature vectors if params['standardize']: gram = scipy.stats.mstats.zscore(gram, axis=1) # Transpose it to (n_samples, n_features) and return it return gram.T
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512): ''' Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT. Input: midi - pretty_midi.PrettyMIDI object sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512 Output: midi_gram - Simulated CQT of the midi data ''' # Synthesize the MIDI using the supplied sf2 path midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path) # Use the harmonic part of the signal H, P = librosa.decompose.hpss(librosa.stft(midi_audio)) midi_audio_harmonic = librosa.istft(H) # Compute log frequency spectrogram of audio synthesized from MIDI midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60, tuning=0.0))**2 return midi_gram
def process_one_file(midi_filename, skip=True): ''' Load in midi data, compute features, and write out file :parameters: - midi_filename : str Full path to midi file - skip : bool Whether to skip creating the file when the npz already exists ''' # npz files go in the 'npz' dir instead of 'mid' output_filename = mid_to_npz_path(midi_filename) # Skip files already created if skip and os.path.exists(output_filename): return try: m = pretty_midi.PrettyMIDI(midi_filename) midi_audio = alignment_utils.fast_fluidsynth(m, MIDI_FS) midi_gram = librosa.cqt( midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) midi_beats, midi_tempo = alignment_utils.midi_beat_track(m) midi_sync_gram = alignment_utils.post_process_cqt( midi_gram, librosa.time_to_frames( midi_beats, sr=MIDI_FS, hop_length=MIDI_HOP)) np.savez_compressed( output_filename, sync_gram=midi_sync_gram, beats=midi_beats, bpm=midi_tempo) except Exception as e: print "Error processing {}: {}".format(midi_filename, e)
def _midi_to_hz(x, idx, log_f0=False): z = np.zeros(len(x)) indices = x[:, idx] > 0 z[indices] = librosa.midi_to_hz(x[indices, idx]) if log_f0: z[indices] = np.log(z[indices]) return z
def test_midi_to_hz_is_accurate(self): """Tests converting between MIDI values and their frequencies in hertz """ midi = np.arange(128) librosa_hz = librosa.midi_to_hz(midi) th_hz = core.midi_to_hz(midi) assert np.allclose(librosa_hz, th_hz)
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512): ''' Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT. Input: midi - pretty_midi.PrettyMIDI object sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512 Output: midi_gram - Simulated CQT of the midi data ''' # Create a copy of the midi object midi_no_drums = copy.deepcopy(midi) # Remove the drums for n, instrument in enumerate(midi_no_drums.instruments): if instrument.is_drum: del midi_no_drums.instruments[n] # Synthesize the MIDI using the supplied sf2 path midi_audio = midi_no_drums.fluidsynth(fs=fs, sf2_path=sf2_path) # midi_audio = midi_no_drums.synthesize(fs = fs) # Use the harmonic part of the signal H, P = librosa.decompose.hpss(librosa.stft(midi_audio)) midi_audio_harmonic = librosa.istft(H) # Compute log frequency spectrogram of audio synthesized from MIDI midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins = 60, tuning=0.0))**2 return midi_gram
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512): ''' Feature extraction for audio data. Gets a power CQT of harmonic component and onset strength signal of percussive. Input: midi - pretty_midi.PrettyMIDI object fs - sampling rate to synthesize audio at, default 22050 hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this Output: audio_gram - CQT of audio data audio_onset_strength - onset strength signal ''' # Use harmonic part for gram, percussive part for onsets H, P = librosa.decompose.hpss(librosa.stft(audio)) audio_harmonic = librosa.istft(H) audio_percussive = librosa.istft(P) # Compute log-frequency spectrogram of original audio audio_gram = np.abs(librosa.cqt(y=audio_harmonic, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(36), n_bins=60))**2 # Beat track the audio file at 4x the hop rate audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs) return audio_gram, audio_onset_strength
def test_estimate_tuning(): def __test(target_hz, resolution, bins_per_octave, tuning): y = np.sin(2 * np.pi * target_hz * t) tuning_est = librosa.estimate_tuning(resolution=resolution, bins_per_octave=bins_per_octave, y=y, sr=sr, n_fft=2048, fmin=librosa.note_to_hz('C4'), fmax=librosa.note_to_hz('G#9')) # Round to the proper number of decimals deviation = np.around(np.abs(tuning - tuning_est), int(-np.log10(resolution))) # We'll accept an answer within three bins of the resolution assert deviation <= 3 * resolution for sr in [11025, 22050]: duration = 5.0 t = np.linspace(0, duration, int(duration * sr)) for resolution in [1e-2]: for bins_per_octave in [12]: # test a null-signal tuning estimate yield (__test, 0.0, resolution, bins_per_octave, 0.0) for center_note in [69, 84, 108]: for tuning in np.linspace(-0.5, 0.5, 8, endpoint=False): target_hz = librosa.midi_to_hz(center_note + tuning) yield (__test, np.asscalar(target_hz), resolution, bins_per_octave, tuning)
def test_FreqToPitchClass(self, low='A1', high='A6', res=120): def check(freqs, labels, ifreqs): self.assertIsInstance(labels, np.ndarray) self.assertEqual(labels.dtype, np.int) for freq, label, ifreq in zip(freqs, labels, ifreqs): if freq >= fLow and freq < fHigh: self.assertGreaterEqual(label, 0) self.assertLess(label, res) self.assertGreater(freq, ifreq) self.assertLess(freq, ifreq * rDelta) elif freq >= fHigh: self.assertEqual(label, res - 1) elif freq < fLow and freq > 0: self.assertEqual(label, 0) else: self.assertEqual(label, -1) fLow = librosa.note_to_hz(low) fHigh = librosa.note_to_hz(high) rDelta = np.power(fHigh / fLow, 1. / res) self.assertGreater(rDelta, 1.) transform = FreqToPitchClass(low=low, high=high, resolution=res) labels = np.arange(-10, 128 + 10) rfreqs = np.append(librosa.midi_to_hz(labels), 0) rfreqs = np.repeat(rfreqs, 30) freqs = rfreqs * np.random.uniform( low=1. / rDelta, high=rDelta, size=rfreqs.shape) # add some deviation sample = {'freqs': freqs} t_sample = transform(sample) it_sample = transform.inv(copy(t_sample)) check(freqs, t_sample['labels'], it_sample['freqs'])
def gen_onsets_info(data, t_unit=0.02): #logging.debug("Data shape: %s", data.shape) pitches = [] intervals = [] lowest_pitch = librosa.note_to_midi("A0") for i in range(data.shape[1]): notes = find_occur(data[:, i], t_unit=t_unit) it = [] for nn in notes: it.append([nn["onset"]*t_unit, (nn["onset"]+2)*t_unit]) if len(intervals)==0 and len(it) > 0: intervals = np.array(it) elif len(it) > 0: intervals = np.concatenate((intervals, np.array(it)), axis=0) # hz = CentralFrequency[i] hz = librosa.midi_to_hz(lowest_pitch+i) for i in range(len(it)): pitches.append(hz) if type(intervals) == list: intervals = np.array([]).reshape((0, 2)) pitches = np.array(pitches) return intervals, pitches
def features(filename): # print '\t[1/5] loading audio' y, sr = librosa.load(filename, sr=SR) # print '\t[2/5] Separating harmonic and percussive signals' y_perc, y_harm = hp_sep(y) # print '\t[3/5] detecting beats' bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH) # print '\t[4/5] generating CQT' M1 = np.abs( librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72) ) M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max) # print '\t[5/5] generating MFCC' S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS) M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC) n = min(M1.shape[1], M2.shape[1]) beats = beats[beats < n] beats = np.unique(np.concatenate([[0], beats])) times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH) times = np.concatenate([times, [float(len(y)) / sr]]) M1 = librosa.feature.sync(M1, beats, aggregate=np.median) M2 = librosa.feature.sync(M2, beats, aggregate=np.mean) return (M1, M2), times
def test_cqt_position(): # synthesize a two second sine wave at midi note 60 sr = 22050 freq = librosa.midi_to_hz(60) y = np.sin(2 * np.pi * freq * np.linspace(0, 2.0, 2 * sr)) def __test(note_min): C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2 # Average over time Cbar = np.median(C, axis=1) # Find the peak idx = np.argmax(Cbar) eq_(idx, 60 - note_min) # Make sure that the max outside the peak is sufficiently small Cscale = Cbar / Cbar[idx] Cscale[idx] = np.nan assert np.nanmax(Cscale) < 6e-1, Cscale Cscale[idx - 1:idx + 2] = np.nan assert np.nanmax(Cscale) < 5e-2, Cscale for note_min in [12, 18, 24, 30, 36]: yield __test, note_min
def gen_onsets_info_from_notes(midi_notes, t_unit=0.02): intervals = [] pitches = [] for note in midi_notes: intervals.append([note.start, note.end]) pitches.append(librosa.midi_to_hz(note.pitch)) return np.array(intervals), np.array(pitches)
def save_spectrogram_plot(audio: Any, sample_rate: int = 16000, filename: Optional[str] = None, output_dir: str = "output") -> None: """ Saves the spectrogram plot of the given audio to the given filename in the given output_dir. The resulting plot is a Constant-Q transform (CQT) spectrogram with the vertical axis being the amplitude converted to dB-scale. :param audio: the audio content, as a floating point time series :param sample_rate: the sampling rate of the file :param filename: the optional filename, set to "%Y-%m-%d_%H%M%S".png if None :param output_dir: the output dir """ os.makedirs(output_dir, exist_ok=True) # Pitch min and max corresponds to the pitch min and max # of the wavenet training checkpoint pitch_min = np.min(36) pitch_max = np.max(84) frequency_min = librosa.midi_to_hz(pitch_min) frequency_max = 2 * librosa.midi_to_hz(pitch_max) octaves = int(np.ceil(np.log2(frequency_max) - np.log2(frequency_min))) bins_per_octave = 32 num_bins = int(bins_per_octave * octaves) hop_length = 2048 constant_q_transform = librosa.cqt(audio, sr=sample_rate, hop_length=hop_length, fmin=frequency_min, n_bins=num_bins, bins_per_octave=bins_per_octave) plt.figure() plt.axis("off") librosa.display.specshow(librosa.amplitude_to_db(constant_q_transform, ref=np.max), sr=sample_rate) if not filename: date_and_time = time.strftime("%Y-%m-%d_%H%M%S") filename = f"{date_and_time}.png" path = os.path.join(output_dir, filename) plt.savefig(fname=path, dpi=600) plt.close()
def get_cqt(y, filter_scale=1): return np.abs( librosa.cqt(y, sr=44100, hop_length=1024, fmin=librosa.midi_to_hz(36), n_bins=84 * 2, bins_per_octave=12 * 2, filter_scale=filter_scale)).T
def _load_f0(f0_path): with open(f0_path) as fhandle: lines = fhandle.readlines() f0_midi = np.array([float(line) for line in lines]) f0_hz = librosa.midi_to_hz(f0_midi) * (f0_midi > 0) confidence = (f0_hz > 0).astype(int) times = np.arange(len(f0_midi)) * IKALA_TIME_STEP f0_data = F0Data(times, f0_hz, confidence) return f0_data
def play(self, note: Note, smpRt: int) -> np.ndarray: event, c_pitch = self.get_event(note.pitch) ratio = (len(event.data) / smpRt) / note.duration ratio = np.clip(ratio, 0.5, 100) shift = note.pitch - c_pitch wave = lr.effects.harmonic(event.data) wave = (Fx().pitch(shift * 100).tempo(ratio).highpass( lr.midi_to_hz(note.pitch)))(wave) wave = envl.adsr(len(wave))(wave) return wave
def show_cqt(y, sr, bins_per_octave): C = np.abs( librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(0), n_bins=5 * bins_per_octave, bins_per_octave=bins_per_octave)) import librosa.display as display display.specshow(librosa.amplitude_to_db(C, ref=np.max), sr=sr, x_axis='time', y_axis='cqt_note', bins_per_octave=bins_per_octave, fmin=librosa.midi_to_hz(0), fmax=librosa.midi_to_hz(127)) plt.colorbar(format='%+2.0f dB') plt.title('Constant-Q power spectrum') plt.tight_layout() plt.show()
def freq(tone): # convert midi/note to hz. # see https://github.com/gpiozero/gpiozero/blob/master/gpiozero/tones.py#L114 # 0 is treated as None, not midi 8.1hz if isinstance(tone, str): return librosa.note_to_hz(tone) elif isinstance(tone, int) and 0 < tone < 128: return librosa.midi_to_hz(tone) else: return tone
def initialize_components_unlimited_partials(signal, pitches, phi=1): n_features, _n_samples = signal.S.shape n_components = len(pitches) W_init = numpy.zeros((n_features, n_components)) fft_freqs = librosa.fft_frequencies(signal.sr, signal.n_fft) for i, pitch in enumerate(pitches): #print(i, pitch) # freq = librosa.midi_to_hz(pitch) partial = 1 while True: min_freq = librosa.midi_to_hz(pitch - phi) * partial max_freq = librosa.midi_to_hz(pitch + phi) * partial max_freq = min(fft_freqs[-1], max_freq) intensity = 1 / (partial**2) #print('\t%s-%s (%s-%s): %s' % (freq_to_bin(min_freq),freq_to_bin(max_freq), min_freq, max_freq, intensity)) W_init[freq_to_bin(min_freq, fft_freqs):freq_to_bin(max_freq, fft_freqs),i] = intensity if max_freq >= fft_freqs[-1]: break partial += 1 return W_init
def wav2inputnp(audio_fn, spec_type='cqt', bin_multiple=3): print("wav2inputnp") bins_per_octave = 12 * bin_multiple #should be a multiple of 12 n_bins = (max_midi - min_midi + 1) * bin_multiple #down-sample,mono-channel y, _ = librosa.load(audio_fn, sr) S = librosa.cqt(y, fmin=librosa.midi_to_hz(min_midi), sr=sr, hop_length=hop_length, bins_per_octave=bins_per_octave, n_bins=n_bins) S = S.T #TODO: LogScaleSpectrogram? ''' if spec_type == 'cqt': #down-sample,mono-channel y,_ = librosa.load(audio_fn,sr) S = librosa.cqt(y,fmin=librosa.midi_to_hz(min_midi), sr=sr, hop_length=hop_length, bins_per_octave=bins_per_octave, n_bins=n_bins) S = S.T else: #down-sample,mono-channel y = madmom.audio.signal.Signal(audio_fn, sample_rate=sr, num_channels=1) S = madmom.audio.spectrogram.LogarithmicFilteredSpectrogram(y,fmin=librosa.midi_to_hz(min_midi), hop_size=hop_length, num_bands=bins_per_octave, fft_size=4096)''' #S = librosa.amplitude_to_db(S) S = np.abs(S) minDB = np.min(S) print(np.min(S), np.max(S), np.mean(S)) S = np.pad(S, ((window_size // 2, window_size // 2), (0, 0)), 'constant', constant_values=minDB) windows = [] # IMPORTANT NOTE: # Since we pad the the spectrogram frame, # the onset frames are actually `offset` frames. # To obtain a window of the center frame at each true index, we take a slice from i to i+window_size # starting at frame 0 of the padded spectrogram for i in range(S.shape[0] - window_size + 1): w = S[i:i + window_size, :] windows.append(w) #print inputs x = np.array(windows) return x
def test_cq_to_chroma(): def __test(n_bins, bins_per_octave, n_chroma, fmin, base_c, window): # Fake up a cqt matrix with the corresponding midi notes if fmin is None: midi_base = 24 # C2 else: midi_base = librosa.hz_to_midi(fmin) midi_notes = np.linspace(midi_base, midi_base + n_bins * 12.0 / bins_per_octave, endpoint=False, num=n_bins) # We don't care past 2 decimals here. # the log2 inside hz_to_midi can cause problems though. midi_notes = np.around(midi_notes, decimals=2) C = np.diag(midi_notes) cq2chr = librosa.filters.cq_to_chroma(n_input=C.shape[0], bins_per_octave=bins_per_octave, n_chroma=n_chroma, fmin=fmin, base_c=base_c, window=window) chroma = cq2chr.dot(C) for i in range(n_chroma): v = chroma[i][chroma[i] != 0] v = np.around(v, decimals=2) if base_c: resid = np.mod(v, 12) else: resid = np.mod(v - 9, 12) resid = np.round(resid * n_chroma / 12.0) assert np.allclose(np.mod(i - resid, 12), 0.0), i-resid for n_octaves in [2, 3, 4]: for semitones in [1, 3]: for n_chroma in 12 * np.arange(1, 1 + semitones): for fmin in [None] + list(librosa.midi_to_hz(range(48, 61))): for base_c in [False, True]: for window in [None, [1]]: bins_per_octave = 12 * semitones n_bins = n_octaves * bins_per_octave if np.mod(bins_per_octave, n_chroma) != 0: tf = raises(librosa.ParameterError)(__test) else: tf = __test yield (tf, n_bins, bins_per_octave, n_chroma, fmin, base_c, window)
def synthesize(self, sample_rate=44100): samples = np.zeros(np.round(np.ceil(sample_rate * self.duration)).astype(int)) for pitch, start, end in self.notes: i = np.round(self.tick_to_time(start) * sample_rate).astype(int) j = np.round(self.tick_to_time(end) * sample_rate).astype(int) buffer = np.sin(librosa.midi_to_hz(pitch) * 2 * np.pi * np.arange(j - i) / sample_rate) buffer *= 1 - np.linspace(0, 1, len(buffer)) ** 2 samples[i:j] += buffer return display.Audio(samples, rate=sample_rate)
def read_midi(path, samplers, polyphonic=False): mid = mido.MidiFile(path) out = [] for i, track in enumerate(mid.tracks): sampler = samplers[i] notes = {} last_note = (None, 0, 0) t_ptr = 0 for msg in track: t_ptr += msg.time tempo = 500000 if msg.type == "note_on": notes[msg.note] = (t_ptr, msg.velocity / 255) if last_note[0] != None and not polyphonic: note, start, vel = last_note dur = t_ptr - start dur = mido.tick2second(dur, mid.ticks_per_beat, tempo) start = mido.tick2second(start, mid.ticks_per_beat, tempo) if dur > 0.1: note = NoteObject( Note(lr.midi_to_hz(msg.note), vel, dur), sampler, start) out.append(note) last_note = (msg.note, t_ptr, msg.velocity / 255) if msg.type == "note_off": try: start, vel = notes[msg.note] dur = t_ptr - start dur = mido.tick2second(dur, mid.ticks_per_beat, tempo) start = mido.tick2second(start, mid.ticks_per_beat, tempo) note = NoteObject(Note(lr.midi_to_hz(msg.note), vel, dur), sampler, start) out.append(note) except: print("Warning: Problems reading MIDI.") if msg.type == "set_tempo": tempo = msg.tempo print(f"Read MIDI tempo: {tempo}") return Structure(out, 0)
def file_to_chromagram(file_name): sr = 44100 x, sr = librosa.load(file_name, sr=sr) # .wav file and its sampling rate fmin = librosa.midi_to_hz(22) # minimal key on our chromagram will be A0 hop_length = 256 # needed for Constant-Q Transform amplitude = librosa.cqt(x[:120 * 44100], sr=sr, fmin=fmin, n_bins=108, hop_length=hop_length) chromagram = librosa.amplitude_to_db(np.abs(amplitude)) return chromagram
def gram_to_beat_chroma(gram): ''' Converts a pre-computed CQT to a beat-synchronous chromagram, transposed so that the first dimension are features and the second are time frames. This implements all that is needed to convert pre-computed CQTs to the format used in the 2DFTM experiments. Parameters ---------- gram : np.ndarray Constant-Q spectrogram, shape=(n_frames, n_frequency_bins) Returns ------- chroma : np.ndarray Beat-synchronous chroma matrix, shape (n_frequency_bins, n_beats) ''' # Transpose to match librosa's format librosa gram = np.array(gram.T) # Because CQTs have spectra which are pre-L2-normalized, their range is # [-some number, 0]; this causes issues for the max-normalization which # happens below. This rescales to [0, some_number] gram -= gram.min() # Compute beats tempo, beats = librosa.beat.beat_track( onset_envelope=librosa.onset.onset_strength(S=gram), sr=feature_extraction.AUDIO_FS, hop_length=feature_extraction.AUDIO_HOP) # Make sure librosa didn't report 0 or 1 beats if beats.shape[0] < 2: # In this degenerate case, just put a beat at the beginning and the end # This, combined with the following interpolation, will result in an # even segmentation of the CQT into integrated frames beats = np.array([0, gram.shape[1]]) # 2DFTM requires there to be at least 75 beats, so double the tempo until # there are 75 beats while beats.shape[0] < 75: # Linearly interpolate beats between all the existing beats interped_beats = np.empty(2 * beats.shape[0] - 1) interped_beats[::2] = beats interped_beats[1::2] = beats[:-1] + np.diff(beats) / 2. beats = interped_beats # Compute CQT from chroma, without any built-in normalization or threshold chroma = librosa.feature.chroma_cqt(C=gram, norm=None, threshold=None, fmin=librosa.midi_to_hz( feature_extraction.NOTE_START)) # Compute beat-synchronous chroma beat_chroma = librosa.feature.sync(chroma, beats) # Max-normalize the result - this is done in Thierry/DAn's msd_beatchroma beat_chroma = librosa.util.normalize(beat_chroma) return beat_chroma
def _load_f0(f0_path): if not os.path.exists(f0_path): return None with open(f0_path) as fhandle: lines = fhandle.readlines() f0_midi = np.array([float(line) for line in lines]) f0_hz = librosa.midi_to_hz(f0_midi) * (f0_midi > 0) confidence = (f0_hz > 0).astype(float) times = (np.arange(len(f0_midi)) * TIME_STEP) + (TIME_STEP / 2.0) f0_data = utils.F0Data(times, f0_hz, confidence) return f0_data
def create_timbre_spectrogram(audio, hparams): """Create either a CQT or mel spectrogram""" if tf.is_tensor(audio): audio = audio.numpy() if isinstance(audio, bytes): # Get samples from wav data. samples = audio_io.wav_data_to_samples(audio, hparams.sample_rate) else: samples = audio if hparams.timbre_spec_type == 'mel': spec = np.abs( librosa.feature.melspectrogram( samples, hparams.sample_rate, hop_length=hparams.timbre_hop_length, fmin=librosa.midi_to_hz(constants.MIN_TIMBRE_PITCH), fmax=librosa.midi_to_hz(constants.MAX_TIMBRE_PITCH), n_mels=constants.TIMBRE_SPEC_BANDS, pad_mode='symmetric', htk=hparams.spec_mel_htk, power=2)).T else: spec = np.abs( librosa.core.cqt(samples, hparams.sample_rate, hop_length=hparams.timbre_hop_length, fmin=librosa.midi_to_hz( constants.MIN_TIMBRE_PITCH), n_bins=constants.TIMBRE_SPEC_BANDS, bins_per_octave=constants.BINS_PER_OCTAVE, pad_mode='symmetric')).T # convert amplitude to power if hparams.timbre_spec_log_amplitude: spec = librosa.power_to_db(spec) - librosa.power_to_db(np.array([1e-9 ]))[0] spec = spec / np.max(spec) return spec
def gram_to_beat_chroma(gram): ''' Converts a pre-computed CQT to a beat-synchronous chromagram, transposed so that the first dimension are features and the second are time frames. This implements all that is needed to convert pre-computed CQTs to the format used in the 2DFTM experiments. Parameters ---------- gram : np.ndarray Constant-Q spectrogram, shape=(n_frames, n_frequency_bins) Returns ------- chroma : np.ndarray Beat-synchronous chroma matrix, shape (n_frequency_bins, n_beats) ''' # Transpose to match librosa's format librosa gram = np.array(gram.T) # Because CQTs have spectra which are pre-L2-normalized, their range is # [-some number, 0]; this causes issues for the max-normalization which # happens below. This rescales to [0, some_number] gram -= gram.min() # Compute beats tempo, beats = librosa.beat.beat_track( onset_envelope=librosa.onset.onset_strength(S=gram), sr=feature_extraction.AUDIO_FS, hop_length=feature_extraction.AUDIO_HOP) # Make sure librosa didn't report 0 or 1 beats if beats.shape[0] < 2: # In this degenerate case, just put a beat at the beginning and the end # This, combined with the following interpolation, will result in an # even segmentation of the CQT into integrated frames beats = np.array([0, gram.shape[1]]) # 2DFTM requires there to be at least 75 beats, so double the tempo until # there are 75 beats while beats.shape[0] < 75: # Linearly interpolate beats between all the existing beats interped_beats = np.empty(2 * beats.shape[0] - 1) interped_beats[::2] = beats interped_beats[1::2] = beats[:-1] + np.diff(beats) / 2. beats = interped_beats # Compute CQT from chroma, without any built-in normalization or threshold chroma = librosa.feature.chroma_cqt( C=gram, norm=None, threshold=None, fmin=librosa.midi_to_hz(feature_extraction.NOTE_START)) # Compute beat-synchronous chroma beat_chroma = librosa.feature.sync(chroma, beats) # Max-normalize the result - this is done in Thierry/DAn's msd_beatchroma beat_chroma = librosa.util.normalize(beat_chroma) return beat_chroma
def test_pitch_tuning(): def __test(hz, resolution, bins_per_octave, tuning): est_tuning = librosa.pitch_tuning(hz, resolution=resolution, bins_per_octave=bins_per_octave) assert np.abs(tuning - est_tuning) <= resolution for resolution in [1e-2, 1e-3]: for bins_per_octave in [12]: # Make up some frequencies for tuning in [-0.5, -0.375, -0.25, 0.0, 0.25, 0.375]: note_hz = librosa.midi_to_hz(tuning + np.arange(128)) yield __test, note_hz, resolution, bins_per_octave, tuning
def __test(note_min): C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2 # Average over time Cbar = np.median(C, axis=1) # Find the peak idx = np.argmax(Cbar) eq_(idx, 60 - note_min) # Make sure that the max outside the peak is sufficiently small Cscale = Cbar / Cbar[idx] Cscale[idx] = np.nan assert np.nanmax(Cscale) < 6e-1, Cscale Cscale[idx-1:idx+2] = np.nan assert np.nanmax(Cscale) < 5e-2, Cscale
def get_drum_wav(percussion, width=5, n=None): # Compute volume shaper percussion = librosa.util.normalize(percussion.ravel()) v = scipy.ndimage.median_filter(percussion, width, mode='mirror') v = np.atleast_2d(v) wav = synthesize(librosa.frames_to_samples(np.arange(v.shape[-1]), hop_length=hop_length), v, fmin=librosa.midi_to_hz(0), bins_per_octave=12, wave=noise, n=n)[0] return wav
def test_estimate_tuning(): def __test(target_hz, resolution, bins_per_octave, tuning): y = np.sin(2 * np.pi * target_hz * t) tuning_est = librosa.estimate_tuning(resolution=resolution, bins_per_octave=bins_per_octave, y=y, sr=sr, n_fft=2048, fmin=librosa.note_to_hz('C4'), fmax=librosa.note_to_hz('G#9')) print('target_hz={:.3f}'.format(target_hz)) print('tuning={:.3f}, estimated={:.3f}'.format(tuning, tuning_est)) print('resolution={:.2e}'.format(resolution)) # Round to the proper number of decimals deviation = np.around(np.abs(tuning - tuning_est), int(-np.log10(resolution))) # We'll accept an answer within three bins of the resolution assert deviation <= 3 * resolution for sr in [11025, 22050]: duration = 5.0 t = np.linspace(0, duration, duration * sr) for resolution in [1e-2]: for bins_per_octave in [12]: # test a null-signal tuning estimate yield (__test, 0.0, resolution, bins_per_octave, 0.0) for center_note in [69, 84, 108]: for tuning in np.linspace(-0.5, 0.5, 8, endpoint=False): target_hz = librosa.midi_to_hz(center_note + tuning) yield (__test, np.asscalar(target_hz), resolution, bins_per_octave, tuning)
def extract_cqt(audio_data, fs, hop, note_start, n_notes): ''' Compute a log-magnitude L2-normalized constant-Q-gram of some audio data. Parameters ---------- audio_data : np.ndarray Audio data to compute CQT of fs : int Sampling rate of audio hop : int Hop length for CQT note_start : int Lowest MIDI note number for CQT n_notes : int Number of notes to include in the CQT Returns ------- cqt : np.ndarray Log-magnitude L2-normalized CQT of the supplied audio data. frame_times : np.ndarray Times, in seconds, of each frame in the CQT ''' # Compute CQT cqt = librosa.cqt( audio_data, sr=fs, hop_length=hop, fmin=librosa.midi_to_hz(note_start), n_bins=n_notes) # Transpose so that rows are spectra cqt = cqt.T # Compute log-amplitude cqt = librosa.logamplitude(cqt, ref_power=cqt.max()) # L2 normalize the columns cqt = librosa.util.normalize(cqt, norm=2., axis=1) # Compute the time of each frame times = librosa.frames_to_time(np.arange(cqt.shape[0]), fs, hop) return cqt, times
def audio_cqt(audio_data, fs=AUDIO_FS): ''' Compute some audio data's constant-Q spectrogram, normalize, and log-scale it Parameters ---------- audio_data : np.ndarray Some audio signal. fs : int Sampling rate the audio data is sampled at, should be ``AUDIO_FS``. Returns ------- midi_gram : np.ndarray Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI data. ''' # Compute CQT of the synthesized audio data audio_gram = librosa.cqt( audio_data, sr=fs, hop_length=AUDIO_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) # L2-normalize and log-magnitute it return post_process_cqt(audio_gram)
def midi_cqt(midi_object): ''' Synthesize MIDI data, compute its constant-Q spectrogram, normalize, and log-scale it Parameters ---------- midi_object : pretty_midi.PrettyMIDI MIDI data to create constant-Q spectrogram of. Returns ------- midi_gram : np.ndarray Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI data. ''' # Synthesize MIDI object as audio data midi_audio = fast_fluidsynth(midi_object, MIDI_FS) # Compute CQT of the synthesized audio data midi_gram = librosa.cqt( midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) # L2-normalize and log-magnitute it return post_process_cqt(midi_gram)
def get_wav(cq, nmin=60, nmax=120, width=5, max_peaks=1, wave=None, n=None): # Slice down to the bass range cq = cq[nmin:nmax] # Pick peaks at each time mask = peakgram(librosa.logamplitude(cq**2, top_db=60, ref_power=np.max), max_peaks=max_peaks) # Smooth in time mask = scipy.ndimage.median_filter(mask, size=(1, width), mode='mirror') # resynthesize with some magnitude compression wav = synthesize(librosa.frames_to_samples(np.arange(cq.shape[-1]), hop_length=hop_length), mask * cq**(1./3), fmin=librosa.midi_to_hz(nmin + MIDI_MIN), bins_per_octave=12, wave=wave, n=n)[0] return wav
def extract_features(audio_data): ''' Feature extraction routine - gets beat-synchronous CQT, beats, and bpm :parameters: - audio_data : np.ndarray Audio samples at 22 kHz :returns: - cqt : np.ndarray Beat-synchronous CQT, four octaves, starting from note 36 - beats : np.ndarray Beat locations, in seconds. Beat tracking is done using CQT - bpm : float BPM. If the estimated BPM is less than 160, it is doubled. ''' gram = np.abs(librosa.cqt( audio_data, fmin=librosa.midi_to_hz(36), n_bins=48)) # Compute onset envelope from CQT (for speed) onset_envelope = librosa.onset.onset_strength(S=gram, aggregate=np.median) bpm, beats = librosa.beat.beat_track(onset_envelope=onset_envelope) # Double the BPM and interpolate beat locations if BPM < 160 while bpm < 240: beat_interp = scipy.interpolate.interp1d( np.arange(0, 2*beats.shape[0], 2), beats) beats = beat_interp(np.arange(2*beats.shape[0] - 1)).astype(int) bpm *= 2 # Synchronize the CQT to the beats sync_gram = librosa.feature.sync(gram, beats) # Also compute log amplitude sync_gram = librosa.logamplitude(sync_gram, ref_power=sync_gram.max()) # Transpose so that rows are samples sync_gram = sync_gram.T # and L2 normalize sync_gram = librosa.util.normalize(sync_gram, norm=2., axis=1) return sync_gram, librosa.frames_to_time(beats), bpm
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True, interval=0): """ Helper function for aligning a MIDI file to an audio file. :parameters: - mp3_filename : str Full path to a .mp3 file. - midi_filename : str Full path to a .mid file. - output_midi_filename : str Full path to where the aligned .mid file should be written. If None, don't output. - output_diagnostics : bool If True, also output a .pdf of figures, a .mat of the alignment results, and a .mp3 of audio and synthesized aligned audio """ # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it try: m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename)) except: print "Error loading {}".format(midi_filename) return print "Aligning {}".format(os.path.split(midi_filename)[1]) # Cache audio CQT and onset strength audio, fs = librosa.load(mp3_filename) if use_mp3_data: if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1]) # Create audio CQT, which is just frame-wise power, and onset strength audio_gram = np.load(to_cqt_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) # Generate synthetic MIDI CQT if piano: midi_gram = align_midi.midi_to_piano_cqt(m) # log_gram = librosa.logamplitude(midi_gram, ref_power=midi_gram.max()) # Normalize columns and return # midi_gram= librosa.util.normalize(log_gram, axis=0) midi_beats, bpm = align_midi.midi_beat_track(m) midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) else: midi_gram = align_midi.midi_to_cqt(m, SF2_PATH) # Get beats midi_beats, bpm = align_midi.midi_beat_track(m) # Beat synchronize and normalize midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) if interval != 0: midi_gram = shift_cqt(midi_gram, interval) # Compute beats midi_beats, bpm = align_midi.midi_beat_track(m) audio_beats = librosa.beat.beat_track(onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4 # Beat-align and log/normalize the audio CQT audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats) similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine") p, q, score = align_midi.dpmod(similarity_matrix) # Plot log-fs grams plt.figure(figsize=(36, 24)) ax = plt.subplot2grid((4, 3), (0, 0), colspan=3) plt.title("MIDI Synthesized") librosa.display.specshow( midi_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96) ) ax = plt.subplot2grid((4, 3), (1, 0), colspan=3) plt.title("Audio data") librosa.display.specshow( audio_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96) ) # Get similarity matrix similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine") # Get best path through matrix p, q, score = align_midi.dpmod(similarity_matrix) # Plot distance at each point of the lowst-cost path ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2) plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)]) plt.title("Distance at each point on lowest-cost path") # Plot similarity matrix and best path through it ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2) plt.imshow(similarity_matrix.T, aspect="auto", interpolation="nearest", cmap=plt.cm.gray) tight = plt.axis() plt.plot(p, q, "r.", ms=0.2) plt.axis(tight) plt.title("Similarity matrix and lowest-cost path, cost={}".format(score)) # Adjust MIDI timing m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q]) # Plot alignment ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2) note_ons = np.array([note.start for instrument in m.instruments for note in instrument.notes]) aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.notes]) plt.plot(note_ons, aligned_note_ons - note_ons, ".") plt.xlabel("Original note location (s)") plt.ylabel("Shift (s)") plt.title("Corrected offset") # Write out the aligned file if output_midi_filename is not None: m_aligned.write(output_midi_filename) if output_diagnostics: # Save the figures plt.savefig(output_midi_filename.replace(".mid", ".pdf")) if write_mp3: # Load in the audio data (needed for writing out) audio, fs = librosa.load(mp3_filename, sr=None) # Synthesize the aligned midi # midi_audio_aligned = m_aligned.fluidsynth() midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH) # Trim to the same size as audio if midi_audio_aligned.shape[0] > audio.shape[0]: midi_audio_aligned = midi_audio_aligned[: audio.shape[0]] else: midi_audio_aligned = np.append( midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0]) ) # Write out to temporary .wav file librosa.output.write_wav( output_midi_filename.replace(".mid", ".wav"), np.vstack([midi_audio_aligned, audio]).T, fs ) # Convert to mp3 subprocess.check_output( [ "ffmpeg", "-i", output_midi_filename.replace(".mid", ".wav"), "-ab", "128k", "-y", output_midi_filename.replace(".mid", ".mp3"), ] ) # Remove temporary .wav file os.remove(output_midi_filename.replace(".mid", ".wav")) # Save a .mat of the results scipy.io.savemat( output_midi_filename.replace(".mid", ".mat"), {"similarity_matrix": similarity_matrix, "p": p, "q": q, "score": score}, ) # If we aren't outputting a .pdf, show the plot else: plt.show() plt.close()
def features(filename): '''Extract feature data for spectral clustering segmentation. :parameters: - filename : str Path on disk to an audio file :returns: - X_cqt : np.ndarray [shape=(d1, n)] A beat-synchronous log-power CQT matrix - X_timbre : np.ndarray [shape=(d2, n)] A beat-synchronous MFCC matrix - beat_times : np.ndarray [shape=(n, 2)] Timing of beat intervals ''' print('\t[1/5] loading audio') y, sr = librosa.load(filename, sr=None) y = librosa.resample(y, sr, SR, res_type='sinc_fastest') sr = SR print('\t[2/5] Separating harmonic and percussive signals') y_harm, y_perc = librosa.effects.hpss(y) print('\t[3/5] detecting beats') bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH) print('\t[4/5] generating CQT') X_cqt = librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72) # Compute log CQT power X_cqt = librosa.logamplitude(X_cqt**2.0, ref_power=np.max) # Compute MFCCs print('\t[5/5] generating MFCC') X_melspec = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS) X_timbre = librosa.feature.mfcc(S=librosa.logamplitude(X_melspec), n_mfcc=N_MFCC) # Resolve any timing discrepancies due to CQT downsampling n = min(X_cqt.shape[1], X_timbre.shape[1]) # Trim the beat detections to fit within the shape of X* beats = beats[beats < n] # Pad on a frame=0 beat for synchronization purposes beats = np.unique(np.concatenate([[0], beats])) # Convert beat frames to beat times beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH) # Take on an end-of-track marker. This is necessary if we want # the output intervals to span the entire track. beat_times = np.concatenate([beat_times, [float(len(y)) / sr]]) beat_intervals = np.c_[beat_times[:-1], beat_times[1:]] # Synchronize the feature matrices X_cqt = librosa.feature.sync(X_cqt, beats, aggregate=np.median) X_timbre = librosa.feature.sync(X_timbre, beats, aggregate=np.mean) return X_cqt, X_timbre, beat_intervals
def logfrequency(sr, n_fft, n_bins=84, bins_per_octave=12, tuning=0.0, fmin=None, spread=0.125): '''Approximate a constant-Q filterbank for a fixed-window STFT. Each filter is a log-normal window centered at the corresponding frequency. :usage: >>> # Simple log frequency filters >>> logfs_fb = librosa.filters.logfrequency(22050, 4096) >>> # Use a narrower frequency range >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, n_bins=48, fmin=110) >>> # Use narrower filters for sparser response: 5% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.05) >>> # Or wider: 50% of a semitone >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.5) :parameters: - sr : int > 0 audio sampling rate - n_fft : int > 0 FFT window size - n_bins : int > 0 Number of bins. Defaults to 84 (7 octaves). - bins_per_octave : int > 0 Number of bins per octave. Defaults to 12 (semitones). - tuning : None or float in [-0.5, +0.5] Tuning correction parameter, in fractions of a bin. - fmin : float > 0 Minimum frequency bin. Defaults to ``C2 ~= 32.70`` - spread : float > 0 Spread of each filter, as a fraction of a bin. :returns: - C : np.ndarray, shape=(n_bins, 1 + n_fft/2) log-frequency filter bank. ''' if fmin is None: fmin = librosa.midi_to_hz(librosa.note_to_midi('C2')) # Apply tuning correction correction = 2.0**(float(tuning) / bins_per_octave) # What's the shape parameter for our log-normal filters? sigma = float(spread) / bins_per_octave # Construct the output matrix basis = np.zeros((n_bins, 1 + n_fft/2)) # Get log frequencies of bins log_freqs = np.log2(librosa.fft_frequencies(sr, n_fft)[1:]) for i in range(n_bins): # What's the center (median) frequency of this filter? c_freq = correction * fmin * (2.0**(float(i)/bins_per_octave)) # Place a log-normal window around c_freq basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma)**2 - np.log2(sigma) - log_freqs) # Normalize each filter c_norm = np.sqrt(np.sum(basis[i]**2)) if c_norm > 0: basis[i] = basis[i] / c_norm return basis
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True): ''' Helper function for aligning a MIDI file to an audio file. :parameters: - mp3_filename : str Full path to a .mp3 file. - midi_filename : str Full path to a .mid file. - output_midi_filename : str Full path to where the aligned .mid file should be written. If None, don't output. - output_diagnostics : bool If True, also output a .pdf of figures, a .mat of the alignment results, and a .mp3 of audio and synthesized aligned audio ''' # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it try: m = pretty_midi.PrettyMIDI(midi_filename) except: print "Error loading {}".format(midi_filename) return print "Aligning {}".format(os.path.split(midi_filename)[1]) #check if output path exists, and create it if necessary if not os.path.exists(os.path.split(output_midi_filename)[0]): os.makedirs(os.path.split(output_midi_filename)[0]) audio, fs = librosa.load(mp3_filename) if use_prev_data: if chroma: if os.path.exists(to_chroma_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): audio_gram = np.load(to_chroma_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Generating chroma features for {}".format(mp3_filename) audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs) np.save(to_chroma_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1]) # Create audio CQT, which is just frame-wise power, and onset strength audio_gram = np.load(to_cqt_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) if chroma: audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs) np.save(to_chroma_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) if use_prev_data and not make_midi_info: if piano: if os.path.exists(to_piano_cqt_npy(midi_filename)): midi_gram = np.load(to_piano_cqt_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) elif chroma: if os.path.exists(to_chroma_npy(midi_filename)): midi_gram = np.load(to_chroma_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) else: if os.path.exists(to_cqt_npy(midi_filename)): midi_gram = np.load(to_cqt_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) # Generate synthetic MIDI CQT midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) if piano: # midi_gram = align_midi.accentuate_onsets(midi_gram) midi_gram = align_midi.piano_roll_fuzz(midi_gram) # midi_gram = align_midi.clean_audio_gram(midi_gram, threshold = np.percentile(midi_gram,40)) midi_gram = librosa.util.normalize(midi_gram, axis = 0) # Compute beats midi_beats, bpm = align_midi.midi_beat_track(m) audio_beats = librosa.beat.beat_track(onset_envelope=audio_onset_strength, hop_length=512/4, bpm=bpm)[1]/4 # Beat-align and log/normalize the audio CQT audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats) # Plot log-fs grams # audio_gram = align_midi.clean_audio_gram(audio_gram, threshold = np.percentile(audio_gram, 80)) plt.figure(figsize=(36, 24)) ax = plt.subplot2grid((4, 3), (0, 0), colspan=3) plt.title('MIDI Synthesized') librosa.display.specshow(midi_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) ax = plt.subplot2grid((4, 3), (1, 0), colspan=3) plt.title('Audio data') librosa.display.specshow(audio_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) # Get similarity matrix similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric='cosine') # Get best path through matrix p, q, score = align_midi.dpmod(similarity_matrix,experimental = False, forceH = False) # Plot distance at each point of the lowst-cost path ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2) plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)]) plt.title('Distance at each point on lowest-cost path') # Plot similarity matrix and best path through it ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2) plt.imshow(similarity_matrix.T, aspect='auto', interpolation='nearest', cmap=plt.cm.gray) tight = plt.axis() plt.plot(p, q, 'r.', ms=.2) plt.axis(tight) plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score)) # Adjust MIDI timing m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q]) # Plot alignment ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2) note_ons = np.array([note.start for instrument in m.instruments for note in instrument.events]) aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.events]) plt.plot(note_ons, aligned_note_ons - note_ons, '.') plt.xlabel('Original note location (s)') plt.ylabel('Shift (s)') plt.title('Corrected offset') # Write out the aligned file if output_midi_filename is not None: m_aligned.write(output_midi_filename) if output_diagnostics: # Save the figures plt.savefig(output_midi_filename.replace('.mid', '.pdf')) if write_mp3: # Load in the audio data (needed for writing out) audio, fs = librosa.load(mp3_filename, sr=None) # Synthesize the aligned midi midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH) # Trim to the same size as audio if midi_audio_aligned.shape[0] > audio.shape[0]: midi_audio_aligned = midi_audio_aligned[:audio.shape[0]] else: midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0])) # Write out to temporary .wav file librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'), np.vstack([midi_audio_aligned, audio]).T, fs) # Convert to mp3 subprocess.check_output(['ffmpeg', '-i', output_midi_filename.replace('.mid', '.wav'), '-ab', '128k', '-y', output_midi_filename.replace('.mid', '.mp3')]) # Remove temporary .wav file os.remove(output_midi_filename.replace('.mid', '.wav')) # Save a .mat of the results scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'), {'similarity_matrix': similarity_matrix, 'p': p, 'q': q, 'score': score}) # If we aren't outputting a .pdf, show the plot else: plt.show() plt.close()
import sklearn import sklearn.cluster import sklearn.pipeline import matplotlib.pyplot as plt import seaborn seaborn.set(style='ticks') # We'll build the feature pipeline object here # First stage is a mel-frequency specrogram of bounded range MelSpec = librosa.util.FeatureExtractor(librosa.feature.melspectrogram, n_fft=2048, n_mels=128, fmax=librosa.midi_to_hz(116), fmin=librosa.midi_to_hz(24)) # Second stage is log-amplitude; power is relative to peak in the signal LogAmp = librosa.util.FeatureExtractor(librosa.logamplitude, ref_power=np.max) # Third stage transposes the data so that frames become samples Transpose = librosa.util.FeatureExtractor(np.transpose) # Last stage stacks all samples together into one matrix for training Stack = librosa.util.FeatureExtractor(np.vstack, iterate=False) # Now, build a learning object. We'll use mini-batch k-means with default parameters. C = sklearn.cluster.MiniBatchKMeans()
SR = 22050 N_FFT = 2048 HOP_LENGTH = 512 HOP_BEATS = 64 N_MELS = 128 FMAX = 8000 REP_WIDTH = 3 REP_FILTER = 7 N_MFCC = 32 N_CHROMA = 12 N_REP = 32 NOTE_MIN = librosa.midi_to_hz(24) # 32Hz NOTE_NUM = 84 NOTE_RES = 2 # CQT filter resolution # mfcc, chroma, repetitions for each, and 4 time features __DIMENSION = N_MFCC + N_CHROMA + 2 * N_REP + 4 def features(filename): '''Feature-extraction for audio segmentation Arguments: filename -- str path to the input song Returns: - X -- ndarray
def test_midi_to_hz(): assert np.allclose(librosa.midi_to_hz([33, 45, 57, 69]), [55, 110, 220, 440])
def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0, window=None, resolution=2, pad=False, **kwargs): r'''Construct a constant-Q basis. :usage: >>> # Change the windowing function to Hamming instead of Hann >>> basis = librosa.filters.constant_q(22050, window=np.hamming) >>> # Use a longer window for each filter >>> basis = librosa.filters.constant_q(22050, resolution=3) >>> # Pad the basis to fixed length >>> basis = librosa.filters.constant_q(22050, pad=True) :parameters: - sr : int > 0 Audio sampling rate - fmin : float > 0 Minimum frequency bin. Defaults to ``C2 ~= 32.70`` - n_bins : int > 0 Number of frequencies. Defaults to 7 octaves (84 bins). - bins_per_octave : int > 0 Number of bins per octave - tuning : float in [-0.5, +0.5) Tuning deviation from A440 in fractions of a bin - window : function or ``None`` Windowing function to apply to filters. If ``None``, no window is applied. Default: scipy.signal.hann - resolution : float > 0 Resolution of filter windows. Larger values use longer windows. - pad : boolean Pad all filters to have a constant width (equal to the longest filter). By default, padding is done with zeros, but this can be overridden by setting the ``mode=`` field in *kwargs*. - *kwargs* Additional keyword arguments to ``np.pad()`` when ``pad==True``. .. note:: - McVicar, Matthew. "A machine learning approach to automatic chord extraction." Dissertation, University of Bristol. 2013. :returns: - filters : list of np.ndarray, ``len(filters) == n_bins`` ``filters[i]`` is ``i``\ th CQT basis filter (in the time-domain) ''' if fmin is None: fmin = librosa.midi_to_hz(librosa.note_to_midi('C2')) if window is None: window = scipy.signal.hann correction = 2.0**(float(tuning) / bins_per_octave) fmin = correction * fmin # Q should be capitalized here, so we suppress the name warning # pylint: disable=invalid-name Q = float(resolution) / (2.0**(1. / bins_per_octave) - 1) filters = [] for i in np.arange(n_bins, dtype=float): # Length of this filter ilen = np.ceil(Q * sr / (fmin * 2.0**(i / bins_per_octave))) # Build the filter win = np.exp(Q * 1j * np.linspace(0, 2 * np.pi, ilen, endpoint=False)) # Apply the windowing function if window is not None: win = win * window(ilen) # Normalize win = librosa.util.normalize(win, norm=2) filters.append(win) if pad: max_len = max(map(len, filters)) # Use reflection padding, unless otherwise specified for i in range(len(filters)): filters[i] = librosa.util.pad_center(filters[i], max_len, **kwargs) return filters