def transform_audio(audio,
                    n_fft=2048,
                    n_mels=40,
                    sr=22050,
                    hop_length=512,
                    fmin=None,
                    fmax=None):
    # Midi values of 24 (C2) and 120 (C10) are chosen, since humans typically
    # can't hear much beyond this range.
    if not fmin:
        fmin = librosa.midi_to_hz(24)
    if not fmax:
        fmax = librosa.midi_to_hz(120)
    # First stage is a mel-frequency specrogram of bounded range.
    mel = librosa.feature.melspectrogram(audio,
                                         sr=sr,
                                         n_fft=n_fft,
                                         hop_length=hop_length,
                                         n_mels=n_mels,
                                         fmax=fmax,
                                         fmin=fmin)
    # Second stage is log-amplitude; power is relative to peak in the signal.
    log_amplitude = librosa.logamplitude(mel, ref_power=np.max)
    # Third stage transposes the data so that frames become samples.
    # Its shape is:
    # (length of audio / frame duration, number of mel bands)
    transpose = np.transpose(log_amplitude)
    return (transpose,
            {'n_fft': n_fft, 'n_mels': n_mels, 'sr': sr,
            'hop_length': hop_length, 'fmin': fmin, 'fmax': fmax})
Esempio n. 2
0
def plot_gram(gram):
    '''
    Plots a *gram (cqt-gram, pian roll-gram).

    :parameters:
        - gram : np.ndarray
            A 2-d representation of time/frequency, with frequencies being the
            notes between MIDI note 36 and 96.
    '''
    librosa.display.specshow(gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))
Esempio n. 3
0
def getChromagram(filename):
    x, sr = librosa.load(filename)
    fmin = librosa.midi_to_hz(36)
    hop_length = 512
    C = librosa.cqt(x, sr=sr, fmin=fmin, n_bins=72, hop_length=hop_length)
    chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length)
    return chromagram
Esempio n. 4
0
    def log_filter_bank_fn():
        """
        generate a logarithmic filterbank
        """
        log_filter_bank_basis = madmom.audio.filters.LogarithmicFilterbank(
            bin_frequencies=librosa.fft_frequencies(sr=16000, n_fft=2048),
            num_bands=48,
            fmin=librosa.midi_to_hz([27])[0],
            fmax=librosa.midi_to_hz([114])[0] * 2. ** (1. / 48)
        )
        log_filter_bank_basis = np.array(log_filter_bank_basis)
        assert log_filter_bank_basis.shape[1] == 229
        assert np.abs(np.sum(log_filter_bank_basis[:, 0]) - 1.) < 1e-3
        assert np.abs(np.sum(log_filter_bank_basis[:, -1]) - 1.) < 1e-3

        return log_filter_bank_basis
Esempio n. 5
0
def stft_module(y, plot=False):
    stft_spectrum = lb.stft(y,
                            n_fft=1024,
                            hop_length=512,
                            center=True,
                            dtype=np.complex64)
    stft = np.abs(stft_spectrum)  #compute the amplitude
    if plot:  #For testing
        plt.figure()
        print stft.shape  # =(1 + n_fft/2, t), t=431 if hop_length=512
        plt.subplot(211)
        # plt.imshow(stft)
        # plt.colorbar(format='%+2.0f dB')
        plt.plot(stft[:, 100])  # Plot a single frame
        plt.title('100th frame')
        plt.subplot(212)
        # Note:lb display must be just put after the other plots
        lb.display.specshow(lb.amplitude_to_db(stft),
                            sr=sr,
                            fmin=lb.midi_to_hz(RangeMIDInotes[0]),
                            x_axis='time',
                            y_axis='linear')
        plt.colorbar(format='%+2.0f dB')
        plt.title('STFT_Linear-frequency power spectrogram')
    return stft
Esempio n. 6
0
def extract_cqt(audio_data):
    '''
    CQT routine with default parameters filled in, and some post-processing.

    Parameters
    ----------
    audio_data : np.ndarray
        Audio data to compute CQT of

    Returns
    -------
    cqt : np.ndarray
        CQT of the supplied audio data.
    frame_times : np.ndarray
        Times, in seconds, of each frame in the CQT
    '''
    # Compute CQT
    cqt = librosa.cqt(audio_data, sr=FS, fmin=librosa.midi_to_hz(NOTE_START),
                      n_bins=N_NOTES, hop_length=HOP_LENGTH, tuning=0.)
    # Compute the time of each frame
    times = librosa.frames_to_time(
        np.arange(cqt.shape[1]), sr=FS, hop_length=HOP_LENGTH)
    # Use float32 for the cqt to save space/memory
    cqt = cqt.astype(np.float32)
    return cqt, times
def audio_cqt(audio_data, fs=AUDIO_FS):
    '''
    Compute some audio data's constant-Q spectrogram, normalize, and log-scale
    it

    Parameters
    ----------
    audio_data : np.ndarray
        Some audio signal.
    fs : int
        Sampling rate the audio data is sampled at, should be ``AUDIO_FS``.

    Returns
    -------
    midi_gram : np.ndarray
        Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI
        data.
    '''
    # Compute CQT of the synthesized audio data
    audio_gram = librosa.cqt(audio_data,
                             sr=fs,
                             hop_length=AUDIO_HOP,
                             fmin=librosa.midi_to_hz(NOTE_START),
                             n_bins=N_NOTES)
    # L2-normalize and log-magnitute it
    return audio_gram
Esempio n. 8
0
def test_cqt_position():

    # synthesize a two second sine wave at midi note 60

    sr = 22050
    freq = librosa.midi_to_hz(60)

    y = np.sin(2 * np.pi * freq * np.linspace(0, 2.0, 2 * sr))

    def __test(note_min):

        C = librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min))

        # Average over time
        Cbar = np.median(C, axis=1)

        # Find the peak
        idx = np.argmax(Cbar)

        eq_(idx, 60 - note_min)

        # Make sure that the max outside the peak is sufficiently small
        Cscale = Cbar / Cbar[idx]
        Cscale[idx] = np.nan

        assert np.nanmax(Cscale) < 1e-1

        Cscale[idx-1:idx+2] = np.nan
        assert np.nanmax(Cscale) < 1e-2

    for note_min in [12, 18, 24, 30, 36]:
        yield __test, note_min
Esempio n. 9
0
 def test_midi_to_hz_is_accurate(self):
   """Tests converting between MIDI values and their frequencies in hertz."""
   midi = np.arange(128)
   librosa_hz = librosa.midi_to_hz(midi)
   with self.cached_session() as sess:
     tf_hz = sess.run(core.midi_to_hz(midi))
   self.assertAllClose(librosa_hz, tf_hz)
Esempio n. 10
0
	def shift_f0(audio_features, pitch_shift=0.0):
		"""Shift f0 by a number of ocatves."""
		audio_features['f0_hz'] *= 2.0 ** (pitch_shift)
		audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
										0.0, 
										librosa.midi_to_hz(110.0))
		return audio_features
Esempio n. 11
0
    def __init__(self, parent, controller):
        tk.Frame.__init__(self, parent)

        label = tk.Label(self, text='Constant-Q Plot')
        label.pack(side=tk.TOP)

        prev_plot = tk.Button(self, text='<--Prev Plot', command=lambda: controller.show_frame('MelView'))
        prev_plot.pack(side=tk.BOTTOM)

        next_plot = tk.Button(self, text='Next Plot-->', command=lambda: controller.show_frame('OnsetView'))
        next_plot.pack(side=tk.TOP)

        ## call fetch data for plots
        plt.style.use('ggplot')

        c = controller.fetch_data('const_q')

        fig = plt.figure(figsize=(10,10), dpi=100)   # make figure
        fig.add_subplot(111)

        fmin = lsa.midi_to_hz(48)
        lsa.display.specshow(c, x_axis='time', y_axis='cqt_note', fmin=fmin, cmap='coolwarm')

        plt.tight_layout()

        canvas = FigureCanvasTkAgg(fig, self)
        canvas.get_tk_widget().pack(side=tk.TOP, expand=True)
Esempio n. 12
0
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512):
    '''
    Feature extraction for audio data.
    Gets a power CQT of harmonic component and onset strength signal of percussive.

    Input:
        midi - pretty_midi.PrettyMIDI object
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this
    Output:
        audio_gram - CQT of audio data
        audio_onset_strength - onset strength signal
    '''
    # Use harmonic part for gram, percussive part for onsets
    H, P = librosa.decompose.hpss(librosa.stft(audio))
    audio_harmonic = librosa.istft(H)
    audio_percussive = librosa.istft(P)
    # Compute log-frequency spectrogram of original audio
    audio_gram = np.abs(librosa.cqt(y=audio_harmonic,
                                    sr=fs,
                                    hop_length=hop,
                                    fmin=librosa.midi_to_hz(36),
                                    n_bins = 60))**2

    # Beat track the audio file at 4x the hop rate
    audio_onset_strength = librosa.onset.onset_strength(audio_percussive , hop_length=hop/4, sr=fs)
    return audio_gram, audio_onset_strength
def midi_cqt(midi_object):
    '''
    Synthesize MIDI data, compute its constant-Q spectrogram, normalize, and
    log-scale it

    Parameters
    ----------
    midi_object : pretty_midi.PrettyMIDI
        MIDI data to create constant-Q spectrogram of.

    Returns
    -------
    midi_gram : np.ndarray
        Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI
        data.
    '''
    # Synthesize MIDI object as audio data
    midi_audio = fast_fluidsynth(midi_object, MIDI_FS)
    # Compute CQT of the synthesized audio data
    midi_gram = librosa.cqt(midi_audio,
                            sr=MIDI_FS,
                            hop_length=MIDI_HOP,
                            fmin=librosa.midi_to_hz(NOTE_START),
                            n_bins=N_NOTES)
    # L2-normalize and log-magnitute it
    return midi_gram
Esempio n. 14
0
    def post_process_features(gram, beats):
        '''
        Apply processing to a feature matrix given the supplied param values

        Parameters
        ----------
        gram : np.ndarray
            Feature matrix, shape (n_features, n_samples)
        beats : np.ndarray
            Indices of beat locations in gram

        Returns
        -------
        gram : np.ndarray
            Feature matrix, shape (n_samples, n_features), post-processed
            according to the values in `params`
        '''
        # Convert to chroma
        if params['feature'] == 'chroma':
            gram = librosa.feature.chroma_cqt(
                C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START))
        # Beat-synchronize the feature matrix
        if params['beat_sync']:
            gram = librosa.feature.sync(gram, beats, pad=False)
        # Compute log magnitude
        gram = librosa.logamplitude(gram, ref_power=gram.max())
        # Normalize the feature vectors
        gram = librosa.util.normalize(gram, norm=params['norm'])
        # Standardize the feature vectors
        if params['standardize']:
            gram = scipy.stats.mstats.zscore(gram, axis=1)
        # Transpose it to (n_samples, n_features) and return it
        return gram.T
Esempio n. 15
0
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512):
    '''
    Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT.
    
    Input:
        midi - pretty_midi.PrettyMIDI object
        sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512
    Output:
        midi_gram - Simulated CQT of the midi data
    '''
    # Synthesize the MIDI using the supplied sf2 path
    midi_audio = midi.fluidsynth(fs=fs, sf2_path=sf2_path)
    # Use the harmonic part of the signal
    H, P = librosa.decompose.hpss(librosa.stft(midi_audio))
    midi_audio_harmonic = librosa.istft(H)
    # Compute log frequency spectrogram of audio synthesized from MIDI
    midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic,
                                   sr=fs,
                                   hop_length=hop,
                                   fmin=librosa.midi_to_hz(36),
                                   n_bins=60,
                                   tuning=0.0))**2
    return midi_gram
def process_one_file(midi_filename, skip=True):
    '''
    Load in midi data, compute features, and write out file

    :parameters:
        - midi_filename : str
            Full path to midi file
        - skip : bool
            Whether to skip creating the file when the npz already exists
    '''
    # npz files go in the 'npz' dir instead of 'mid'
    output_filename = mid_to_npz_path(midi_filename)
    # Skip files already created
    if skip and os.path.exists(output_filename):
        return
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
        midi_audio = alignment_utils.fast_fluidsynth(m, MIDI_FS)
        midi_gram = librosa.cqt(
            midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP,
            fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES)
        midi_beats, midi_tempo = alignment_utils.midi_beat_track(m)
        midi_sync_gram = alignment_utils.post_process_cqt(
            midi_gram, librosa.time_to_frames(
                midi_beats, sr=MIDI_FS, hop_length=MIDI_HOP))
        np.savez_compressed(
            output_filename, sync_gram=midi_sync_gram,
            beats=midi_beats, bpm=midi_tempo)
    except Exception as e:
        print "Error processing {}: {}".format(midi_filename, e)
Esempio n. 17
0
def _midi_to_hz(x, idx, log_f0=False):
    z = np.zeros(len(x))
    indices = x[:, idx] > 0
    z[indices] = librosa.midi_to_hz(x[indices, idx])
    if log_f0:
        z[indices] = np.log(z[indices])
    return z
Esempio n. 18
0
 def test_midi_to_hz_is_accurate(self):
     """Tests converting between MIDI values and their frequencies in hertz
     """
     midi = np.arange(128)
     librosa_hz = librosa.midi_to_hz(midi)
     th_hz = core.midi_to_hz(midi)
     assert np.allclose(librosa_hz, th_hz)
Esempio n. 19
0
def midi_to_cqt(midi, sf2_path=None, fs=22050, hop=512):
    '''
    Feature extraction routine for midi data, converts to a drum-free, percussion-suppressed CQT.

    Input:
        midi - pretty_midi.PrettyMIDI object
        sf2_path - path to .sf2 file to pass to pretty_midi.fluidsynth
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512
    Output:
        midi_gram - Simulated CQT of the midi data
    '''
    # Create a copy of the midi object
    midi_no_drums = copy.deepcopy(midi)
    # Remove the drums
    for n, instrument in enumerate(midi_no_drums.instruments):
        if instrument.is_drum:
            del midi_no_drums.instruments[n]
    # Synthesize the MIDI using the supplied sf2 path
    midi_audio = midi_no_drums.fluidsynth(fs=fs, sf2_path=sf2_path)
    # midi_audio = midi_no_drums.synthesize(fs = fs)
    # Use the harmonic part of the signal
    H, P = librosa.decompose.hpss(librosa.stft(midi_audio))
    midi_audio_harmonic = librosa.istft(H)
    # Compute log frequency spectrogram of audio synthesized from MIDI
    midi_gram = np.abs(librosa.cqt(y=midi_audio_harmonic,
                                   sr=fs,
                                   hop_length=hop,
                                   fmin=librosa.midi_to_hz(36),
                                   n_bins = 60,
                                   tuning=0.0))**2
    return midi_gram
Esempio n. 20
0
def audio_to_cqt_and_onset_strength(audio, fs=22050, hop=512):
    '''
    Feature extraction for audio data.
    Gets a power CQT of harmonic component and onset strength signal of percussive.
    
    Input:
        midi - pretty_midi.PrettyMIDI object
        fs - sampling rate to synthesize audio at, default 22050
        hop - hop length for cqt, default 512, onset strength hop will be 1/4 of this
    Output:
        audio_gram - CQT of audio data
        audio_onset_strength - onset strength signal
    '''
    # Use harmonic part for gram, percussive part for onsets
    H, P = librosa.decompose.hpss(librosa.stft(audio))
    audio_harmonic = librosa.istft(H)
    audio_percussive = librosa.istft(P)
    # Compute log-frequency spectrogram of original audio
    audio_gram = np.abs(librosa.cqt(y=audio_harmonic,
                                    sr=fs,
                                    hop_length=hop,
                                    fmin=librosa.midi_to_hz(36),
                                    n_bins=60))**2
    # Beat track the audio file at 4x the hop rate
    audio_onset_strength = librosa.onset.onset_strength(audio_percussive, hop_length=hop/4, sr=fs)
    return audio_gram, audio_onset_strength
Esempio n. 21
0
def test_estimate_tuning():
    def __test(target_hz, resolution, bins_per_octave, tuning):

        y = np.sin(2 * np.pi * target_hz * t)
        tuning_est = librosa.estimate_tuning(resolution=resolution,
                                             bins_per_octave=bins_per_octave,
                                             y=y,
                                             sr=sr,
                                             n_fft=2048,
                                             fmin=librosa.note_to_hz('C4'),
                                             fmax=librosa.note_to_hz('G#9'))

        # Round to the proper number of decimals
        deviation = np.around(np.abs(tuning - tuning_est),
                              int(-np.log10(resolution)))

        # We'll accept an answer within three bins of the resolution
        assert deviation <= 3 * resolution

    for sr in [11025, 22050]:
        duration = 5.0

        t = np.linspace(0, duration, int(duration * sr))

        for resolution in [1e-2]:
            for bins_per_octave in [12]:
                # test a null-signal tuning estimate
                yield (__test, 0.0, resolution, bins_per_octave, 0.0)

                for center_note in [69, 84, 108]:
                    for tuning in np.linspace(-0.5, 0.5, 8, endpoint=False):
                        target_hz = librosa.midi_to_hz(center_note + tuning)

                        yield (__test, np.asscalar(target_hz), resolution,
                               bins_per_octave, tuning)
Esempio n. 22
0
    def test_FreqToPitchClass(self, low='A1', high='A6', res=120):
        def check(freqs, labels, ifreqs):
            self.assertIsInstance(labels, np.ndarray)
            self.assertEqual(labels.dtype, np.int)
            for freq, label, ifreq in zip(freqs, labels, ifreqs):
                if freq >= fLow and freq < fHigh:
                    self.assertGreaterEqual(label, 0)
                    self.assertLess(label, res)
                    self.assertGreater(freq, ifreq)
                    self.assertLess(freq, ifreq * rDelta)
                elif freq >= fHigh:
                    self.assertEqual(label, res - 1)
                elif freq < fLow and freq > 0:
                    self.assertEqual(label, 0)
                else:
                    self.assertEqual(label, -1)

        fLow = librosa.note_to_hz(low)
        fHigh = librosa.note_to_hz(high)
        rDelta = np.power(fHigh / fLow, 1. / res)
        self.assertGreater(rDelta, 1.)
        transform = FreqToPitchClass(low=low, high=high, resolution=res)
        labels = np.arange(-10, 128 + 10)
        rfreqs = np.append(librosa.midi_to_hz(labels), 0)
        rfreqs = np.repeat(rfreqs, 30)
        freqs = rfreqs * np.random.uniform(
            low=1. / rDelta, high=rDelta,
            size=rfreqs.shape)  # add some deviation

        sample = {'freqs': freqs}
        t_sample = transform(sample)
        it_sample = transform.inv(copy(t_sample))
        check(freqs, t_sample['labels'], it_sample['freqs'])
Esempio n. 23
0
def gen_onsets_info(data, t_unit=0.02):
    #logging.debug("Data shape: %s", data.shape)
    pitches   = []
    intervals = []
    lowest_pitch = librosa.note_to_midi("A0")

    for i in range(data.shape[1]):
        notes = find_occur(data[:, i], t_unit=t_unit)
        it = []
        for nn in notes:
            it.append([nn["onset"]*t_unit, (nn["onset"]+2)*t_unit])
        
        if len(intervals)==0 and len(it) > 0:
            intervals = np.array(it)
        elif len(it) > 0:
            intervals = np.concatenate((intervals, np.array(it)), axis=0)
            
        # hz = CentralFrequency[i]
        hz = librosa.midi_to_hz(lowest_pitch+i)
        for i in range(len(it)):
            pitches.append(hz)
    
    if type(intervals) == list:
        intervals = np.array([]).reshape((0, 2))
    pitches = np.array(pitches)
    
    return intervals, pitches
Esempio n. 24
0
File: main.py Progetto: beckgom/msaf
def features(filename):
    # print '\t[1/5] loading audio'
    y, sr = librosa.load(filename, sr=SR)

    # print '\t[2/5] Separating harmonic and percussive signals'
    y_perc, y_harm = hp_sep(y)

    # print '\t[3/5] detecting beats'
    bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH)

    # print '\t[4/5] generating CQT'
    M1 = np.abs(
        librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72)
    )

    M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max)

    # print '\t[5/5] generating MFCC'
    S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS)
    M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC)

    n = min(M1.shape[1], M2.shape[1])

    beats = beats[beats < n]

    beats = np.unique(np.concatenate([[0], beats]))

    times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH)

    times = np.concatenate([times, [float(len(y)) / sr]])
    M1 = librosa.feature.sync(M1, beats, aggregate=np.median)
    M2 = librosa.feature.sync(M2, beats, aggregate=np.mean)
    return (M1, M2), times
Esempio n. 25
0
def test_cqt_position():

    # synthesize a two second sine wave at midi note 60

    sr = 22050
    freq = librosa.midi_to_hz(60)

    y = np.sin(2 * np.pi * freq * np.linspace(0, 2.0, 2 * sr))

    def __test(note_min):

        C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2

        # Average over time
        Cbar = np.median(C, axis=1)

        # Find the peak
        idx = np.argmax(Cbar)

        eq_(idx, 60 - note_min)

        # Make sure that the max outside the peak is sufficiently small
        Cscale = Cbar / Cbar[idx]
        Cscale[idx] = np.nan
        assert np.nanmax(Cscale) < 6e-1, Cscale

        Cscale[idx - 1:idx + 2] = np.nan
        assert np.nanmax(Cscale) < 5e-2, Cscale

    for note_min in [12, 18, 24, 30, 36]:
        yield __test, note_min
Esempio n. 26
0
def gen_onsets_info_from_notes(midi_notes, t_unit=0.02):
    intervals = []
    pitches = []
    for note in midi_notes:
        intervals.append([note.start, note.end])
        pitches.append(librosa.midi_to_hz(note.pitch))

    return np.array(intervals), np.array(pitches)
Esempio n. 27
0
def save_spectrogram_plot(audio: Any,
                          sample_rate: int = 16000,
                          filename: Optional[str] = None,
                          output_dir: str = "output") -> None:
    """
  Saves the spectrogram plot of the given audio to the given filename in
  the given output_dir. The resulting plot is a Constant-Q transform (CQT)
  spectrogram with the vertical axis being the amplitude converted to
  dB-scale.

  :param audio: the audio content, as a floating point time series
  :param sample_rate: the sampling rate of the file
  :param filename: the optional filename, set to "%Y-%m-%d_%H%M%S".png if None
  :param output_dir: the output dir
  """
    os.makedirs(output_dir, exist_ok=True)

    # Pitch min and max corresponds to the pitch min and max
    # of the wavenet training checkpoint
    pitch_min = np.min(36)
    pitch_max = np.max(84)
    frequency_min = librosa.midi_to_hz(pitch_min)
    frequency_max = 2 * librosa.midi_to_hz(pitch_max)
    octaves = int(np.ceil(np.log2(frequency_max) - np.log2(frequency_min)))
    bins_per_octave = 32
    num_bins = int(bins_per_octave * octaves)
    hop_length = 2048
    constant_q_transform = librosa.cqt(audio,
                                       sr=sample_rate,
                                       hop_length=hop_length,
                                       fmin=frequency_min,
                                       n_bins=num_bins,
                                       bins_per_octave=bins_per_octave)
    plt.figure()
    plt.axis("off")
    librosa.display.specshow(librosa.amplitude_to_db(constant_q_transform,
                                                     ref=np.max),
                             sr=sample_rate)

    if not filename:
        date_and_time = time.strftime("%Y-%m-%d_%H%M%S")
        filename = f"{date_and_time}.png"
    path = os.path.join(output_dir, filename)
    plt.savefig(fname=path, dpi=600)
    plt.close()
def get_cqt(y, filter_scale=1):
    return np.abs(
        librosa.cqt(y,
                    sr=44100,
                    hop_length=1024,
                    fmin=librosa.midi_to_hz(36),
                    n_bins=84 * 2,
                    bins_per_octave=12 * 2,
                    filter_scale=filter_scale)).T
Esempio n. 29
0
def _load_f0(f0_path):
    with open(f0_path) as fhandle:
        lines = fhandle.readlines()
    f0_midi = np.array([float(line) for line in lines])
    f0_hz = librosa.midi_to_hz(f0_midi) * (f0_midi > 0)
    confidence = (f0_hz > 0).astype(int)
    times = np.arange(len(f0_midi)) * IKALA_TIME_STEP
    f0_data = F0Data(times, f0_hz, confidence)
    return f0_data
Esempio n. 30
0
 def play(self, note: Note, smpRt: int) -> np.ndarray:
     event, c_pitch = self.get_event(note.pitch)
     ratio = (len(event.data) / smpRt) / note.duration
     ratio = np.clip(ratio, 0.5, 100)
     shift = note.pitch - c_pitch
     wave = lr.effects.harmonic(event.data)
     wave = (Fx().pitch(shift * 100).tempo(ratio).highpass(
         lr.midi_to_hz(note.pitch)))(wave)
     wave = envl.adsr(len(wave))(wave)
     return wave
Esempio n. 31
0
def show_cqt(y, sr, bins_per_octave):
    C = np.abs(
        librosa.cqt(y,
                    sr=sr,
                    fmin=librosa.midi_to_hz(0),
                    n_bins=5 * bins_per_octave,
                    bins_per_octave=bins_per_octave))
    import librosa.display as display
    display.specshow(librosa.amplitude_to_db(C, ref=np.max),
                     sr=sr,
                     x_axis='time',
                     y_axis='cqt_note',
                     bins_per_octave=bins_per_octave,
                     fmin=librosa.midi_to_hz(0),
                     fmax=librosa.midi_to_hz(127))
    plt.colorbar(format='%+2.0f dB')
    plt.title('Constant-Q power spectrum')
    plt.tight_layout()
    plt.show()
Esempio n. 32
0
def freq(tone):
    # convert midi/note to hz.
    # see https://github.com/gpiozero/gpiozero/blob/master/gpiozero/tones.py#L114
    # 0 is treated as None, not midi 8.1hz
    if isinstance(tone, str):
        return librosa.note_to_hz(tone)
    elif isinstance(tone, int) and 0 < tone < 128:
        return librosa.midi_to_hz(tone)
    else:
        return tone
Esempio n. 33
0
def initialize_components_unlimited_partials(signal, pitches, phi=1):
    n_features, _n_samples = signal.S.shape
    n_components = len(pitches)
    W_init = numpy.zeros((n_features, n_components))
    fft_freqs = librosa.fft_frequencies(signal.sr, signal.n_fft)
    for i, pitch in enumerate(pitches):
        #print(i, pitch)
    #    freq = librosa.midi_to_hz(pitch)
        partial = 1
        while True:
            min_freq = librosa.midi_to_hz(pitch - phi) * partial
            max_freq = librosa.midi_to_hz(pitch + phi)  * partial
            max_freq = min(fft_freqs[-1], max_freq)
            intensity = 1 / (partial**2)
            #print('\t%s-%s (%s-%s): %s' % (freq_to_bin(min_freq),freq_to_bin(max_freq), min_freq, max_freq, intensity))
            W_init[freq_to_bin(min_freq, fft_freqs):freq_to_bin(max_freq, fft_freqs),i] = intensity
            if max_freq >= fft_freqs[-1]:
                break
            partial += 1
    return W_init
def wav2inputnp(audio_fn, spec_type='cqt', bin_multiple=3):
    print("wav2inputnp")
    bins_per_octave = 12 * bin_multiple  #should be a multiple of 12
    n_bins = (max_midi - min_midi + 1) * bin_multiple

    #down-sample,mono-channel
    y, _ = librosa.load(audio_fn, sr)
    S = librosa.cqt(y,
                    fmin=librosa.midi_to_hz(min_midi),
                    sr=sr,
                    hop_length=hop_length,
                    bins_per_octave=bins_per_octave,
                    n_bins=n_bins)
    S = S.T

    #TODO: LogScaleSpectrogram?
    '''
    if spec_type == 'cqt':
        #down-sample,mono-channel
        y,_ = librosa.load(audio_fn,sr)
        S = librosa.cqt(y,fmin=librosa.midi_to_hz(min_midi), sr=sr, hop_length=hop_length,
                          bins_per_octave=bins_per_octave, n_bins=n_bins)
        S = S.T
    else:
        #down-sample,mono-channel
        y = madmom.audio.signal.Signal(audio_fn, sample_rate=sr, num_channels=1)
        S = madmom.audio.spectrogram.LogarithmicFilteredSpectrogram(y,fmin=librosa.midi_to_hz(min_midi),
                                            hop_size=hop_length, num_bands=bins_per_octave, fft_size=4096)'''

    #S = librosa.amplitude_to_db(S)
    S = np.abs(S)

    minDB = np.min(S)

    print(np.min(S), np.max(S), np.mean(S))

    S = np.pad(S, ((window_size // 2, window_size // 2), (0, 0)),
               'constant',
               constant_values=minDB)

    windows = []

    # IMPORTANT NOTE:
    # Since we pad the the spectrogram frame,
    # the onset frames are actually `offset` frames.
    # To obtain a window of the center frame at each true index, we take a slice from i to i+window_size
    # starting at frame 0 of the padded spectrogram
    for i in range(S.shape[0] - window_size + 1):
        w = S[i:i + window_size, :]
        windows.append(w)

    #print inputs
    x = np.array(windows)
    return x
Esempio n. 35
0
def test_cq_to_chroma():

    def __test(n_bins, bins_per_octave, n_chroma, fmin, base_c, window):
        # Fake up a cqt matrix with the corresponding midi notes

        if fmin is None:
            midi_base = 24  # C2
        else:
            midi_base = librosa.hz_to_midi(fmin)

        midi_notes = np.linspace(midi_base,
                                 midi_base + n_bins * 12.0 / bins_per_octave,
                                 endpoint=False,
                                 num=n_bins)
        #  We don't care past 2 decimals here.
        # the log2 inside hz_to_midi can cause problems though.
        midi_notes = np.around(midi_notes, decimals=2)
        C = np.diag(midi_notes)

        cq2chr = librosa.filters.cq_to_chroma(n_input=C.shape[0],
                                              bins_per_octave=bins_per_octave,
                                              n_chroma=n_chroma,
                                              fmin=fmin,
                                              base_c=base_c,
                                              window=window)

        chroma = cq2chr.dot(C)
        for i in range(n_chroma):
            v = chroma[i][chroma[i] != 0]
            v = np.around(v, decimals=2)

            if base_c:
                resid = np.mod(v, 12)
            else:
                resid = np.mod(v - 9, 12)

            resid = np.round(resid * n_chroma / 12.0)
            assert np.allclose(np.mod(i - resid, 12), 0.0), i-resid

    for n_octaves in [2, 3, 4]:
        for semitones in [1, 3]:
            for n_chroma in 12 * np.arange(1, 1 + semitones):
                for fmin in [None] + list(librosa.midi_to_hz(range(48, 61))):
                    for base_c in [False, True]:
                        for window in [None, [1]]:
                            bins_per_octave = 12 * semitones
                            n_bins = n_octaves * bins_per_octave

                            if np.mod(bins_per_octave, n_chroma) != 0:
                                tf = raises(librosa.ParameterError)(__test)
                            else:
                                tf = __test
                            yield (tf, n_bins, bins_per_octave,
                                   n_chroma, fmin, base_c, window)
Esempio n. 36
0
 def synthesize(self, sample_rate=44100):
     samples = np.zeros(np.round(np.ceil(sample_rate * self.duration)).astype(int))
     
     for pitch, start, end in self.notes:
         i = np.round(self.tick_to_time(start) * sample_rate).astype(int)
         j = np.round(self.tick_to_time(end) * sample_rate).astype(int)
         buffer = np.sin(librosa.midi_to_hz(pitch) * 2 * np.pi * np.arange(j - i) / sample_rate)
         buffer *= 1 - np.linspace(0, 1, len(buffer)) ** 2
         samples[i:j] += buffer
     
     return display.Audio(samples, rate=sample_rate)
Esempio n. 37
0
def read_midi(path, samplers, polyphonic=False):
    mid = mido.MidiFile(path)
    out = []
    for i, track in enumerate(mid.tracks):
        sampler = samplers[i]
        notes = {}
        last_note = (None, 0, 0)
        t_ptr = 0

        for msg in track:
            t_ptr += msg.time
            tempo = 500000
            if msg.type == "note_on":
                notes[msg.note] = (t_ptr, msg.velocity / 255)
                if last_note[0] != None and not polyphonic:
                    note, start, vel = last_note
                    dur = t_ptr - start
                    dur = mido.tick2second(dur, mid.ticks_per_beat, tempo)
                    start = mido.tick2second(start, mid.ticks_per_beat, tempo)
                    if dur > 0.1:
                        note = NoteObject(
                            Note(lr.midi_to_hz(msg.note), vel, dur), sampler,
                            start)
                        out.append(note)
                last_note = (msg.note, t_ptr, msg.velocity / 255)
            if msg.type == "note_off":
                try:
                    start, vel = notes[msg.note]
                    dur = t_ptr - start
                    dur = mido.tick2second(dur, mid.ticks_per_beat, tempo)
                    start = mido.tick2second(start, mid.ticks_per_beat, tempo)
                    note = NoteObject(Note(lr.midi_to_hz(msg.note), vel, dur),
                                      sampler, start)
                    out.append(note)
                except:
                    print("Warning: Problems reading MIDI.")
            if msg.type == "set_tempo":
                tempo = msg.tempo
                print(f"Read MIDI tempo: {tempo}")

    return Structure(out, 0)
Esempio n. 38
0
def file_to_chromagram(file_name):
    sr = 44100
    x, sr = librosa.load(file_name, sr=sr)  # .wav file and its sampling rate
    fmin = librosa.midi_to_hz(22)  # minimal key on our chromagram will be A0
    hop_length = 256  # needed for Constant-Q Transform
    amplitude = librosa.cqt(x[:120 * 44100],
                            sr=sr,
                            fmin=fmin,
                            n_bins=108,
                            hop_length=hop_length)
    chromagram = librosa.amplitude_to_db(np.abs(amplitude))
    return chromagram
Esempio n. 39
0
def gram_to_beat_chroma(gram):
    '''
    Converts a pre-computed CQT to a beat-synchronous chromagram, transposed so
    that the first dimension are features and the second are time frames.
    This implements all that is needed to convert pre-computed CQTs to the
    format used in the 2DFTM experiments.

    Parameters
    ----------
    gram : np.ndarray
        Constant-Q spectrogram, shape=(n_frames, n_frequency_bins)

    Returns
    -------
    chroma : np.ndarray
        Beat-synchronous chroma matrix, shape (n_frequency_bins, n_beats)
    '''
    # Transpose to match librosa's format librosa
    gram = np.array(gram.T)
    # Because CQTs have spectra which are pre-L2-normalized, their range is
    # [-some number, 0]; this causes issues for the max-normalization which
    # happens below.  This rescales to [0, some_number]
    gram -= gram.min()
    # Compute beats
    tempo, beats = librosa.beat.beat_track(
        onset_envelope=librosa.onset.onset_strength(S=gram),
        sr=feature_extraction.AUDIO_FS,
        hop_length=feature_extraction.AUDIO_HOP)
    # Make sure librosa didn't report 0 or 1 beats
    if beats.shape[0] < 2:
        # In this degenerate case, just put a beat at the beginning and the end
        # This, combined with the following interpolation, will result in an
        # even segmentation of the CQT into integrated frames
        beats = np.array([0, gram.shape[1]])
    # 2DFTM requires there to be at least 75 beats, so double the tempo until
    # there are 75 beats
    while beats.shape[0] < 75:
        # Linearly interpolate beats between all the existing beats
        interped_beats = np.empty(2 * beats.shape[0] - 1)
        interped_beats[::2] = beats
        interped_beats[1::2] = beats[:-1] + np.diff(beats) / 2.
        beats = interped_beats
    # Compute CQT from chroma, without any built-in normalization or threshold
    chroma = librosa.feature.chroma_cqt(C=gram,
                                        norm=None,
                                        threshold=None,
                                        fmin=librosa.midi_to_hz(
                                            feature_extraction.NOTE_START))
    # Compute beat-synchronous chroma
    beat_chroma = librosa.feature.sync(chroma, beats)
    # Max-normalize the result - this is done in Thierry/DAn's msd_beatchroma
    beat_chroma = librosa.util.normalize(beat_chroma)
    return beat_chroma
Esempio n. 40
0
def _load_f0(f0_path):
    if not os.path.exists(f0_path):
        return None

    with open(f0_path) as fhandle:
        lines = fhandle.readlines()
    f0_midi = np.array([float(line) for line in lines])
    f0_hz = librosa.midi_to_hz(f0_midi) * (f0_midi > 0)
    confidence = (f0_hz > 0).astype(float)
    times = (np.arange(len(f0_midi)) * TIME_STEP) + (TIME_STEP / 2.0)
    f0_data = utils.F0Data(times, f0_hz, confidence)
    return f0_data
Esempio n. 41
0
def create_timbre_spectrogram(audio, hparams):
    """Create either a CQT or mel spectrogram"""
    if tf.is_tensor(audio):
        audio = audio.numpy()
    if isinstance(audio, bytes):
        # Get samples from wav data.
        samples = audio_io.wav_data_to_samples(audio, hparams.sample_rate)
    else:
        samples = audio

    if hparams.timbre_spec_type == 'mel':
        spec = np.abs(
            librosa.feature.melspectrogram(
                samples,
                hparams.sample_rate,
                hop_length=hparams.timbre_hop_length,
                fmin=librosa.midi_to_hz(constants.MIN_TIMBRE_PITCH),
                fmax=librosa.midi_to_hz(constants.MAX_TIMBRE_PITCH),
                n_mels=constants.TIMBRE_SPEC_BANDS,
                pad_mode='symmetric',
                htk=hparams.spec_mel_htk,
                power=2)).T

    else:
        spec = np.abs(
            librosa.core.cqt(samples,
                             hparams.sample_rate,
                             hop_length=hparams.timbre_hop_length,
                             fmin=librosa.midi_to_hz(
                                 constants.MIN_TIMBRE_PITCH),
                             n_bins=constants.TIMBRE_SPEC_BANDS,
                             bins_per_octave=constants.BINS_PER_OCTAVE,
                             pad_mode='symmetric')).T

    # convert amplitude to power
    if hparams.timbre_spec_log_amplitude:
        spec = librosa.power_to_db(spec) - librosa.power_to_db(np.array([1e-9
                                                                         ]))[0]
        spec = spec / np.max(spec)
    return spec
Esempio n. 42
0
def gram_to_beat_chroma(gram):
    '''
    Converts a pre-computed CQT to a beat-synchronous chromagram, transposed so
    that the first dimension are features and the second are time frames.
    This implements all that is needed to convert pre-computed CQTs to the
    format used in the 2DFTM experiments.

    Parameters
    ----------
    gram : np.ndarray
        Constant-Q spectrogram, shape=(n_frames, n_frequency_bins)

    Returns
    -------
    chroma : np.ndarray
        Beat-synchronous chroma matrix, shape (n_frequency_bins, n_beats)
    '''
    # Transpose to match librosa's format librosa
    gram = np.array(gram.T)
    # Because CQTs have spectra which are pre-L2-normalized, their range is
    # [-some number, 0]; this causes issues for the max-normalization which
    # happens below.  This rescales to [0, some_number]
    gram -= gram.min()
    # Compute beats
    tempo, beats = librosa.beat.beat_track(
        onset_envelope=librosa.onset.onset_strength(S=gram),
        sr=feature_extraction.AUDIO_FS,
        hop_length=feature_extraction.AUDIO_HOP)
    # Make sure librosa didn't report 0 or 1 beats
    if beats.shape[0] < 2:
        # In this degenerate case, just put a beat at the beginning and the end
        # This, combined with the following interpolation, will result in an
        # even segmentation of the CQT into integrated frames
        beats = np.array([0, gram.shape[1]])
    # 2DFTM requires there to be at least 75 beats, so double the tempo until
    # there are 75 beats
    while beats.shape[0] < 75:
        # Linearly interpolate beats between all the existing beats
        interped_beats = np.empty(2 * beats.shape[0] - 1)
        interped_beats[::2] = beats
        interped_beats[1::2] = beats[:-1] + np.diff(beats) / 2.
        beats = interped_beats
    # Compute CQT from chroma, without any built-in normalization or threshold
    chroma = librosa.feature.chroma_cqt(
        C=gram, norm=None, threshold=None,
        fmin=librosa.midi_to_hz(feature_extraction.NOTE_START))
    # Compute beat-synchronous chroma
    beat_chroma = librosa.feature.sync(chroma, beats)
    # Max-normalize the result - this is done in Thierry/DAn's msd_beatchroma
    beat_chroma = librosa.util.normalize(beat_chroma)
    return beat_chroma
Esempio n. 43
0
def test_pitch_tuning():

    def __test(hz, resolution, bins_per_octave, tuning):

        est_tuning = librosa.pitch_tuning(hz,
                                          resolution=resolution,
                                          bins_per_octave=bins_per_octave)

        assert np.abs(tuning - est_tuning) <= resolution

    for resolution in [1e-2, 1e-3]:
        for bins_per_octave in [12]:
            # Make up some frequencies
            for tuning in [-0.5, -0.375, -0.25, 0.0, 0.25, 0.375]:

                note_hz = librosa.midi_to_hz(tuning + np.arange(128))

                yield __test, note_hz, resolution, bins_per_octave, tuning
Esempio n. 44
0
    def __test(note_min):

        C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.midi_to_hz(note_min)))**2

        # Average over time
        Cbar = np.median(C, axis=1)

        # Find the peak
        idx = np.argmax(Cbar)

        eq_(idx, 60 - note_min)

        # Make sure that the max outside the peak is sufficiently small
        Cscale = Cbar / Cbar[idx]
        Cscale[idx] = np.nan
        assert np.nanmax(Cscale) < 6e-1, Cscale

        Cscale[idx-1:idx+2] = np.nan
        assert np.nanmax(Cscale) < 5e-2, Cscale
Esempio n. 45
0
def get_drum_wav(percussion, width=5, n=None):

    # Compute volume shaper
    percussion = librosa.util.normalize(percussion.ravel())

    v = scipy.ndimage.median_filter(percussion,
                                    width,
                                    mode='mirror')
    v = np.atleast_2d(v)

    wav = synthesize(librosa.frames_to_samples(np.arange(v.shape[-1]),
                                               hop_length=hop_length),
                     v,
                     fmin=librosa.midi_to_hz(0),
                     bins_per_octave=12,
                     wave=noise,
                     n=n)[0]

    return wav
Esempio n. 46
0
def test_estimate_tuning():

    def __test(target_hz, resolution, bins_per_octave, tuning):

        y = np.sin(2 * np.pi * target_hz * t)
        tuning_est = librosa.estimate_tuning(resolution=resolution,
                                             bins_per_octave=bins_per_octave,
                                             y=y,
                                             sr=sr,
                                             n_fft=2048,
                                             fmin=librosa.note_to_hz('C4'),
                                             fmax=librosa.note_to_hz('G#9'))

        print('target_hz={:.3f}'.format(target_hz))
        print('tuning={:.3f}, estimated={:.3f}'.format(tuning, tuning_est))
        print('resolution={:.2e}'.format(resolution))

        # Round to the proper number of decimals
        deviation = np.around(np.abs(tuning - tuning_est),
                              int(-np.log10(resolution)))

        # We'll accept an answer within three bins of the resolution
        assert deviation <= 3 * resolution

    for sr in [11025, 22050]:
        duration = 5.0

        t = np.linspace(0, duration, duration * sr)

        for resolution in [1e-2]:
            for bins_per_octave in [12]:
                # test a null-signal tuning estimate
                yield (__test, 0.0, resolution, bins_per_octave, 0.0)

                for center_note in [69, 84, 108]:
                    for tuning in np.linspace(-0.5, 0.5, 8, endpoint=False):
                        target_hz = librosa.midi_to_hz(center_note + tuning)

                        yield (__test, np.asscalar(target_hz), resolution,
                               bins_per_octave, tuning)
Esempio n. 47
0
def extract_cqt(audio_data, fs, hop, note_start, n_notes):
    '''
    Compute a log-magnitude L2-normalized constant-Q-gram of some audio data.

    Parameters
    ----------
    audio_data : np.ndarray
        Audio data to compute CQT of
    fs : int
        Sampling rate of audio
    hop : int
        Hop length for CQT
    note_start : int
        Lowest MIDI note number for CQT
    n_notes : int
        Number of notes to include in the CQT

    Returns
    -------
    cqt : np.ndarray
        Log-magnitude L2-normalized CQT of the supplied audio data.
    frame_times : np.ndarray
        Times, in seconds, of each frame in the CQT
    '''
    # Compute CQT
    cqt = librosa.cqt(
        audio_data, sr=fs, hop_length=hop,
        fmin=librosa.midi_to_hz(note_start), n_bins=n_notes)
    # Transpose so that rows are spectra
    cqt = cqt.T
    # Compute log-amplitude
    cqt = librosa.logamplitude(cqt, ref_power=cqt.max())
    # L2 normalize the columns
    cqt = librosa.util.normalize(cqt, norm=2., axis=1)
    # Compute the time of each frame
    times = librosa.frames_to_time(np.arange(cqt.shape[0]), fs, hop)
    return cqt, times
Esempio n. 48
0
def audio_cqt(audio_data, fs=AUDIO_FS):
    '''
    Compute some audio data's constant-Q spectrogram, normalize, and log-scale
    it

    Parameters
    ----------
    audio_data : np.ndarray
        Some audio signal.
    fs : int
        Sampling rate the audio data is sampled at, should be ``AUDIO_FS``.

    Returns
    -------
    midi_gram : np.ndarray
        Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI
        data.
    '''
    # Compute CQT of the synthesized audio data
    audio_gram = librosa.cqt(
        audio_data, sr=fs, hop_length=AUDIO_HOP,
        fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES)
    # L2-normalize and log-magnitute it
    return post_process_cqt(audio_gram)
Esempio n. 49
0
def midi_cqt(midi_object):
    '''
    Synthesize MIDI data, compute its constant-Q spectrogram, normalize, and
    log-scale it

    Parameters
    ----------
    midi_object : pretty_midi.PrettyMIDI
        MIDI data to create constant-Q spectrogram of.

    Returns
    -------
    midi_gram : np.ndarray
        Log-magnitude, L2-normalized constant-Q spectrugram of synthesized MIDI
        data.
    '''
    # Synthesize MIDI object as audio data
    midi_audio = fast_fluidsynth(midi_object, MIDI_FS)
    # Compute CQT of the synthesized audio data
    midi_gram = librosa.cqt(
        midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP,
        fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES)
    # L2-normalize and log-magnitute it
    return post_process_cqt(midi_gram)
Esempio n. 50
0
def get_wav(cq, nmin=60, nmax=120, width=5, max_peaks=1, wave=None, n=None):

    # Slice down to the bass range
    cq = cq[nmin:nmax]

    # Pick peaks at each time
    mask = peakgram(librosa.logamplitude(cq**2, top_db=60, ref_power=np.max),
                    max_peaks=max_peaks)

    # Smooth in time
    mask = scipy.ndimage.median_filter(mask,
                                       size=(1, width),
                                       mode='mirror')

    # resynthesize with some magnitude compression
    wav = synthesize(librosa.frames_to_samples(np.arange(cq.shape[-1]),
                                               hop_length=hop_length),
                     mask * cq**(1./3),
                     fmin=librosa.midi_to_hz(nmin + MIDI_MIN),
                     bins_per_octave=12,
                     wave=wave,
                     n=n)[0]

    return wav
Esempio n. 51
0
def extract_features(audio_data):
    '''
    Feature extraction routine - gets beat-synchronous CQT, beats, and bpm

    :parameters:
        - audio_data : np.ndarray
            Audio samples at 22 kHz

    :returns:
        - cqt : np.ndarray
            Beat-synchronous CQT, four octaves, starting from note 36
        - beats : np.ndarray
            Beat locations, in seconds.  Beat tracking is done using CQT
        - bpm : float
            BPM.  If the estimated BPM is less than 160, it is doubled.
    '''
    gram = np.abs(librosa.cqt(
        audio_data, fmin=librosa.midi_to_hz(36), n_bins=48))
    # Compute onset envelope from CQT (for speed)
    onset_envelope = librosa.onset.onset_strength(S=gram, aggregate=np.median)
    bpm, beats = librosa.beat.beat_track(onset_envelope=onset_envelope)
    # Double the BPM and interpolate beat locations if BPM < 160
    while bpm < 240:
        beat_interp = scipy.interpolate.interp1d(
            np.arange(0, 2*beats.shape[0], 2), beats)
        beats = beat_interp(np.arange(2*beats.shape[0] - 1)).astype(int)
        bpm *= 2
    # Synchronize the CQT to the beats
    sync_gram = librosa.feature.sync(gram, beats)
    # Also compute log amplitude
    sync_gram = librosa.logamplitude(sync_gram, ref_power=sync_gram.max())
    # Transpose so that rows are samples
    sync_gram = sync_gram.T
    # and L2 normalize
    sync_gram = librosa.util.normalize(sync_gram, norm=2., axis=1)
    return sync_gram, librosa.frames_to_time(beats), bpm
Esempio n. 52
0
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True, interval=0):
    """
    Helper function for aligning a MIDI file to an audio file.

    :parameters:
        - mp3_filename : str
            Full path to a .mp3 file.
        - midi_filename : str
            Full path to a .mid file.
        - output_midi_filename : str
            Full path to where the aligned .mid file should be written.  If None, don't output.
        - output_diagnostics : bool
            If True, also output a .pdf of figures, a .mat of the alignment results,
            and a .mp3 of audio and synthesized aligned audio
    """
    # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it
    try:
        m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename))
    except:
        print "Error loading {}".format(midi_filename)
        return

    print "Aligning {}".format(os.path.split(midi_filename)[1])

    # Cache audio CQT and onset strength

    audio, fs = librosa.load(mp3_filename)
    if use_mp3_data:
        if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)):
            print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1])
            # Create audio CQT, which is just frame-wise power, and onset strength
            audio_gram = np.load(to_cqt_npy(mp3_filename))
            audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))
        else:
            print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
            audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
            np.save(to_cqt_npy(mp3_filename), audio_gram)
            np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
    else:
        print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
        audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
        np.save(to_cqt_npy(mp3_filename), audio_gram)
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)

    print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
    # Generate synthetic MIDI CQT
    if piano:
        midi_gram = align_midi.midi_to_piano_cqt(m)
        # log_gram = librosa.logamplitude(midi_gram, ref_power=midi_gram.max())
        # Normalize columns and return
        # midi_gram= librosa.util.normalize(log_gram, axis=0)
        midi_beats, bpm = align_midi.midi_beat_track(m)
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    else:
        midi_gram = align_midi.midi_to_cqt(m, SF2_PATH)
        # Get beats
        midi_beats, bpm = align_midi.midi_beat_track(m)
        # Beat synchronize and normalize
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    if interval != 0:
        midi_gram = shift_cqt(midi_gram, interval)

    # Compute beats
    midi_beats, bpm = align_midi.midi_beat_track(m)
    audio_beats = librosa.beat.beat_track(onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4
    # Beat-align and log/normalize the audio CQT
    audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats)

    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine")
    p, q, score = align_midi.dpmod(similarity_matrix)

    # Plot log-fs grams
    plt.figure(figsize=(36, 24))
    ax = plt.subplot2grid((4, 3), (0, 0), colspan=3)
    plt.title("MIDI Synthesized")
    librosa.display.specshow(
        midi_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)
    )
    ax = plt.subplot2grid((4, 3), (1, 0), colspan=3)
    plt.title("Audio data")
    librosa.display.specshow(
        audio_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)
    )

    # Get similarity matrix
    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine")
    # Get best path through matrix
    p, q, score = align_midi.dpmod(similarity_matrix)

    # Plot distance at each point of the lowst-cost path
    ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2)
    plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)])
    plt.title("Distance at each point on lowest-cost path")

    # Plot similarity matrix and best path through it
    ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2)
    plt.imshow(similarity_matrix.T, aspect="auto", interpolation="nearest", cmap=plt.cm.gray)
    tight = plt.axis()
    plt.plot(p, q, "r.", ms=0.2)
    plt.axis(tight)
    plt.title("Similarity matrix and lowest-cost path, cost={}".format(score))

    # Adjust MIDI timing
    m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q])

    # Plot alignment
    ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2)
    note_ons = np.array([note.start for instrument in m.instruments for note in instrument.notes])
    aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.notes])
    plt.plot(note_ons, aligned_note_ons - note_ons, ".")
    plt.xlabel("Original note location (s)")
    plt.ylabel("Shift (s)")
    plt.title("Corrected offset")

    # Write out the aligned file
    if output_midi_filename is not None:
        m_aligned.write(output_midi_filename)

    if output_diagnostics:
        # Save the figures
        plt.savefig(output_midi_filename.replace(".mid", ".pdf"))
        if write_mp3:
            # Load in the audio data (needed for writing out)
            audio, fs = librosa.load(mp3_filename, sr=None)
            # Synthesize the aligned midi
            # midi_audio_aligned = m_aligned.fluidsynth()
            midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH)

            # Trim to the same size as audio
            if midi_audio_aligned.shape[0] > audio.shape[0]:
                midi_audio_aligned = midi_audio_aligned[: audio.shape[0]]
            else:
                midi_audio_aligned = np.append(
                    midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0])
                )
            # Write out to temporary .wav file
            librosa.output.write_wav(
                output_midi_filename.replace(".mid", ".wav"), np.vstack([midi_audio_aligned, audio]).T, fs
            )
            # Convert to mp3
            subprocess.check_output(
                [
                    "ffmpeg",
                    "-i",
                    output_midi_filename.replace(".mid", ".wav"),
                    "-ab",
                    "128k",
                    "-y",
                    output_midi_filename.replace(".mid", ".mp3"),
                ]
            )
            # Remove temporary .wav file
            os.remove(output_midi_filename.replace(".mid", ".wav"))
            # Save a .mat of the results
            scipy.io.savemat(
                output_midi_filename.replace(".mid", ".mat"),
                {"similarity_matrix": similarity_matrix, "p": p, "q": q, "score": score},
            )
    # If we aren't outputting a .pdf, show the plot
    else:
        plt.show()
    plt.close()
def features(filename):
    '''Extract feature data for spectral clustering segmentation.

    :parameters:
        - filename : str
            Path on disk to an audio file

    :returns:
        - X_cqt : np.ndarray [shape=(d1, n)]
          A beat-synchronous log-power CQT matrix

        - X_timbre : np.ndarray [shape=(d2, n)]
          A beat-synchronous MFCC matrix

        - beat_times : np.ndarray [shape=(n, 2)]
          Timing of beat intervals
    '''
    print('\t[1/5] loading audio')
    y, sr = librosa.load(filename, sr=None)
    y = librosa.resample(y, sr, SR, res_type='sinc_fastest')
    sr = SR

    print('\t[2/5] Separating harmonic and percussive signals')
    y_harm, y_perc = librosa.effects.hpss(y)

    print('\t[3/5] detecting beats')
    bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH)

    print('\t[4/5] generating CQT')
    X_cqt = librosa.cqt(y=y_harm,
                        sr=sr,
                        hop_length=HOP_LENGTH,
                        bins_per_octave=12,
                        fmin=librosa.midi_to_hz(24),
                        n_bins=72)

    # Compute log CQT power
    X_cqt = librosa.logamplitude(X_cqt**2.0, ref_power=np.max)

    # Compute MFCCs
    print('\t[5/5] generating MFCC')
    X_melspec = librosa.feature.melspectrogram(y=y,
                                               sr=sr,
                                               hop_length=HOP_LENGTH,
                                               n_mels=N_MELS)

    X_timbre = librosa.feature.mfcc(S=librosa.logamplitude(X_melspec),
                                    n_mfcc=N_MFCC)

    # Resolve any timing discrepancies due to CQT downsampling
    n = min(X_cqt.shape[1], X_timbre.shape[1])

    # Trim the beat detections to fit within the shape of X*
    beats = beats[beats < n]

    # Pad on a frame=0 beat for synchronization purposes
    beats = np.unique(np.concatenate([[0], beats]))

    # Convert beat frames to beat times
    beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH)

    # Take on an end-of-track marker.  This is necessary if we want
    # the output intervals to span the entire track.
    beat_times = np.concatenate([beat_times, [float(len(y)) / sr]])

    beat_intervals = np.c_[beat_times[:-1], beat_times[1:]]

    # Synchronize the feature matrices
    X_cqt = librosa.feature.sync(X_cqt, beats, aggregate=np.median)
    X_timbre = librosa.feature.sync(X_timbre, beats, aggregate=np.mean)

    return X_cqt, X_timbre, beat_intervals
Esempio n. 54
0
def logfrequency(sr, n_fft, n_bins=84, bins_per_octave=12, tuning=0.0,
                 fmin=None, spread=0.125):
    '''Approximate a constant-Q filterbank for a fixed-window STFT.

    Each filter is a log-normal window centered at the corresponding frequency.

    :usage:
        >>> # Simple log frequency filters
        >>> logfs_fb = librosa.filters.logfrequency(22050, 4096)

        >>> # Use a narrower frequency range
        >>> logfs_fb = librosa.filters.logfrequency(22050, 4096,
                                                    n_bins=48, fmin=110)

        >>> # Use narrower filters for sparser response: 5% of a semitone
        >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.05)
        >>> # Or wider: 50% of a semitone
        >>> logfs_fb = librosa.filters.logfrequency(22050, 4096, spread=0.5)

    :parameters:
      - sr : int > 0
          audio sampling rate

      - n_fft : int > 0
          FFT window size

      - n_bins : int > 0
          Number of bins.  Defaults to 84 (7 octaves).

      - bins_per_octave : int > 0
          Number of bins per octave. Defaults to 12 (semitones).

      - tuning : None or float in [-0.5, +0.5]
          Tuning correction parameter, in fractions of a bin.

      - fmin : float > 0
          Minimum frequency bin. Defaults to ``C2 ~= 32.70``

      - spread : float > 0
          Spread of each filter, as a fraction of a bin.

    :returns:
      - C : np.ndarray, shape=(n_bins, 1 + n_fft/2)
          log-frequency filter bank.
    '''

    if fmin is None:
        fmin = librosa.midi_to_hz(librosa.note_to_midi('C2'))

    # Apply tuning correction
    correction = 2.0**(float(tuning) / bins_per_octave)

    # What's the shape parameter for our log-normal filters?
    sigma = float(spread) / bins_per_octave

    # Construct the output matrix
    basis = np.zeros((n_bins, 1 + n_fft/2))

    # Get log frequencies of bins
    log_freqs = np.log2(librosa.fft_frequencies(sr, n_fft)[1:])

    for i in range(n_bins):
        # What's the center (median) frequency of this filter?
        c_freq = correction * fmin * (2.0**(float(i)/bins_per_octave))

        # Place a log-normal window around c_freq
        basis[i, 1:] = np.exp(-0.5 * ((log_freqs - np.log2(c_freq)) / sigma)**2
                              - np.log2(sigma) - log_freqs)

        # Normalize each filter
        c_norm = np.sqrt(np.sum(basis[i]**2))
        if c_norm > 0:
            basis[i] = basis[i] / c_norm

    return basis
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True):
    '''
    Helper function for aligning a MIDI file to an audio file.

    :parameters:
        - mp3_filename : str
            Full path to a .mp3 file.
        - midi_filename : str
            Full path to a .mid file.
        - output_midi_filename : str
            Full path to where the aligned .mid file should be written.  If None, don't output.
        - output_diagnostics : bool
            If True, also output a .pdf of figures, a .mat of the alignment results,
            and a .mp3 of audio and synthesized aligned audio
    '''
    # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except:
        print "Error loading {}".format(midi_filename)
        return

    print "Aligning {}".format(os.path.split(midi_filename)[1])

    #check if output path exists, and create it if necessary
    if not os.path.exists(os.path.split(output_midi_filename)[0]):
      os.makedirs(os.path.split(output_midi_filename)[0])

    audio, fs = librosa.load(mp3_filename)
    if use_prev_data:
      if chroma:
        if os.path.exists(to_chroma_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)):
          audio_gram = np.load(to_chroma_npy(mp3_filename))
          audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))
        else:
          print "Generating chroma features for {}".format(mp3_filename)
          audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs)
          np.save(to_chroma_npy(mp3_filename), audio_gram)
          np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
      else:
        if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)):
          print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1])
          # Create audio CQT, which is just frame-wise power, and onset strength
          audio_gram = np.load(to_cqt_npy(mp3_filename))
          audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))
        else:
          print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
          audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
          np.save(to_cqt_npy(mp3_filename), audio_gram)
          np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
    else:
      print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
      if chroma:
        audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs)
        np.save(to_chroma_npy(mp3_filename), audio_gram)
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
      else:
        audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
        np.save(to_cqt_npy(mp3_filename), audio_gram)
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)

    if use_prev_data and not make_midi_info:
      if piano:
        if os.path.exists(to_piano_cqt_npy(midi_filename)):
          midi_gram = np.load(to_piano_cqt_npy(midi_filename))
        else:
          print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
          midi_gram = make_midi_cqt(midi_filename, piano,chroma, m)
      elif chroma:
        if os.path.exists(to_chroma_npy(midi_filename)):
          midi_gram = np.load(to_chroma_npy(midi_filename))
        else:
          print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
          midi_gram = make_midi_cqt(midi_filename, piano,chroma, m)
      else:
        if os.path.exists(to_cqt_npy(midi_filename)):
          midi_gram = np.load(to_cqt_npy(midi_filename))
        else:
          print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
          midi_gram = make_midi_cqt(midi_filename, piano,chroma, m)
    else:
      print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
      # Generate synthetic MIDI CQT
      midi_gram = make_midi_cqt(midi_filename, piano,chroma, m)
    if piano:
      # midi_gram = align_midi.accentuate_onsets(midi_gram)
      midi_gram = align_midi.piano_roll_fuzz(midi_gram)
      # midi_gram = align_midi.clean_audio_gram(midi_gram, threshold = np.percentile(midi_gram,40))
      midi_gram = librosa.util.normalize(midi_gram, axis = 0)

    # Compute beats
    midi_beats, bpm = align_midi.midi_beat_track(m)
    audio_beats = librosa.beat.beat_track(onset_envelope=audio_onset_strength, hop_length=512/4, bpm=bpm)[1]/4
    # Beat-align and log/normalize the audio CQT
    audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats)

    # Plot log-fs grams
    # audio_gram = align_midi.clean_audio_gram(audio_gram, threshold = np.percentile(audio_gram, 80))
    plt.figure(figsize=(36, 24))
    ax = plt.subplot2grid((4, 3), (0, 0), colspan=3)
    plt.title('MIDI Synthesized')
    librosa.display.specshow(midi_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))
    ax = plt.subplot2grid((4, 3), (1, 0), colspan=3)
    plt.title('Audio data')
    librosa.display.specshow(audio_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))


    # Get similarity matrix
    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric='cosine')
    # Get best path through matrix
    p, q, score = align_midi.dpmod(similarity_matrix,experimental = False, forceH = False)

    # Plot distance at each point of the lowst-cost path
    ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2)
    plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)])
    plt.title('Distance at each point on lowest-cost path')


    # Plot similarity matrix and best path through it
    ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2)
    plt.imshow(similarity_matrix.T,
               aspect='auto',
               interpolation='nearest',
               cmap=plt.cm.gray)
    tight = plt.axis()
    plt.plot(p, q, 'r.', ms=.2)
    plt.axis(tight)
    plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score))

    # Adjust MIDI timing
    m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q])

    # Plot alignment
    ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2)
    note_ons = np.array([note.start for instrument in m.instruments for note in instrument.events])
    aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.events])
    plt.plot(note_ons, aligned_note_ons - note_ons, '.')
    plt.xlabel('Original note location (s)')
    plt.ylabel('Shift (s)')
    plt.title('Corrected offset')

    # Write out the aligned file
    if output_midi_filename is not None:
        m_aligned.write(output_midi_filename)

    if output_diagnostics:
        # Save the figures
        plt.savefig(output_midi_filename.replace('.mid', '.pdf'))
        if write_mp3:
          # Load in the audio data (needed for writing out)
          audio, fs = librosa.load(mp3_filename, sr=None)
          # Synthesize the aligned midi
          midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH)

          # Trim to the same size as audio
          if midi_audio_aligned.shape[0] > audio.shape[0]:
              midi_audio_aligned = midi_audio_aligned[:audio.shape[0]]
          else:
              midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0]))
          # Write out to temporary .wav file
          librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'),
                                   np.vstack([midi_audio_aligned, audio]).T, fs)
          # Convert to mp3
          subprocess.check_output(['ffmpeg',
                           '-i',
                           output_midi_filename.replace('.mid', '.wav'),
                           '-ab',
                           '128k',
                           '-y',
                           output_midi_filename.replace('.mid', '.mp3')])
          # Remove temporary .wav file
          os.remove(output_midi_filename.replace('.mid', '.wav'))
          # Save a .mat of the results
          scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'),
                           {'similarity_matrix': similarity_matrix,
                            'p': p, 'q': q, 'score': score})
    # If we aren't outputting a .pdf, show the plot
    else:
        plt.show()
    plt.close()
Esempio n. 56
0
import sklearn
import sklearn.cluster
import sklearn.pipeline

import matplotlib.pyplot as plt
import seaborn
seaborn.set(style='ticks')


# We'll build the feature pipeline object here

# First stage is a mel-frequency specrogram of bounded range
MelSpec = librosa.util.FeatureExtractor(librosa.feature.melspectrogram, 
                                        n_fft=2048,
                                        n_mels=128,
                                        fmax=librosa.midi_to_hz(116), 
                                        fmin=librosa.midi_to_hz(24))

# Second stage is log-amplitude; power is relative to peak in the signal
LogAmp = librosa.util.FeatureExtractor(librosa.logamplitude, 
                                       ref_power=np.max)


# Third stage transposes the data so that frames become samples
Transpose = librosa.util.FeatureExtractor(np.transpose)

# Last stage stacks all samples together into one matrix for training
Stack = librosa.util.FeatureExtractor(np.vstack, iterate=False)

# Now, build a learning object.  We'll use mini-batch k-means with default parameters.
C = sklearn.cluster.MiniBatchKMeans()
Esempio n. 57
0
SR          = 22050
N_FFT       = 2048
HOP_LENGTH  = 512
HOP_BEATS   = 64
N_MELS      = 128
FMAX        = 8000

REP_WIDTH   = 3
REP_FILTER  = 7

N_MFCC      = 32
N_CHROMA    = 12
N_REP       = 32

NOTE_MIN    = librosa.midi_to_hz(24) # 32Hz
NOTE_NUM    = 84
NOTE_RES    = 2                     # CQT filter resolution

# mfcc, chroma, repetitions for each, and 4 time features
__DIMENSION = N_MFCC + N_CHROMA + 2 * N_REP + 4

def features(filename):
    '''Feature-extraction for audio segmentation
    Arguments:
        filename -- str
        path to the input song

    Returns:
        - X -- ndarray
            
def test_midi_to_hz():

    assert np.allclose(librosa.midi_to_hz([33, 45, 57, 69]),
                       [55, 110, 220, 440])
Esempio n. 59
0
def constant_q(sr, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0,
               window=None, resolution=2, pad=False, **kwargs):
    r'''Construct a constant-Q basis.

    :usage:
        >>> # Change the windowing function to Hamming instead of Hann
        >>> basis   = librosa.filters.constant_q(22050, window=np.hamming)

        >>> # Use a longer window for each filter
        >>> basis   = librosa.filters.constant_q(22050, resolution=3)

        >>> # Pad the basis to fixed length
        >>> basis   = librosa.filters.constant_q(22050, pad=True)

    :parameters:
      - sr : int > 0
          Audio sampling rate

      - fmin : float > 0
          Minimum frequency bin. Defaults to ``C2 ~= 32.70``

      - n_bins : int > 0
          Number of frequencies.  Defaults to 7 octaves (84 bins).

      - bins_per_octave : int > 0
          Number of bins per octave

      - tuning : float in [-0.5, +0.5)
          Tuning deviation from A440 in fractions of a bin

      - window : function or ``None``
          Windowing function to apply to filters.
          If ``None``, no window is applied.
          Default: scipy.signal.hann

      - resolution : float > 0
          Resolution of filter windows. Larger values use longer windows.

      - pad : boolean
          Pad all filters to have a constant width (equal to the longest filter).
          By default, padding is done with zeros, but this can be overridden
          by setting the ``mode=`` field in *kwargs*.

      - *kwargs*
          Additional keyword arguments to ``np.pad()`` when ``pad==True``.

      .. note::
        - McVicar, Matthew. "A machine learning approach to automatic chord
          extraction." Dissertation, University of Bristol. 2013.

    :returns:
      - filters : list of np.ndarray, ``len(filters) == n_bins``
          ``filters[i]`` is ``i``\ th CQT basis filter (in the time-domain)
    '''

    if fmin is None:
        fmin = librosa.midi_to_hz(librosa.note_to_midi('C2'))

    if window is None:
        window = scipy.signal.hann

    correction = 2.0**(float(tuning) / bins_per_octave)

    fmin = correction * fmin

    # Q should be capitalized here, so we suppress the name warning
    # pylint: disable=invalid-name
    Q = float(resolution) / (2.0**(1. / bins_per_octave) - 1)

    filters = []
    for i in np.arange(n_bins, dtype=float):
        # Length of this filter
        ilen = np.ceil(Q * sr / (fmin * 2.0**(i / bins_per_octave)))

        # Build the filter
        win = np.exp(Q * 1j * np.linspace(0, 2 * np.pi, ilen, endpoint=False))

        # Apply the windowing function
        if window is not None:
            win = win * window(ilen)

        # Normalize
        win = librosa.util.normalize(win, norm=2)

        filters.append(win)

    if pad:
        max_len = max(map(len, filters))

        # Use reflection padding, unless otherwise specified
        for i in range(len(filters)):
            filters[i] = librosa.util.pad_center(filters[i], max_len, **kwargs)

    return filters