def make_midi_cqt(midi_filename, piano, chroma, midi_info = None):
  if midi_info is None:
    midi_info = pretty_midi.PrettyMIDI(midi_filename)
  if piano:
    print "Generating CQT with piano roll"
    midi_gram = align_midi.midi_to_piano_cqt(midi_info)
    midi_beats, bpm = align_midi.midi_beat_track(midi_info)
    midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    np.save(to_piano_cqt_npy(midi_filename), midi_gram)
    return midi_gram
  elif chroma:
    chroma_gram = align_midi.midi_to_chroma(midi_info)
    midi_beats, bpm = align_midi.midi_beat_track(midi_info)
    chroma_gram = align_midi.post_process_cqt(chroma_gram, midi_beats)
    np.save(to_chroma_npy(midi_filename), chroma_gram)
    return chroma_gram
  else:
    midi_gram = align_midi.midi_to_cqt(midi_info, SF2_PATH)
    # Get beats
    midi_beats, bpm = align_midi.midi_beat_track(midi_info)
    # Beat synchronize and normalize
    midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    np.save(to_cqt_npy(midi_filename),midi_gram)
    return midi_gram
def make_midi_cqt(midi_filename, piano, chroma, midi_info=None):
    if midi_info is None:
        midi_info = pretty_midi.PrettyMIDI(midi_filename)
    if piano:
        print "Generating CQT with piano roll"
        midi_gram = align_midi.midi_to_piano_cqt(midi_info)
        midi_beats, bpm = align_midi.midi_beat_track(midi_info)
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
        np.save(to_piano_cqt_npy(midi_filename), midi_gram)
        return midi_gram
    elif chroma:
        chroma_gram = align_midi.midi_to_chroma(midi_info)
        midi_beats, bpm = align_midi.midi_beat_track(midi_info)
        chroma_gram = align_midi.post_process_cqt(chroma_gram, midi_beats)
        np.save(to_chroma_npy(midi_filename), chroma_gram)
        return chroma_gram
    else:
        midi_gram = align_midi.midi_to_cqt(midi_info, SF2_PATH)
        # Get beats
        midi_beats, bpm = align_midi.midi_beat_track(midi_info)
        # Beat synchronize and normalize
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
        np.save(to_cqt_npy(midi_filename), midi_gram)
        return midi_gram
Example #3
0
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True, interval=0):
    """
    Helper function for aligning a MIDI file to an audio file.

    :parameters:
        - mp3_filename : str
            Full path to a .mp3 file.
        - midi_filename : str
            Full path to a .mid file.
        - output_midi_filename : str
            Full path to where the aligned .mid file should be written.  If None, don't output.
        - output_diagnostics : bool
            If True, also output a .pdf of figures, a .mat of the alignment results,
            and a .mp3 of audio and synthesized aligned audio
    """
    # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it
    try:
        m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename))
    except:
        print "Error loading {}".format(midi_filename)
        return

    print "Aligning {}".format(os.path.split(midi_filename)[1])

    # Cache audio CQT and onset strength

    audio, fs = librosa.load(mp3_filename)
    if use_mp3_data:
        if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)):
            print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1])
            # Create audio CQT, which is just frame-wise power, and onset strength
            audio_gram = np.load(to_cqt_npy(mp3_filename))
            audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))
        else:
            print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
            audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
            np.save(to_cqt_npy(mp3_filename), audio_gram)
            np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
    else:
        print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
        audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
        np.save(to_cqt_npy(mp3_filename), audio_gram)
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)

    print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
    # Generate synthetic MIDI CQT
    if piano:
        midi_gram = align_midi.midi_to_piano_cqt(m)
        # log_gram = librosa.logamplitude(midi_gram, ref_power=midi_gram.max())
        # Normalize columns and return
        # midi_gram= librosa.util.normalize(log_gram, axis=0)
        midi_beats, bpm = align_midi.midi_beat_track(m)
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    else:
        midi_gram = align_midi.midi_to_cqt(m, SF2_PATH)
        # Get beats
        midi_beats, bpm = align_midi.midi_beat_track(m)
        # Beat synchronize and normalize
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
    if interval != 0:
        midi_gram = shift_cqt(midi_gram, interval)

    # Compute beats
    midi_beats, bpm = align_midi.midi_beat_track(m)
    audio_beats = librosa.beat.beat_track(onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4
    # Beat-align and log/normalize the audio CQT
    audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats)

    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine")
    p, q, score = align_midi.dpmod(similarity_matrix)

    # Plot log-fs grams
    plt.figure(figsize=(36, 24))
    ax = plt.subplot2grid((4, 3), (0, 0), colspan=3)
    plt.title("MIDI Synthesized")
    librosa.display.specshow(
        midi_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)
    )
    ax = plt.subplot2grid((4, 3), (1, 0), colspan=3)
    plt.title("Audio data")
    librosa.display.specshow(
        audio_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)
    )

    # Get similarity matrix
    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine")
    # Get best path through matrix
    p, q, score = align_midi.dpmod(similarity_matrix)

    # Plot distance at each point of the lowst-cost path
    ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2)
    plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)])
    plt.title("Distance at each point on lowest-cost path")

    # Plot similarity matrix and best path through it
    ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2)
    plt.imshow(similarity_matrix.T, aspect="auto", interpolation="nearest", cmap=plt.cm.gray)
    tight = plt.axis()
    plt.plot(p, q, "r.", ms=0.2)
    plt.axis(tight)
    plt.title("Similarity matrix and lowest-cost path, cost={}".format(score))

    # Adjust MIDI timing
    m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q])

    # Plot alignment
    ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2)
    note_ons = np.array([note.start for instrument in m.instruments for note in instrument.notes])
    aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.notes])
    plt.plot(note_ons, aligned_note_ons - note_ons, ".")
    plt.xlabel("Original note location (s)")
    plt.ylabel("Shift (s)")
    plt.title("Corrected offset")

    # Write out the aligned file
    if output_midi_filename is not None:
        m_aligned.write(output_midi_filename)

    if output_diagnostics:
        # Save the figures
        plt.savefig(output_midi_filename.replace(".mid", ".pdf"))
        if write_mp3:
            # Load in the audio data (needed for writing out)
            audio, fs = librosa.load(mp3_filename, sr=None)
            # Synthesize the aligned midi
            # midi_audio_aligned = m_aligned.fluidsynth()
            midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH)

            # Trim to the same size as audio
            if midi_audio_aligned.shape[0] > audio.shape[0]:
                midi_audio_aligned = midi_audio_aligned[: audio.shape[0]]
            else:
                midi_audio_aligned = np.append(
                    midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0])
                )
            # Write out to temporary .wav file
            librosa.output.write_wav(
                output_midi_filename.replace(".mid", ".wav"), np.vstack([midi_audio_aligned, audio]).T, fs
            )
            # Convert to mp3
            subprocess.check_output(
                [
                    "ffmpeg",
                    "-i",
                    output_midi_filename.replace(".mid", ".wav"),
                    "-ab",
                    "128k",
                    "-y",
                    output_midi_filename.replace(".mid", ".mp3"),
                ]
            )
            # Remove temporary .wav file
            os.remove(output_midi_filename.replace(".mid", ".wav"))
            # Save a .mat of the results
            scipy.io.savemat(
                output_midi_filename.replace(".mid", ".mat"),
                {"similarity_matrix": similarity_matrix, "p": p, "q": q, "score": score},
            )
    # If we aren't outputting a .pdf, show the plot
    else:
        plt.show()
    plt.close()
def align_one_file(mp3_filename,
                   midi_filename,
                   output_midi_filename,
                   output_diagnostics=True):
    '''
    Helper function for aligning a MIDI file to an audio file.
    
    :parameters:
        - mp3_filename : str
            Full path to a .mp3 file.
        - midi_filename : str
            Full path to a .mid file.
        - output_midi_filename : str
            Full path to where the aligned .mid file should be written.  If None, don't output.
        - output_diagnostics : bool
            If True, also output a .pdf of figures, a .mat of the alignment results,
            and a .mp3 of audio and synthesized aligned audio
    '''
    # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it
    try:
        m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename))
    except:
        print "Error loading {}".format(midi_filename)
        return

    print "Aligning {}".format(os.path.split(midi_filename)[1])

    # Cache audio CQT and onset strength
    if not os.path.exists(
            to_onset_strength_npy(mp3_filename)) or not os.path.exists(
                to_cqt_npy(mp3_filename)):
        print "Creating CQT and onset strength signal for {}".format(
            os.path.split(mp3_filename)[1])
        # Don't need to load in audio multiple times
        audio, fs = librosa.load(mp3_filename)
        # Create audio CQT, which is just frame-wise power, and onset strength
        audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(
            audio, fs=fs)
        # Write out
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
        np.save(to_cqt_npy(mp3_filename), audio_gram)

    # Cache MIDI CQT
    if not os.path.exists(to_cqt_npy(midi_filename)):
        print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
        # Generate synthetic MIDI CQT
        midi_gram = align_midi.midi_to_cqt(m, SF2_PATH)
        # Get beats
        midi_beats, bpm = align_midi.midi_beat_track(m)
        # Beat synchronize and normalize
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
        # Write out
        np.save(to_cqt_npy(midi_filename), midi_gram)

    # Load in CQTs
    audio_gram = np.load(to_cqt_npy(mp3_filename))
    midi_gram = np.load(to_cqt_npy(midi_filename))
    # and audio onset strength signal
    audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))

    # Compute beats
    midi_beats, bpm = align_midi.midi_beat_track(m)
    audio_beats = librosa.beat.beat_track(
        onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4
    # Beat-align and log/normalize the audio CQT
    audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats)

    # Plot log-fs grams
    plt.figure(figsize=(36, 24))
    ax = plt.subplot2grid((4, 3), (0, 0), colspan=3)
    plt.title('MIDI Synthesized')
    librosa.display.specshow(midi_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))
    ax = plt.subplot2grid((4, 3), (1, 0), colspan=3)
    plt.title('Audio data')
    librosa.display.specshow(audio_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))

    # Get similarity matrix
    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T,
                                                     audio_gram.T,
                                                     metric='cosine')
    # Get best path through matrix
    p, q, score = align_midi.dpmod(similarity_matrix)

    # Plot distance at each point of the lowst-cost path
    ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2)
    plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)])
    plt.title('Distance at each point on lowest-cost path')

    # Plot similarity matrix and best path through it
    ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2)
    plt.imshow(similarity_matrix.T,
               aspect='auto',
               interpolation='nearest',
               cmap=plt.cm.gray)
    tight = plt.axis()
    plt.plot(p, q, 'r.', ms=.2)
    plt.axis(tight)
    plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score))

    # Adjust MIDI timing
    print np.shape(similarity_matrix)
    print len(p), np.max(p)
    print len(q), np.max(q)
    print len(midi_beats)
    print len(audio_beats)

    m_aligned = align_midi.adjust_midi(m,
                                       librosa.frames_to_time(midi_beats)[p],
                                       librosa.frames_to_time(audio_beats)[q])

    # Plot alignment
    ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2)
    note_ons = np.array([
        note.start for instrument in m.instruments
        for note in instrument.events
    ])
    aligned_note_ons = np.array([
        note.start for instrument in m_aligned.instruments
        for note in instrument.events
    ])
    plt.plot(note_ons, aligned_note_ons - note_ons, '.')
    plt.xlabel('Original note location (s)')
    plt.ylabel('Shift (s)')
    plt.title('Corrected offset')

    # Write out the aligned file
    if output_midi_filename is not None:
        m_aligned.write(output_midi_filename)

    if output_diagnostics:
        # Save the figures
        plt.savefig(output_midi_filename.replace('.mid', '.pdf'))
        # Load in the audio data (needed for writing out)
        audio, fs = librosa.load(mp3_filename, sr=None)
        # Synthesize the aligned midi
        midi_audio_aligned = m_aligned.synthesize(fs=fs, method=SF2_PATH)
        # Trim to the same size as audio
        if midi_audio_aligned.shape[0] > audio.shape[0]:
            midi_audio_aligned = midi_audio_aligned[:audio.shape[0]]
        else:
            midi_audio_aligned = np.append(
                midi_audio_aligned,
                np.zeros(audio.shape[0] - midi_audio_aligned.shape[0]))
        # Write out to temporary .wav file
        librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'),
                                 np.vstack([midi_audio_aligned, audio]).T, fs)
        # Convert to mp3
        subprocess.check_output([
            'ffmpeg', '-i',
            output_midi_filename.replace('.mid', '.wav'), '-acodec',
            'libvorbis', '-aq', '0',
            output_midi_filename.replace('.mid', '.ogg')
        ])
        # Remove temporary .wav file
        os.remove(output_midi_filename.replace('.mid', '.wav'))
        # Save a .mat of the results
        scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'), {
            'similarity_matrix': similarity_matrix,
            'p': p,
            'q': q,
            'score': score
        })
    # If we aren't outputting a .pdf, show the plot
    else:
        plt.show()
    plt.close()
Example #5
0
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True):
    '''
    Helper function for aligning a MIDI file to an audio file.
    
    :parameters:
        - mp3_filename : str
            Full path to a .mp3 file.
        - midi_filename : str
            Full path to a .mid file.
        - output_midi_filename : str
            Full path to where the aligned .mid file should be written.  If None, don't output.
        - output_diagnostics : bool
            If True, also output a .pdf of figures, a .mat of the alignment results,
            and a .mp3 of audio and synthesized aligned audio
    '''
    # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it
    try:
        m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename))
    except:
        print "Error loading {}".format(midi_filename)
        return
        
    print "Aligning {}".format(os.path.split(midi_filename)[1])
    
    # Cache audio CQT and onset strength
    if not os.path.exists(to_onset_strength_npy(mp3_filename)) or not os.path.exists(to_cqt_npy(mp3_filename)):        
        print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1])
        # Don't need to load in audio multiple times
        audio, fs = librosa.load(mp3_filename)
        # Create audio CQT, which is just frame-wise power, and onset strength
        audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs)
        # Write out
        np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength)
        np.save(to_cqt_npy(mp3_filename), audio_gram)  

    # Cache MIDI CQT
    if not os.path.exists(to_cqt_npy(midi_filename)):      
        print "Creating CQT for {}".format(os.path.split(midi_filename)[1])
        # Generate synthetic MIDI CQT
        midi_gram = align_midi.midi_to_cqt(m, SF2_PATH)
        # Get beats
        midi_beats, bpm = align_midi.midi_beat_track(m)
        # Beat synchronize and normalize
        midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats)
        # Write out
        np.save(to_cqt_npy(midi_filename), midi_gram)
            
    # Load in CQTs
    audio_gram = np.load(to_cqt_npy(mp3_filename))
    midi_gram = np.load(to_cqt_npy(midi_filename))
    # and audio onset strength signal
    audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename))
    
    # Compute beats
    midi_beats, bpm = align_midi.midi_beat_track(m)
    audio_beats = librosa.beat.beat_track(onsets=audio_onset_strength, hop_length=512/4, bpm=bpm)[1]/4
    # Beat-align and log/normalize the audio CQT
    audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats)
    
    # Plot log-fs grams
    plt.figure(figsize=(36, 24))
    ax = plt.subplot2grid((4, 3), (0, 0), colspan=3)
    plt.title('MIDI Synthesized')
    librosa.display.specshow(midi_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))
    ax = plt.subplot2grid((4, 3), (1, 0), colspan=3)
    plt.title('Audio data')
    librosa.display.specshow(audio_gram,
                             x_axis='frames',
                             y_axis='cqt_note',
                             fmin=librosa.midi_to_hz(36),
                             fmax=librosa.midi_to_hz(96))
    
    # Get similarity matrix
    similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric='cosine')
    # Get best path through matrix
    p, q, score = align_midi.dpmod(similarity_matrix)
    
    # Plot distance at each point of the lowst-cost path
    ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2)
    plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)])
    plt.title('Distance at each point on lowest-cost path')

    # Plot similarity matrix and best path through it
    ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2)
    plt.imshow(similarity_matrix.T,
               aspect='auto',
               interpolation='nearest',
               cmap=plt.cm.gray)
    tight = plt.axis()
    plt.plot(p, q, 'r.', ms=.2)
    plt.axis(tight)
    plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score))
    
    # Adjust MIDI timing
    print np.shape(similarity_matrix)
    print len(p), np.max(p)
    print len(q), np.max(q)
    print len(midi_beats)
    print len(audio_beats)

    m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q])
    
    # Plot alignment
    ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2)
    note_ons = np.array([note.start for instrument in m.instruments for note in instrument.events])
    aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.events])
    plt.plot(note_ons, aligned_note_ons - note_ons, '.')
    plt.xlabel('Original note location (s)')
    plt.ylabel('Shift (s)')
    plt.title('Corrected offset')

    # Write out the aligned file
    if output_midi_filename is not None:
        m_aligned.write(output_midi_filename)
    
    if output_diagnostics:
        # Save the figures
        plt.savefig(output_midi_filename.replace('.mid', '.pdf'))
        # Load in the audio data (needed for writing out)
        audio, fs = librosa.load(mp3_filename, sr=None)
        # Synthesize the aligned midi
        midi_audio_aligned = m_aligned.synthesize(fs=fs, method=SF2_PATH)
        # Trim to the same size as audio
        if midi_audio_aligned.shape[0] > audio.shape[0]:
            midi_audio_aligned = midi_audio_aligned[:audio.shape[0]]
        else:
            midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0]))
        # Write out to temporary .wav file
        librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'),
                                 np.vstack([midi_audio_aligned, audio]).T, fs)
        # Convert to mp3
        subprocess.check_output(['ffmpeg',
                         '-i',
                         output_midi_filename.replace('.mid', '.wav'),
                         '-acodec',
                         'libvorbis',
                         '-aq',
                         '0', 
                         output_midi_filename.replace('.mid', '.ogg')])
        # Remove temporary .wav file
        os.remove(output_midi_filename.replace('.mid', '.wav'))
        # Save a .mat of the results
        scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'),
                         {'similarity_matrix': similarity_matrix,
                          'p' : p, 'q': q, 'score': score})
    # If we aren't outputting a .pdf, show the plot
    else:
        plt.show()
    plt.close()