def process_one_file(diagnostics_file, output_filename,
                     output_filename_unaligned):
    # If the alignment failed and there was no diagnostics file, return
    if not os.path.exists(diagnostics_file):
        return
    diagnostics = deepdish.io.load(diagnostics_file)
    score = diagnostics['score']
    # Skip bad alignments
    if score < SCORE_THRESHOLD:
        return
    try:
        # Load in MIDI data
        pm_unaligned = pretty_midi.PrettyMIDI(
            str(diagnostics['midi_filename']))
        # Synthesize MIDI data and extract CQT
        midi_gram_unaligned = feature_extraction.midi_cqt(pm_unaligned)
        # Get audio CQT
        audio_features = deepdish.io.load(
            str(diagnostics['audio_features_filename']))
        audio_gram = audio_features['gram']
        audio_frame_times = feature_extraction.frame_times(audio_gram)
        # Write out unaligned MIDI CQT
        deepdish.io.save(output_filename_unaligned,
                         {'X': midi_gram_unaligned[np.newaxis],
                          'Y': audio_gram[np.newaxis]})
        # Load in MIDI data
        pm_aligned = pretty_midi.PrettyMIDI(
            str(diagnostics['output_midi_filename']))
        # Synthesize MIDI data and extract CQT
        midi_gram_aligned = feature_extraction.midi_cqt(pm_aligned)
        midi_frame_times = feature_extraction.frame_times(midi_gram_aligned)
        # Get indices which fall within the range of correct alignment
        start_time = min(
            n.start for i in pm_aligned.instruments for n in i.notes)
        end_time = min(pm_aligned.get_end_time(), midi_frame_times.max(),
                       audio_frame_times.max())
        if end_time <= start_time:
            return
        # Mask out the times within the aligned region
        audio_gram = audio_gram[np.logical_and(audio_frame_times >= start_time,
                                               audio_frame_times <= end_time)]
        midi_gram = midi_gram_aligned[
            np.logical_and(midi_frame_times >= start_time,
                           midi_frame_times <= end_time)]
        # Write out matrices with a newaxis at front (for # of channels)
        deepdish.io.save(
            output_filename, {'X': midi_gram[np.newaxis],
                              'Y': audio_gram[np.newaxis]})
    except Exception as e:
        print "Error for {}: {}".format(
            diagnostics_file, traceback.format_exc(e))
        return
def process_one_pair(midi_filename, mp3_filename, h5_filename,
                     unaligned_output_filename, aligned_output_filename,
                     mp3_output_filename, h5_output_filename):
    """
    Given a candidate MIDI-audio match, align the MIDI to the audio, then copy
    the unaligned and aligned MIDI if the score is high enough.

    Parameters
    ----------
    midi_filename : str
        Path to the MIDI file to align.
    mp3_filename : str
        Path to the audio file to align to.
    unaligned_output_filename : str
        Where to copy the unaligned MIDI file if the match was successful.
    aligned_output_filename : str
        Where to write the aligned MIDI file if the match was successful.
    mp3_output_filename : str
        Where to copy the mp3 file if the match was successful.
    h5_output_filename : str
        Where to copy the h5 file if the match was successful.
    """
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    try:
        midi_gram = feature_extraction.midi_cqt(m)
    except Exception as e:
        print "Error creating CQT for {}: {}".format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    # Construct path to pre-computed audio CQT path
    audio_features_filename = mp3_filename.replace('mp3', 'h5')
    try:
        audio_features = deepdish.io.load(audio_features_filename)
    except Exception as e:
        print "Error loading CQT for {}: {}".format(
            os.path.split(audio_features_filename)[1],
            traceback.format_exc(e))
        return
    # Check that the distance matrix will not be too big before computing
    size = midi_gram.shape[0] * audio_features['gram'].shape[0]
    # If > 1 GB, skip
    if (size * 64 / 8e9 > 2):
        print (
            "Distance matrix would be {} GB because the "
            "CQTs have shape {} and {}".format(
                size * 64 / 8e9, audio_features['gram'].shape[0],
                midi_gram.shape[0]))
        return
    # Get distance matrix
    distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T)
    # Non-diagonal additive path penalty is the mean of the sim mtx
    # Note that we typically use a median here, but a mean is faster and
    # produces close enough results
    add_pen = np.mean(distance_matrix)
    # Get best path through matrix
    aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
        distance_matrix, gully=.96, additive_penalty=add_pen,
        inplace=False)
    # Normalize score by path length
    score /= float(len(aligned_midi_indices))
    # Normalize score by score by mean sim matrix value within path chunk
    score /= distance_matrix[
        aligned_midi_indices.min():aligned_midi_indices.max(),
        aligned_audio_indices.min():aligned_audio_indices.max()].mean()
    # If the match was successful
    if score > SCORE_THRESHOLD:
        # Try adjusting MIDI timing and writing out
        try:
            # Retrieve timing of frames in CQTs
            midi_frame_times = feature_extraction.frame_times(midi_gram)
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            # Adjust MIDI file timing
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            # Make sure all output paths exist and write out
            check_subdirectories(aligned_output_filename)
            m.write(aligned_output_filename)
        except Exception as e:
            print "Error adjusting and writing {}: {}".format(
                os.path.split(midi_filename)[1],
                traceback.format_exc(e))
            return
        # Assuming the above worked, all we have to do now is copy
        # Check/create all necessary subdirectores
        check_subdirectories(unaligned_output_filename)
        check_subdirectories(mp3_output_filename)
        check_subdirectories(h5_output_filename)
        # Copy all files
        shutil.copy(midi_filename, unaligned_output_filename)
        shutil.copy(mp3_filename, mp3_output_filename)
        try:
            shutil.copy(h5_filename, h5_output_filename)
        except Exception as e:
            print "Could not copy {}: {}".format(
                os.path.split(h5_filename)[1],
                traceback.format_exc(e))
            return
        # Return list of msd_id, midi_md5, score]
        prefix, midi_filename = os.path.split(aligned_output_filename)
        msd_id = os.path.split(prefix)[1]
        midi_md5 = os.path.splitext(midi_filename)[0]
        return [msd_id, midi_md5, score]
def align_one_file(audio_filename, midi_filename, audio_features_filename=None,
                   midi_features_filename=None, output_midi_filename=None,
                   output_diagnostics_filename=None,
                   additional_diagnostics=None):
    '''
    Helper function for aligning a MIDI file to an audio file.

    Parameters
    ----------
    audio_filename : str
        Full path to an audio file.
    midi_filename : str
        Full path to a midi file.
    audio_features_filename : str or None
        Full path to pre-computed features for the audio file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    midi_features_filename : str or None
        Full path to pre-computed features for the midi file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    output_midi_filename : str or None
        Full path to where the aligned .mid file should be written.
        If None, don't output.
    output_diagnostics_filename : str or None
        Full path to a file to write out diagnostic information (alignment
        score, best path, paths to files, etc) in a .h5 file.  If None, don't
        output.
    additional_diagnostics : dict or None
        Optional dictionary of additional diagnostic information to include
        in the diagnostics file.  If None, don't include.

    Returns
    -------
    p, q : np.ndarray
        Indices of the lowest-cost alignment between the audio and MIDI
    score : float
        Normalized DTW path distance
    '''
    # Skip when already processed
    if (output_diagnostics_filename is not None
            and os.path.exists(output_diagnostics_filename)):
        return

    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    midi_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (midi_features_filename is not None and
            os.path.exists(midi_features_filename)):
        try:
            # If a feature file was provided and exists, read it in
            midi_features = deepdish.io.load(midi_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(
                midi_features_filename, traceback.format_exc(e))
            midi_features = {}

    if not midi_features:
        # Generate synthetic MIDI CQT
        try:
            midi_features['gram'] = feature_extraction.midi_cqt(m)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        if midi_features_filename is not None:
            try:
                # Write out
                check_subdirectories(midi_features_filename)
                deepdish.io.save(
                    midi_features_filename, midi_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(midi_filename)[1], traceback.format_exc(e))
                return

    audio_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (audio_features_filename is not None and
            os.path.exists(audio_features_filename)):
        # If a feature file was provided and exists, read it in
        try:
            audio_features = deepdish.io.load(audio_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(
                audio_features_filename, traceback.format_exc(e))
            audio_features = {}

    # Cache audio CQT
    if not audio_features:
        try:
            # Read in audio data
            audio, fs = librosa.load(
                audio_filename, sr=feature_extraction.AUDIO_FS)
            # Compute audio cqt
            audio_features['gram'] = feature_extraction.audio_cqt(audio)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(audio_filename)[1], traceback.format_exc(e))
            return
        if audio_features_filename is not None:
            try:
                # Write out
                check_subdirectories(audio_features_filename)
                deepdish.io.save(audio_features_filename, audio_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(audio_filename)[1], traceback.format_exc(e))
                return

    try:
        # Check that the distance matrix will not be too big before computing
        size = midi_features['gram'].shape[0]*audio_features['gram'].shape[0]
        # If > 1 GB, skip
        if (size*64/8e9 > 1):
            print (
                "Distance matrix for {} and {} would be {} GB because the "
                "CQTs have shape {} and {}".format(
                    os.path.split(audio_filename)[1],
                    os.path.split(midi_filename)[1],
                    size*64/8e9, audio_features['gram'].shape[0],
                    midi_features['gram'].shape[0]))
            return

        # Get distance matrix
        distance_matrix = 1 - np.dot(
            midi_features['gram'], audio_features['gram'].T)
        # Non-diagonal additive path penalty is the median of the sim mtx
        add_pen = np.median(distance_matrix)
        # Get best path through matrix
        aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
            distance_matrix, gully=.96, additive_penalty=add_pen,
            inplace=False)
        # Normalize score by path length
        score /= float(len(aligned_midi_indices))
        # Normalize score by score by mean sim matrix value within path chunk
        score /= distance_matrix[
            aligned_midi_indices.min():aligned_midi_indices.max(),
            aligned_audio_indices.min():aligned_audio_indices.max()].mean()
        # The confidence score is a normalized DTW distance, which
        # approximately follows in the range [.5, 1.] with .5 meaning a very
        # good alignment.  This maps the scores from [0., 1.] where 1. means a
        # very good alignment.
        score = np.clip(2*(1 - score), 0, 1)
    except Exception as e:
        print "Error performing DTW for {} and {}: {}".format(
            os.path.split(audio_filename)[1],
            os.path.split(midi_filename)[1],
            traceback.format_exc(e))
        return

    # Write out the aligned file
    if output_midi_filename is not None:
        try:
            # Adjust MIDI timing
            midi_frame_times = feature_extraction.frame_times(
                midi_features['gram'])
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            check_subdirectories(output_midi_filename)
            m.write(output_midi_filename)
        except Exception as e:
            print "Error writing aligned .mid for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return

    if output_diagnostics_filename is not None:
        try:
            check_subdirectories(output_diagnostics_filename)
            # Construct empty additional diagnostics dict when None was given
            if additional_diagnostics is None:
                additional_diagnostics = {}
            diagnostics = dict(
                aligned_midi_indices=aligned_midi_indices,
                aligned_audio_indices=aligned_audio_indices, score=score,
                audio_filename=os.path.abspath(audio_filename),
                midi_filename=os.path.abspath(midi_filename),
                audio_features_filename=os.path.abspath(
                    audio_features_filename),
                midi_features_filename=os.path.abspath(midi_features_filename),
                output_midi_filename=os.path.abspath(output_midi_filename),
                output_diagnostics_filename=os.path.abspath(
                    output_diagnostics_filename),
                **additional_diagnostics)
            deepdish.io.save(output_diagnostics_filename, diagnostics)
        except Exception as e:
            print "Error writing diagnostics for {} and {}: {}".format(
                os.path.split(audio_filename)[1],
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
    return aligned_midi_indices, aligned_audio_indices, score
Exemple #4
0
def extract_ground_truth(diagnostics_group):
    """
    Extract ground-truth information from one or more MIDI files about a single
    MIDI file based on the results in one or more diagnostics files and return
    a JAMS object with all of the annotations compiled.

    Parameters
    ----------
    - diagnostics_group : list of dict
        List of dicts of diagnostics, each about a successful alignment of a
        different MIDI file to a single audio file.
    """
    # Construct the JAMS object
    jam = jams.JAMS()
    # Load in the first diagnostics (doesn't matter which as they all
    # should correspond the same audio file)
    diagnostics = diagnostics_group[0]
    # Load in the audio file to get its duration for the JAMS file
    audio, fs = librosa.load(diagnostics['audio_filename'],
                             feature_extraction.AUDIO_FS)
    jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs)
    # Also store metadata about the audio file, retrieved from the MSD
    jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']}
    jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist']
    jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title']

    # Iterate over the diagnostics files supplied
    for diagnostics in diagnostics_group:

        # Create annotation metadata object, shared across annotations
        commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
        commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit
        annotator = {
            'midi_md5': diagnostics['midi_md5'],
            'commit_url': commit_url,
            'confidence': diagnostics['score']
        }
        annotation_metadata = jams.AnnotationMetadata(
            curator=jams.Curator('Colin Raffel', '*****@*****.**'),
            version='0.0.1b',
            corpus='Million Song Dataset MIDI Matches',
            annotator=annotator,
            annotation_tools=(
                'MIDI files were matched and aligned to audio files using the '
                'code at http://github.com/craffel/midi-dataset.  Information '
                'was extracted from MIDI files using pretty_midi '
                'https://github.com/craffel/pretty-midi.'),
            annotation_rules=(
                'Beat locations and key change times were linearly '
                'interpolated according to an audio-to-MIDI alignment.'),
            validation=(
                'Only MIDI files with alignment confidence scores >= .5 were '
                'considered "correct".  The confidence score can be used as a '
                'rough guide to the potential correctness of the annotation.'),
            data_source='Inferred from a MIDI file.')

        # Load the extracted features
        midi_features = deepdish.io.load(diagnostics['midi_features_filename'])
        audio_features = deepdish.io.load(
            diagnostics['audio_features_filename'])
        # Load in the original MIDI file
        midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename'])
        # Compute the times of the frames (will be used for interpolation)
        midi_frame_times = feature_extraction.frame_times(
            midi_features['gram'])[diagnostics['aligned_midi_indices']]
        audio_frame_times = feature_extraction.frame_times(
            audio_features['gram'])[diagnostics['aligned_audio_indices']]

        # Get the interpolated beat locations and add them to the JAM
        adjusted_beats = interpolate_times(midi_object.get_beats(),
                                           midi_frame_times, audio_frame_times)
        # Create annotation record for the beats
        beat_a = jams.Annotation(namespace='beat')
        beat_a.annotation_metadata = annotation_metadata
        # Add beat timings to the annotation record
        for t in adjusted_beats:
            beat_a.append(time=t, duration=0.0)
        # Add beat annotation record to the JAMS file
        jam.annotations.append(beat_a)

        # Get key signature times and their string names
        key_change_times = [c.time for c in midi_object.key_signature_changes]
        key_names = [
            pretty_midi.key_number_to_key_name(c.key_number)
            for c in midi_object.key_signature_changes
        ]
        # JAMS requires that the key name be supplied in the form e.g.
        # "C:major" but pretty_midi returns things in the format "C Major",
        # so the following code converts to JAMS format
        key_names = [
            name.replace(' ', ':').replace('M', 'm') for name in key_names
        ]
        # Compute interpolated event times
        adjusted_key_change_times, adjusted_key_names = interpolate_times(
            key_change_times, midi_frame_times, audio_frame_times, key_names,
            True)
        # Create JAMS annotation for the key changes
        if len(adjusted_key_change_times) > 0:
            key_a = jams.Annotation(namespace='key_mode')
            key_a.annotation_metadata = annotation_metadata
            # We only have key start times from the MIDI file, but JAMS wants
            # durations too, so create a list of "end times"
            end_times = np.append(adjusted_key_change_times[1:],
                                  jam.file_metadata.duration)
            # Add key labels into the JAMS file
            for start, end, key in zip(adjusted_key_change_times, end_times,
                                       adjusted_key_names):
                key_a.append(time=start, duration=end - start, value=key)
            jam.annotations.append(key_a)

    return jam
def extract_ground_truth(diagnostics_group):
    """
    Extract ground-truth information from one or more MIDI files about a single
    MIDI file based on the results in one or more diagnostics files and return
    a JAMS object with all of the annotations compiled.

    Parameters
    ----------
    - diagnostics_group : list of dict
        List of dicts of diagnostics, each about a successful alignment of a
        different MIDI file to a single audio file.
    """
    # Construct the JAMS object
    jam = jams.JAMS()
    # Load in the first diagnostics (doesn't matter which as they all
    # should correspond the same audio file)
    diagnostics = diagnostics_group[0]
    # Load in the audio file to get its duration for the JAMS file
    audio, fs = librosa.load(
        diagnostics['audio_filename'], feature_extraction.AUDIO_FS)
    jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs)
    # Also store metadata about the audio file, retrieved from the MSD
    jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']}
    jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist']
    jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title']

    # Iterate over the diagnostics files supplied
    for diagnostics in diagnostics_group:

        # Create annotation metadata object, shared across annotations
        commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip()
        commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit
        annotator = {'midi_md5': diagnostics['midi_md5'],
                     'commit_url': commit_url,
                     'confidence': diagnostics['score']}
        annotation_metadata = jams.AnnotationMetadata(
            curator=jams.Curator('Colin Raffel', '*****@*****.**'),
            version='0.0.1b', corpus='Million Song Dataset MIDI Matches',
            annotator=annotator,
            annotation_tools=(
                'MIDI files were matched and aligned to audio files using the '
                'code at http://github.com/craffel/midi-dataset.  Information '
                'was extracted from MIDI files using pretty_midi '
                'https://github.com/craffel/pretty-midi.'),
            annotation_rules=(
                'Beat locations and key change times were linearly '
                'interpolated according to an audio-to-MIDI alignment.'),
            validation=(
                'Only MIDI files with alignment confidence scores >= .5 were '
                'considered "correct".  The confidence score can be used as a '
                'rough guide to the potential correctness of the annotation.'),
            data_source='Inferred from a MIDI file.')

        # Load the extracted features
        midi_features = deepdish.io.load(diagnostics['midi_features_filename'])
        audio_features = deepdish.io.load(
            diagnostics['audio_features_filename'])
        # Load in the original MIDI file
        midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename'])
        # Compute the times of the frames (will be used for interpolation)
        midi_frame_times = feature_extraction.frame_times(
            midi_features['gram'])[diagnostics['aligned_midi_indices']]
        audio_frame_times = feature_extraction.frame_times(
            audio_features['gram'])[diagnostics['aligned_audio_indices']]

        # Get the interpolated beat locations and add them to the JAM
        adjusted_beats = interpolate_times(
            midi_object.get_beats(), midi_frame_times, audio_frame_times)
        # Create annotation record for the beats
        beat_a = jams.Annotation(namespace='beat')
        beat_a.annotation_metadata = annotation_metadata
        # Add beat timings to the annotation record
        for t in adjusted_beats:
            beat_a.append(time=t, duration=0.0)
        # Add beat annotation record to the JAMS file
        jam.annotations.append(beat_a)

        # Get key signature times and their string names
        key_change_times = [c.time for c in midi_object.key_signature_changes]
        key_names = [pretty_midi.key_number_to_key_name(c.key_number)
                     for c in midi_object.key_signature_changes]
        # JAMS requires that the key name be supplied in the form e.g.
        # "C:major" but pretty_midi returns things in the format "C Major",
        # so the following code converts to JAMS format
        key_names = [name.replace(' ', ':').replace('M', 'm')
                     for name in key_names]
        # Compute interpolated event times
        adjusted_key_change_times, adjusted_key_names = interpolate_times(
            key_change_times, midi_frame_times, audio_frame_times, key_names,
            True)
        # Create JAMS annotation for the key changes
        if len(adjusted_key_change_times) > 0:
            key_a = jams.Annotation(namespace='key_mode')
            key_a.annotation_metadata = annotation_metadata
            # We only have key start times from the MIDI file, but JAMS wants
            # durations too, so create a list of "end times"
            end_times = np.append(adjusted_key_change_times[1:],
                                  jam.file_metadata.duration)
            # Add key labels into the JAMS file
            for start, end, key in zip(adjusted_key_change_times, end_times,
                                       adjusted_key_names):
                key_a.append(time=start, duration=end - start, value=key)
            jam.annotations.append(key_a)

    return jam
def process_one_pair(midi_filename, mp3_filename, h5_filename,
                     unaligned_output_filename, aligned_output_filename,
                     mp3_output_filename, h5_output_filename):
    """
    Given a candidate MIDI-audio match, align the MIDI to the audio, then copy
    the unaligned and aligned MIDI if the score is high enough.

    Parameters
    ----------
    midi_filename : str
        Path to the MIDI file to align.
    mp3_filename : str
        Path to the audio file to align to.
    unaligned_output_filename : str
        Where to copy the unaligned MIDI file if the match was successful.
    aligned_output_filename : str
        Where to write the aligned MIDI file if the match was successful.
    mp3_output_filename : str
        Where to copy the mp3 file if the match was successful.
    h5_output_filename : str
        Where to copy the h5 file if the match was successful.
    """
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    try:
        midi_gram = feature_extraction.midi_cqt(m)
    except Exception as e:
        print "Error creating CQT for {}: {}".format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    # Construct path to pre-computed audio CQT path
    audio_features_filename = mp3_filename.replace('mp3', 'h5')
    try:
        audio_features = deepdish.io.load(audio_features_filename)
    except Exception as e:
        print "Error loading CQT for {}: {}".format(
            os.path.split(audio_features_filename)[1], traceback.format_exc(e))
        return
    # Check that the distance matrix will not be too big before computing
    size = midi_gram.shape[0] * audio_features['gram'].shape[0]
    # If > 1 GB, skip
    if (size * 64 / 8e9 > 2):
        print(
            "Distance matrix would be {} GB because the "
            "CQTs have shape {} and {}".format(size * 64 / 8e9,
                                               audio_features['gram'].shape[0],
                                               midi_gram.shape[0]))
        return
    # Get distance matrix
    distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T)
    # Non-diagonal additive path penalty is the mean of the sim mtx
    # Note that we typically use a median here, but a mean is faster and
    # produces close enough results
    add_pen = np.mean(distance_matrix)
    # Get best path through matrix
    aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
        distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False)
    # Normalize score by path length
    score /= float(len(aligned_midi_indices))
    # Normalize score by score by mean sim matrix value within path chunk
    score /= distance_matrix[
        aligned_midi_indices.min():aligned_midi_indices.max(),
        aligned_audio_indices.min():aligned_audio_indices.max()].mean()
    # If the match was successful
    if score > SCORE_THRESHOLD:
        # Try adjusting MIDI timing and writing out
        try:
            # Retrieve timing of frames in CQTs
            midi_frame_times = feature_extraction.frame_times(midi_gram)
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            # Adjust MIDI file timing
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            # Make sure all output paths exist and write out
            check_subdirectories(aligned_output_filename)
            m.write(aligned_output_filename)
        except Exception as e:
            print "Error adjusting and writing {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        # Assuming the above worked, all we have to do now is copy
        # Check/create all necessary subdirectores
        check_subdirectories(unaligned_output_filename)
        check_subdirectories(mp3_output_filename)
        check_subdirectories(h5_output_filename)
        # Copy all files
        shutil.copy(midi_filename, unaligned_output_filename)
        shutil.copy(mp3_filename, mp3_output_filename)
        try:
            shutil.copy(h5_filename, h5_output_filename)
        except Exception as e:
            print "Could not copy {}: {}".format(
                os.path.split(h5_filename)[1], traceback.format_exc(e))
            return
        # Return list of msd_id, midi_md5, score]
        prefix, midi_filename = os.path.split(aligned_output_filename)
        msd_id = os.path.split(prefix)[1]
        midi_md5 = os.path.splitext(midi_filename)[0]
        return [msd_id, midi_md5, score]
def process_one_file(diagnostics_file, output_filename,
                     output_filename_unaligned, output_filename_piano_roll):
    # If the alignment failed and there was no diagnostics file, return
    if not os.path.exists(diagnostics_file):
        return
    diagnostics = deepdish.io.load(diagnostics_file)
    score = diagnostics['score']
    # Skip bad alignments
    if score < SCORE_THRESHOLD:
        return
    try:
        # Load in MIDI data
        pm_unaligned = pretty_midi.PrettyMIDI(str(
            diagnostics['midi_filename']))
        # Synthesize MIDI data and extract CQT
        midi_gram_unaligned = feature_extraction.midi_cqt(pm_unaligned)
        # Get audio CQT
        audio_features = deepdish.io.load(
            str(diagnostics['audio_features_filename']))
        audio_gram = audio_features['gram']
        audio_frame_times = feature_extraction.frame_times(audio_gram)
        # Write out unaligned MIDI CQT
        deepdish.io.save(output_filename_unaligned, {
            'X': midi_gram_unaligned[np.newaxis],
            'Y': audio_gram[np.newaxis]
        })
        # Load in MIDI data
        pm_aligned = pretty_midi.PrettyMIDI(
            str(diagnostics['output_midi_filename']))
        # Synthesize MIDI data and extract CQT
        midi_gram_aligned = feature_extraction.midi_cqt(pm_aligned)
        midi_frame_times = feature_extraction.frame_times(midi_gram_aligned)
        # Get indices which fall within the range of correct alignment
        start_time = min(n.start for i in pm_aligned.instruments
                         for n in i.notes)
        end_time = min(pm_aligned.get_end_time(), midi_frame_times.max(),
                       audio_frame_times.max())
        if end_time <= start_time:
            return
        # Mask out the times within the aligned region
        audio_gram = audio_gram[np.logical_and(audio_frame_times >= start_time,
                                               audio_frame_times <= end_time)]
        midi_gram = midi_gram_aligned[np.logical_and(
            midi_frame_times >= start_time, midi_frame_times <= end_time)]
        # Write out matrices with a newaxis at front (for # of channels)
        deepdish.io.save(output_filename, {
            'X': midi_gram[np.newaxis],
            'Y': audio_gram[np.newaxis]
        })

        piano_roll = pm_aligned.get_piano_roll(times=midi_frame_times)
        # Only utilize the same notes which are used in the CQT
        piano_roll = piano_roll[feature_extraction.
                                NOTE_START:feature_extraction.NOTE_START +
                                feature_extraction.N_NOTES]
        # Transpose so that the first dimension is time
        piano_roll = piano_roll.T
        # L2 normalize columns
        piano_roll = librosa.util.normalize(piano_roll, norm=2, axis=1)
        # Mask out times within the aligned region
        piano_roll = piano_roll[np.logical_and(midi_frame_times >= start_time,
                                               midi_frame_times <= end_time)]
        # Use float32 for Theano
        piano_roll = piano_roll.astype(np.float32)
        deepdish.io.save(output_filename_piano_roll, {
            'X': piano_roll[np.newaxis],
            'Y': audio_gram[np.newaxis]
        })
    except Exception as e:
        print "Error for {}: {}".format(diagnostics_file,
                                        traceback.format_exc(e))
        return
Exemple #8
0
def align_one_file(audio_filename,
                   midi_filename,
                   audio_features_filename=None,
                   midi_features_filename=None,
                   output_midi_filename=None,
                   output_diagnostics_filename=None,
                   additional_diagnostics=None):
    '''
    Helper function for aligning a MIDI file to an audio file.

    Parameters
    ----------
    audio_filename : str
        Full path to an audio file.
    midi_filename : str
        Full path to a midi file.
    audio_features_filename : str or None
        Full path to pre-computed features for the audio file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    midi_features_filename : str or None
        Full path to pre-computed features for the midi file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    output_midi_filename : str or None
        Full path to where the aligned .mid file should be written.
        If None, don't output.
    output_diagnostics_filename : str or None
        Full path to a file to write out diagnostic information (alignment
        score, best path, paths to files, etc) in a .h5 file.  If None, don't
        output.
    additional_diagnostics : dict or None
        Optional dictionary of additional diagnostic information to include
        in the diagnostics file.  If None, don't include.

    Returns
    -------
    p, q : np.ndarray
        Indices of the lowest-cost alignment between the audio and MIDI
    score : float
        Normalized DTW path distance
    '''
    # Skip when already processed
    if (output_diagnostics_filename is not None
            and os.path.exists(output_diagnostics_filename)):
        return

    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    midi_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (midi_features_filename is not None
            and os.path.exists(midi_features_filename)):
        try:
            # If a feature file was provided and exists, read it in
            midi_features = deepdish.io.load(midi_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(midi_features_filename,
                                                traceback.format_exc(e))
            midi_features = {}

    if not midi_features:
        # Generate synthetic MIDI CQT
        try:
            midi_features['gram'] = feature_extraction.midi_cqt(m)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        if midi_features_filename is not None:
            try:
                # Write out
                check_subdirectories(midi_features_filename)
                deepdish.io.save(midi_features_filename, midi_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(midi_filename)[1], traceback.format_exc(e))
                return

    audio_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (audio_features_filename is not None
            and os.path.exists(audio_features_filename)):
        # If a feature file was provided and exists, read it in
        try:
            audio_features = deepdish.io.load(audio_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(audio_features_filename,
                                                traceback.format_exc(e))
            audio_features = {}

    # Cache audio CQT
    if not audio_features:
        try:
            # Read in audio data
            audio, fs = librosa.load(audio_filename,
                                     sr=feature_extraction.AUDIO_FS)
            # Compute audio cqt
            audio_features['gram'] = feature_extraction.audio_cqt(audio)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(audio_filename)[1], traceback.format_exc(e))
            return
        if audio_features_filename is not None:
            try:
                # Write out
                check_subdirectories(audio_features_filename)
                deepdish.io.save(audio_features_filename, audio_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(audio_filename)[1], traceback.format_exc(e))
                return

    try:
        # Check that the distance matrix will not be too big before computing
        size = midi_features['gram'].shape[0] * audio_features['gram'].shape[0]
        # If > 1 GB, skip
        if (size * 64 / 8e9 > 2):
            print(
                "Distance matrix for {} and {} would be {} GB because the "
                "CQTs have shape {} and {}".format(
                    os.path.split(audio_filename)[1],
                    os.path.split(midi_filename)[1], size * 64 / 8e9,
                    audio_features['gram'].shape[0],
                    midi_features['gram'].shape[0]))
            return

        # Get distance matrix
        distance_matrix = 1 - np.dot(midi_features['gram'],
                                     audio_features['gram'].T)
        # Non-diagonal additive path penalty is the median of the sim mtx
        add_pen = np.median(distance_matrix)
        # Get best path through matrix
        aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
            distance_matrix,
            gully=.96,
            additive_penalty=add_pen,
            inplace=False)
        # Normalize score by path length
        score /= float(len(aligned_midi_indices))
        # Normalize score by score by mean sim matrix value within path chunk
        score /= distance_matrix[
            aligned_midi_indices.min():aligned_midi_indices.max(),
            aligned_audio_indices.min():aligned_audio_indices.max()].mean()
        # The confidence score is a normalized DTW distance, which
        # approximately follows in the range [.5, 1.] with .5 meaning a very
        # good alignment.  This maps the scores from [0., 1.] where 1. means a
        # very good alignment.
        score = np.clip(2 * (1 - score), 0, 1)
    except Exception as e:
        print "Error performing DTW for {} and {}: {}".format(
            os.path.split(audio_filename)[1],
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    # Write out the aligned file
    if output_midi_filename is not None:
        try:
            # Adjust MIDI timing
            midi_frame_times = feature_extraction.frame_times(
                midi_features['gram'])
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            check_subdirectories(output_midi_filename)
            m.write(output_midi_filename)
        except Exception as e:
            print "Error writing aligned .mid for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return

    if output_diagnostics_filename is not None:
        try:
            check_subdirectories(output_diagnostics_filename)
            # Construct empty additional diagnostics dict when None was given
            if additional_diagnostics is None:
                additional_diagnostics = {}
            diagnostics = dict(
                aligned_midi_indices=aligned_midi_indices,
                aligned_audio_indices=aligned_audio_indices,
                score=score,
                audio_filename=os.path.abspath(audio_filename),
                midi_filename=os.path.abspath(midi_filename),
                audio_features_filename=os.path.abspath(
                    audio_features_filename),
                midi_features_filename=os.path.abspath(midi_features_filename),
                output_midi_filename=os.path.abspath(output_midi_filename),
                output_diagnostics_filename=os.path.abspath(
                    output_diagnostics_filename),
                **additional_diagnostics)
            deepdish.io.save(output_diagnostics_filename, diagnostics)
        except Exception as e:
            print "Error writing diagnostics for {} and {}: {}".format(
                os.path.split(audio_filename)[1],
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
    return aligned_midi_indices, aligned_audio_indices, score