def process_one_file(diagnostics_file, output_filename, output_filename_unaligned): # If the alignment failed and there was no diagnostics file, return if not os.path.exists(diagnostics_file): return diagnostics = deepdish.io.load(diagnostics_file) score = diagnostics['score'] # Skip bad alignments if score < SCORE_THRESHOLD: return try: # Load in MIDI data pm_unaligned = pretty_midi.PrettyMIDI( str(diagnostics['midi_filename'])) # Synthesize MIDI data and extract CQT midi_gram_unaligned = feature_extraction.midi_cqt(pm_unaligned) # Get audio CQT audio_features = deepdish.io.load( str(diagnostics['audio_features_filename'])) audio_gram = audio_features['gram'] audio_frame_times = feature_extraction.frame_times(audio_gram) # Write out unaligned MIDI CQT deepdish.io.save(output_filename_unaligned, {'X': midi_gram_unaligned[np.newaxis], 'Y': audio_gram[np.newaxis]}) # Load in MIDI data pm_aligned = pretty_midi.PrettyMIDI( str(diagnostics['output_midi_filename'])) # Synthesize MIDI data and extract CQT midi_gram_aligned = feature_extraction.midi_cqt(pm_aligned) midi_frame_times = feature_extraction.frame_times(midi_gram_aligned) # Get indices which fall within the range of correct alignment start_time = min( n.start for i in pm_aligned.instruments for n in i.notes) end_time = min(pm_aligned.get_end_time(), midi_frame_times.max(), audio_frame_times.max()) if end_time <= start_time: return # Mask out the times within the aligned region audio_gram = audio_gram[np.logical_and(audio_frame_times >= start_time, audio_frame_times <= end_time)] midi_gram = midi_gram_aligned[ np.logical_and(midi_frame_times >= start_time, midi_frame_times <= end_time)] # Write out matrices with a newaxis at front (for # of channels) deepdish.io.save( output_filename, {'X': midi_gram[np.newaxis], 'Y': audio_gram[np.newaxis]}) except Exception as e: print "Error for {}: {}".format( diagnostics_file, traceback.format_exc(e)) return
def process_one_pair(midi_filename, mp3_filename, h5_filename, unaligned_output_filename, aligned_output_filename, mp3_output_filename, h5_output_filename): """ Given a candidate MIDI-audio match, align the MIDI to the audio, then copy the unaligned and aligned MIDI if the score is high enough. Parameters ---------- midi_filename : str Path to the MIDI file to align. mp3_filename : str Path to the audio file to align to. unaligned_output_filename : str Where to copy the unaligned MIDI file if the match was successful. aligned_output_filename : str Where to write the aligned MIDI file if the match was successful. mp3_output_filename : str Where to copy the mp3 file if the match was successful. h5_output_filename : str Where to copy the h5 file if the match was successful. """ try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return try: midi_gram = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Construct path to pre-computed audio CQT path audio_features_filename = mp3_filename.replace('mp3', 'h5') try: audio_features = deepdish.io.load(audio_features_filename) except Exception as e: print "Error loading CQT for {}: {}".format( os.path.split(audio_features_filename)[1], traceback.format_exc(e)) return # Check that the distance matrix will not be too big before computing size = midi_gram.shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print ( "Distance matrix would be {} GB because the " "CQTs have shape {} and {}".format( size * 64 / 8e9, audio_features['gram'].shape[0], midi_gram.shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T) # Non-diagonal additive path penalty is the mean of the sim mtx # Note that we typically use a median here, but a mean is faster and # produces close enough results add_pen = np.mean(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # If the match was successful if score > SCORE_THRESHOLD: # Try adjusting MIDI timing and writing out try: # Retrieve timing of frames in CQTs midi_frame_times = feature_extraction.frame_times(midi_gram) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) # Adjust MIDI file timing m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) # Make sure all output paths exist and write out check_subdirectories(aligned_output_filename) m.write(aligned_output_filename) except Exception as e: print "Error adjusting and writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Assuming the above worked, all we have to do now is copy # Check/create all necessary subdirectores check_subdirectories(unaligned_output_filename) check_subdirectories(mp3_output_filename) check_subdirectories(h5_output_filename) # Copy all files shutil.copy(midi_filename, unaligned_output_filename) shutil.copy(mp3_filename, mp3_output_filename) try: shutil.copy(h5_filename, h5_output_filename) except Exception as e: print "Could not copy {}: {}".format( os.path.split(h5_filename)[1], traceback.format_exc(e)) return # Return list of msd_id, midi_md5, score] prefix, midi_filename = os.path.split(aligned_output_filename) msd_id = os.path.split(prefix)[1] midi_md5 = os.path.splitext(midi_filename)[0] return [msd_id, midi_md5, score]
def align_one_file(audio_filename, midi_filename, audio_features_filename=None, midi_features_filename=None, output_midi_filename=None, output_diagnostics_filename=None, additional_diagnostics=None): ''' Helper function for aligning a MIDI file to an audio file. Parameters ---------- audio_filename : str Full path to an audio file. midi_filename : str Full path to a midi file. audio_features_filename : str or None Full path to pre-computed features for the audio file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. midi_features_filename : str or None Full path to pre-computed features for the midi file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. output_midi_filename : str or None Full path to where the aligned .mid file should be written. If None, don't output. output_diagnostics_filename : str or None Full path to a file to write out diagnostic information (alignment score, best path, paths to files, etc) in a .h5 file. If None, don't output. additional_diagnostics : dict or None Optional dictionary of additional diagnostic information to include in the diagnostics file. If None, don't include. Returns ------- p, q : np.ndarray Indices of the lowest-cost alignment between the audio and MIDI score : float Normalized DTW path distance ''' # Skip when already processed if (output_diagnostics_filename is not None and os.path.exists(output_diagnostics_filename)): return try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return midi_features = {} # If a feature file was provided and the file exists, try to read it in if (midi_features_filename is not None and os.path.exists(midi_features_filename)): try: # If a feature file was provided and exists, read it in midi_features = deepdish.io.load(midi_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format( midi_features_filename, traceback.format_exc(e)) midi_features = {} if not midi_features: # Generate synthetic MIDI CQT try: midi_features['gram'] = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if midi_features_filename is not None: try: # Write out check_subdirectories(midi_features_filename) deepdish.io.save( midi_features_filename, midi_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return audio_features = {} # If a feature file was provided and the file exists, try to read it in if (audio_features_filename is not None and os.path.exists(audio_features_filename)): # If a feature file was provided and exists, read it in try: audio_features = deepdish.io.load(audio_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format( audio_features_filename, traceback.format_exc(e)) audio_features = {} # Cache audio CQT if not audio_features: try: # Read in audio data audio, fs = librosa.load( audio_filename, sr=feature_extraction.AUDIO_FS) # Compute audio cqt audio_features['gram'] = feature_extraction.audio_cqt(audio) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return if audio_features_filename is not None: try: # Write out check_subdirectories(audio_features_filename) deepdish.io.save(audio_features_filename, audio_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return try: # Check that the distance matrix will not be too big before computing size = midi_features['gram'].shape[0]*audio_features['gram'].shape[0] # If > 1 GB, skip if (size*64/8e9 > 1): print ( "Distance matrix for {} and {} would be {} GB because the " "CQTs have shape {} and {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], size*64/8e9, audio_features['gram'].shape[0], midi_features['gram'].shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot( midi_features['gram'], audio_features['gram'].T) # Non-diagonal additive path penalty is the median of the sim mtx add_pen = np.median(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # The confidence score is a normalized DTW distance, which # approximately follows in the range [.5, 1.] with .5 meaning a very # good alignment. This maps the scores from [0., 1.] where 1. means a # very good alignment. score = np.clip(2*(1 - score), 0, 1) except Exception as e: print "Error performing DTW for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Write out the aligned file if output_midi_filename is not None: try: # Adjust MIDI timing midi_frame_times = feature_extraction.frame_times( midi_features['gram']) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) check_subdirectories(output_midi_filename) m.write(output_midi_filename) except Exception as e: print "Error writing aligned .mid for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if output_diagnostics_filename is not None: try: check_subdirectories(output_diagnostics_filename) # Construct empty additional diagnostics dict when None was given if additional_diagnostics is None: additional_diagnostics = {} diagnostics = dict( aligned_midi_indices=aligned_midi_indices, aligned_audio_indices=aligned_audio_indices, score=score, audio_filename=os.path.abspath(audio_filename), midi_filename=os.path.abspath(midi_filename), audio_features_filename=os.path.abspath( audio_features_filename), midi_features_filename=os.path.abspath(midi_features_filename), output_midi_filename=os.path.abspath(output_midi_filename), output_diagnostics_filename=os.path.abspath( output_diagnostics_filename), **additional_diagnostics) deepdish.io.save(output_diagnostics_filename, diagnostics) except Exception as e: print "Error writing diagnostics for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return return aligned_midi_indices, aligned_audio_indices, score
def extract_ground_truth(diagnostics_group): """ Extract ground-truth information from one or more MIDI files about a single MIDI file based on the results in one or more diagnostics files and return a JAMS object with all of the annotations compiled. Parameters ---------- - diagnostics_group : list of dict List of dicts of diagnostics, each about a successful alignment of a different MIDI file to a single audio file. """ # Construct the JAMS object jam = jams.JAMS() # Load in the first diagnostics (doesn't matter which as they all # should correspond the same audio file) diagnostics = diagnostics_group[0] # Load in the audio file to get its duration for the JAMS file audio, fs = librosa.load(diagnostics['audio_filename'], feature_extraction.AUDIO_FS) jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs) # Also store metadata about the audio file, retrieved from the MSD jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']} jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist'] jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title'] # Iterate over the diagnostics files supplied for diagnostics in diagnostics_group: # Create annotation metadata object, shared across annotations commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip() commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit annotator = { 'midi_md5': diagnostics['midi_md5'], 'commit_url': commit_url, 'confidence': diagnostics['score'] } annotation_metadata = jams.AnnotationMetadata( curator=jams.Curator('Colin Raffel', '*****@*****.**'), version='0.0.1b', corpus='Million Song Dataset MIDI Matches', annotator=annotator, annotation_tools=( 'MIDI files were matched and aligned to audio files using the ' 'code at http://github.com/craffel/midi-dataset. Information ' 'was extracted from MIDI files using pretty_midi ' 'https://github.com/craffel/pretty-midi.'), annotation_rules=( 'Beat locations and key change times were linearly ' 'interpolated according to an audio-to-MIDI alignment.'), validation=( 'Only MIDI files with alignment confidence scores >= .5 were ' 'considered "correct". The confidence score can be used as a ' 'rough guide to the potential correctness of the annotation.'), data_source='Inferred from a MIDI file.') # Load the extracted features midi_features = deepdish.io.load(diagnostics['midi_features_filename']) audio_features = deepdish.io.load( diagnostics['audio_features_filename']) # Load in the original MIDI file midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename']) # Compute the times of the frames (will be used for interpolation) midi_frame_times = feature_extraction.frame_times( midi_features['gram'])[diagnostics['aligned_midi_indices']] audio_frame_times = feature_extraction.frame_times( audio_features['gram'])[diagnostics['aligned_audio_indices']] # Get the interpolated beat locations and add them to the JAM adjusted_beats = interpolate_times(midi_object.get_beats(), midi_frame_times, audio_frame_times) # Create annotation record for the beats beat_a = jams.Annotation(namespace='beat') beat_a.annotation_metadata = annotation_metadata # Add beat timings to the annotation record for t in adjusted_beats: beat_a.append(time=t, duration=0.0) # Add beat annotation record to the JAMS file jam.annotations.append(beat_a) # Get key signature times and their string names key_change_times = [c.time for c in midi_object.key_signature_changes] key_names = [ pretty_midi.key_number_to_key_name(c.key_number) for c in midi_object.key_signature_changes ] # JAMS requires that the key name be supplied in the form e.g. # "C:major" but pretty_midi returns things in the format "C Major", # so the following code converts to JAMS format key_names = [ name.replace(' ', ':').replace('M', 'm') for name in key_names ] # Compute interpolated event times adjusted_key_change_times, adjusted_key_names = interpolate_times( key_change_times, midi_frame_times, audio_frame_times, key_names, True) # Create JAMS annotation for the key changes if len(adjusted_key_change_times) > 0: key_a = jams.Annotation(namespace='key_mode') key_a.annotation_metadata = annotation_metadata # We only have key start times from the MIDI file, but JAMS wants # durations too, so create a list of "end times" end_times = np.append(adjusted_key_change_times[1:], jam.file_metadata.duration) # Add key labels into the JAMS file for start, end, key in zip(adjusted_key_change_times, end_times, adjusted_key_names): key_a.append(time=start, duration=end - start, value=key) jam.annotations.append(key_a) return jam
def extract_ground_truth(diagnostics_group): """ Extract ground-truth information from one or more MIDI files about a single MIDI file based on the results in one or more diagnostics files and return a JAMS object with all of the annotations compiled. Parameters ---------- - diagnostics_group : list of dict List of dicts of diagnostics, each about a successful alignment of a different MIDI file to a single audio file. """ # Construct the JAMS object jam = jams.JAMS() # Load in the first diagnostics (doesn't matter which as they all # should correspond the same audio file) diagnostics = diagnostics_group[0] # Load in the audio file to get its duration for the JAMS file audio, fs = librosa.load( diagnostics['audio_filename'], feature_extraction.AUDIO_FS) jam.file_metadata.duration = librosa.get_duration(y=audio, sr=fs) # Also store metadata about the audio file, retrieved from the MSD jam.file_metadata.identifiers = {'track_id': diagnostics['audio_id']} jam.file_metadata.artist = MSD_LIST[diagnostics['audio_id']]['artist'] jam.file_metadata.title = MSD_LIST[diagnostics['audio_id']]['title'] # Iterate over the diagnostics files supplied for diagnostics in diagnostics_group: # Create annotation metadata object, shared across annotations commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip() commit_url = "http://github.com/craffel/midi-dataset/tree/" + commit annotator = {'midi_md5': diagnostics['midi_md5'], 'commit_url': commit_url, 'confidence': diagnostics['score']} annotation_metadata = jams.AnnotationMetadata( curator=jams.Curator('Colin Raffel', '*****@*****.**'), version='0.0.1b', corpus='Million Song Dataset MIDI Matches', annotator=annotator, annotation_tools=( 'MIDI files were matched and aligned to audio files using the ' 'code at http://github.com/craffel/midi-dataset. Information ' 'was extracted from MIDI files using pretty_midi ' 'https://github.com/craffel/pretty-midi.'), annotation_rules=( 'Beat locations and key change times were linearly ' 'interpolated according to an audio-to-MIDI alignment.'), validation=( 'Only MIDI files with alignment confidence scores >= .5 were ' 'considered "correct". The confidence score can be used as a ' 'rough guide to the potential correctness of the annotation.'), data_source='Inferred from a MIDI file.') # Load the extracted features midi_features = deepdish.io.load(diagnostics['midi_features_filename']) audio_features = deepdish.io.load( diagnostics['audio_features_filename']) # Load in the original MIDI file midi_object = pretty_midi.PrettyMIDI(diagnostics['midi_filename']) # Compute the times of the frames (will be used for interpolation) midi_frame_times = feature_extraction.frame_times( midi_features['gram'])[diagnostics['aligned_midi_indices']] audio_frame_times = feature_extraction.frame_times( audio_features['gram'])[diagnostics['aligned_audio_indices']] # Get the interpolated beat locations and add them to the JAM adjusted_beats = interpolate_times( midi_object.get_beats(), midi_frame_times, audio_frame_times) # Create annotation record for the beats beat_a = jams.Annotation(namespace='beat') beat_a.annotation_metadata = annotation_metadata # Add beat timings to the annotation record for t in adjusted_beats: beat_a.append(time=t, duration=0.0) # Add beat annotation record to the JAMS file jam.annotations.append(beat_a) # Get key signature times and their string names key_change_times = [c.time for c in midi_object.key_signature_changes] key_names = [pretty_midi.key_number_to_key_name(c.key_number) for c in midi_object.key_signature_changes] # JAMS requires that the key name be supplied in the form e.g. # "C:major" but pretty_midi returns things in the format "C Major", # so the following code converts to JAMS format key_names = [name.replace(' ', ':').replace('M', 'm') for name in key_names] # Compute interpolated event times adjusted_key_change_times, adjusted_key_names = interpolate_times( key_change_times, midi_frame_times, audio_frame_times, key_names, True) # Create JAMS annotation for the key changes if len(adjusted_key_change_times) > 0: key_a = jams.Annotation(namespace='key_mode') key_a.annotation_metadata = annotation_metadata # We only have key start times from the MIDI file, but JAMS wants # durations too, so create a list of "end times" end_times = np.append(adjusted_key_change_times[1:], jam.file_metadata.duration) # Add key labels into the JAMS file for start, end, key in zip(adjusted_key_change_times, end_times, adjusted_key_names): key_a.append(time=start, duration=end - start, value=key) jam.annotations.append(key_a) return jam
def process_one_pair(midi_filename, mp3_filename, h5_filename, unaligned_output_filename, aligned_output_filename, mp3_output_filename, h5_output_filename): """ Given a candidate MIDI-audio match, align the MIDI to the audio, then copy the unaligned and aligned MIDI if the score is high enough. Parameters ---------- midi_filename : str Path to the MIDI file to align. mp3_filename : str Path to the audio file to align to. unaligned_output_filename : str Where to copy the unaligned MIDI file if the match was successful. aligned_output_filename : str Where to write the aligned MIDI file if the match was successful. mp3_output_filename : str Where to copy the mp3 file if the match was successful. h5_output_filename : str Where to copy the h5 file if the match was successful. """ try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return try: midi_gram = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Construct path to pre-computed audio CQT path audio_features_filename = mp3_filename.replace('mp3', 'h5') try: audio_features = deepdish.io.load(audio_features_filename) except Exception as e: print "Error loading CQT for {}: {}".format( os.path.split(audio_features_filename)[1], traceback.format_exc(e)) return # Check that the distance matrix will not be too big before computing size = midi_gram.shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print( "Distance matrix would be {} GB because the " "CQTs have shape {} and {}".format(size * 64 / 8e9, audio_features['gram'].shape[0], midi_gram.shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T) # Non-diagonal additive path penalty is the mean of the sim mtx # Note that we typically use a median here, but a mean is faster and # produces close enough results add_pen = np.mean(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # If the match was successful if score > SCORE_THRESHOLD: # Try adjusting MIDI timing and writing out try: # Retrieve timing of frames in CQTs midi_frame_times = feature_extraction.frame_times(midi_gram) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) # Adjust MIDI file timing m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) # Make sure all output paths exist and write out check_subdirectories(aligned_output_filename) m.write(aligned_output_filename) except Exception as e: print "Error adjusting and writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Assuming the above worked, all we have to do now is copy # Check/create all necessary subdirectores check_subdirectories(unaligned_output_filename) check_subdirectories(mp3_output_filename) check_subdirectories(h5_output_filename) # Copy all files shutil.copy(midi_filename, unaligned_output_filename) shutil.copy(mp3_filename, mp3_output_filename) try: shutil.copy(h5_filename, h5_output_filename) except Exception as e: print "Could not copy {}: {}".format( os.path.split(h5_filename)[1], traceback.format_exc(e)) return # Return list of msd_id, midi_md5, score] prefix, midi_filename = os.path.split(aligned_output_filename) msd_id = os.path.split(prefix)[1] midi_md5 = os.path.splitext(midi_filename)[0] return [msd_id, midi_md5, score]
def process_one_file(diagnostics_file, output_filename, output_filename_unaligned, output_filename_piano_roll): # If the alignment failed and there was no diagnostics file, return if not os.path.exists(diagnostics_file): return diagnostics = deepdish.io.load(diagnostics_file) score = diagnostics['score'] # Skip bad alignments if score < SCORE_THRESHOLD: return try: # Load in MIDI data pm_unaligned = pretty_midi.PrettyMIDI(str( diagnostics['midi_filename'])) # Synthesize MIDI data and extract CQT midi_gram_unaligned = feature_extraction.midi_cqt(pm_unaligned) # Get audio CQT audio_features = deepdish.io.load( str(diagnostics['audio_features_filename'])) audio_gram = audio_features['gram'] audio_frame_times = feature_extraction.frame_times(audio_gram) # Write out unaligned MIDI CQT deepdish.io.save(output_filename_unaligned, { 'X': midi_gram_unaligned[np.newaxis], 'Y': audio_gram[np.newaxis] }) # Load in MIDI data pm_aligned = pretty_midi.PrettyMIDI( str(diagnostics['output_midi_filename'])) # Synthesize MIDI data and extract CQT midi_gram_aligned = feature_extraction.midi_cqt(pm_aligned) midi_frame_times = feature_extraction.frame_times(midi_gram_aligned) # Get indices which fall within the range of correct alignment start_time = min(n.start for i in pm_aligned.instruments for n in i.notes) end_time = min(pm_aligned.get_end_time(), midi_frame_times.max(), audio_frame_times.max()) if end_time <= start_time: return # Mask out the times within the aligned region audio_gram = audio_gram[np.logical_and(audio_frame_times >= start_time, audio_frame_times <= end_time)] midi_gram = midi_gram_aligned[np.logical_and( midi_frame_times >= start_time, midi_frame_times <= end_time)] # Write out matrices with a newaxis at front (for # of channels) deepdish.io.save(output_filename, { 'X': midi_gram[np.newaxis], 'Y': audio_gram[np.newaxis] }) piano_roll = pm_aligned.get_piano_roll(times=midi_frame_times) # Only utilize the same notes which are used in the CQT piano_roll = piano_roll[feature_extraction. NOTE_START:feature_extraction.NOTE_START + feature_extraction.N_NOTES] # Transpose so that the first dimension is time piano_roll = piano_roll.T # L2 normalize columns piano_roll = librosa.util.normalize(piano_roll, norm=2, axis=1) # Mask out times within the aligned region piano_roll = piano_roll[np.logical_and(midi_frame_times >= start_time, midi_frame_times <= end_time)] # Use float32 for Theano piano_roll = piano_roll.astype(np.float32) deepdish.io.save(output_filename_piano_roll, { 'X': piano_roll[np.newaxis], 'Y': audio_gram[np.newaxis] }) except Exception as e: print "Error for {}: {}".format(diagnostics_file, traceback.format_exc(e)) return
def align_one_file(audio_filename, midi_filename, audio_features_filename=None, midi_features_filename=None, output_midi_filename=None, output_diagnostics_filename=None, additional_diagnostics=None): ''' Helper function for aligning a MIDI file to an audio file. Parameters ---------- audio_filename : str Full path to an audio file. midi_filename : str Full path to a midi file. audio_features_filename : str or None Full path to pre-computed features for the audio file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. midi_features_filename : str or None Full path to pre-computed features for the midi file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. output_midi_filename : str or None Full path to where the aligned .mid file should be written. If None, don't output. output_diagnostics_filename : str or None Full path to a file to write out diagnostic information (alignment score, best path, paths to files, etc) in a .h5 file. If None, don't output. additional_diagnostics : dict or None Optional dictionary of additional diagnostic information to include in the diagnostics file. If None, don't include. Returns ------- p, q : np.ndarray Indices of the lowest-cost alignment between the audio and MIDI score : float Normalized DTW path distance ''' # Skip when already processed if (output_diagnostics_filename is not None and os.path.exists(output_diagnostics_filename)): return try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return midi_features = {} # If a feature file was provided and the file exists, try to read it in if (midi_features_filename is not None and os.path.exists(midi_features_filename)): try: # If a feature file was provided and exists, read it in midi_features = deepdish.io.load(midi_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format(midi_features_filename, traceback.format_exc(e)) midi_features = {} if not midi_features: # Generate synthetic MIDI CQT try: midi_features['gram'] = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if midi_features_filename is not None: try: # Write out check_subdirectories(midi_features_filename) deepdish.io.save(midi_features_filename, midi_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return audio_features = {} # If a feature file was provided and the file exists, try to read it in if (audio_features_filename is not None and os.path.exists(audio_features_filename)): # If a feature file was provided and exists, read it in try: audio_features = deepdish.io.load(audio_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format(audio_features_filename, traceback.format_exc(e)) audio_features = {} # Cache audio CQT if not audio_features: try: # Read in audio data audio, fs = librosa.load(audio_filename, sr=feature_extraction.AUDIO_FS) # Compute audio cqt audio_features['gram'] = feature_extraction.audio_cqt(audio) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return if audio_features_filename is not None: try: # Write out check_subdirectories(audio_features_filename) deepdish.io.save(audio_features_filename, audio_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return try: # Check that the distance matrix will not be too big before computing size = midi_features['gram'].shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print( "Distance matrix for {} and {} would be {} GB because the " "CQTs have shape {} and {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], size * 64 / 8e9, audio_features['gram'].shape[0], midi_features['gram'].shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_features['gram'], audio_features['gram'].T) # Non-diagonal additive path penalty is the median of the sim mtx add_pen = np.median(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # The confidence score is a normalized DTW distance, which # approximately follows in the range [.5, 1.] with .5 meaning a very # good alignment. This maps the scores from [0., 1.] where 1. means a # very good alignment. score = np.clip(2 * (1 - score), 0, 1) except Exception as e: print "Error performing DTW for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Write out the aligned file if output_midi_filename is not None: try: # Adjust MIDI timing midi_frame_times = feature_extraction.frame_times( midi_features['gram']) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) check_subdirectories(output_midi_filename) m.write(output_midi_filename) except Exception as e: print "Error writing aligned .mid for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if output_diagnostics_filename is not None: try: check_subdirectories(output_diagnostics_filename) # Construct empty additional diagnostics dict when None was given if additional_diagnostics is None: additional_diagnostics = {} diagnostics = dict( aligned_midi_indices=aligned_midi_indices, aligned_audio_indices=aligned_audio_indices, score=score, audio_filename=os.path.abspath(audio_filename), midi_filename=os.path.abspath(midi_filename), audio_features_filename=os.path.abspath( audio_features_filename), midi_features_filename=os.path.abspath(midi_features_filename), output_midi_filename=os.path.abspath(output_midi_filename), output_diagnostics_filename=os.path.abspath( output_diagnostics_filename), **additional_diagnostics) deepdish.io.save(output_diagnostics_filename, diagnostics) except Exception as e: print "Error writing diagnostics for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return return aligned_midi_indices, aligned_audio_indices, score