def process_one_file(audio_file, midi_file, output_midi_file, pair_file, diagnostics_file): """ Wrapper routine for loading in audio/MIDI data, aligning, and writing out the result. Parameters ---------- audio_file, midi_file, output_midi_file, pair_file, diagnostics_file : str Paths to the audio file to align, MIDI file to align, and paths where to write the aligned MIDI, the synthesized pair file, and the DTW diagnostics file. """ # Load in the audio data audio_data, _ = librosa.load(audio_file, sr=create_data.FS) # Compute the log-magnitude CQT of the data audio_cqt, audio_times = create_data.extract_cqt(audio_data) audio_cqt = librosa.logamplitude(audio_cqt, ref_power=audio_cqt.max()).T # Load and synthesize MIDI data midi_object = pretty_midi.PrettyMIDI(midi_file) midi_audio = midi_object.fluidsynth(fs=create_data.FS) # Compute log-magnitude CQT midi_cqt, midi_times = create_data.extract_cqt(midi_audio) midi_cqt = librosa.logamplitude(midi_cqt, ref_power=midi_cqt.max()).T # Compute cosine distance matrix distance_matrix = scipy.spatial.distance.cdist( midi_cqt, audio_cqt, 'cosine') # Get lowest cost path p, q, score = djitw.dtw( distance_matrix, GULLY, np.median(distance_matrix), inplace=False) # Normalize by path length score = score/len(p) # Normalize by distance matrix submatrix within path score = score/distance_matrix[p.min():p.max(), q.min():q.max()].mean() # Adjust the MIDI file midi_object.adjust_times(midi_times[p], audio_times[q]) # Write the result midi_object.write(output_midi_file) # Synthesize aligned MIDI midi_audio_aligned = midi_object.fluidsynth(fs=create_data.FS) # Adjust to the same size as audio if midi_audio_aligned.shape[0] > audio_data.shape[0]: midi_audio_aligned = midi_audio_aligned[:audio_data.shape[0]] else: trim_amount = audio_data.shape[0] - midi_audio_aligned.shape[0] midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(trim_amount)) # Stack one in each channel librosa.output.write_wav( pair_file, np.array([midi_audio_aligned, audio_data]), create_data.FS) # Write out diagnostics with open(diagnostics_file, 'wb') as f: json.dump({'p': list(p), 'q': list(q), 'score': score}, f)
def align(mid_file, Y_pred): import os import djitw import numpy as np import pretty_midi #import os cwd = os.getcwd() print(cwd) # os.chdir('C:\Users\Francisco A\Google Drive ([email protected])\ULI\TFG\Codigo primario\predicts+weigths\predicts_m+p') f = pretty_midi.PrettyMIDI(midi_file=mid_file) piano_roll = pretty_midi.PrettyMIDI.get_piano_roll(f, fs=44100 / 512.0) piano_roll = piano_roll[21:109, :] piano_roll_est = Y_pred for i in range(0, piano_roll.shape[0]): for j in range(0, piano_roll.shape[1]): if (piano_roll[i, j] > 0.5): piano_roll[i, j] = 1 ''' Align a MIDI object in-place to some audio data. Parameters ---------- midi_object : pretty_midi.PrettyMIDI A pretty_midi.PrettyMIDI class instance describing some MIDI content audio_data : np.ndarray Samples of some audio data fs : int audio_data's sampling rate, and the sampling rate to use when synthesizing MIDI hop : int Hop length for CQT note_start : int Lowest MIDI note number for CQT n_notes : int Number of notes to include in the CQT penalty : float DTW non-diagonal move penalty ''' # L2-normalized we can compute a cosine distance matrix via a dot product distance_matrix = 1 - np.dot(piano_roll[:, :].T, piano_roll_est[:, :]) penalty = distance_matrix.mean() # Compute lowest-cost path through distance matrix p, q, score = djitw.dtw(distance_matrix, gully=.98, additive_penalty=penalty) return p, q
def compute_dtw_distance(x, y, metric): distance_matrix = sp.spatial.distance.cdist(x, y, metric=metric) # Non-diagonal additive path penalty is the median of the sim matrix add_pen = np.median(distance_matrix) x_indices, y_indices, cost = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length cost /= float(len(x_indices)) # Normalize score by mean distance matrix value within path chunk cost /= distance_matrix[ x_indices.min():x_indices.max()+1, y_indices.min():y_indices.max()+1].mean() if np.isnan(cost): pdb.set_trace() # cost is unbounded return cost
def compute_dtw_distance(x, y, metric): distance_matrix = sp.spatial.distance.cdist(x, y, metric=metric) # Non-diagonal additive path penalty is the median of the sim matrix add_pen = np.median(distance_matrix) x_indices, y_indices, cost = djitw.dtw(distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length cost /= float(len(x_indices)) # Normalize score by mean distance matrix value within path chunk cost /= distance_matrix[x_indices.min():x_indices.max() + 1, y_indices.min():y_indices.max() + 1].mean() if np.isnan(cost): pdb.set_trace() # cost is unbounded return cost
def align(midi_object, audio_data, fs, hop, note_start, n_notes, penalty): ''' Align a MIDI object in-place to some audio data. Parameters ---------- midi_object : pretty_midi.PrettyMIDI A pretty_midi.PrettyMIDI class instance describing some MIDI content audio_data : np.ndarray Samples of some audio data fs : int audio_data's sampling rate, and the sampling rate to use when synthesizing MIDI hop : int Hop length for CQT note_start : int Lowest MIDI note number for CQT n_notes : int Number of notes to include in the CQT penalty : float DTW non-diagonal move penalty ''' # Get synthesized MIDI audio midi_audio = midi_object.fluidsynth(fs=fs) # Compute CQ-grams for MIDI and audio midi_gram, midi_times = extract_cqt(midi_audio, fs, hop, note_start, n_notes) audio_gram, audio_times = extract_cqt(audio_data, fs, hop, note_start, n_notes) # Compute distance matrix; because the columns of the CQ-grams are # L2-normalized we can compute a cosine distance matrix via a dot product distance_matrix = 1 - np.dot(midi_gram, audio_gram.T) if penalty is None: penalty = distance_matrix.mean() # Compute lowest-cost path through distance matrix p, q, score = djitw.dtw(distance_matrix, gully=.98, additive_penalty=penalty) # Adjust the timing of the MIDI object according to the alignment midi_object.adjust_times(midi_times[p], audio_times[q])
def align(midi_object, audio_data, fs, hop, note_start, n_notes, penalty): ''' Align a MIDI object in-place to some audio data. Parameters ---------- midi_object : pretty_midi.PrettyMIDI A pretty_midi.PrettyMIDI class instance describing some MIDI content audio_data : np.ndarray Samples of some audio data fs : int audio_data's sampling rate, and the sampling rate to use when synthesizing MIDI hop : int Hop length for CQT note_start : int Lowest MIDI note number for CQT n_notes : int Number of notes to include in the CQT penalty : float DTW non-diagonal move penalty ''' # Get synthesized MIDI audio midi_audio = midi_object.fluidsynth(fs=fs) # Compute CQ-grams for MIDI and audio midi_gram, midi_times = extract_cqt( midi_audio, fs, hop, note_start, n_notes) audio_gram, audio_times = extract_cqt( audio_data, fs, hop, note_start, n_notes) # Compute distance matrix; because the columns of the CQ-grams are # L2-normalized we can compute a cosine distance matrix via a dot product distance_matrix = 1 - np.dot(midi_gram, audio_gram.T) if penalty is None: penalty = distance_matrix.mean() # Compute lowest-cost path through distance matrix p, q, score = djitw.dtw( distance_matrix, gully=.98, penalty=penalty) # Adjust the timing of the MIDI object according to the alignment midi_object.adjust_times(midi_times[p], audio_times[q])
def process_one_pair(midi_filename, mp3_filename, h5_filename, unaligned_output_filename, aligned_output_filename, mp3_output_filename, h5_output_filename): """ Given a candidate MIDI-audio match, align the MIDI to the audio, then copy the unaligned and aligned MIDI if the score is high enough. Parameters ---------- midi_filename : str Path to the MIDI file to align. mp3_filename : str Path to the audio file to align to. unaligned_output_filename : str Where to copy the unaligned MIDI file if the match was successful. aligned_output_filename : str Where to write the aligned MIDI file if the match was successful. mp3_output_filename : str Where to copy the mp3 file if the match was successful. h5_output_filename : str Where to copy the h5 file if the match was successful. """ try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return try: midi_gram = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Construct path to pre-computed audio CQT path audio_features_filename = mp3_filename.replace('mp3', 'h5') try: audio_features = deepdish.io.load(audio_features_filename) except Exception as e: print "Error loading CQT for {}: {}".format( os.path.split(audio_features_filename)[1], traceback.format_exc(e)) return # Check that the distance matrix will not be too big before computing size = midi_gram.shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print ( "Distance matrix would be {} GB because the " "CQTs have shape {} and {}".format( size * 64 / 8e9, audio_features['gram'].shape[0], midi_gram.shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T) # Non-diagonal additive path penalty is the mean of the sim mtx # Note that we typically use a median here, but a mean is faster and # produces close enough results add_pen = np.mean(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # If the match was successful if score > SCORE_THRESHOLD: # Try adjusting MIDI timing and writing out try: # Retrieve timing of frames in CQTs midi_frame_times = feature_extraction.frame_times(midi_gram) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) # Adjust MIDI file timing m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) # Make sure all output paths exist and write out check_subdirectories(aligned_output_filename) m.write(aligned_output_filename) except Exception as e: print "Error adjusting and writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Assuming the above worked, all we have to do now is copy # Check/create all necessary subdirectores check_subdirectories(unaligned_output_filename) check_subdirectories(mp3_output_filename) check_subdirectories(h5_output_filename) # Copy all files shutil.copy(midi_filename, unaligned_output_filename) shutil.copy(mp3_filename, mp3_output_filename) try: shutil.copy(h5_filename, h5_output_filename) except Exception as e: print "Could not copy {}: {}".format( os.path.split(h5_filename)[1], traceback.format_exc(e)) return # Return list of msd_id, midi_md5, score] prefix, midi_filename = os.path.split(aligned_output_filename) msd_id = os.path.split(prefix)[1] midi_md5 = os.path.splitext(midi_filename)[0] return [msd_id, midi_md5, score]
def align_one_file(audio_filename, midi_filename, audio_features_filename=None, midi_features_filename=None, output_midi_filename=None, output_diagnostics_filename=None, additional_diagnostics=None): ''' Helper function for aligning a MIDI file to an audio file. Parameters ---------- audio_filename : str Full path to an audio file. midi_filename : str Full path to a midi file. audio_features_filename : str or None Full path to pre-computed features for the audio file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. midi_features_filename : str or None Full path to pre-computed features for the midi file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. output_midi_filename : str or None Full path to where the aligned .mid file should be written. If None, don't output. output_diagnostics_filename : str or None Full path to a file to write out diagnostic information (alignment score, best path, paths to files, etc) in a .h5 file. If None, don't output. additional_diagnostics : dict or None Optional dictionary of additional diagnostic information to include in the diagnostics file. If None, don't include. Returns ------- p, q : np.ndarray Indices of the lowest-cost alignment between the audio and MIDI score : float Normalized DTW path distance ''' # Skip when already processed if (output_diagnostics_filename is not None and os.path.exists(output_diagnostics_filename)): return try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return midi_features = {} # If a feature file was provided and the file exists, try to read it in if (midi_features_filename is not None and os.path.exists(midi_features_filename)): try: # If a feature file was provided and exists, read it in midi_features = deepdish.io.load(midi_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format( midi_features_filename, traceback.format_exc(e)) midi_features = {} if not midi_features: # Generate synthetic MIDI CQT try: midi_features['gram'] = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if midi_features_filename is not None: try: # Write out check_subdirectories(midi_features_filename) deepdish.io.save( midi_features_filename, midi_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return audio_features = {} # If a feature file was provided and the file exists, try to read it in if (audio_features_filename is not None and os.path.exists(audio_features_filename)): # If a feature file was provided and exists, read it in try: audio_features = deepdish.io.load(audio_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format( audio_features_filename, traceback.format_exc(e)) audio_features = {} # Cache audio CQT if not audio_features: try: # Read in audio data audio, fs = librosa.load( audio_filename, sr=feature_extraction.AUDIO_FS) # Compute audio cqt audio_features['gram'] = feature_extraction.audio_cqt(audio) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return if audio_features_filename is not None: try: # Write out check_subdirectories(audio_features_filename) deepdish.io.save(audio_features_filename, audio_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return try: # Check that the distance matrix will not be too big before computing size = midi_features['gram'].shape[0]*audio_features['gram'].shape[0] # If > 1 GB, skip if (size*64/8e9 > 1): print ( "Distance matrix for {} and {} would be {} GB because the " "CQTs have shape {} and {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], size*64/8e9, audio_features['gram'].shape[0], midi_features['gram'].shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot( midi_features['gram'], audio_features['gram'].T) # Non-diagonal additive path penalty is the median of the sim mtx add_pen = np.median(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # The confidence score is a normalized DTW distance, which # approximately follows in the range [.5, 1.] with .5 meaning a very # good alignment. This maps the scores from [0., 1.] where 1. means a # very good alignment. score = np.clip(2*(1 - score), 0, 1) except Exception as e: print "Error performing DTW for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Write out the aligned file if output_midi_filename is not None: try: # Adjust MIDI timing midi_frame_times = feature_extraction.frame_times( midi_features['gram']) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) check_subdirectories(output_midi_filename) m.write(output_midi_filename) except Exception as e: print "Error writing aligned .mid for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if output_diagnostics_filename is not None: try: check_subdirectories(output_diagnostics_filename) # Construct empty additional diagnostics dict when None was given if additional_diagnostics is None: additional_diagnostics = {} diagnostics = dict( aligned_midi_indices=aligned_midi_indices, aligned_audio_indices=aligned_audio_indices, score=score, audio_filename=os.path.abspath(audio_filename), midi_filename=os.path.abspath(midi_filename), audio_features_filename=os.path.abspath( audio_features_filename), midi_features_filename=os.path.abspath(midi_features_filename), output_midi_filename=os.path.abspath(output_midi_filename), output_diagnostics_filename=os.path.abspath( output_diagnostics_filename), **additional_diagnostics) deepdish.io.save(output_diagnostics_filename, diagnostics) except Exception as e: print "Error writing diagnostics for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return return aligned_midi_indices, aligned_audio_indices, score
def align_dataset(params, data): ''' Perform alignment of all corrupted MIDIs in the database given the supplied parameters and compute the mean alignment error across all examples Parameters ---------- params : dict Dictionary of alignment parameters. data : list of dict Collection of things to align, loaded via load_dataset. Returns ------- results : list of dict List of dicts reporting the results for each alignment ''' def post_process_features(gram, beats): ''' Apply processing to a feature matrix given the supplied param values Parameters ---------- gram : np.ndarray Feature matrix, shape (n_features, n_samples) beats : np.ndarray Indices of beat locations in gram Returns ------- gram : np.ndarray Feature matrix, shape (n_samples, n_features), post-processed according to the values in `params` ''' # Convert to chroma if params['feature'] == 'chroma': gram = librosa.feature.chroma_cqt( C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START)) # Beat-synchronize the feature matrix if params['beat_sync']: gram = librosa.feature.sync(gram, beats, pad=False) # Compute log magnitude gram = librosa.logamplitude(gram, ref_power=gram.max()) # Normalize the feature vectors gram = librosa.util.normalize(gram, norm=params['norm']) # Standardize the feature vectors if params['standardize']: gram = scipy.stats.mstats.zscore(gram, axis=1) # Transpose it to (n_samples, n_features) and return it return gram.T # List for storing the results of each alignment results = collections.defaultdict(list) for n, d in enumerate(data): # If we are beat syncing and either of the beat frames are empty, we # can't really align, so just skip this file. if params['beat_sync'] and (d['orig_beat_frames'].size == 0 or d['corrupted_beat_frames'].size == 0): continue # Post proces the chosen feature matrices orig_gram = post_process_features( d['orig_gram'], d['orig_beat_frames']) corrupted_gram = post_process_features( d['corrupted_gram'], d['corrupted_beat_frames']) # Compute a distance matrix according to the supplied metric distance_matrix = scipy.spatial.distance.cdist( orig_gram, corrupted_gram, params['metric']) # If the entire distance matrix is non-finite, we can't align, skip if not np.any(np.isfinite(distance_matrix)): continue # Set any Nan/inf values to the largest distance distance_matrix[np.logical_not(np.isfinite(distance_matrix))] = np.max( distance_matrix[np.isfinite(distance_matrix)]) # Compute a band mask or set to None for no mask if params['band_mask']: mask = np.zeros(distance_matrix.shape, dtype=np.bool) djitw.band_mask(1 - params['gully'], mask) else: mask = None # Get DTW path and score add_pen = params['add_pen']*np.median(distance_matrix) p, q, score = djitw.dtw( distance_matrix, params['gully'], add_pen, mask=mask, inplace=0) if params['beat_sync']: # If we are beat syncing, we have to compare against beat times # so we index adjusted_times by the beat indices adjusted_times = d['adjusted_times'][d['orig_beat_frames']] corrupted_times = d['corrupted_beat_times'] else: corrupted_times = d['corrupted_times'] adjusted_times = d['adjusted_times'] # Compute the error, clipped to within .5 seconds error = np.clip( corrupted_times[q] - adjusted_times[p], -.5, .5) # Compute the mean error for this MIDI mean_error = np.mean(np.abs(error)) # If the mean error is NaN or inf for some reason, set it to max (.5) if not np.isfinite(mean_error): mean_error = .5 results['mean_errors'].append(mean_error) results['raw_scores'].append(score) results['raw_scores_no_penalty'].append(distance_matrix[p, q].sum()) results['path_lengths'].append(p.shape[0]) results['distance_matrix_means'].append(np.mean( distance_matrix[p.min():p.max() + 1, q.min():q.max() + 1])) results['feature_files'].append(os.path.basename(d['feature_file'])) return results
def process_one_pair(midi_filename, mp3_filename, h5_filename, unaligned_output_filename, aligned_output_filename, mp3_output_filename, h5_output_filename): """ Given a candidate MIDI-audio match, align the MIDI to the audio, then copy the unaligned and aligned MIDI if the score is high enough. Parameters ---------- midi_filename : str Path to the MIDI file to align. mp3_filename : str Path to the audio file to align to. unaligned_output_filename : str Where to copy the unaligned MIDI file if the match was successful. aligned_output_filename : str Where to write the aligned MIDI file if the match was successful. mp3_output_filename : str Where to copy the mp3 file if the match was successful. h5_output_filename : str Where to copy the h5 file if the match was successful. """ try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return try: midi_gram = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Construct path to pre-computed audio CQT path audio_features_filename = mp3_filename.replace('mp3', 'h5') try: audio_features = deepdish.io.load(audio_features_filename) except Exception as e: print "Error loading CQT for {}: {}".format( os.path.split(audio_features_filename)[1], traceback.format_exc(e)) return # Check that the distance matrix will not be too big before computing size = midi_gram.shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print( "Distance matrix would be {} GB because the " "CQTs have shape {} and {}".format(size * 64 / 8e9, audio_features['gram'].shape[0], midi_gram.shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T) # Non-diagonal additive path penalty is the mean of the sim mtx # Note that we typically use a median here, but a mean is faster and # produces close enough results add_pen = np.mean(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # If the match was successful if score > SCORE_THRESHOLD: # Try adjusting MIDI timing and writing out try: # Retrieve timing of frames in CQTs midi_frame_times = feature_extraction.frame_times(midi_gram) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) # Adjust MIDI file timing m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) # Make sure all output paths exist and write out check_subdirectories(aligned_output_filename) m.write(aligned_output_filename) except Exception as e: print "Error adjusting and writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Assuming the above worked, all we have to do now is copy # Check/create all necessary subdirectores check_subdirectories(unaligned_output_filename) check_subdirectories(mp3_output_filename) check_subdirectories(h5_output_filename) # Copy all files shutil.copy(midi_filename, unaligned_output_filename) shutil.copy(mp3_filename, mp3_output_filename) try: shutil.copy(h5_filename, h5_output_filename) except Exception as e: print "Could not copy {}: {}".format( os.path.split(h5_filename)[1], traceback.format_exc(e)) return # Return list of msd_id, midi_md5, score] prefix, midi_filename = os.path.split(aligned_output_filename) msd_id = os.path.split(prefix)[1] midi_md5 = os.path.splitext(midi_filename)[0] return [msd_id, midi_md5, score]
def match_one_midi(midi_gram, midi_embedding, midi_hash_sequence, msd_embeddings, msd_sequences, msd_feature_paths, msd_ids): """ Match one MIDI file to the million song dataset by computing its CQT, pruning by matching its embedding, re-pruning by matching its downsampled hash sequence, and finally doing DTW on CQTs on the remaining entries. Parameters ---------- midi_gram : np.ndarray Synthesized MIDI CQT midi_embedding : np.ndarray Embedding of the synthesized MIDI CQT midi_hash_sequence : np.ndarray Downsampled hash sequence of the MIDI CQT msd_embeddings : np.ndarray (# MSD entries x embedding dimension) matrix of all embeddings for all entries from the MSD msd_sequences : list of np.ndarray List of binary vector sequences (represented as ints) for all MSD entries msd_feature_paths : list of str Path to feature files (containing CQT) for each MSD entry msd_ids : list of str MSD ID of each corresponding entry in the above lists Returns ------- dtw_matches : list of list List of [msd_id, score] for all non-pruned MSD entries """ # Get the distance between the MIDI embedding and all MSD entries embedding_distances = np.sum((msd_embeddings - midi_embedding)**2, axis=1) # Get the indices of MSD entries sorted by their embedded distance to the # query MIDI embedding. embedding_matches = np.argsort(embedding_distances) # Get the top N matches embedding_matches = embedding_matches[:TOP_EMBEDDINGS] # Match this hash sequence to MSD sequences hash_matches, _, _ = dhs.match_one_sequence( midi_hash_sequence, msd_sequences, GULLY, PENALTY, True, embedding_matches) # Get the top N matches hash_matches = hash_matches[:TOP_SEQUENCES] # List for storing final match information matches = [] # Perform DTW matching for each non-pruned MSD entry for match in hash_matches: # Construct path to pre-computed audio CQT path audio_features_filename = os.path.join(msd_feature_paths[match]) try: audio_features = deepdish.io.load(audio_features_filename) except Exception as e: print "Error loading CQT for {}: {}".format( os.path.split(audio_features_filename)[1], traceback.format_exc(e)) continue # Check that the distance matrix will not be too big before computing size = midi_gram.shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print ( "Distance matrix would be {} GB because the " "CQTs have shape {} and {}".format( size * 64 / 8e9, audio_features['gram'].shape[0], midi_gram.shape[0])) continue # Get distance matrix distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T) # Non-diagonal additive path penalty is the mean of the sim mtx # Note that we typically use a median here, but a mean is faster and # produces close enough results add_pen = np.mean(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # The confidence score is a normalized DTW distance, which # approximately follows in the range [.5, 1.] with .5 meaning a very # good alignment. This maps the scores from [0., 1.] where 1. means a # very good alignment. score = np.clip(2 * (1 - score), 0, 1) matches.append([msd_ids[match], score]) return matches
def align_one_file(audio_filename, midi_filename, audio_features_filename=None, midi_features_filename=None, output_midi_filename=None, output_diagnostics_filename=None, additional_diagnostics=None): ''' Helper function for aligning a MIDI file to an audio file. Parameters ---------- audio_filename : str Full path to an audio file. midi_filename : str Full path to a midi file. audio_features_filename : str or None Full path to pre-computed features for the audio file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. midi_features_filename : str or None Full path to pre-computed features for the midi file. If the file doesn't exist, features will be computed and saved. If None, force re-computation of the features and don't save. output_midi_filename : str or None Full path to where the aligned .mid file should be written. If None, don't output. output_diagnostics_filename : str or None Full path to a file to write out diagnostic information (alignment score, best path, paths to files, etc) in a .h5 file. If None, don't output. additional_diagnostics : dict or None Optional dictionary of additional diagnostic information to include in the diagnostics file. If None, don't include. Returns ------- p, q : np.ndarray Indices of the lowest-cost alignment between the audio and MIDI score : float Normalized DTW path distance ''' # Skip when already processed if (output_diagnostics_filename is not None and os.path.exists(output_diagnostics_filename)): return try: m = pretty_midi.PrettyMIDI(midi_filename) except Exception as e: print 'Could not parse {}: {}'.format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return midi_features = {} # If a feature file was provided and the file exists, try to read it in if (midi_features_filename is not None and os.path.exists(midi_features_filename)): try: # If a feature file was provided and exists, read it in midi_features = deepdish.io.load(midi_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format(midi_features_filename, traceback.format_exc(e)) midi_features = {} if not midi_features: # Generate synthetic MIDI CQT try: midi_features['gram'] = feature_extraction.midi_cqt(m) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if midi_features_filename is not None: try: # Write out check_subdirectories(midi_features_filename) deepdish.io.save(midi_features_filename, midi_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return audio_features = {} # If a feature file was provided and the file exists, try to read it in if (audio_features_filename is not None and os.path.exists(audio_features_filename)): # If a feature file was provided and exists, read it in try: audio_features = deepdish.io.load(audio_features_filename) # If there was a problem reading, force re-cration except Exception as e: print "Error reading {}: {}".format(audio_features_filename, traceback.format_exc(e)) audio_features = {} # Cache audio CQT if not audio_features: try: # Read in audio data audio, fs = librosa.load(audio_filename, sr=feature_extraction.AUDIO_FS) # Compute audio cqt audio_features['gram'] = feature_extraction.audio_cqt(audio) except Exception as e: print "Error creating CQT for {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return if audio_features_filename is not None: try: # Write out check_subdirectories(audio_features_filename) deepdish.io.save(audio_features_filename, audio_features) except Exception as e: print "Error writing {}: {}".format( os.path.split(audio_filename)[1], traceback.format_exc(e)) return try: # Check that the distance matrix will not be too big before computing size = midi_features['gram'].shape[0] * audio_features['gram'].shape[0] # If > 1 GB, skip if (size * 64 / 8e9 > 2): print( "Distance matrix for {} and {} would be {} GB because the " "CQTs have shape {} and {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], size * 64 / 8e9, audio_features['gram'].shape[0], midi_features['gram'].shape[0])) return # Get distance matrix distance_matrix = 1 - np.dot(midi_features['gram'], audio_features['gram'].T) # Non-diagonal additive path penalty is the median of the sim mtx add_pen = np.median(distance_matrix) # Get best path through matrix aligned_midi_indices, aligned_audio_indices, score = djitw.dtw( distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False) # Normalize score by path length score /= float(len(aligned_midi_indices)) # Normalize score by score by mean sim matrix value within path chunk score /= distance_matrix[ aligned_midi_indices.min():aligned_midi_indices.max(), aligned_audio_indices.min():aligned_audio_indices.max()].mean() # The confidence score is a normalized DTW distance, which # approximately follows in the range [.5, 1.] with .5 meaning a very # good alignment. This maps the scores from [0., 1.] where 1. means a # very good alignment. score = np.clip(2 * (1 - score), 0, 1) except Exception as e: print "Error performing DTW for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return # Write out the aligned file if output_midi_filename is not None: try: # Adjust MIDI timing midi_frame_times = feature_extraction.frame_times( midi_features['gram']) audio_frame_times = feature_extraction.frame_times( audio_features['gram']) m.adjust_times(midi_frame_times[aligned_midi_indices], audio_frame_times[aligned_audio_indices]) check_subdirectories(output_midi_filename) m.write(output_midi_filename) except Exception as e: print "Error writing aligned .mid for {}: {}".format( os.path.split(midi_filename)[1], traceback.format_exc(e)) return if output_diagnostics_filename is not None: try: check_subdirectories(output_diagnostics_filename) # Construct empty additional diagnostics dict when None was given if additional_diagnostics is None: additional_diagnostics = {} diagnostics = dict( aligned_midi_indices=aligned_midi_indices, aligned_audio_indices=aligned_audio_indices, score=score, audio_filename=os.path.abspath(audio_filename), midi_filename=os.path.abspath(midi_filename), audio_features_filename=os.path.abspath( audio_features_filename), midi_features_filename=os.path.abspath(midi_features_filename), output_midi_filename=os.path.abspath(output_midi_filename), output_diagnostics_filename=os.path.abspath( output_diagnostics_filename), **additional_diagnostics) deepdish.io.save(output_diagnostics_filename, diagnostics) except Exception as e: print "Error writing diagnostics for {} and {}: {}".format( os.path.split(audio_filename)[1], os.path.split(midi_filename)[1], traceback.format_exc(e)) return return aligned_midi_indices, aligned_audio_indices, score