コード例 #1
0
def process_one_file(audio_file, midi_file, output_midi_file, pair_file,
                     diagnostics_file):
    """
    Wrapper routine for loading in audio/MIDI data, aligning, and writing
    out the result.

    Parameters
    ----------
    audio_file, midi_file, output_midi_file, pair_file, diagnostics_file : str
        Paths to the audio file to align, MIDI file to align, and paths where
        to write the aligned MIDI, the synthesized pair file, and the DTW
        diagnostics file.
    """
    # Load in the audio data
    audio_data, _ = librosa.load(audio_file, sr=create_data.FS)
    # Compute the log-magnitude CQT of the data
    audio_cqt, audio_times = create_data.extract_cqt(audio_data)
    audio_cqt = librosa.logamplitude(audio_cqt, ref_power=audio_cqt.max()).T
    # Load and synthesize MIDI data
    midi_object = pretty_midi.PrettyMIDI(midi_file)
    midi_audio = midi_object.fluidsynth(fs=create_data.FS)
    # Compute log-magnitude CQT
    midi_cqt, midi_times = create_data.extract_cqt(midi_audio)
    midi_cqt = librosa.logamplitude(midi_cqt, ref_power=midi_cqt.max()).T
    # Compute cosine distance matrix
    distance_matrix = scipy.spatial.distance.cdist(
        midi_cqt, audio_cqt, 'cosine')
    # Get lowest cost path
    p, q, score = djitw.dtw(
        distance_matrix, GULLY, np.median(distance_matrix), inplace=False)
    # Normalize by path length
    score = score/len(p)
    # Normalize by distance matrix submatrix within path
    score = score/distance_matrix[p.min():p.max(), q.min():q.max()].mean()
    # Adjust the MIDI file
    midi_object.adjust_times(midi_times[p], audio_times[q])
    # Write the result
    midi_object.write(output_midi_file)
    # Synthesize aligned MIDI
    midi_audio_aligned = midi_object.fluidsynth(fs=create_data.FS)
    # Adjust to the same size as audio
    if midi_audio_aligned.shape[0] > audio_data.shape[0]:
        midi_audio_aligned = midi_audio_aligned[:audio_data.shape[0]]
    else:
        trim_amount = audio_data.shape[0] - midi_audio_aligned.shape[0]
        midi_audio_aligned = np.append(midi_audio_aligned,
                                       np.zeros(trim_amount))
    # Stack one in each channel
    librosa.output.write_wav(
        pair_file, np.array([midi_audio_aligned, audio_data]), create_data.FS)
    # Write out diagnostics
    with open(diagnostics_file, 'wb') as f:
        json.dump({'p': list(p), 'q': list(q), 'score': score}, f)
コード例 #2
0
ファイル: align.py プロジェクト: Franciscocartas5/TFG_repo
def align(mid_file, Y_pred):

    import os
    import djitw
    import numpy as np
    import pretty_midi
    #import os

    cwd = os.getcwd()
    print(cwd)
    #    os.chdir('C:\Users\Francisco A\Google Drive ([email protected])\ULI\TFG\Codigo primario\predicts+weigths\predicts_m+p')
    f = pretty_midi.PrettyMIDI(midi_file=mid_file)
    piano_roll = pretty_midi.PrettyMIDI.get_piano_roll(f, fs=44100 / 512.0)
    piano_roll = piano_roll[21:109, :]

    piano_roll_est = Y_pred

    for i in range(0, piano_roll.shape[0]):
        for j in range(0, piano_roll.shape[1]):
            if (piano_roll[i, j] > 0.5):
                piano_roll[i, j] = 1
    '''
    Align a MIDI object in-place to some audio data.
    Parameters
    ----------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing some MIDI content
    audio_data : np.ndarray
        Samples of some audio data
    fs : int
        audio_data's sampling rate, and the sampling rate to use when
        synthesizing MIDI
    hop : int
        Hop length for CQT
    note_start : int
        Lowest MIDI note number for CQT
    n_notes : int
        Number of notes to include in the CQT
    penalty : float
        DTW non-diagonal move penalty
    '''
    # L2-normalized we can compute a cosine distance matrix via a dot product
    distance_matrix = 1 - np.dot(piano_roll[:, :].T, piano_roll_est[:, :])
    penalty = distance_matrix.mean()
    # Compute lowest-cost path through distance matrix
    p, q, score = djitw.dtw(distance_matrix,
                            gully=.98,
                            additive_penalty=penalty)

    return p, q
コード例 #3
0
def compute_dtw_distance(x, y, metric):
    distance_matrix = sp.spatial.distance.cdist(x, y, metric=metric)
    # Non-diagonal additive path penalty is the median of the sim matrix
    add_pen = np.median(distance_matrix)
    x_indices, y_indices, cost = djitw.dtw(
        distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False)
    # Normalize score by path length
    cost /= float(len(x_indices))
    # Normalize score by mean distance matrix value within path chunk
    cost /= distance_matrix[
        x_indices.min():x_indices.max()+1,
        y_indices.min():y_indices.max()+1].mean()
    if np.isnan(cost):
        pdb.set_trace()
    # cost is unbounded
    return cost
コード例 #4
0
def compute_dtw_distance(x, y, metric):
    distance_matrix = sp.spatial.distance.cdist(x, y, metric=metric)
    # Non-diagonal additive path penalty is the median of the sim matrix
    add_pen = np.median(distance_matrix)
    x_indices, y_indices, cost = djitw.dtw(distance_matrix,
                                           gully=.96,
                                           additive_penalty=add_pen,
                                           inplace=False)
    # Normalize score by path length
    cost /= float(len(x_indices))
    # Normalize score by mean distance matrix value within path chunk
    cost /= distance_matrix[x_indices.min():x_indices.max() + 1,
                            y_indices.min():y_indices.max() + 1].mean()
    if np.isnan(cost):
        pdb.set_trace()
    # cost is unbounded
    return cost
コード例 #5
0
def align(midi_object, audio_data, fs, hop, note_start, n_notes, penalty):
    '''
    Align a MIDI object in-place to some audio data.

    Parameters
    ----------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing some MIDI content
    audio_data : np.ndarray
        Samples of some audio data
    fs : int
        audio_data's sampling rate, and the sampling rate to use when
        synthesizing MIDI
    hop : int
        Hop length for CQT
    note_start : int
        Lowest MIDI note number for CQT
    n_notes : int
        Number of notes to include in the CQT
    penalty : float
        DTW non-diagonal move penalty
    '''
    # Get synthesized MIDI audio
    midi_audio = midi_object.fluidsynth(fs=fs)
    # Compute CQ-grams for MIDI and audio
    midi_gram, midi_times = extract_cqt(midi_audio, fs, hop, note_start,
                                        n_notes)
    audio_gram, audio_times = extract_cqt(audio_data, fs, hop, note_start,
                                          n_notes)
    # Compute distance matrix; because the columns of the CQ-grams are
    # L2-normalized we can compute a cosine distance matrix via a dot product
    distance_matrix = 1 - np.dot(midi_gram, audio_gram.T)
    if penalty is None:
        penalty = distance_matrix.mean()
    # Compute lowest-cost path through distance matrix
    p, q, score = djitw.dtw(distance_matrix,
                            gully=.98,
                            additive_penalty=penalty)
    # Adjust the timing of the MIDI object according to the alignment
    midi_object.adjust_times(midi_times[p], audio_times[q])
コード例 #6
0
ファイル: align_midi.py プロジェクト: beckgom/pretty-midi
def align(midi_object, audio_data, fs, hop, note_start, n_notes, penalty):
    '''
    Align a MIDI object in-place to some audio data.

    Parameters
    ----------
    midi_object : pretty_midi.PrettyMIDI
        A pretty_midi.PrettyMIDI class instance describing some MIDI content
    audio_data : np.ndarray
        Samples of some audio data
    fs : int
        audio_data's sampling rate, and the sampling rate to use when
        synthesizing MIDI
    hop : int
        Hop length for CQT
    note_start : int
        Lowest MIDI note number for CQT
    n_notes : int
        Number of notes to include in the CQT
    penalty : float
        DTW non-diagonal move penalty
    '''
    # Get synthesized MIDI audio
    midi_audio = midi_object.fluidsynth(fs=fs)
    # Compute CQ-grams for MIDI and audio
    midi_gram, midi_times = extract_cqt(
        midi_audio, fs, hop, note_start, n_notes)
    audio_gram, audio_times = extract_cqt(
        audio_data, fs, hop, note_start, n_notes)
    # Compute distance matrix; because the columns of the CQ-grams are
    # L2-normalized we can compute a cosine distance matrix via a dot product
    distance_matrix = 1 - np.dot(midi_gram, audio_gram.T)
    if penalty is None:
        penalty = distance_matrix.mean()
    # Compute lowest-cost path through distance matrix
    p, q, score = djitw.dtw(
        distance_matrix, gully=.98, penalty=penalty)
    # Adjust the timing of the MIDI object according to the alignment
    midi_object.adjust_times(midi_times[p], audio_times[q])
コード例 #7
0
def process_one_pair(midi_filename, mp3_filename, h5_filename,
                     unaligned_output_filename, aligned_output_filename,
                     mp3_output_filename, h5_output_filename):
    """
    Given a candidate MIDI-audio match, align the MIDI to the audio, then copy
    the unaligned and aligned MIDI if the score is high enough.

    Parameters
    ----------
    midi_filename : str
        Path to the MIDI file to align.
    mp3_filename : str
        Path to the audio file to align to.
    unaligned_output_filename : str
        Where to copy the unaligned MIDI file if the match was successful.
    aligned_output_filename : str
        Where to write the aligned MIDI file if the match was successful.
    mp3_output_filename : str
        Where to copy the mp3 file if the match was successful.
    h5_output_filename : str
        Where to copy the h5 file if the match was successful.
    """
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    try:
        midi_gram = feature_extraction.midi_cqt(m)
    except Exception as e:
        print "Error creating CQT for {}: {}".format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    # Construct path to pre-computed audio CQT path
    audio_features_filename = mp3_filename.replace('mp3', 'h5')
    try:
        audio_features = deepdish.io.load(audio_features_filename)
    except Exception as e:
        print "Error loading CQT for {}: {}".format(
            os.path.split(audio_features_filename)[1],
            traceback.format_exc(e))
        return
    # Check that the distance matrix will not be too big before computing
    size = midi_gram.shape[0] * audio_features['gram'].shape[0]
    # If > 1 GB, skip
    if (size * 64 / 8e9 > 2):
        print (
            "Distance matrix would be {} GB because the "
            "CQTs have shape {} and {}".format(
                size * 64 / 8e9, audio_features['gram'].shape[0],
                midi_gram.shape[0]))
        return
    # Get distance matrix
    distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T)
    # Non-diagonal additive path penalty is the mean of the sim mtx
    # Note that we typically use a median here, but a mean is faster and
    # produces close enough results
    add_pen = np.mean(distance_matrix)
    # Get best path through matrix
    aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
        distance_matrix, gully=.96, additive_penalty=add_pen,
        inplace=False)
    # Normalize score by path length
    score /= float(len(aligned_midi_indices))
    # Normalize score by score by mean sim matrix value within path chunk
    score /= distance_matrix[
        aligned_midi_indices.min():aligned_midi_indices.max(),
        aligned_audio_indices.min():aligned_audio_indices.max()].mean()
    # If the match was successful
    if score > SCORE_THRESHOLD:
        # Try adjusting MIDI timing and writing out
        try:
            # Retrieve timing of frames in CQTs
            midi_frame_times = feature_extraction.frame_times(midi_gram)
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            # Adjust MIDI file timing
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            # Make sure all output paths exist and write out
            check_subdirectories(aligned_output_filename)
            m.write(aligned_output_filename)
        except Exception as e:
            print "Error adjusting and writing {}: {}".format(
                os.path.split(midi_filename)[1],
                traceback.format_exc(e))
            return
        # Assuming the above worked, all we have to do now is copy
        # Check/create all necessary subdirectores
        check_subdirectories(unaligned_output_filename)
        check_subdirectories(mp3_output_filename)
        check_subdirectories(h5_output_filename)
        # Copy all files
        shutil.copy(midi_filename, unaligned_output_filename)
        shutil.copy(mp3_filename, mp3_output_filename)
        try:
            shutil.copy(h5_filename, h5_output_filename)
        except Exception as e:
            print "Could not copy {}: {}".format(
                os.path.split(h5_filename)[1],
                traceback.format_exc(e))
            return
        # Return list of msd_id, midi_md5, score]
        prefix, midi_filename = os.path.split(aligned_output_filename)
        msd_id = os.path.split(prefix)[1]
        midi_md5 = os.path.splitext(midi_filename)[0]
        return [msd_id, midi_md5, score]
コード例 #8
0
def align_one_file(audio_filename, midi_filename, audio_features_filename=None,
                   midi_features_filename=None, output_midi_filename=None,
                   output_diagnostics_filename=None,
                   additional_diagnostics=None):
    '''
    Helper function for aligning a MIDI file to an audio file.

    Parameters
    ----------
    audio_filename : str
        Full path to an audio file.
    midi_filename : str
        Full path to a midi file.
    audio_features_filename : str or None
        Full path to pre-computed features for the audio file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    midi_features_filename : str or None
        Full path to pre-computed features for the midi file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    output_midi_filename : str or None
        Full path to where the aligned .mid file should be written.
        If None, don't output.
    output_diagnostics_filename : str or None
        Full path to a file to write out diagnostic information (alignment
        score, best path, paths to files, etc) in a .h5 file.  If None, don't
        output.
    additional_diagnostics : dict or None
        Optional dictionary of additional diagnostic information to include
        in the diagnostics file.  If None, don't include.

    Returns
    -------
    p, q : np.ndarray
        Indices of the lowest-cost alignment between the audio and MIDI
    score : float
        Normalized DTW path distance
    '''
    # Skip when already processed
    if (output_diagnostics_filename is not None
            and os.path.exists(output_diagnostics_filename)):
        return

    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    midi_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (midi_features_filename is not None and
            os.path.exists(midi_features_filename)):
        try:
            # If a feature file was provided and exists, read it in
            midi_features = deepdish.io.load(midi_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(
                midi_features_filename, traceback.format_exc(e))
            midi_features = {}

    if not midi_features:
        # Generate synthetic MIDI CQT
        try:
            midi_features['gram'] = feature_extraction.midi_cqt(m)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        if midi_features_filename is not None:
            try:
                # Write out
                check_subdirectories(midi_features_filename)
                deepdish.io.save(
                    midi_features_filename, midi_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(midi_filename)[1], traceback.format_exc(e))
                return

    audio_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (audio_features_filename is not None and
            os.path.exists(audio_features_filename)):
        # If a feature file was provided and exists, read it in
        try:
            audio_features = deepdish.io.load(audio_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(
                audio_features_filename, traceback.format_exc(e))
            audio_features = {}

    # Cache audio CQT
    if not audio_features:
        try:
            # Read in audio data
            audio, fs = librosa.load(
                audio_filename, sr=feature_extraction.AUDIO_FS)
            # Compute audio cqt
            audio_features['gram'] = feature_extraction.audio_cqt(audio)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(audio_filename)[1], traceback.format_exc(e))
            return
        if audio_features_filename is not None:
            try:
                # Write out
                check_subdirectories(audio_features_filename)
                deepdish.io.save(audio_features_filename, audio_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(audio_filename)[1], traceback.format_exc(e))
                return

    try:
        # Check that the distance matrix will not be too big before computing
        size = midi_features['gram'].shape[0]*audio_features['gram'].shape[0]
        # If > 1 GB, skip
        if (size*64/8e9 > 1):
            print (
                "Distance matrix for {} and {} would be {} GB because the "
                "CQTs have shape {} and {}".format(
                    os.path.split(audio_filename)[1],
                    os.path.split(midi_filename)[1],
                    size*64/8e9, audio_features['gram'].shape[0],
                    midi_features['gram'].shape[0]))
            return

        # Get distance matrix
        distance_matrix = 1 - np.dot(
            midi_features['gram'], audio_features['gram'].T)
        # Non-diagonal additive path penalty is the median of the sim mtx
        add_pen = np.median(distance_matrix)
        # Get best path through matrix
        aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
            distance_matrix, gully=.96, additive_penalty=add_pen,
            inplace=False)
        # Normalize score by path length
        score /= float(len(aligned_midi_indices))
        # Normalize score by score by mean sim matrix value within path chunk
        score /= distance_matrix[
            aligned_midi_indices.min():aligned_midi_indices.max(),
            aligned_audio_indices.min():aligned_audio_indices.max()].mean()
        # The confidence score is a normalized DTW distance, which
        # approximately follows in the range [.5, 1.] with .5 meaning a very
        # good alignment.  This maps the scores from [0., 1.] where 1. means a
        # very good alignment.
        score = np.clip(2*(1 - score), 0, 1)
    except Exception as e:
        print "Error performing DTW for {} and {}: {}".format(
            os.path.split(audio_filename)[1],
            os.path.split(midi_filename)[1],
            traceback.format_exc(e))
        return

    # Write out the aligned file
    if output_midi_filename is not None:
        try:
            # Adjust MIDI timing
            midi_frame_times = feature_extraction.frame_times(
                midi_features['gram'])
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            check_subdirectories(output_midi_filename)
            m.write(output_midi_filename)
        except Exception as e:
            print "Error writing aligned .mid for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return

    if output_diagnostics_filename is not None:
        try:
            check_subdirectories(output_diagnostics_filename)
            # Construct empty additional diagnostics dict when None was given
            if additional_diagnostics is None:
                additional_diagnostics = {}
            diagnostics = dict(
                aligned_midi_indices=aligned_midi_indices,
                aligned_audio_indices=aligned_audio_indices, score=score,
                audio_filename=os.path.abspath(audio_filename),
                midi_filename=os.path.abspath(midi_filename),
                audio_features_filename=os.path.abspath(
                    audio_features_filename),
                midi_features_filename=os.path.abspath(midi_features_filename),
                output_midi_filename=os.path.abspath(output_midi_filename),
                output_diagnostics_filename=os.path.abspath(
                    output_diagnostics_filename),
                **additional_diagnostics)
            deepdish.io.save(output_diagnostics_filename, diagnostics)
        except Exception as e:
            print "Error writing diagnostics for {} and {}: {}".format(
                os.path.split(audio_filename)[1],
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
    return aligned_midi_indices, aligned_audio_indices, score
コード例 #9
0
def align_dataset(params, data):
    '''
    Perform alignment of all corrupted MIDIs in the database given the supplied
    parameters and compute the mean alignment error across all examples

    Parameters
    ----------
    params : dict
        Dictionary of alignment parameters.

    data : list of dict
        Collection of things to align, loaded via load_dataset.

    Returns
    -------
    results : list of dict
        List of dicts reporting the results for each alignment
    '''
    def post_process_features(gram, beats):
        '''
        Apply processing to a feature matrix given the supplied param values

        Parameters
        ----------
        gram : np.ndarray
            Feature matrix, shape (n_features, n_samples)
        beats : np.ndarray
            Indices of beat locations in gram

        Returns
        -------
        gram : np.ndarray
            Feature matrix, shape (n_samples, n_features), post-processed
            according to the values in `params`
        '''
        # Convert to chroma
        if params['feature'] == 'chroma':
            gram = librosa.feature.chroma_cqt(
                C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START))
        # Beat-synchronize the feature matrix
        if params['beat_sync']:
            gram = librosa.feature.sync(gram, beats, pad=False)
        # Compute log magnitude
        gram = librosa.logamplitude(gram, ref_power=gram.max())
        # Normalize the feature vectors
        gram = librosa.util.normalize(gram, norm=params['norm'])
        # Standardize the feature vectors
        if params['standardize']:
            gram = scipy.stats.mstats.zscore(gram, axis=1)
        # Transpose it to (n_samples, n_features) and return it
        return gram.T
    # List for storing the results of each alignment
    results = collections.defaultdict(list)
    for n, d in enumerate(data):
        # If we are beat syncing and either of the beat frames are empty, we
        # can't really align, so just skip this file.
        if params['beat_sync'] and (d['orig_beat_frames'].size == 0 or
                                    d['corrupted_beat_frames'].size == 0):
            continue
        # Post proces the chosen feature matrices
        orig_gram = post_process_features(
            d['orig_gram'], d['orig_beat_frames'])
        corrupted_gram = post_process_features(
            d['corrupted_gram'], d['corrupted_beat_frames'])
        # Compute a distance matrix according to the supplied metric
        distance_matrix = scipy.spatial.distance.cdist(
            orig_gram, corrupted_gram, params['metric'])
        # If the entire distance matrix is non-finite, we can't align, skip
        if not np.any(np.isfinite(distance_matrix)):
            continue
        # Set any Nan/inf values to the largest distance
        distance_matrix[np.logical_not(np.isfinite(distance_matrix))] = np.max(
            distance_matrix[np.isfinite(distance_matrix)])
        # Compute a band mask or set to None for no mask
        if params['band_mask']:
            mask = np.zeros(distance_matrix.shape, dtype=np.bool)
            djitw.band_mask(1 - params['gully'], mask)
        else:
            mask = None
        # Get DTW path and score
        add_pen = params['add_pen']*np.median(distance_matrix)
        p, q, score = djitw.dtw(
            distance_matrix, params['gully'], add_pen, mask=mask, inplace=0)
        if params['beat_sync']:
            # If we are beat syncing, we have to compare against beat times
            # so we index adjusted_times by the beat indices
            adjusted_times = d['adjusted_times'][d['orig_beat_frames']]
            corrupted_times = d['corrupted_beat_times']
        else:
            corrupted_times = d['corrupted_times']
            adjusted_times = d['adjusted_times']
        # Compute the error, clipped to within .5 seconds
        error = np.clip(
            corrupted_times[q] - adjusted_times[p], -.5, .5)
        # Compute the mean error for this MIDI
        mean_error = np.mean(np.abs(error))
        # If the mean error is NaN or inf for some reason, set it to max (.5)
        if not np.isfinite(mean_error):
            mean_error = .5
        results['mean_errors'].append(mean_error)
        results['raw_scores'].append(score)
        results['raw_scores_no_penalty'].append(distance_matrix[p, q].sum())
        results['path_lengths'].append(p.shape[0])
        results['distance_matrix_means'].append(np.mean(
            distance_matrix[p.min():p.max() + 1, q.min():q.max() + 1]))
        results['feature_files'].append(os.path.basename(d['feature_file']))
    return results
コード例 #10
0
def process_one_pair(midi_filename, mp3_filename, h5_filename,
                     unaligned_output_filename, aligned_output_filename,
                     mp3_output_filename, h5_output_filename):
    """
    Given a candidate MIDI-audio match, align the MIDI to the audio, then copy
    the unaligned and aligned MIDI if the score is high enough.

    Parameters
    ----------
    midi_filename : str
        Path to the MIDI file to align.
    mp3_filename : str
        Path to the audio file to align to.
    unaligned_output_filename : str
        Where to copy the unaligned MIDI file if the match was successful.
    aligned_output_filename : str
        Where to write the aligned MIDI file if the match was successful.
    mp3_output_filename : str
        Where to copy the mp3 file if the match was successful.
    h5_output_filename : str
        Where to copy the h5 file if the match was successful.
    """
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    try:
        midi_gram = feature_extraction.midi_cqt(m)
    except Exception as e:
        print "Error creating CQT for {}: {}".format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return
    # Construct path to pre-computed audio CQT path
    audio_features_filename = mp3_filename.replace('mp3', 'h5')
    try:
        audio_features = deepdish.io.load(audio_features_filename)
    except Exception as e:
        print "Error loading CQT for {}: {}".format(
            os.path.split(audio_features_filename)[1], traceback.format_exc(e))
        return
    # Check that the distance matrix will not be too big before computing
    size = midi_gram.shape[0] * audio_features['gram'].shape[0]
    # If > 1 GB, skip
    if (size * 64 / 8e9 > 2):
        print(
            "Distance matrix would be {} GB because the "
            "CQTs have shape {} and {}".format(size * 64 / 8e9,
                                               audio_features['gram'].shape[0],
                                               midi_gram.shape[0]))
        return
    # Get distance matrix
    distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T)
    # Non-diagonal additive path penalty is the mean of the sim mtx
    # Note that we typically use a median here, but a mean is faster and
    # produces close enough results
    add_pen = np.mean(distance_matrix)
    # Get best path through matrix
    aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
        distance_matrix, gully=.96, additive_penalty=add_pen, inplace=False)
    # Normalize score by path length
    score /= float(len(aligned_midi_indices))
    # Normalize score by score by mean sim matrix value within path chunk
    score /= distance_matrix[
        aligned_midi_indices.min():aligned_midi_indices.max(),
        aligned_audio_indices.min():aligned_audio_indices.max()].mean()
    # If the match was successful
    if score > SCORE_THRESHOLD:
        # Try adjusting MIDI timing and writing out
        try:
            # Retrieve timing of frames in CQTs
            midi_frame_times = feature_extraction.frame_times(midi_gram)
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            # Adjust MIDI file timing
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            # Make sure all output paths exist and write out
            check_subdirectories(aligned_output_filename)
            m.write(aligned_output_filename)
        except Exception as e:
            print "Error adjusting and writing {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        # Assuming the above worked, all we have to do now is copy
        # Check/create all necessary subdirectores
        check_subdirectories(unaligned_output_filename)
        check_subdirectories(mp3_output_filename)
        check_subdirectories(h5_output_filename)
        # Copy all files
        shutil.copy(midi_filename, unaligned_output_filename)
        shutil.copy(mp3_filename, mp3_output_filename)
        try:
            shutil.copy(h5_filename, h5_output_filename)
        except Exception as e:
            print "Could not copy {}: {}".format(
                os.path.split(h5_filename)[1], traceback.format_exc(e))
            return
        # Return list of msd_id, midi_md5, score]
        prefix, midi_filename = os.path.split(aligned_output_filename)
        msd_id = os.path.split(prefix)[1]
        midi_md5 = os.path.splitext(midi_filename)[0]
        return [msd_id, midi_md5, score]
コード例 #11
0
ファイル: match.py プロジェクト: wonderwrj/midi-dataset
def match_one_midi(midi_gram, midi_embedding, midi_hash_sequence,
                   msd_embeddings, msd_sequences, msd_feature_paths, msd_ids):
    """
    Match one MIDI file to the million song dataset by computing its CQT,
    pruning by matching its embedding, re-pruning by matching its downsampled
    hash sequence, and finally doing DTW on CQTs on the remaining entries.

    Parameters
    ----------
    midi_gram : np.ndarray
        Synthesized MIDI CQT
    midi_embedding : np.ndarray
        Embedding of the synthesized MIDI CQT
    midi_hash_sequence : np.ndarray
        Downsampled hash sequence of the MIDI CQT
    msd_embeddings : np.ndarray
        (# MSD entries x embedding dimension) matrix of all embeddings for all
        entries from the MSD
    msd_sequences : list of np.ndarray
        List of binary vector sequences (represented as ints) for all MSD
        entries
    msd_feature_paths : list of str
        Path to feature files (containing CQT) for each MSD entry
    msd_ids : list of str
        MSD ID of each corresponding entry in the above lists

    Returns
    -------
    dtw_matches : list of list
        List of [msd_id, score] for all non-pruned MSD entries
    """
    # Get the distance between the MIDI embedding and all MSD entries
    embedding_distances = np.sum((msd_embeddings - midi_embedding)**2, axis=1)
    # Get the indices of MSD entries sorted by their embedded distance to the
    # query MIDI embedding.
    embedding_matches = np.argsort(embedding_distances)
    # Get the top N matches
    embedding_matches = embedding_matches[:TOP_EMBEDDINGS]
    # Match this hash sequence to MSD sequences
    hash_matches, _, _ = dhs.match_one_sequence(
        midi_hash_sequence, msd_sequences, GULLY, PENALTY, True,
        embedding_matches)
    # Get the top N matches
    hash_matches = hash_matches[:TOP_SEQUENCES]
    # List for storing final match information
    matches = []
    # Perform DTW matching for each non-pruned MSD entry
    for match in hash_matches:
        # Construct path to pre-computed audio CQT path
        audio_features_filename = os.path.join(msd_feature_paths[match])
        try:
            audio_features = deepdish.io.load(audio_features_filename)
        except Exception as e:
            print "Error loading CQT for {}: {}".format(
                os.path.split(audio_features_filename)[1],
                traceback.format_exc(e))
            continue
        # Check that the distance matrix will not be too big before computing
        size = midi_gram.shape[0] * audio_features['gram'].shape[0]
        # If > 1 GB, skip
        if (size * 64 / 8e9 > 2):
            print (
                "Distance matrix would be {} GB because the "
                "CQTs have shape {} and {}".format(
                    size * 64 / 8e9, audio_features['gram'].shape[0],
                    midi_gram.shape[0]))
            continue
        # Get distance matrix
        distance_matrix = 1 - np.dot(midi_gram, audio_features['gram'].T)
        # Non-diagonal additive path penalty is the mean of the sim mtx
        # Note that we typically use a median here, but a mean is faster and
        # produces close enough results
        add_pen = np.mean(distance_matrix)
        # Get best path through matrix
        aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
            distance_matrix, gully=.96, additive_penalty=add_pen,
            inplace=False)
        # Normalize score by path length
        score /= float(len(aligned_midi_indices))
        # Normalize score by score by mean sim matrix value within path chunk
        score /= distance_matrix[
            aligned_midi_indices.min():aligned_midi_indices.max(),
            aligned_audio_indices.min():aligned_audio_indices.max()].mean()
        # The confidence score is a normalized DTW distance, which
        # approximately follows in the range [.5, 1.] with .5 meaning a very
        # good alignment.  This maps the scores from [0., 1.] where 1. means a
        # very good alignment.
        score = np.clip(2 * (1 - score), 0, 1)
        matches.append([msd_ids[match], score])
    return matches
コード例 #12
0
def align_one_file(audio_filename,
                   midi_filename,
                   audio_features_filename=None,
                   midi_features_filename=None,
                   output_midi_filename=None,
                   output_diagnostics_filename=None,
                   additional_diagnostics=None):
    '''
    Helper function for aligning a MIDI file to an audio file.

    Parameters
    ----------
    audio_filename : str
        Full path to an audio file.
    midi_filename : str
        Full path to a midi file.
    audio_features_filename : str or None
        Full path to pre-computed features for the audio file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    midi_features_filename : str or None
        Full path to pre-computed features for the midi file.
        If the file doesn't exist, features will be computed and saved.
        If None, force re-computation of the features and don't save.
    output_midi_filename : str or None
        Full path to where the aligned .mid file should be written.
        If None, don't output.
    output_diagnostics_filename : str or None
        Full path to a file to write out diagnostic information (alignment
        score, best path, paths to files, etc) in a .h5 file.  If None, don't
        output.
    additional_diagnostics : dict or None
        Optional dictionary of additional diagnostic information to include
        in the diagnostics file.  If None, don't include.

    Returns
    -------
    p, q : np.ndarray
        Indices of the lowest-cost alignment between the audio and MIDI
    score : float
        Normalized DTW path distance
    '''
    # Skip when already processed
    if (output_diagnostics_filename is not None
            and os.path.exists(output_diagnostics_filename)):
        return

    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
    except Exception as e:
        print 'Could not parse {}: {}'.format(
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    midi_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (midi_features_filename is not None
            and os.path.exists(midi_features_filename)):
        try:
            # If a feature file was provided and exists, read it in
            midi_features = deepdish.io.load(midi_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(midi_features_filename,
                                                traceback.format_exc(e))
            midi_features = {}

    if not midi_features:
        # Generate synthetic MIDI CQT
        try:
            midi_features['gram'] = feature_extraction.midi_cqt(m)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
        if midi_features_filename is not None:
            try:
                # Write out
                check_subdirectories(midi_features_filename)
                deepdish.io.save(midi_features_filename, midi_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(midi_filename)[1], traceback.format_exc(e))
                return

    audio_features = {}

    # If a feature file was provided and the file exists, try to read it in
    if (audio_features_filename is not None
            and os.path.exists(audio_features_filename)):
        # If a feature file was provided and exists, read it in
        try:
            audio_features = deepdish.io.load(audio_features_filename)
        # If there was a problem reading, force re-cration
        except Exception as e:
            print "Error reading {}: {}".format(audio_features_filename,
                                                traceback.format_exc(e))
            audio_features = {}

    # Cache audio CQT
    if not audio_features:
        try:
            # Read in audio data
            audio, fs = librosa.load(audio_filename,
                                     sr=feature_extraction.AUDIO_FS)
            # Compute audio cqt
            audio_features['gram'] = feature_extraction.audio_cqt(audio)
        except Exception as e:
            print "Error creating CQT for {}: {}".format(
                os.path.split(audio_filename)[1], traceback.format_exc(e))
            return
        if audio_features_filename is not None:
            try:
                # Write out
                check_subdirectories(audio_features_filename)
                deepdish.io.save(audio_features_filename, audio_features)
            except Exception as e:
                print "Error writing {}: {}".format(
                    os.path.split(audio_filename)[1], traceback.format_exc(e))
                return

    try:
        # Check that the distance matrix will not be too big before computing
        size = midi_features['gram'].shape[0] * audio_features['gram'].shape[0]
        # If > 1 GB, skip
        if (size * 64 / 8e9 > 2):
            print(
                "Distance matrix for {} and {} would be {} GB because the "
                "CQTs have shape {} and {}".format(
                    os.path.split(audio_filename)[1],
                    os.path.split(midi_filename)[1], size * 64 / 8e9,
                    audio_features['gram'].shape[0],
                    midi_features['gram'].shape[0]))
            return

        # Get distance matrix
        distance_matrix = 1 - np.dot(midi_features['gram'],
                                     audio_features['gram'].T)
        # Non-diagonal additive path penalty is the median of the sim mtx
        add_pen = np.median(distance_matrix)
        # Get best path through matrix
        aligned_midi_indices, aligned_audio_indices, score = djitw.dtw(
            distance_matrix,
            gully=.96,
            additive_penalty=add_pen,
            inplace=False)
        # Normalize score by path length
        score /= float(len(aligned_midi_indices))
        # Normalize score by score by mean sim matrix value within path chunk
        score /= distance_matrix[
            aligned_midi_indices.min():aligned_midi_indices.max(),
            aligned_audio_indices.min():aligned_audio_indices.max()].mean()
        # The confidence score is a normalized DTW distance, which
        # approximately follows in the range [.5, 1.] with .5 meaning a very
        # good alignment.  This maps the scores from [0., 1.] where 1. means a
        # very good alignment.
        score = np.clip(2 * (1 - score), 0, 1)
    except Exception as e:
        print "Error performing DTW for {} and {}: {}".format(
            os.path.split(audio_filename)[1],
            os.path.split(midi_filename)[1], traceback.format_exc(e))
        return

    # Write out the aligned file
    if output_midi_filename is not None:
        try:
            # Adjust MIDI timing
            midi_frame_times = feature_extraction.frame_times(
                midi_features['gram'])
            audio_frame_times = feature_extraction.frame_times(
                audio_features['gram'])
            m.adjust_times(midi_frame_times[aligned_midi_indices],
                           audio_frame_times[aligned_audio_indices])
            check_subdirectories(output_midi_filename)
            m.write(output_midi_filename)
        except Exception as e:
            print "Error writing aligned .mid for {}: {}".format(
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return

    if output_diagnostics_filename is not None:
        try:
            check_subdirectories(output_diagnostics_filename)
            # Construct empty additional diagnostics dict when None was given
            if additional_diagnostics is None:
                additional_diagnostics = {}
            diagnostics = dict(
                aligned_midi_indices=aligned_midi_indices,
                aligned_audio_indices=aligned_audio_indices,
                score=score,
                audio_filename=os.path.abspath(audio_filename),
                midi_filename=os.path.abspath(midi_filename),
                audio_features_filename=os.path.abspath(
                    audio_features_filename),
                midi_features_filename=os.path.abspath(midi_features_filename),
                output_midi_filename=os.path.abspath(output_midi_filename),
                output_diagnostics_filename=os.path.abspath(
                    output_diagnostics_filename),
                **additional_diagnostics)
            deepdish.io.save(output_diagnostics_filename, diagnostics)
        except Exception as e:
            print "Error writing diagnostics for {} and {}: {}".format(
                os.path.split(audio_filename)[1],
                os.path.split(midi_filename)[1], traceback.format_exc(e))
            return
    return aligned_midi_indices, aligned_audio_indices, score