Example #1
0
def _rescale_notes(intervals, pitches, frames):
    """
    Rescale notes to from raw prediction results to make MIDI.

    Parameters
    ----------
    intervals: Intervals from extract_notes.
    pitches: Pitches from extract_notes.
    frames: Frame prediciton result.

    Returns
    -------
    re_intervals: Rescaled intervals.
    re_pitches: Rescaled pitches.
    re_times: Rescaled times.
    re_freqs: Rescaled frequencies.
    """
    scaling = HOP_LENGTH / SAMPLE_RATE

    re_intervals = (intervals * scaling).reshape(-1, 2)
    re_pitches = np.array([midi_to_hz(MIN_MIDI + midi) for midi in pitches])

    times, freqs = notes_to_frames(pitches, intervals, frames)
    re_times = times.astype(np.float64) * scaling
    re_freqs = [
        np.array([midi_to_hz(MIN_MIDI + midi) for midi in freq])
        for freq in freqs
    ]

    return re_intervals, re_pitches, re_times, re_freqs
Example #2
0
def eval_one_data(answer_true, answer_pred, onset_tolerance=0.05):
    ref_pitches = []
    est_pitches = []
    ref_intervals = []
    est_intervals = []
    for i in range(len(answer_true)):
        if answer_true[i] is not None and float(answer_true[i][1]) - float(
                answer_true[i][0]) > 0:
            ref_intervals.append([answer_true[i][0], answer_true[i][1]])
            ref_pitches.append(answer_true[i][2])

    for i in range(len(answer_pred)):
        if answer_pred[i] is not None and float(answer_pred[i][1]) - float(
                answer_pred[i][0]) > 0:
            est_intervals.append([answer_pred[i][0], answer_pred[i][1]])
            est_pitches.append(answer_pred[i][2])

    ref_intervals = np.array(ref_intervals)
    est_intervals = np.array(est_intervals)

    ref_pitches = np.array(
        [float(ref_pitches[i]) for i in range(len(ref_pitches))])
    est_pitches = np.array(
        [float(est_pitches[i]) for i in range(len(est_pitches))])

    ref_pitches = util.midi_to_hz(ref_pitches)
    est_pitches = util.midi_to_hz(est_pitches)

    if len(est_intervals) == 0:
        ret = np.zeros(14)
        ret[9] = len(ref_pitches)
        return ret

    raw_data = transcription.evaluate(ref_intervals,
                                      ref_pitches,
                                      est_intervals,
                                      est_pitches,
                                      onset_tolerance=onset_tolerance,
                                      pitch_tolerance=50)

    ret = np.zeros(14)
    ret[0] = raw_data['Precision']
    ret[1] = raw_data['Recall']
    ret[2] = raw_data['F-measure']
    ret[3] = raw_data['Precision_no_offset']
    ret[4] = raw_data['Recall_no_offset']
    ret[5] = raw_data['F-measure_no_offset']
    ret[6] = raw_data['Onset_Precision']
    ret[7] = raw_data['Onset_Recall']
    ret[8] = raw_data['Onset_F-measure']
    ret[9] = len(ref_pitches)
    ret[10] = len(est_pitches)
    ret[11] = int(round(ret[1] * ret[9]))
    ret[12] = int(round(ret[4] * ret[9]))
    ret[13] = int(round(ret[7] * ret[9]))

    return ret
Example #3
0
def evaluate(model, batch, device):
    metrics = defaultdict(list)
    batch = allocate_batch(batch, device)

    frame_logit, onset_logit = model(batch['audio'])

    criterion = nn.BCEWithLogitsLoss()
    frame_loss = criterion(frame_logit, batch['frame'])
    onset_loss = criterion(frame_logit, batch['onset'])
    metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy())
    metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy())

    for n in range(batch['audio'].shape[0]):
        frame_pred = th.sigmoid(frame_logit[n])
        onset_pred = th.sigmoid(onset_logit[n])

        pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n])
        metrics['metric/frame/frame_precision'].append(pr)
        metrics['metric/frame/frame_recall'].append(re)
        metrics['metric/frame/frame_f1'].append(f1)

        pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n])
        metrics['metric/frame/onset_precision'].append(pr)
        metrics['metric/frame/onset_recall'].append(re)
        metrics['metric/frame/onset_f1'].append(f1)

        p_est, i_est = extract_notes(onset_pred, frame_pred)
        p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n])

        scaling = HOP_SIZE / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est])

        p, r, f, o = evaluate_notes(i_ref,
                                    p_ref,
                                    i_est,
                                    p_est,
                                    offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

    return metrics
Example #4
0
def eval_one_data(answer_true,
                  answer_pred,
                  onset_tolerance=0.05,
                  shifting=0,
                  gt_pitch_shift=0):

    ref_intervals, est_intervals, ref_pitches, est_pitches = prepare_data(
        answer_true, answer_pred, time_shift=shifting)

    ref_pitches = np.array([
        float(ref_pitches[i]) + gt_pitch_shift for i in range(len(ref_pitches))
    ])
    est_pitches = np.array(
        [float(est_pitches[i]) for i in range(len(est_pitches))])

    ref_pitches = util.midi_to_hz(ref_pitches)
    est_pitches = util.midi_to_hz(est_pitches)

    if len(est_intervals) == 0:
        ret = np.zeros(14)
        ret[9] = len(ref_pitches)
        return ret

    raw_data = transcription.evaluate(ref_intervals,
                                      ref_pitches,
                                      est_intervals,
                                      est_pitches,
                                      onset_tolerance=onset_tolerance,
                                      pitch_tolerance=50)

    ret = np.zeros(14)
    ret[0] = raw_data['Precision']
    ret[1] = raw_data['Recall']
    ret[2] = raw_data['F-measure']
    ret[3] = raw_data['Precision_no_offset']
    ret[4] = raw_data['Recall_no_offset']
    ret[5] = raw_data['F-measure_no_offset']
    ret[6] = raw_data['Onset_Precision']
    ret[7] = raw_data['Onset_Recall']
    ret[8] = raw_data['Onset_F-measure']
    ret[9] = len(ref_pitches)
    ret[10] = len(est_pitches)
    ret[11] = int(round(ret[1] * ret[9]))
    ret[12] = int(round(ret[4] * ret[9]))
    ret[13] = int(round(ret[7] * ret[9]))

    # print (ret[13], ret[8])
    return ret
def transcribe_file(model_file, flac_paths, save_path, sequence_length,
                    onset_threshold, frame_threshold):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = torch.load(model_file, map_location=device).eval()
    summary(model)

    for flac_path in flac_paths:
        #print(f'Processing {flac_path}...', file=sys.stderr)
        audio = load_and_process_audio(flac_path, sequence_length, device)
        predictions = transcribe(model, audio)

        p_est, i_est, v_est = extract_notes(predictions['onset'],
                                            predictions['frame'],
                                            predictions['velocity'],
                                            onset_threshold, frame_threshold)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        os.makedirs(save_path, exist_ok=True)
        pred_path = os.path.join(
            save_path,
            os.path.basename(flac_path) + "." + model_file + '.pred.png')
        #print(f'saved prediction to path: {pred_path}', file=sys.stderr)
        save_pianoroll(pred_path, predictions['onset'], predictions['frame'])
        midi_path = os.path.join(
            save_path,
            os.path.basename(flac_path) + "." + model_file + '.pred.mid')
        #print(f'saved mid to path: {midi_path}', file=sys.stderr)
        save_midi(midi_path, p_est, i_est, v_est)
Example #6
0
def midi_preparation(midifile):

    midi_data = dict()
    midi_data['onsets'] = dict()
    midi_data['offsets'] = dict()
    midi_data['midipitches'] = dict()  # midi notes?
    midi_data['hz'] = dict()

    patt = pretty_midi.PrettyMIDI(midifile)
    midi_data['downbeats'] = patt.get_downbeats()

    for instrument in patt.instruments:
        midi_data['onsets'][instrument.name] = []
        midi_data['offsets'][instrument.name] = []
        midi_data['midipitches'][instrument.name] = []

        for note in instrument.notes:
            midi_data['onsets'][instrument.name].append(note.start)
            midi_data['offsets'][instrument.name].append(note.end)
            midi_data['midipitches'][instrument.name].append(note.pitch)

        p = midi_data['midipitches'][instrument.name]
        midi_data['hz'][instrument.name] = midi_to_hz(np.array(p))

    return midi_data
def transcribe_file(model_file, audio_paths, save_path, sequence_length,
                    onset_threshold, frame_threshold, device):

    model = torch.load(model_file, map_location=device).eval()
    summary(model)

    for i, audio_path in enumerate(audio_paths):
        print(f'{i+1}/{len(audio_paths)}: Processing {audio_path}...',
              file=sys.stderr)
        audio = load_and_process_audio(audio_path, sequence_length, device)
        predictions = transcribe(model, audio)

        p_est, i_est, v_est = extract_notes(predictions['onset'],
                                            predictions['frame'],
                                            predictions['velocity'],
                                            onset_threshold, frame_threshold)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        os.makedirs(save_path, exist_ok=True)
        pred_path = os.path.join(save_path,
                                 os.path.basename(audio_path) + '.pred.png')
        save_pianoroll(pred_path, predictions['onset'], predictions['frame'])
        midi_path = os.path.join(save_path,
                                 os.path.basename(audio_path) + '.pred.mid')
        save_midi(midi_path, p_est, i_est, v_est)
Example #8
0
def transcribe(audio, model, args, save_name, max_len):
    print(f'save_path: {save_name}')
    audio = audio[:max_len*SAMPLE_RATE]
    t_audio = th.tensor(audio).to(th.float).cuda()
    pad_len = math.ceil(len(t_audio) / HOP_SIZE) * HOP_SIZE - len(t_audio)
    t_audio = th.unsqueeze(F.pad(t_audio, (0, pad_len)), 0)

    frame_logit, onset_logit = model(t_audio)
    onset = th.sigmoid(onset_logit[0])
    frame = th.sigmoid(frame_logit[0])

    p_est, i_est = extract_notes(onset, frame)

    scaling = HOP_SIZE / SAMPLE_RATE

    i_est = (i_est * scaling).reshape(-1, 2)
    p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est])

    numpy_filename = Path(save_name).parent / (Path(save_name).stem + '.npz')
    np.savez(save_name, onset=onset.cpu().numpy(), frame=frame.cpu().numpy())


    midi_filename = Path(save_name).parent / (Path(save_name).stem + '.midi')
    save_midi(midi_filename, p_est, i_est, [64] * len(p_est))

    wav_filename = Path(save_name).parent / (Path(save_name).stem + '.wav')
    midi_file = pretty_midi.PrettyMIDI(str(midi_filename))
    synth_audio = midi_file.fluidsynth(fs=16000)
    soundfile.write(wav_filename, synth_audio, 16000)
Example #9
0
def transcribe(audio, model, args, save_name, max_len):
    print(f'save_path: {save_name}')
    audio = audio[:max_len * SAMPLE_RATE]
    t_audio = th.tensor(audio).to(th.float).cuda()
    pad_len = math.ceil(len(t_audio) / HOP_SIZE) * HOP_SIZE - len(
        t_audio
    )  # To make sure that the total length of audio is multiple of hop size
    t_audio = th.unsqueeze(F.pad(t_audio, (0, pad_len)), 0)

    frame_logit, onset_logit = model(t_audio)
    # Why use sigmoid rather than softmax? : enable multiple notes at a time(poly phonic)
    onset = th.sigmoid(onset_logit[0])
    frame = th.sigmoid(frame_logit[0])

    # Get pitch and interval(the length of notes, not the harmonic interval) values
    p_est, i_est = extract_notes(onset, frame)

    scaling = HOP_SIZE / SAMPLE_RATE

    i_est = (i_est * scaling).reshape(-1, 2)
    p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est])

    # Save onset and frame information into numpy zip
    numpy_filename = Path(save_name).parent / (Path(save_name).stem + '.npz')
    np.savez(save_name, onset=onset.cpu().numpy(), frame=frame.cpu().numpy())

    # Save MIDI
    midi_filename = Path(save_name).parent / (Path(save_name).stem + '.midi')
    save_midi(midi_filename, p_est, i_est, [64] * len(p_est))

    # Save Wav using fluidsynth
    wav_filename = Path(save_name).parent / (Path(save_name).stem + '.wav')
    midi_file = pretty_midi.PrettyMIDI(str(midi_filename))
    synth_audio = midi_file.fluidsynth(fs=16000)
    soundfile.write(wav_filename, synth_audio, 16000)
def seq_to_mireval_form(seq):
    i_est = []
    p_est = []
    for note in seq.notes:
        i_est.append([note.start_time, note.end_time])
        p_est.append(midi_to_hz(note.pitch))
    i_est = np.asarray(i_est)
    p_est = np.asarray(p_est)

    return p_est, i_est
def simple_decoding_wrapper(onset_probs, frame_probs):
    th_onset_probs = torch.from_numpy(onset_probs)
    th_frame_probs = torch.from_numpy(frame_probs)
    p_ref, i_ref, v_ref = extract_notes(th_onset_probs, th_frame_probs)

    scaling = 512 / 16000

    i_ref = (i_ref * scaling).reshape(-1, 2)
    p_ref = np.array([midi_to_hz(21 + midi) for midi in p_ref])
    return p_ref, i_ref
Example #12
0
def evaluate(data,
             model,
             onset_threshold=0.5,
             frame_threshold=0.5,
             save_path=None):
    metrics = defaultdict(list)

    for label in data:
        pred, losses = model.run_on_batch(label)

        for key, value in pred.items():
            value.squeeze_(0).relu_()

        p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'],
                                            pred['velocity'], onset_threshold,
                                            frame_threshold)

        t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        t_est = t_est.astype(np.float64) * scaling
        f_est = [
            np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
            for freqs in f_est
        ]

        if save_path is not None:
            os.makedirs(save_path, exist_ok=True)
            midi_path = os.path.join(
                save_path,
                os.path.basename(label['path']) + '.pred.mid')
            save_midi(midi_path, p_est, i_est, v_est)

    return metrics
Example #13
0
def extract_label(label_path, label_loader, mapping, cenf, t_unit):
    """Label extraction function of PatchCNN module.

    Extracts the label representation required by PatchCNN module.
    The output dimesions are: patch_length x 2. The second dimension indicates whether
    there is an active vocal pitch or not of that patch.

    Small probabilities are assigned to those patch with pitch slightly shifted
    to augment the sparse label. The probabilities are computed according to the distance
    of that pitch index to the ground-truth index: 1 / (dist + 1).

    Parameters
    ----------
    label_path: Path
        Path to the ground-truth file.
    label_loader:
        Label loader that contains ``load_label`` function for parsing the ground-truth
        file into list :class:`Label` representation.
    mapping: 2D numpy array
        The original frequency and time index of patches.
        See ``omnizart.feature.cfp.extract_patch_cfp`` for more details.
    cenf: list[float]
        Center frequencies in Hz of each frequency index.
    t_unit: float
        Time unit of each frame in seconds.

    Returns
    -------
    gt_roll: 2D numpy array
        A sequence of binary classes, represents whether the patch contains the pitch
        of vocal.
    """
    labels = label_loader.load_label(label_path)
    total_len = len(mapping)
    cenf = np.array(cenf)
    gt_roll = np.zeros((total_len, 2))
    for label in labels:
        start_tidx = int(round(label.start_time / t_unit))
        end_tidx = int(round(label.end_time / t_unit))
        frm_start = np.argmin(np.abs(mapping[:, 1] - start_tidx))
        frm_end = total_len - np.argmin(np.abs(mapping[::-1, 1] - end_tidx))
        cur_hz = midi_to_hz(label.note)
        pitch_idx = np.argmin(np.abs(cenf - cur_hz))
        for idx in range(frm_start, frm_end):
            dist = abs(mapping[idx, 0] - pitch_idx)
            prob = 1 / (1 + dist)
            gt_roll[idx, 1] = prob

    gt_roll[:, 0] = 1 - gt_roll[:, 1]
    return gt_roll
def transcribe_file(checkpoint_dir, flac_paths, save_path, sequence_length,
                    onset_threshold, frame_threshold):

    # Create default model and optimizer even though they'll be replaced with the checkpoint.
    model = OnsetsAndFrames(MAX_MIDI - MIN_MIDI + 1)
    optimizer = keras.optimizers.Adam(.0001)

    ckpt = tf.train.Checkpoint(step=tf.Variable(1),
                               optimizer=optimizer,
                               net=model)
    manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3)

    ckpt.restore(manager.latest_checkpoint).expect_partial()
    if manager.latest_checkpoint:
        tf.print("Restored from {}".format(manager.latest_checkpoint))

    globbed_paths = glob.glob(flac_paths)

    # do a transcription just to be able to call model.summary()
    # audio = load_and_process_audio(globbed_paths[0], sequence_length)
    # audio = tf.expand_dims(audio, 0)
    # predictions = transcribe(model, audio)
    # model.summary()

    for flac_path in globbed_paths:
        print(f'Processing FLAC: {flac_path}', file=sys.stderr)
        audio = load_and_process_audio(flac_path, sequence_length)

        audio = tf.expand_dims(audio, 0)

        predictions = transcribe(model, audio)

        p_est, i_est, v_est = extract_notes(predictions['onset'],
                                            predictions['frame'],
                                            predictions['velocity'],
                                            onset_threshold, frame_threshold)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_est = (i_est * scaling).reshape((-1, 2))
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        os.makedirs(save_path, exist_ok=True)

        midi_path = os.path.join(save_path,
                                 os.path.basename(flac_path) + '.pred.mid')
        save_midi(midi_path, p_est, i_est, v_est)
        pred_path = os.path.join(save_path,
                                 os.path.basename(flac_path) + '.pred.png')
        save_pianoroll(pred_path, predictions['onset'], predictions['frame'])
Example #15
0
def note_transcribe_per_min(input_file_name, output_file_name, model_path,
                            ensemble, hop_size, sr, onset_threshold,
                            frame_threshold, export_midi, device):
    """
    Transcribe piano notes with deep learning model and write in lines with
    (onset  offset  F0) form or MIDI file.
    """
    audio, _ = librosa.load(input_file_name, sr=sr)

    # Protection code for audio > 1
    if np.max(np.abs(audio)) > 1:
        audio = audio / np.max(np.abs(audio))

    audio_tensor = torch.from_numpy(audio).to(device).unsqueeze(0)
    melspec = MelSpectrogram(N_MELS,
                             SAMPLE_RATE,
                             WINDOW_LENGTH,
                             HOP_LENGTH,
                             mel_fmin=MEL_FMIN,
                             mel_fmax=MEL_FMAX).to(device)
    mel = (melspec(audio_tensor.reshape(
        -1, audio_tensor.shape[-1])[:, :-1]).transpose(-1, -2))

    model = load_transcriber(model_path).to(device).eval()
    pred = model(mel)

    print('onset_pred:{}, frame_pred:{}'.format(onset_pred, frame_pred))
    p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, vel_pred,
                                        onset_threshold, frame_threshold)

    if export_midi:
        scaling = HOP_LENGTH / SAMPLE_RATE
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])
        filename, file_extension = os.path.splitext(output_file_name)
        onesec_output_file_name = filename + '_' + str(
            i) + 'sec' + file_extension
        print(
            'onesec_output_file_name:{}, scaling:{}\n len(i_est):{}, i_est:{}, len(p_est):{}, p_est:{}'
            .format(onesec_output_file_name, scaling, len(i_est), i_est,
                    len(p_est), p_est))
        save_midi(onesec_output_file_name, p_est, i_est, v_est)
    else:
        _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
def evaluate(data, model, onset_threshold=0.5, frame_threshold=0.5, save_path=None):
    metrics = defaultdict(list)

    for label in data:
        pred, losses = model.run_on_batch(label)

        for key, loss in losses.items():
            metrics[key].append(loss.item())

        for key, value in pred.items():
            value.squeeze_(0).relu_()

        p_ref, i_ref, v_ref = extract_notes(label['onset'], label['frame'], label['velocity'])
        p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'], pred['velocity'], onset_threshold, frame_threshold)

        t_ref, f_ref = notes_to_frames(p_ref, i_ref, label['frame'].shape)
        t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        t_ref = t_ref.astype(np.float64) * scaling
        f_ref = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref]
        t_est = t_est.astype(np.float64) * scaling
        f_est = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est]

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

        p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est,
                                                  offset_ratio=None, velocity_tolerance=0.1)
        metrics['metric/note-with-velocity/precision'].append(p)
        metrics['metric/note-with-velocity/recall'].append(r)
        metrics['metric/note-with-velocity/f1'].append(f)
        metrics['metric/note-with-velocity/overlap'].append(o)

        p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, velocity_tolerance=0.1)
        metrics['metric/note-with-offsets-and-velocity/precision'].append(p)
        metrics['metric/note-with-offsets-and-velocity/recall'].append(r)
        metrics['metric/note-with-offsets-and-velocity/f1'].append(f)
        metrics['metric/note-with-offsets-and-velocity/overlap'].append(o)

        frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est)
        metrics['metric/frame/f1'].append(hmean([frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps]) - eps)

        for key, loss in frame_metrics.items():
            metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss)

        if save_path is not None:
            os.makedirs(save_path, exist_ok=True)
            label_path = os.path.join(save_path, os.path.basename(label['path']) + '.label.png')
            save_pianoroll(label_path, label['onset'], label['frame'])
            pred_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.png')
            save_pianoroll(pred_path, pred['onset'], pred['frame'])
            midi_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.mid')
            save_midi(midi_path, p_est, i_est, v_est)

    return metrics
Example #17
0
def note_transcribe_per_min(input_file_name, output_file_name, model_path, ensemble,
                    hop_size, sr, onset_threshold, frame_threshold,
                    export_midi, device):
    """
    Transcribe piano notes with deep learning model and write in lines with
    (onset  offset  F0) form or MIDI file.
    """
    audio, _ = librosa.load(input_file_name, sr=sr)

    # Protection code for audio > 1
    if np.max(np.abs(audio)) > 1:
        audio = audio / np.max(np.abs(audio))
    
    dur_music_sec = math.floor(len(audio)/sr)
    
    for i in range(round(dur_music_sec)):
        
        print('i:{}, dur_music_sec:{}'.format(i, dur_music_sec))
        print('audio.size():{}, type(audio):{}, sr:{}'.format( audio.shape, type(audio), sr))
        onesec_audio = audio[i*sr:(i+19)*sr]
        print('type(onesec_audio):{}, len(onesec_audio):{}'.format(type(onesec_audio), onesec_audio.shape))
        audio_tensor = torch.from_numpy(onesec_audio).to(device).unsqueeze(0)
        melspec = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH,
                                HOP_LENGTH, mel_fmin=MEL_FMIN,
                                mel_fmax=MEL_FMAX).to(device)
        mel = (melspec(audio_tensor
                    .reshape(-1, audio_tensor.shape[-1])[:, :-1])
            .transpose(-1, -2))

        if ensemble is not None:
            print('mel.shape:{}'.format(mel.shape))
            model_paths = list(Path(model_path).glob('*.trm'))
            onset_pred = torch.zeros(mel.shape[0], mel.shape[1],
                                    MAX_MIDI - MIN_MIDI + 1).to(device)
            frame_pred = torch.zeros(mel.shape[0], mel.shape[1],
                                    MAX_MIDI - MIN_MIDI + 1).to(device)
            vel_pred = torch.zeros(mel.shape[0], mel.shape[1],
                                MAX_MIDI - MIN_MIDI + 1).to(device)

            for model_path in model_paths:
                model = load_transcriber(model_path).to(device).eval()
                onset_pred_part, _, _, frame_pred_part, vel_pred_part = model(mel)

                if ensemble == 'mean':
                    onset_pred += onset_pred_part / len(model_paths)
                    frame_pred += frame_pred_part / len(model_paths)
                    vel_pred += vel_pred_part / len(model_paths)
                elif ensemble == 'vote':
                    # extract_notes does not use offset. -> mean
                    onset_pred += ((onset_pred_part > onset_threshold)
                                .type(torch.float))
                    frame_pred += ((frame_pred_part > frame_threshold)
                                .type(torch.float))
                    vel_pred += vel_pred_part

                del model

        else:
            model = load_transcriber(model_path).to(device).eval()

            onset_pred, offset_pred, _, frame_pred, vel_pred = model(mel)

        onset_pred = onset_pred.squeeze()
        frame_pred = frame_pred.squeeze()
        vel_pred = vel_pred.squeeze()

        print('onset_pred:{}, frame_pred:{}'.format(onset_pred, frame_pred))
        p_est, i_est, v_est = extract_notes(onset_pred, frame_pred,
                                            vel_pred, onset_threshold,
                                            frame_threshold)

        if export_midi:
            scaling = HOP_LENGTH / SAMPLE_RATE
            i_est = (i_est * scaling).reshape(-1, 2)
            p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])
            filename, file_extension = os.path.splitext(output_file_name)
            onesec_output_file_name = filename + '_' + str(i) + 'sec' + file_extension
            print('onesec_output_file_name:{}, scaling:{}\n len(i_est):{}, i_est:{}, len(p_est):{}, p_est:{}'.format(onesec_output_file_name, scaling, len(i_est), i_est, len(p_est), p_est))
            save_midi(onesec_output_file_name, p_est, i_est, v_est)
        else:
            _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
Example #18
0
def evaluate(metrics,
             model,
             inputs,
             targets,
             onset_threshold=0.5,
             frame_threshold=0.5,
             save_path=None):

    # NB: this can't be decorated with tf.function because of all the extract_notes functions not being pure TF code.

    mel = audio_to_mel(inputs)

    onset_pred, offset_pred, frame_pred, velocity_pred = model(mel,
                                                               training=False)
    onset_labels, offset_labels, frame_labels, velocity_labels, path_labels = targets

    # for key, loss in losses.items():
    #     metrics[key].append(loss.item()) # todo: add loss metrics

    # We're working with batch size of 1, so remove the first index for everything.
    onset_pred = tf.squeeze(onset_pred)
    offset_pred = tf.squeeze(offset_pred)
    frame_pred = tf.squeeze(frame_pred)
    velocity_pred = tf.squeeze(velocity_pred)

    onset_labels = tf.squeeze(onset_labels)
    offset_labels = tf.squeeze(offset_labels)
    frame_labels = tf.squeeze(frame_labels)
    velocity_labels = tf.squeeze(velocity_labels)
    path_labels = tf.squeeze(path_labels).numpy().decode("utf-8")

    p_ref, i_ref, v_ref = extract_notes(onset_labels, frame_labels,
                                        velocity_labels)
    p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, velocity_pred,
                                        onset_threshold, frame_threshold)

    t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_labels.shape)
    t_est, f_est = notes_to_frames(p_est, i_est, frame_pred.shape)

    scaling = HOP_LENGTH / SAMPLE_RATE

    i_ref = (i_ref * scaling).reshape(-1, 2)
    p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref])
    i_est = (i_est * scaling).reshape(-1, 2)
    p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

    t_ref = t_ref.astype(np.float64) * scaling
    f_ref = [
        np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
        for freqs in f_ref
    ]
    t_est = t_est.astype(np.float64) * scaling
    f_est = [
        np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
        for freqs in f_est
    ]

    p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None)
    metrics['metric/note/precision'].append(p)
    metrics['metric/note/recall'].append(r)
    metrics['metric/note/f1'].append(f)
    metrics['metric/note/overlap'].append(o)

    p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
    metrics['metric/note-with-offsets/precision'].append(p)
    metrics['metric/note-with-offsets/recall'].append(r)
    metrics['metric/note-with-offsets/f1'].append(f)
    metrics['metric/note-with-offsets/overlap'].append(o)

    p, r, f, o = evaluate_notes_with_velocity(i_ref,
                                              p_ref,
                                              v_ref,
                                              i_est,
                                              p_est,
                                              v_est,
                                              offset_ratio=None,
                                              velocity_tolerance=0.1)
    metrics['metric/note-with-velocity/precision'].append(p)
    metrics['metric/note-with-velocity/recall'].append(r)
    metrics['metric/note-with-velocity/f1'].append(f)
    metrics['metric/note-with-velocity/overlap'].append(o)

    p, r, f, o = evaluate_notes_with_velocity(i_ref,
                                              p_ref,
                                              v_ref,
                                              i_est,
                                              p_est,
                                              v_est,
                                              velocity_tolerance=0.1)
    metrics['metric/note-with-offsets-and-velocity/precision'].append(p)
    metrics['metric/note-with-offsets-and-velocity/recall'].append(r)
    metrics['metric/note-with-offsets-and-velocity/f1'].append(f)
    metrics['metric/note-with-offsets-and-velocity/overlap'].append(o)

    frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est)
    metrics['metric/frame/f1'].append(
        hmean(
            [frame_metrics['Precision'] + eps, frame_metrics['Recall'] +
             eps]) - eps)

    for key, loss in frame_metrics.items():
        metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss)

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        label_path = os.path.join(save_path,
                                  os.path.basename(path_labels) + '.label.png')
        save_pianoroll(label_path, onset_labels, frame_labels)
        pred_path = os.path.join(save_path,
                                 os.path.basename(path_labels) + '.pred.png')
        save_pianoroll(pred_path, onset_pred, frame_pred)
        midi_path = os.path.join(save_path,
                                 os.path.basename(path_labels) + '.pred.mid')
        save_midi(midi_path, p_est, i_est, v_est)

    return metrics
Example #19
0
def evaluate(model, batch, device, save=False, save_path=None):
    metrics = defaultdict(list)
    batch = allocate_batch(batch, device)

    frame_logit, onset_logit = model(batch['audio'])

    criterion = nn.BCEWithLogitsLoss()
    frame_loss = criterion(frame_logit, batch['frame'])
    onset_loss = criterion(frame_logit, batch['onset'])
    metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy())
    metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy())

    for n in range(batch['audio'].shape[0]):
        frame_pred = th.sigmoid(frame_logit[n])
        onset_pred = th.sigmoid(onset_logit[n])

        pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n])
        metrics['metric/frame/frame_precision'].append(pr)
        metrics['metric/frame/frame_recall'].append(re)
        metrics['metric/frame/frame_f1'].append(f1)

        pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n])
        metrics['metric/frame/onset_precision'].append(pr)
        metrics['metric/frame/onset_recall'].append(re)
        metrics['metric/frame/onset_f1'].append(f1)

        p_est, i_est = extract_notes(onset_pred, frame_pred)
        p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n])

        scaling = HOP_SIZE / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est])

        p, r, f, o = evaluate_notes(i_ref,
                                    p_ref,
                                    i_est,
                                    p_est,
                                    offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

        if save:
            if len(p_est) == 0:
                print(
                    f'no onset detected. skip: {Path(batch["path"][n]).stem}')
            midi_filename = Path(save_path) / (Path(batch['path'][n]).stem +
                                               '.midi')
            save_midi(midi_filename, p_est, i_est, [64] * len(p_est))

            wav_filename = Path(save_path) / (Path(batch['path'][n]).stem +
                                              '.wav')
            midi_file = pretty_midi.PrettyMIDI(str(midi_filename))
            synth_audio = midi_file.fluidsynth(fs=16000)
            soundfile.write(wav_filename, synth_audio, 16000)

    return metrics
Example #20
0
def evaluate_onf(batch,
                 model,
                 device,
                 save_path=None,
                 criterion=None,
                 sampling_method='argmax',
                 rep_type='base',
                 plot_example=False,
                 recursive=True,
                 detail_eval=False,
                 delay=1):
    metrics = defaultdict(list)
    with th.no_grad():
        preds, losses = models.run_on_batch_onf(model, batch, device[0])
    losses = losses.cpu().numpy()
    metrics['loss'].extend([losses])

    for n in range(preds['frame'].shape[0]):
        label = dict()
        for key in batch:
            label[key] = batch[key][n]

        onset_ref, offset_ref, frame_ref = representation.base2onsets_and_frames(
            label['shifted_label'][delay:])
        onsets = preds['onset'][n] > 0.5
        offsets = preds['offset'][n] > 0.5
        frames = preds['frame'][n] > 0.5

        p_ref, i_ref, v_ref = extract_notes(onset_ref, frame_ref)
        p_est, i_est, v_est = extract_notes(onsets, frames)

        t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_ref.shape)
        t_est, f_est = notes_to_frames(p_est, i_est, frames.shape)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        t_ref = t_ref.astype(np.float64) * scaling
        f_ref = [
            np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
            for freqs in f_ref
        ]
        t_est = t_est.astype(np.float64) * scaling
        f_est = [
            np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
            for freqs in f_est
        ]

        p, r, f, o = evaluate_notes(i_ref,
                                    p_ref,
                                    i_est,
                                    p_est,
                                    offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

        frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est)
        metrics['metric/frame/f1'].append(
            hmean([
                frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps
            ]) - eps)

        for key, value in frame_metrics.items():
            metrics['metric/frame/' +
                    key.lower().replace(' ', '_')].append(value)

    return metrics, None
Example #21
0
def evaluate(batch,
             model,
             device,
             save_path=None,
             criterion=None,
             sampling_method='argmax',
             rep_type='base',
             plot_example=False,
             recursive=True,
             detail_eval=False,
             delay=1):
    # TODO: input: prediction & label. output: metric
    metrics = defaultdict(list)
    acc_conf = []
    if sampling_method == 'argmax':
        gt_ratio = 0.0
    elif sampling_method == 'gt':
        gt_ratio = 1.0
    else:
        gt_ratio = 0.0
    with th.no_grad():
        preds, losses = models.run_on_batch(model,
                                            batch,
                                            device[0],
                                            sampling_method=sampling_method,
                                            gt_ratio=gt_ratio,
                                            criterion=criterion,
                                            rep_type=rep_type,
                                            recursive=recursive,
                                            delay=delay)
    losses = losses.cpu().numpy()
    metrics['loss'].extend(list(np.atleast_1d(losses)))

    for n in range(preds.shape[0]):
        label = dict()
        pred = preds[n]
        argmax_pred = pred.argmax(dim=0)
        for key in batch:
            label[key] = batch[key][n]

        if detail_eval:
            acc_conf.append(
                calculate_acc_conf(
                    pred.cpu().numpy().transpose((1, 2, 0)),
                    label['shifted_label'][delay:].cpu().numpy()))
        else:
            acc_conf.append(None)

        onset_ref, offset_ref, frame_ref = representation.base2onsets_and_frames(
            label['shifted_label'][delay:])
        onsets, offsets, frames = representation.convert2onsets_and_frames(
            argmax_pred, rep_type)

        p_ref, i_ref, v_ref = extract_notes(onset_ref, frame_ref)
        p_est, i_est, v_est = extract_notes(onsets, frames)

        t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_ref.shape)
        t_est, f_est = notes_to_frames(p_est, i_est, frames.shape)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        t_ref = t_ref.astype(np.float64) * scaling
        f_ref = [
            np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
            for freqs in f_ref
        ]
        t_est = t_est.astype(np.float64) * scaling
        f_est = [
            np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs])
            for freqs in f_est
        ]

        p, r, f, o = evaluate_notes(i_ref,
                                    p_ref,
                                    i_est,
                                    p_est,
                                    offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

        frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est)
        metrics['metric/frame/f1'].append(
            hmean([
                frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps
            ]) - eps)

        for key, value in frame_metrics.items():
            metrics['metric/frame/' +
                    key.lower().replace(' ', '_')].append(value)

        if plot_example:
            pred = pred.cpu().numpy().transpose(1, 2, 0)
            label = label['shifted_label'][delay:].cpu().numpy()
            os.makedirs(save_path, exist_ok=True)
            basename = Path(save_path) / Path(batch['path'][n]).stem

            np.save(str(basename) + f'_label.npy', label)
            np.save(str(basename) + f'_pred_{sampling_method}.npy', pred)

            draw_predictions_with_label(
                str(basename) + f'_pred.png', pred, label)
            # midi_path = str(basename) + f'_pred_{global_step}.mid'
            # save_midi(midi_path, p_est, i_est, v_est)

    return metrics, acc_conf
Example #22
0
def note_transcribe(input_file_name, output_file_name, model_path, ensemble,
                    hop_size, sr, onset_threshold, frame_threshold,
                    export_midi, device):
    """
    Transcribe piano notes with deep learning model and write in lines with
    (onset  offset  F0) form or MIDI file.
    """
    audio, _ = librosa.load(input_file_name, sr=sr)

    # Protection code for audio > 1
    if np.max(np.abs(audio)) > 1:
        audio = audio / np.max(np.abs(audio))

    audio_tensor = torch.from_numpy(audio).to(device).unsqueeze(0)
    melspec = MelSpectrogram(N_MELS,
                             SAMPLE_RATE,
                             WINDOW_LENGTH,
                             HOP_LENGTH,
                             mel_fmin=MEL_FMIN,
                             mel_fmax=MEL_FMAX).to(device)
    mel = (melspec(audio_tensor.reshape(
        -1, audio_tensor.shape[-1])[:, :-1]).transpose(-1, -2))

    if ensemble is not None:
        model_paths = list(Path(model_path).glob('*.trm'))
        onset_pred = torch.zeros(mel.shape[0], mel.shape[1],
                                 MAX_MIDI - MIN_MIDI + 1).to(device)
        frame_pred = torch.zeros(mel.shape[0], mel.shape[1],
                                 MAX_MIDI - MIN_MIDI + 1).to(device)
        vel_pred = torch.zeros(mel.shape[0], mel.shape[1],
                               MAX_MIDI - MIN_MIDI + 1).to(device)

        for model_path in model_paths:
            model = load_transcriber(model_path).to(device).eval()
            onset_pred_part, _, _, frame_pred_part, vel_pred_part = model(mel)

            if ensemble == 'mean':
                onset_pred += onset_pred_part / len(model_paths)
                frame_pred += frame_pred_part / len(model_paths)
                vel_pred += vel_pred_part / len(model_paths)
            elif ensemble == 'vote':
                # extract_notes does not use offset. -> mean
                onset_pred += ((onset_pred_part > onset_threshold).type(
                    torch.float))
                frame_pred += ((frame_pred_part > frame_threshold).type(
                    torch.float))
                vel_pred += vel_pred_part

            del model

    else:
        model = load_transcriber(model_path).to(device).eval()

        onset_pred, offset_pred, _, frame_pred, vel_pred = model(mel)

    onset_pred = onset_pred.squeeze()
    frame_pred = frame_pred.squeeze()
    vel_pred = vel_pred.squeeze()

    p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, vel_pred,
                                        onset_threshold, frame_threshold)

    if export_midi:
        scaling = HOP_LENGTH / SAMPLE_RATE

        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        save_midi(output_file_name, p_est, i_est, v_est)
    else:
        _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
Example #23
0
def evaluate(data, model, logging_info, onset_threshold=0.5, frame_threshold=0.5, save_path=None):
    metrics = defaultdict(list)
    
    
    song_names = list()
    for label in data:
        song_names.append((label['path']).split("/")[-1:])
        pred, losses = model.run_on_batch(label)

        for key, loss in losses.items():
            metrics[key].append(loss.item())

        for key, value in pred.items():
            value.squeeze_(0).relu_()

        p_ref, i_ref, v_ref = extract_notes(label['onset'], label['frame'], label['velocity'])
        p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'], pred['velocity'], onset_threshold, frame_threshold)

        t_ref, f_ref = notes_to_frames(p_ref, i_ref, label['frame'].shape)
        t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape)

        scaling = HOP_LENGTH / SAMPLE_RATE

        i_ref = (i_ref * scaling).reshape(-1, 2)
        p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref])
        i_est = (i_est * scaling).reshape(-1, 2)
        p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est])

        t_ref = t_ref.astype(np.float64) * scaling
        f_ref = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref]
        t_est = t_est.astype(np.float64) * scaling
        f_est = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est]

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None)
        metrics['metric/note/precision'].append(p)
        metrics['metric/note/recall'].append(r)
        metrics['metric/note/f1'].append(f)
        metrics['metric/note/overlap'].append(o)

        p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est)
        metrics['metric/note-with-offsets/precision'].append(p)
        metrics['metric/note-with-offsets/recall'].append(r)
        metrics['metric/note-with-offsets/f1'].append(f)
        metrics['metric/note-with-offsets/overlap'].append(o)

        p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est,
                                                  offset_ratio=None, velocity_tolerance=0.1)
        metrics['metric/note-with-velocity/precision'].append(p)
        metrics['metric/note-with-velocity/recall'].append(r)
        metrics['metric/note-with-velocity/f1'].append(f)
        metrics['metric/note-with-velocity/overlap'].append(o)

        p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, velocity_tolerance=0.1)
        metrics['metric/note-with-offsets-and-velocity/precision'].append(p)
        metrics['metric/note-with-offsets-and-velocity/recall'].append(r)
        metrics['metric/note-with-offsets-and-velocity/f1'].append(f)
        metrics['metric/note-with-offsets-and-velocity/overlap'].append(o)

        frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est)
        metrics['metric/frame/f1'].append(hmean([frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps]) - eps)

        for key, loss in frame_metrics.items():
            metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss)

        if save_path is not None:
            os.makedirs(save_path, exist_ok=True)
            label_path = os.path.join(save_path, os.path.basename(label['path']) + '.label.png')
            save_pianoroll(label_path, label['onset'], label['frame'])
            pred_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.png')
            save_pianoroll(pred_path, pred['onset'], pred['frame'])
            midi_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.mid')
            save_midi(midi_path, p_est, i_est, v_est)

    # Creating a table of results for each song, and sorting by note_with_offset F1 score
    
    rename_dict = dict()
    for key, _ in metrics.items():
        if key.startswith('loss/'):
            _, category = key.split('/')
            rename_dict[key] = category + " loss"
        if key.startswith('metric/'):
            _, category, name = key.split('/')
            rename_dict[key] = category + " " + name
    
    
    model_file, dataset_name = logging_info
    log_str = (model_file + "_" + dataset_name).replace("/", "-")
    evaluation_by_song_df = pd.DataFrame.from_dict(metrics, orient='index').transpose()
    evaluation_by_song_df.insert(0, "song_name", song_names)
    evaluation_by_song_df["song_name"] = song_names
    evaluation_by_song_df.rename(columns=rename_dict, inplace=True)
    evaluation_by_song_df.sort_values("note-with-offsets f1", ascending=True)
    evaluation_by_song_df.to_csv("./evaluations/new_evals" + log_str+ "_by_song.csv", index=False)

    model_df = evaluation_by_song_df.mean()
    model_df.to_csv("./evaluations/new_evals/new_evals" + log_str+ "_model.csv", index=False)
    return metrics