/
align_real_world.py
85 lines (80 loc) · 3.37 KB
/
align_real_world.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
""" Align all of the real-world data. """
import djitw
import numpy as np
import pretty_midi
import librosa
import create_data
import json
import scipy.spatial
import os
import joblib
GULLY = .96
REAL_WORLD_PATH = 'data/real_world'
def process_one_file(audio_file, midi_file, output_midi_file, pair_file,
diagnostics_file):
"""
Wrapper routine for loading in audio/MIDI data, aligning, and writing
out the result.
Parameters
----------
audio_file, midi_file, output_midi_file, pair_file, diagnostics_file : str
Paths to the audio file to align, MIDI file to align, and paths where
to write the aligned MIDI, the synthesized pair file, and the DTW
diagnostics file.
"""
# Load in the audio data
audio_data, _ = librosa.load(audio_file, sr=create_data.FS)
# Compute the log-magnitude CQT of the data
audio_cqt, audio_times = create_data.extract_cqt(audio_data)
audio_cqt = librosa.logamplitude(audio_cqt, ref_power=audio_cqt.max()).T
# Load and synthesize MIDI data
midi_object = pretty_midi.PrettyMIDI(midi_file)
midi_audio = midi_object.fluidsynth(fs=create_data.FS)
# Compute log-magnitude CQT
midi_cqt, midi_times = create_data.extract_cqt(midi_audio)
midi_cqt = librosa.logamplitude(midi_cqt, ref_power=midi_cqt.max()).T
# Compute cosine distance matrix
distance_matrix = scipy.spatial.distance.cdist(
midi_cqt, audio_cqt, 'cosine')
# Get lowest cost path
p, q, score = djitw.dtw(
distance_matrix, GULLY, np.median(distance_matrix), inplace=False)
# Normalize by path length
score = score/len(p)
# Normalize by distance matrix submatrix within path
score = score/distance_matrix[p.min():p.max(), q.min():q.max()].mean()
# Adjust the MIDI file
midi_object.adjust_times(midi_times[p], audio_times[q])
# Write the result
midi_object.write(output_midi_file)
# Synthesize aligned MIDI
midi_audio_aligned = midi_object.fluidsynth(fs=create_data.FS)
# Adjust to the same size as audio
if midi_audio_aligned.shape[0] > audio_data.shape[0]:
midi_audio_aligned = midi_audio_aligned[:audio_data.shape[0]]
else:
trim_amount = audio_data.shape[0] - midi_audio_aligned.shape[0]
midi_audio_aligned = np.append(midi_audio_aligned,
np.zeros(trim_amount))
# Stack one in each channel
librosa.output.write_wav(
pair_file, np.array([midi_audio_aligned, audio_data]), create_data.FS)
# Write out diagnostics
with open(diagnostics_file, 'wb') as f:
json.dump({'p': list(p), 'q': list(q), 'score': score}, f)
if __name__ == '__main__':
# Utility function for getting lists of all files of a certain type
def get_file_list(extension):
return [os.path.join(REAL_WORLD_PATH, '{}{}'.format(n, extension))
for n in range(1000)]
# Construct lists of each type of files
audios = get_file_list('.mp3')
mids = get_file_list('.mid')
out_mids = get_file_list('-aligned.mid')
pairs = get_file_list('-pair.wav')
diags = get_file_list('-diagnostics.js')
# Process each file from each list in parallel
joblib.Parallel(n_jobs=20, verbose=51)(
joblib.delayed(process_one_file)(audio, mid, output_mid, pair, diag)
for (audio, mid, output_mid, pair, diag)
in zip(audios, mids, out_mids, pairs, diags))