-
Notifications
You must be signed in to change notification settings - Fork 0
/
music-tagger-test.py
316 lines (275 loc) · 16.8 KB
/
music-tagger-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#coding: utf-8
#---------------------------------------------------------------------------------------------
# Description: a remake of Music Genre Classification with Deep Learning for Chainer
# Date: 2018.6
#---------------------------------------------------------------------------------------------
# This is based on
# audio_processor.py, example_tagging.py music_tagger_cnn.py and music_tagger_crnn.py
# of music-auto_tagging-keras <https://github.com/keunwoochoi/music-auto_tagging-keras>
# Copyright (c) 2016 Keunwoo Choi.
# Pls see LICENSE-music-auto_tagging-keras.md in the 'docs' directory
# and
# quick_test.py, tagger_net.py, train_tagger_net.py and utils.py
# of Music Genre Classification with Deep Learning <https://github.com/jsalbert/Music-Genre-Classification-with-Deep-Learning>
#----------------------------------------------------------------------------------------------
import argparse
import numpy as np
import chainer
from chainer import Chain, serializers, optimizers, cuda
import chainer.links as L
import chainer.functions as F
from chainer import Variable
import librosa
from librosa import feature, core, filters
from gru2 import *
from batch_normalization2 import *
# check version
# python 3.6.4 (64bit) on win32
# windows 10 (64bit)
# chainer (3.2.0)
# numpy (1.14.0)
# librosa (0.6.0)
# CRNN
class MusicTaggerCRNN(Chain):
def __init__(self, net=None):
super(MusicTaggerCRNN, self).__init__()
with self.init_scope():
# input_shape = (1, 96, 1440) (1, mel-bands, time frames)
self.norm0 = BatchNormalization2(1440, initial_gamma=net.norm0_g if net else None ,initial_beta=net.norm0_b if net else None,
initial_avg_mean=net.norm0_m if net else None, initial_avg_var=net.norm0_v if net else None)
# conv1 1->64
self.conv1 = L.Convolution2D(1, 64, 3, pad=1, initialW=net.conv1_W if net else None ,initial_bias=net.conv1_b if net else None)
self.norm1 = BatchNormalization2(64, initial_gamma=net.norm1_g if net else None ,initial_beta=net.norm1_b if net else None,
initial_avg_mean=net.norm1_m if net else None, initial_avg_var=net.norm1_v if net else None)
# conv2 64->128
self.conv2 = L.Convolution2D(64, 128, 3, pad=1, initialW=net.conv2_W if net else None ,initial_bias=net.conv2_b if net else None)
self.norm2 = BatchNormalization2(128, initial_gamma=net.norm2_g if net else None ,initial_beta=net.norm2_b if net else None,
initial_avg_mean=net.norm2_m if net else None, initial_avg_var=net.norm2_v if net else None)
# conv3 128->128
self.conv3 = L.Convolution2D(128, 128, 3, pad=1, initialW=net.conv3_W if net else None ,initial_bias=net.conv3_b if net else None)
self.norm3 = BatchNormalization2(128, initial_gamma=net.norm3_g if net else None ,initial_beta=net.norm3_b if net else None,
initial_avg_mean=net.norm3_m if net else None, initial_avg_var=net.norm3_v if net else None)
# conv4 128->128
self.conv4 = L.Convolution2D(128, 128, 3, pad=1, initialW=net.conv4_W if net else None ,initial_bias=net.conv4_b if net else None)
self.norm4 = BatchNormalization2(128, initial_gamma=net.norm4_g if net else None ,initial_beta=net.norm4_b if net else None,
initial_avg_mean=net.norm4_m if net else None, initial_avg_var=net.norm4_v if net else None)
# GRU1 (statefull) 128->32
self.gru1 = StatefulGRU2(128, 32, init=net.gru1_W if net else None, inner_init=net.gru1_U if net else None, bias_init=net.gru1_b if net else None,
init_r=net.gru1_W_r if net else None, inner_init_r=net.gru1_U_r if net else None, bias_init_r=net.gru1_b_r if net else None,
init_z=net.gru1_W_z if net else None, inner_init_z=net.gru1_U_z if net else None, bias_init_z=net.gru1_b_z if net else None )
# GRU2 (statefull) 32->32
self.gru2 = StatefulGRU2(32, 32, init=net.gru2_W if net else None, inner_init=net.gru2_U if net else None, bias_init=net.gru2_b if net else None,
init_r=net.gru2_W_r if net else None, inner_init_r=net.gru2_U_r if net else None, bias_init_r=net.gru2_b_r if net else None,
init_z=net.gru2_W_z if net else None, inner_init_z=net.gru2_U_z if net else None, bias_init_z=net.gru2_b_z if net else None )
# full connection 32->10 (32->50)
self.fc1 = L.Linear(32, 10, initialW=net.fc1_W if net else None ,initial_bias=net.fc1_b if net else None)
def __call__(self, X):
h0 = F.pad(X, ((0,0),(0,0),(0,0),(37,37)), 'constant') # (1, 96, 1366) -> (1, 96, 1440)
h1 = F.transpose( self.norm0( F.transpose(h0,axes=(0, 3 , 1, 2)) ), axes=(0, 2 , 3, 1) ) # normalize along time axis is OK?
h1 = F.max_pooling_2d( F.elu(self.norm1(self.conv1(h1))), (2,2), stride=(2,2) )
h1 = F.dropout(h1, ratio=0.1)
h2 = F.max_pooling_2d( F.elu(self.norm2(self.conv2(h1))), (3,3), stride=(3,3) )
h2 = F.dropout(h2, ratio=0.1)
h3 = F.max_pooling_2d( F.elu(self.norm3(self.conv3(h2))), (4,4), stride=(4,4) )
h3 = F.dropout(h3, ratio=0.1)
h4 = F.max_pooling_2d( F.elu(self.norm4(self.conv4(h3))), (4,4), stride=(4,4) )
h4 = F.dropout(h4, ratio=0.1)
h4 = F.transpose( h4, axes=(0, 3 , 1, 2) )
h4 = F.reshape( h4, (h4.shape[0], 15,128) )
self.gru1.reset_state() # reset hidden states per. track Is this OK?
self.gru2.reset_state() # reset hidden states per. track Is this OK?
for i in range (h4.shape[1]):
h5 = self.gru1(h4[:,i,:])
h6 = self.gru2(h5)
h6 = F.dropout(h6, ratio=0.3)
h7 = F.sigmoid(self.fc1(h6))
return h7
def load(self, fname="data/crnn.model"):
serializers.load_npz(fname, self)
def save(self, fname="data/crnn.model"):
serializers.save_npz(fname, self)
# CNN
# CNN model in keras 1.0.6, of which BatchNormalization is something special.
# initial_avg_var=(net.normx_v)**2
#
# audio_convnet.py with cnn_weights_theano.h5:
# BatchNormalization mode
# norm0 mode=0
# norm1-5 mode=2 : using per-batch statistics to normalize the data during both
#
class MusicTaggerCNN(Chain):
def __init__(self, net=None, mode2=False):
super(MusicTaggerCNN, self).__init__()
with self.init_scope():
# input_shape = (1, 96, 1366) (1, mel-bands, time frames)
self.norm0 = BatchNormalization2(1366, initial_gamma=net.norm0_g if net else None ,initial_beta=net.norm0_b if net else None,
initial_avg_mean=net.norm0_m if net else None, initial_avg_var=(net.norm0_v)**2 if net else None, eps=1e-5)
# conv1 1->32
self.conv1 = L.Convolution2D(1, 32, 3, pad=1, initialW=net.conv1_W if net else None ,initial_bias=net.conv1_b if net else None)
self.norm1 = BatchNormalization2(32, initial_gamma=net.norm1_g if net else None ,initial_beta=net.norm1_b if net else None,
initial_avg_mean=net.norm1_m if net else None, initial_avg_var=(net.norm1_v)**2 if net else None,
enforce_compute=mode2)
# conv2 32->128
self.conv2 = L.Convolution2D(32, 128, 3, pad=1, initialW=net.conv2_W if net else None ,initial_bias=net.conv2_b if net else None)
self.norm2 = BatchNormalization2(128, initial_gamma=net.norm2_g if net else None ,initial_beta=net.norm2_b if net else None,
initial_avg_mean=net.norm2_m if net else None, initial_avg_var=(net.norm2_v)**2 if net else None,
enforce_compute=mode2)
# conv3 128->128
self.conv3 = L.Convolution2D(128, 128, 3, pad=1, initialW=net.conv3_W if net else None ,initial_bias=net.conv3_b if net else None)
self.norm3 = BatchNormalization2(128, initial_gamma=net.norm3_g if net else None ,initial_beta=net.norm3_b if net else None,
initial_avg_mean=net.norm3_m if net else None, initial_avg_var=(net.norm3_v)**2 if net else None,
enforce_compute=mode2)
# conv4 128->192
self.conv4 = L.Convolution2D(128, 192, 3, pad=1, initialW=net.conv4_W if net else None ,initial_bias=net.conv4_b if net else None)
self.norm4 = BatchNormalization2(192, initial_gamma=net.norm4_g if net else None ,initial_beta=net.norm4_b if net else None,
initial_avg_mean=net.norm4_m if net else None, initial_avg_var=(net.norm4_v)**2 if net else None,
enforce_compute=mode2)
# conv5 128->256
self.conv5 = L.Convolution2D(192, 256 , 3, pad=1, initialW=net.conv5_W if net else None ,initial_bias=net.conv5_b if net else None)
self.norm5 = BatchNormalization2(256, initial_gamma=net.norm5_g if net else None ,initial_beta=net.norm5_b if net else None,
initial_avg_mean=net.norm5_m if net else None, initial_avg_var=(net.norm5_v)**2 if net else None,
enforce_compute=mode2)
# full connection 256->50
self.fc1 = L.Linear(256, 50, initialW=net.fc1_W if net else None ,initial_bias=net.fc1_b if net else None)
self.mode2=mode2
def __call__(self, X):
h0 = F.transpose( self.norm0( F.transpose(X,axes=(0, 3 , 1, 2)) ), axes=(0, 2 , 3, 1) ) # swap axis
h1 = F.max_pooling_2d( F.elu(self.norm1(self.conv1(h0))), (2,4) , cover_all=False)
h1 = F.dropout(h1, ratio=0.5)
h2 = F.max_pooling_2d( F.elu(self.norm2(self.conv2(h1))), (2,4) , cover_all=False)
h2 = F.dropout(h2, ratio=0.5)
h3 = F.max_pooling_2d( F.elu(self.norm3(self.conv3(h2))), (2,4) ,cover_all=False)
h3 = F.dropout(h3, ratio=0.5)
h4 = F.max_pooling_2d( F.elu(self.norm4(self.conv4(h3))), (3,5) ,cover_all=False)
h4 = F.dropout(h4, ratio=0.5)
h5 = F.max_pooling_2d( F.elu(self.norm5(self.conv5(h4))), (4,4) ) #,cover_all=False)
h5 = F.dropout(h5, ratio=0.5)
h5 = F.reshape( h5, (h5.shape[0], 256))
h6 = F.sigmoid(self.fc1(h5))
return h6
def load(self, fname='data/cnn.model'):
serializers.load_npz(fname, self)
def save(self, fname="data/cnn.model"):
serializers.save_npz(fname, self)
def compute_melgram(audio_path, SR=12000, N_FFT=512, N_MELS=96, HOP_LEN=256, DURA=29.12): # compute only center portion of the track
"""
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12 # to make it 1366 frame..
"""
print ('loading...', audio_path)
src, sr = librosa.load(audio_path, sr=SR) # load whole signal
n_sample = src.shape[0]
n_sample_fit = int(DURA*SR)
if n_sample < n_sample_fit: # if too short
src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,)))) # still problem ?
elif n_sample > n_sample_fit: # if too long
sp0=int((n_sample-n_sample_fit)/2)
src = src[sp0 : sp0 + n_sample_fit ]
# feature.melspectrogram out still power. Is use amplitude_to_db OK? Or, is it power_to_db?
melgram= feature.melspectrogram(y=src, sr=SR, hop_length=HOP_LEN, n_fft=N_FFT, n_mels=N_MELS)
ret= core.amplitude_to_db(melgram, ref=1.0)
"""
# alternative:
power=2
S = np.abs( core.stft(y=src, n_fft=N_FFT, hop_length=HOP_LEN) ) **power
mel_basis = filters.mel(sr, n_fft=N_FFT, n_mels=N_MELS)
ret= np.dot(mel_basis, S)
ret= core.power_to_db(ret, ref=1.0) # mel_basis is still power
ret= core.amplitude_to_db(ret, ref=1.0) # mel_basis is still power
"""
ret = ret[np.newaxis, np.newaxis, :]
return ret
def load_wav_and_get_melgrams(audio_paths):
melgrams = np.zeros((0, 1, 96, 1366))
for audio_path in audio_paths:
melgram = compute_melgram(audio_path)
melgrams = np.concatenate((melgrams, melgram), axis=0)
return melgrams
def sort_result(tags, preds):
result = zip(tags, preds)
sorted_result = sorted(result, key=lambda x: x[1], reverse=True)
return [(name, '%5.3f' % score) for name, score in sorted_result]
def print_result(tags, pred_tags, percent=True ):
print('* top-10 tags: genre prediction result percentage *')
for song_idx, audio_path in enumerate(audio_paths):
if percent:
total=pred_tags[song_idx, :].sum()
pred_tags[song_idx, :]=pred_tags[song_idx, :] / total * 100.
else:
pred_tags[song_idx, :]=pred_tags[song_idx, :]
sorted_result = sort_result(tags, pred_tags[song_idx, :].tolist())
print(audio_path)
print(sorted_result[:5])
print(sorted_result[5:10])
print(' ')
# music-auto_tagging-keras Tags 50
tag50 = ['rock', 'pop', 'alternative', 'indie', 'electronic',
'female vocalists', 'dance', '00s', 'alternative rock', 'jazz',
'beautiful', 'metal', 'chillout', 'male vocalists',
'classic rock', 'soul', 'indie rock', 'Mellow', 'electronica',
'80s', 'folk', '90s', 'chill', 'instrumental', 'punk',
'oldies', 'blues', 'hard rock', 'ambient', 'acoustic',
'experimental', 'female vocalist', 'guitar', 'Hip-Hop',
'70s', 'party', 'country', 'easy listening',
'sexy', 'catchy', 'funk', 'electro', 'heavy metal',
'Progressive rock', '60s', 'rnb', 'indie pop',
'sad', 'House', 'happy']
# GTZAN Dataset: Tags10
tag10 = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Music Tagger for Chainer')
parser.add_argument('--modelSel', '-m', default='CRNN', help='model select CRNN Or CNN, CNN0')
parser.add_argument('--en', action='store_false', help='add --en if use keras h5 weight data')
args = parser.parse_args()
# Set audio_path to genre prediction.
audio_paths = ['data/bensound-thejazzpiano.wav', # jazz, duration around 32sec, 44100Hz Mono
'data/bensound-actionable.wav'] # rock, duration around 32sec, 44100Hz Mono
# load wav file and compute melgram
melgrams2=np.array(load_wav_and_get_melgrams(audio_paths),dtype=np.float32)
if args.en:
if args.modelSel == 'CRNN':
model_cnn = MusicTaggerCRNN()
model_cnn.load()
tags=10 # default
else:
if args.modelSel == 'CNN0':
print ('set to use mode 2')
model_cnn = MusicTaggerCNN(mode2=True)
model_cnn.load(fname='data/cnn0.model')
else: # 'CNN'
model_cnn = MusicTaggerCNN()
model_cnn.load()
tags=50
else:
if args.modelSel == 'CRNN':
from h5_load import Class_net_from_h5_CRNN
net0= Class_net_from_h5_CRNN()
model_cnn = MusicTaggerCRNN(net=net0)
# save the weights as model file
model_cnn.save()
elif args.modelSel == 'CNN0' :
from h5_load import Class_net_from_h5_CNN
net0= Class_net_from_h5_CNN(IN_FILE = 'data/cnn_weights_theano.h5')
model_cnn = MusicTaggerCNN(net=net0, mode2=True)
# save the weights as model file
model_cnn.save(fname='data/cnn0.model')
else: # 'CNN'
from h5_load import Class_net_from_h5_CNN
net0= Class_net_from_h5_CNN()
model_cnn = MusicTaggerCNN(net=net0)
# save the weights as model file
model_cnn.save()
# set tag number
tags=net0.tags
# enter chainer into test mode (no train mode)
with chainer.using_config('train', False):
pred_tags=model_cnn(melgrams2)
# show genre prediction result
if tags == 10:
print_result(tag10,pred_tags.data)
else:
print_result(tag50,pred_tags.data) #, percent=False )