/
utils.py
253 lines (202 loc) · 8.71 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import numpy as np
#Note to run this script you need to run librosa version 0.6.3
import librosa
import os
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import wave
import pylab
import librosa.display
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import plot_model
from sklearn.metrics import classification_report
#Function that converts a wav audio file into a padded mfcc
def wav_to_mfcc(file_path, max_pad_len=20):
wave, sr = librosa.load(file_path, mono=True, sr = None)
wave = wave[::3]
mfcc = librosa.feature.mfcc(wave, sr= 8000)
pad_width = max_pad_len - mfcc.shape[1]
mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
return mfcc
#Function looks through every wav file in a certain folder, converts them to MFCCs then return the features and labels
def get_data(folder):
print("Fetching wav data from" + folder)
labels = []
mfccs = []
for f in os.listdir('./' + folder):
if f.endswith('.wav'):
# MFCC
mfccs.append(wav_to_mfcc(folder + f))
# List of labels
label = f.split('_')[0]
labels.append(label)
#print(np.array([np.hstack(i) for i in mfccs]))
return np.asarray(mfccs), to_categorical(labels)
#Splits mfcc data into training, validation, and testing sets
def prepare_data(folder):
print("Preparing Data")
mfccs, labels = get_data(folder)
dim_1 = mfccs.shape[1]
dim_2 = mfccs.shape[2]
channels = 1
classes = 10
X = mfccs
X = X.reshape((mfccs.shape[0], dim_1, dim_2, channels))
y = labels
input_output = [(dim_1, dim_2, channels), classes]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
return X_train, X_val, X_test, y_train, y_val, y_test, input_output
#Based on the parameters returns a compiled model with the desired characteristics
def prepare_model(input_output, modeltype = 'CNN', dropout = True, batch_n = True, maxpooling = True, optimizer = keras.optimizers.Adam()):
print("Creating Model")
if modeltype == 'CNN':
model = get_cnn_model(input_output[0], input_output[1], dropout, batch_n, maxpooling, optimizer)
print(model.summary())
plot_model(model, to_file='images/CNN_model.png')
elif modeltype == 'FF':
model = get_feedforward_model(input_output[0], input_output[1], dropout, batch_n, optimizer)
print(model.summary())
plot_model(model, to_file='images/FF_model.png')
else:
raise ValueError('Not An Acceptable Model Type Sorry!')
return model
#Generates wave graph based on an audio file
def generate_wave_graph(filename):
file = './recordings/' + filename
with wave.open(file, 'r') as wav_file:
# Extract Raw Audio from Wav File
signal = wav_file.readframes(-1)
signal = np.frombuffer(signal, 'Int16')
# Split the data into channels
channels = [[] for channel in range(wav_file.getnchannels())]
for index, datum in enumerate(signal):
channels[index % len(channels)].append(datum)
# Get time from indices
fs = wav_file.getframerate()
Time = np.linspace(0, len(signal) / len(channels) / fs, num= int(len(signal) / len(channels)))
# Plot
plt.figure(1)
plt.title('Signal Wave')
for channel in channels:
plt.plot(Time, channel)
plt.savefig('images/waveform' + filename + '.png')
plt.show()
#Generates a spectogram based on an audio file
def generate_spectogram(filename):
sound_info, frame_rate = get_wav_info('./recordings/' + filename)
pylab.figure(num=None, figsize=(19, 12))
pylab.subplot(111)
pylab.title('Spectrogram of %r' % filename)
pylab.specgram(sound_info, Fs=frame_rate)
pylab.savefig('images/spectrogram' + filename + '.png')
pylab.show()
#Gets the info from a wave file
def get_wav_info(filename):
with wave.open(filename, 'r') as wav_file:
frames = wav_file.readframes(-1)
sound_info = pylab.fromstring(frames, 'int16')
frame_rate = wav_file.getframerate()
return sound_info, frame_rate
#Generates an MFCC graph for a given wave file
def generate_mfcc_graph(filename):
(xf, sr) = librosa.load('./recordings/' + filename)
mfccs = librosa.feature.mfcc(y=xf, sr=sr, n_mfcc=4)
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.tight_layout()
plt.title('mfcc')
plt.savefig('images/mfcc' + filename + '.png')
plt.show()
#Generates all three graphs for one file
def generate_graphs(filename):
generate_wave_graph(filename)
generate_mfcc_graph(filename)
generate_spectogram(filename)
#Creates and compiles a CNN based on desired hyperparameters
def get_cnn_model(input_shape, num_classes, dropout = True, batch_n = True, maxpooling = True, optimizer = keras.optimizers.Adam() ):
model = Sequential()
model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=input_shape))
if batch_n:
model.add(BatchNormalization())
model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
if batch_n:
model.add(BatchNormalization())
if maxpooling:
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
if batch_n:
model.add(BatchNormalization())
if dropout:
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
if batch_n:
model.add(BatchNormalization())
if dropout:
model.add(Dropout(0.4))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=optimizer, metrics=['accuracy'])
return model
#Creates and compiles a Feedforward Network based on desired hyperparameters
def get_feedforward_model(input_shape, num_classes, dropout = True, batch_n = True, optimizer = keras.optimizers.Adam() ):
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=input_shape))
model.add(Flatten())
if batch_n:
model.add(BatchNormalization())
if dropout:
model.add(Dropout(0.25))
model.add(Dense(64, activation='relu'))
if batch_n:
model.add(BatchNormalization())
if dropout:
model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
if batch_n:
model.add(BatchNormalization())
if dropout:
model.add(Dropout(0.4))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy, optimizer = optimizer,
metrics=['accuracy'])
return model
#Returns a test score for a given model and test set and prints a classification report
def evaluate_model(model, test_X, test_y):
model = keras.models.load_model(model)
predictions = model.predict_classes(test_X)
eval = model.evaluate(test_X, test_y)
print('Test Score: ' + str(eval[1]))
print(classification_report(test_y, to_categorical(predictions)))
#Plots the loss history
def plot_losses(history):
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('images/loss_curve.png')
plt.show()
#main method that runs through a default run of the neural network
if __name__ == '__main__':
print("Generating Graphs")
generate_wave_graph('0_jackson_0.wav')
generate_spectogram('0_jackson_0.wav')
generate_mfcc_graph('0_jackson_0.wav')
print("Done")
X_train, X_val, X_test, y_train, y_val, y_test, input_output = prepare_data('./recordings/')
model = prepare_model(input_output)
keras_callback = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=1,
write_graph=True, write_images=True)
callbacks = [ModelCheckpoint(filepath='models/model.h5', monitor='val_loss', save_best_only=True), TensorBoard(log_dir='./Graph', histogram_freq=1,
write_graph=False, write_images=False)]
history = model.fit(X_train, y_train, batch_size=32, epochs=50, verbose= 2, validation_data = [X_val, y_val],
callbacks=callbacks)
# Plot training & validation accuracy values
plot_losses(history)
evaluate_model('models/model.h5', X_test, y_test)