forked from bmknecht/gruschen
/
mfcc.py
264 lines (208 loc) · 9.63 KB
/
mfcc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import math
import unittest
import numpy as np
from signal_type import Signal
import utility
# unit test
class MFCCTest(unittest.TestCase):
maxDiff = None
commonSampleRates = [8000, 16000, 32000, 44100]
testSize = 100
def sine(self):
return np.array([math.sin(x) for x in range(self.testSize)])
def test_dynamic_time_warping_metric(self):
dtwm = dynamic_time_warping_metric
self.assertLess(dtwm(self.sine(), self.sine()), 1)
self.assertLess(dtwm(self.sine(), self.sine()+0.9),
dtwm(self.sine(), self.sine()+1))
self.assertLess(dtwm(np.zeros(self.testSize), self.sine()),
dtwm(np.zeros(self.testSize), self.sine()*1.1))
self.assertLess(dtwm(np.zeros(self.testSize),
np.zeros(2*self.testSize)+1),
dtwm(np.zeros(self.testSize),
np.zeros(2*self.testSize)+1.1))
self.assertAlmostEqual(dtwm(self.sine(), self.sine()*1.1),
dtwm(self.sine()*1.1, self.sine()))
def test_pre_emphasize(self):
test_signal = _pre_emphasize(np.array([_ for _ in
range(self.testSize)]))
predictor_index = self.testSize - 1
predictor = ((predictor_index - test_signal[predictor_index-1]) /
predictor_index)
for i in range(2, len(test_signal)):
self.assertLess(abs(predictor - (i+1 - test_signal[i]) / i), 0.1)
def test_split_into_frame(self):
frames = _split_into_frame(Signal([_ for _ in
range(self.commonSampleRates[1])],
self.commonSampleRates[0]))
for n, frame in enumerate(frames):
self.assertEqual(len(frame), len(frames[0]))
for i, v in enumerate(frame):
self.assertEqual(v, n * len(frame) + i)
def is_power_of_two(self, n):
return n == 2 ** int(math.log2(n))
def test_frame_characteristics(self):
for sample_rate in self.commonSampleRates:
frame_size, frame_overlap = _frame_characteristics(sample_rate)
self.assertEqual(frame_size, frame_overlap * 2)
self.assertTrue(self.is_power_of_two(frame_size))
self.assertTrue(self.is_power_of_two(frame_overlap))
def test_window(self):
impactful_number = 100
not_just_one = 10
windows = _window([np.array([impactful_number for _ in
range(self.testSize)])
for _ in range(not_just_one)])
for window in windows:
for i in range(len(window) // 2 - 1):
self.assertLess(window[i], window[i+1])
self.assertAlmostEqual(window[i], window[len(window) - 1 - i])
def test_fft(self):
a_lot = 12
signals = [Signal([0j for _ in range(utility.window_size())], 2)
for _ in range(a_lot)]
some_frequencies = [1, 15, 23]
for i in range(len(signals)):
for f in some_frequencies:
signals[i][f] = 2j
for i in range(len(signals)):
signals[i] = Signal(np.fft.ifft(signals[i]), 1)
signals = _fft(signals)
for signal in signals:
for i in range(len(signal)):
if i in some_frequencies:
self.assertAlmostEqual(signal[i], 4)
else:
self.assertAlmostEqual(signal[i], 0)
def test_mel_filtering(self):
for sample_rate in self.commonSampleRates:
not_just_one = 2
windows = [Signal([1 for _ in range(utility.window_size())],
sample_rate)
for _ in range(not_just_one)]
signal = _mel_filtering(windows)
for i in range(len(signal[0])-1):
self.assertLessEqual(signal[0][i]-1, signal[0][i+1])
self.assertEqual(signal[0][i], signal[1][i])
def test_non_linear_transform(self):
not_just_one = 2
signal = [self.sine() for _ in range(not_just_one)]
transformed = _non_linear_transform(signal)
for i in range(len(signal)):
if signal[0][i] < logarithm_smallest_argument:
self.assertEqual(transformed[0][i], logarithm_bottom_line)
else:
self.assertGreater(signal[0][i], transformed[0][i])
self.assertEqual(transformed[0][i], transformed[1][i])
logarithm_smallest_argument = 1e-22
logarithm_bottom_line = -50
mel_bins_count = 25
# dynamic time warping algorithm
def dynamic_time_warping_metric(s, t):
n = len(s)
m = len(t)
dtw = np.zeros((n, m))
for i in range(n):
dtw[i][0] = float("inf")
for i in range(m):
dtw[0][i] = float("inf")
dtw[0][0] = 0
for i in range(1, n):
for j in range(1, m):
cost = np.linalg.norm(s[i] - t[j])
dtw[i][j] = cost + min([dtw[i-1][j], dtw[i][j-1], dtw[i-1][j-1]])
return dtw[n-1][m-1]
def mfcc_diagnostics(signal):
frame_size, frame_overlap = _frame_characteristics(signal.sampleRate)
center_of_bins = _mel_bin_center(signal.sampleRate, frame_size * 2)
print("file MFCC diagnostics: ")
print("\tsample rate: {} Hz".format(signal.sampleRate))
print("\tframe size: {}".format(frame_size))
print("\tframe overlap: {}".format(frame_overlap))
frame_length_ms = int(frame_size / signal.sampleRate * 1000)
print("\tframe length: {} ms".format(frame_length_ms))
print("\tcenter of Mel bins: {}".format(center_of_bins))
# MFCC
def mfcc(signal):
return (_mfcc_besides_pre_emphasizing(signal) +
_mfcc_besides_pre_emphasizing(_pre_emphasize(signal)))
def _mfcc_besides_pre_emphasizing(signal):
signal = _split_into_frame(signal)
signal = _window(signal)
signal = _fft(signal)
signal = _mel_filtering(signal)
signal = _non_linear_transform(signal)
signal = _cepstral_coefficients(signal)
return _zero_highest_frequency(signal)
def _pre_emphasize(signal):
return signal - 0.97 * np.append(np.array([0]), signal[0:len(signal)-1])
def _split_into_frame(signal):
frame_size, frame_overlap = _frame_characteristics(signal.sampleRate)
return [_frame(signal, frame_size, i) for i in
range(frame_overlap,
len(signal)-frame_overlap,
int(frame_size))]
def _frame_characteristics(sample_rate):
frame_size = 128
lowest_time_per_frame = 15
while utility.sample_count_to_time(frame_size,
sample_rate) < lowest_time_per_frame:
frame_size *= 2
return frame_size, frame_size // 2
def _frame(signal, frame_size, i):
assert frame_size % 2 == 0
return signal[i-frame_size//2:i+frame_size//2]
def _window(signal):
n = len(signal[0])
hamming_weights = 0.54 - 0.46 * np.cos(2*math.pi*np.arange(n) / (n-1))
return [frame * hamming_weights for frame in signal]
def _fft(signal):
return [Signal(abs(np.fft.fft(frame)) ** 2,
signal[0].sampleRate) for frame in signal]
def _mel_filtering(signal):
center_of_bins = _mel_bin_center(signal[0].sampleRate, len(signal[0]) * 2)
return [np.array([_mel_filter_window_sum(frame, i, center_of_bins[i-1:i+2])
for i in range(1, len(center_of_bins)-1)])
for frame in signal]
def _mel_bin_center(samplerate, fft_size):
lowest_useful_frequency = 200
lowest_uselful_mel_frequency = _mel_transform(lowest_useful_frequency)
center_mel_frequencies = np.linspace(lowest_uselful_mel_frequency,
_mel_transform(samplerate / 2 - 1),
mel_bins_count)
center_frequencies = _mel_transform_invert(center_mel_frequencies)
center_spectrum_indices = ((fft_size / 2 + 1) /
samplerate * center_frequencies)
return np.round(center_spectrum_indices).astype(int)
def _mel_transform(frequency):
return 2595 * math.log10(1 + frequency / 700)
def _mel_transform_invert(mel_frequencies):
return np.array([(10**(f / 2595)-1) * 700 for f in mel_frequencies])
def _mel_filter_window_sum(signal, bin_index, center_of_bins):
a, b, c = center_of_bins
first_half_weights = (np.arange(a, b+1)-a+1) / (b-a+1)
second_half_weights = 1 - (np.arange(b, c+1)-b)/(c-b+1)
return (sum(first_half_weights * signal[a:b+1])
+ sum(second_half_weights * signal[b:c+1]))
def _non_linear_transform(signal):
return [np.array([math.log(s)
if s > logarithm_smallest_argument
else logarithm_bottom_line
for s in frame])
for frame in signal]
def _cepstral_coefficients(signal):
useful_mel_bins_count = 13
return [np.array([_cepstral_single_coefficient(frame, i)
for i in range(useful_mel_bins_count)])
for frame in signal]
def _cepstral_single_coefficient(signal, index):
cosine_sum = 0
for f, freq in enumerate(signal):
cosine_sum += freq * math.cos(math.pi * index /
(mel_bins_count-2) * (f - 0.5))
return cosine_sum
def _zero_highest_frequency(signal):
signal = [np.fft.rfft(frame) for frame in signal]
for i in range(len(signal)):
signal[i][-1] = 0
return [np.fft.irfft(frame) for frame in signal]