forked from LittleLaa/SpeechRecognitionSystem
/
kmeans.py
303 lines (284 loc) · 17.5 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# -*- coding: utf-8 -*-
__author__ = 'lufo'
import numpy
import math
import DTW
import SR
def get_covariance(data_array):
"""
get covariance matrix of input data,we only need diagonal data so we assume that all off-diagonal terms in the matrix are 0
:param data_array:array,every element of it is a data which has some features(in this project the number of features is 39)
:return:array,the covariance matrix and the mean of data array
"""
number_of_data = len(data_array)
number_of_features = len(data_array[0]) if number_of_data != 0 else 0
covariance_matrix = numpy.zeros([number_of_features, number_of_features])
mean = numpy.zeros(number_of_features)
for data in data_array:
numpy.add(mean, data, mean)
numpy.divide(mean, number_of_data, mean)
for i in xrange(number_of_features):
temp = numpy.zeros(number_of_data)
for j in xrange(number_of_data):
temp[j] = data_array[j][i] - mean[i]
covariance_matrix[i][i] = max(numpy.dot(temp, temp.transpose()), 0.00001) / number_of_data
return covariance_matrix, mean
def get_mahalanobis_distance(covariance_matrix, mean, segment):
"""
get mahalanobis distance between a state and a segment
:param covariance_matrix: array,the covariance matrix of a state
:param mean: the mean of a state
:param segment: array,a segment that has some features(in this project the number of features is 39)
:return: mahalanobis distance,node cost for DTW
"""
if not len(covariance_matrix): # this state has no frame
return [float('inf'), float('inf')]
inv_covariance_matrix = numpy.linalg.inv(covariance_matrix)
# print 'covariance_matrix', covariance_matrix
# print 'inv_covariance_matrix', inv_covariance_matrix
difference_between_segment_and_mean = numpy.subtract(segment, mean)
mahalanobis_distance = numpy.dot(numpy.dot(difference_between_segment_and_mean.transpose(), inv_covariance_matrix), \
difference_between_segment_and_mean)
# print 'mahalanobis_distance', mahalanobis_distance
node_cost = 0
for i in xrange(len(covariance_matrix)):
node_cost += math.log(2 * math.pi * covariance_matrix[i][i])
# print 'first part', node_cost
node_cost += mahalanobis_distance
node_cost /= 2
return mahalanobis_distance, node_cost
def initialize_states(templates, number_of_templates, number_of_states=5):
"""
initialize states for each template,if this template has 12 frames,then the number of frames in each state is 2,2,2,3,3
:param templates: list,each element of it is a template,each element of a template is a frame,which has 39 features
:param number_of_states: number of states in each template
:return: number_of_frames_in_state_for_each_template[i][j] represent for the number of frames in ith template jth state
"""
number_of_frames_in_each_state_for_each_template = []
for i in xrange(number_of_templates):
# get number_of_frames_in_each_state_for_each_template
length = len(templates[i])
small_number_of_elements_in_current_state = length / number_of_states # if length is 12,
# then there are 3 states have 2 frames and 2 states have 3 frames,we call 2 small number and 3 big number
number_of_big_number = length % number_of_states
number_of_frames_in_each_state = [small_number_of_elements_in_current_state for j in \
xrange(number_of_states - number_of_big_number)]
number_of_frames_in_each_state.extend \
([small_number_of_elements_in_current_state + 1 for j in xrange(number_of_big_number)])
number_of_frames_in_each_state_for_each_template.append(number_of_frames_in_each_state)
# print number_of_frames_in_each_state_for_each_template
return number_of_frames_in_each_state_for_each_template
def get_edge_cost(number_of_frames_in_each_state_for_each_template, number_of_templates, number_of_states=5):
"""
get edge cost from one state to other state
:param number_of_frames_in_each_state_for_each_template: number_of_frames_in_state_for_each_template[i][j] represent for the number of frames in ith template jth state
:param number_of_states: number of states in each template
:return: a edge cost list and state for each template,
edge_cost[i][j] represent for the cost transform from state i to j,begin from 1,0 represents for dummy state
state_for_each_template[i][j] represent for the state of ith template's jth frame,begin from 0,no dummy state
"""
number_of_frames_before_this_state_for_each_template = []
# state_for_each_template[i][j] represent for the state of ith template's jth frame,begin from 0
state_for_each_template = []
for number_of_frames_in_each_state in number_of_frames_in_each_state_for_each_template:
# get number_of_frames_before_this_state_for_each_template
number_of_frames_before_this_state = [0 for j in xrange(number_of_states)]
for j in xrange(1, number_of_states):
number_of_frames_before_this_state[j] = number_of_frames_before_this_state[j - 1] \
+ number_of_frames_in_each_state[j - 1]
number_of_frames_before_this_state_for_each_template.append(number_of_frames_before_this_state)
# get state_for_each_template
state = []
for j in xrange(number_of_states):
state.extend([j for k in xrange(number_of_frames_in_each_state[j])])
state_for_each_template.append(state)
edge_cost = [] # edge_cost[i][j] represent for the cost transform from i to j
for i in xrange(-1, number_of_states):
edge_cost_i = [0 for j in xrange(number_of_states + 1)]
number_of_frames_in_ith_state = 0
number_of_frames_that_next_frame_in_state = [0 for j in xrange(number_of_states)]
if i is -1: # begin state
number_of_frames_in_ith_state = number_of_templates
for j in xrange(number_of_templates):
number_of_frames_that_next_frame_in_state[state_for_each_template[j][0]] += 1
else:
for j in xrange(number_of_templates):
number_of_frames_in_ith_state += number_of_frames_in_each_state_for_each_template[j][i]
if i is not number_of_states - 1: # the last state can't translate to other state
number_of_frames_that_next_frame_in_state[ \
state_for_each_template[j][number_of_frames_before_this_state_for_each_template[j][i + 1]]] += 1
number_of_frames_that_next_frame_in_state[i] = number_of_frames_in_ith_state - sum(
number_of_frames_that_next_frame_in_state)
for j in xrange(number_of_states):
# if ith state has no frame,then we think it's impossible for any frame transform to ith state
if number_of_frames_in_ith_state == 0 or number_of_frames_that_next_frame_in_state[j] == 0:
edge_cost_i[j + 1] = float('inf')
else:
edge_cost_i[j + 1] = -math.log(
number_of_frames_that_next_frame_in_state[j] / float(number_of_frames_in_ith_state))
edge_cost_i[0] = float('inf')
edge_cost.append(edge_cost_i)
return edge_cost, state_for_each_template
def get_covariance_and_mean_for_each_state(templates, state_for_each_template, number_of_states=5):
"""
get covariance and mean for each state
:param templates: list,each element of it is a template,each element of a template is a frame,which has 39 features
:param state_for_each_template[i][j] represent for the state of ith template's jth frame,begin from 0
:param number_of_states: number of states in each template
:return: covariance_matrix, a list,covariance_matrix[i] represent for ith state's covariance matrix
:return: mean,a list,mean[i] represent for ith state's mean
"""
# total_cost[i][j][k] represent for the ith template's jth frame's total cost from its origin state to state k
# frames_in_each_state[i] represent for ith state's frames,each of them has 39 features
frames_in_each_state = [[] for i in xrange(number_of_states)]
covariance_matrix = [] # covariance_matrix[i] represent for ith state's covariance matrix
mean = [] # mean[i] represent for ith state's mean
for i in xrange(len(state_for_each_template)):
for j in xrange(len(state_for_each_template[i])):
frames_in_each_state[state_for_each_template[i][j]].append(templates[i][j])
for frames_in_one_state in frames_in_each_state:
temp_covariance_matrix, temp_mean = get_covariance(numpy.array(frames_in_one_state))
covariance_matrix.append(temp_covariance_matrix)
mean.append(temp_mean)
return covariance_matrix, mean
def get_number_of_frames_in_each_state_for_each_template_by_state_for_each_template(state_for_each_template,
number_of_states):
"""
get_number_of_frames_in_each_state_for_each_template_by_state_for_each_template
:param: state_for_each_template[i][j] represent for the state of ith template's jth frame,begin from 0,no dummy state
:param number_of_states: number of states in each template
:return: number_of_frames_in_state_for_each_template[i][j] represent for the number of frames in ith template jth state
"""
number_of_frames_in_each_state_for_each_template = []
for i in xrange(len(state_for_each_template)):
number_of_frames_in_each_state = [0 for j in xrange(number_of_states)]
for j in xrange(len(state_for_each_template[i])):
number_of_frames_in_each_state[state_for_each_template[i][j]] += 1
number_of_frames_in_each_state_for_each_template.append(number_of_frames_in_each_state)
return number_of_frames_in_each_state_for_each_template
def k_means(templates, number_of_states=5):
"""
using k-means to templates to get a template has 5 states
:param templates: list,each element of it is a template,each element of a template is a frame,which has 39 features
:param number_of_states: number of states in each template
:return: list,the template after doing k-means
"""
# we assume that frames in each state is continuous
# print 'kmeans'
number_of_templates = len(templates)
# print 'number_of_templates', number_of_templates
# number_of_frames_in_state_for_each_template[i][j] represent for the number of frames in ith template jth state
number_of_frames_in_each_state_for_each_template = initialize_states \
(templates, number_of_templates, number_of_states)
trained_model = train_model(templates, number_of_states, number_of_frames_in_each_state_for_each_template)
# print 'covariance_matrix', covariance_matrix
return trained_model
def train_model(templates, number_of_states, number_of_frames_in_each_state_for_each_template):
number_of_templates = len(templates)
edge_cost, state_for_each_template = get_edge_cost(number_of_frames_in_each_state_for_each_template,
number_of_templates, number_of_states)
covariance_matrix, mean = get_covariance_and_mean_for_each_state(templates, state_for_each_template,
number_of_states)
cluster_changed = True
iteration_times = 0
while cluster_changed:
# print 'covariance_matrix', covariance_matrix
# print '\nedge cost', edge_cost
# print 'mean', mean
# print 'state_for_each_template', state_for_each_template
# print 'number_of_frames_in_each_state_for_each_template', number_of_frames_in_each_state_for_each_template
iteration_times += 1
# print 'iteration_times', iteration_times
cluster_changed = False
viterbi_search_object = DTW.DTW([mean])
for i in xrange(len(templates)):
# print 'templates', templates
number_of_frames = len(templates[i])
cost, template, path = viterbi_search_object.DTW(templates[i][:], strategy=0, cost_function=1,
covariance_matrix=covariance_matrix,
edge_cost=edge_cost,
number_of_templates=number_of_templates)
# print 'path', path
# print 'cost', cost
temp_state_for_one_template = [0 for j in xrange(number_of_frames)]
for j in xrange(number_of_frames - 1):
temp_state_for_one_template[j] = path[number_of_frames - 2 - j][1] - 1
temp_state_for_one_template[number_of_frames - 1] = number_of_states - 1
if temp_state_for_one_template != state_for_each_template[i]:
cluster_changed = True
state_for_each_template[i] = temp_state_for_one_template[:]
number_of_frames_in_each_state_for_each_template = get_number_of_frames_in_each_state_for_each_template_by_state_for_each_template(
state_for_each_template, number_of_states)
# print 'state_for_each_template', state_for_each_template
# print 'number_of_frames_in_each_state_for_each_template',number_of_frames_in_each_state_for_each_template
edge_cost = get_edge_cost(number_of_frames_in_each_state_for_each_template,
number_of_templates, number_of_states)[0]
covariance_matrix, mean = get_covariance_and_mean_for_each_state(templates, state_for_each_template,
number_of_states)
# print number_of_frames_in_each_state_for_each_template
# print 'covariance_matrix',len(covariance_matrix)
# print 'mean',len(mean)
return [map(list,
mean)], edge_cost, covariance_matrix, mean, state_for_each_template, number_of_frames_in_each_state_for_each_template
def train_continuous_model(input_list, continuous_words, number_of_states_for_each_word=5):
number_of_frames_in_each_state_for_each_template = []
number_of_words = len(continuous_words) + 2 # 2为首尾的silence
number_of_states = number_of_states_for_each_word * (number_of_words)
number_of_frames_in_each_state = [] # 第i个元素代表state i的frame数
template = []
covariance_matrix_in_each_state = []
mean_in_each_state = []
for i in xrange(number_of_words):
if i == 0 or i == number_of_words - 1:
word = 10 # silence
else:
word = int(continuous_words[i - 1])
mfcc_list = SR.get_isolated_templates(word)
model = k_means(mfcc_list)
temp_number_of_frames_in_each_state = [0 for i in xrange(number_of_states_for_each_word)]
for number_of_frames_in_each_state_in_each_template in model[5]:
for i in xrange(number_of_states_for_each_word):
temp_number_of_frames_in_each_state[i] += number_of_frames_in_each_state_in_each_template[i]
number_of_frames_in_each_state.extend(temp_number_of_frames_in_each_state)
template.extend(model[0][0])
covariance_matrix_in_each_state.extend(model[2])
mean_in_each_state.extend(model[3])
for input_feature in input_list:
print len(template), len(input_feature)
last_state_index = [[[0, 0] for i in xrange(len(template))] for j in xrange(len(input_feature))]
cost_matrix = [[float('inf') for i in xrange(len(template))] for j in xrange(len(input_feature))]
cost_matrix[0][0] = get_mahalanobis_distance(covariance_matrix_in_each_state[0],
mean_in_each_state[0], input_feature[0])[1]
for i in xrange(1, len(input_feature)):
for j in xrange(len(template)):
if j == 0:
transform_list = [j]
else:
transform_list = [j - 1, j]
for last_state in transform_list:
edge_cost = (number_of_frames_in_each_state[j] - 10.0) / number_of_frames_in_each_state[
j] if j == last_state else 10.0 / number_of_frames_in_each_state[last_state]
new_cost = cost_matrix[i - 1][last_state] + edge_cost
if cost_matrix[i][j] > new_cost:
min_index = last_state
cost_matrix[i][j] = new_cost
cost_matrix[i][j] += get_mahalanobis_distance(covariance_matrix_in_each_state[j],
mean_in_each_state[j], input_feature[i])[1]
last_state_index[i][j] = [i - 1, min_index]
min_cost = min(
cost_matrix[len(input_feature) - 1][(len(template) - number_of_states_for_each_word):len(template)])
cur_index = cost_matrix[len(input_feature) - 1].index(min_cost)
path = []
i = len(input_feature) - 1
while i >= 0:
path.append(cur_index)
cur_index = last_state_index[i][cur_index][1]
i -= 1
print path
number_of_frames_in_each_state_in_one_template = [0 for i in xrange(number_of_states)]
for state in path:
number_of_frames_in_each_state_in_one_template[state] += 1
number_of_frames_in_each_state_for_each_template.append(number_of_frames_in_each_state_in_one_template)
print number_of_frames_in_each_state_for_each_template
trained_model = train_model(input_list, number_of_states, number_of_frames_in_each_state_for_each_template)
return trained_model