-
Notifications
You must be signed in to change notification settings - Fork 1
/
Segmentation.py
239 lines (176 loc) · 7.69 KB
/
Segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import audioFeatureExtraction
import numpy as np
import math
import matplotlib.pyplot as plt
# main process
[Fs, x] = audioBasicIO.readAudioFile("data/diarizationExample.wav")
TIME_OF_WINDOW = 0.050 #a window = 0.05s
TIME_OF_STEP = 0.025 #step = 0.01s
SIZE_OF_WINDOW = int(TIME_OF_WINDOW * Fs) #the number of frame for one window
SIZE_OF_STEP = int(TIME_OF_STEP * Fs) #the number of frame for one step
BLOCK_SIZE = 4 #a block has (6 * SIZE_OF_STEP) frame
BLOCK_STEP = 2
# variables
END_OF_FILE = 0
FIRST_PAIR = 1
INDEX_BOUCLE = 1
def getMFCCs(block_start, block_end):
return attribute[8:20,block_start:block_end+1]
def getMFCCsFromTime(moment_start, moment_end):
block_start = int(moment_start / BLOCK_STEP / TIME_OF_STEP - 1)
block_end = int(moment_end / BLOCK_STEP / TIME_OF_STEP - 1)
return getMFCCs(block_start, block_end)
def gauss(x, mean, cov):
[n, d] = x.shape
[j, k] = cov.shape
if (j != n) | (k != n):
raise Exception("Dimension of the covariance matrix and data should match")
invcov = cov.T
mean = np.reshape(mean, (1, n))
x = x - (np.ones((d, 1))*mean).T
fact = np.sum(((np.dot(invcov, x))*x), axis = 1)
y = np.exp(-0.5*fact)
y = np.divide(y, math.pow((2*math.pi), n)*np.std(cov))
return y
# feature extraction from the library pyAudioAnalysis
attribute = audioFeatureExtraction.stFeatureExtraction(x, Fs, SIZE_OF_WINDOW, SIZE_OF_STEP)
# relationship between the similarity and the timestamp in the audio
relation = [[1 for col in range(2)] for row in range(attribute.shape[1]/BLOCK_STEP)]
while (END_OF_FILE == 0):
if (FIRST_PAIR == 1):
# for the first pari of the block
block_i_index_start = 0
block_i_index_end = BLOCK_SIZE
block_i_attribute = getMFCCs(block_i_index_start, block_i_index_end)
block_i_mean = np.mean(block_i_attribute, axis=1)
block_i_cov = np.cov(block_i_attribute)
block_i_log_like = np.log(gauss(block_i_attribute, mean=block_i_mean, cov=block_i_cov))
block_j_index_start = block_i_index_end + 1
block_j_index_end = block_j_index_start + BLOCK_SIZE - 1
block_j_attribute = getMFCCs(block_j_index_start, block_j_index_end)
block_j_mean = np.mean(block_j_attribute, axis=1)
block_j_cov = np.cov(block_j_attribute)
block_j_log_like = np.log(gauss(block_j_attribute, mean=block_j_mean, cov=block_j_cov))
FIRST_PAIR = 0
else:
#for the rest of the block
block_j_index_start += BLOCK_STEP
block_j_index_end += BLOCK_STEP
new_attribute = getMFCCs(block_j_index_end-BLOCK_STEP+1, block_j_index_end)
#the following code is for the object that to avoid recalculate the overlap between the block after moved and before moved
block_i_index_start += BLOCK_STEP
block_i_index_end += BLOCK_STEP
block_i_attribute[:,0:block_i_attribute.shape[1]-new_attribute.shape[1]] = block_i_attribute[:,new_attribute.shape[1]:block_i_attribute.shape[1]]
block_i_attribute[:,block_i_attribute.shape[1]-new_attribute.shape[1]:block_i_attribute.shape[1]] = block_j_attribute[:,0:new_attribute.shape[1]]
block_i_mean = np.mean(block_i_attribute, axis=1)
block_i_cov = np.cov(block_i_attribute)
block_i_log_like = np.log(gauss(block_i_attribute, mean=block_i_mean, cov=block_i_cov))
block_j_attribute[:,0:block_j_attribute.shape[1]-new_attribute.shape[1]] = block_j_attribute[:,new_attribute.shape[1]:block_j_attribute.shape[1]]
block_j_attribute[:,block_j_attribute.shape[1]-new_attribute.shape[1]:block_j_attribute.shape[1]] = new_attribute[:,0:new_attribute.shape[1]]
block_j_mean = np.mean(block_j_attribute, axis=1)
block_j_cov = np.cov(block_j_attribute)
block_j_log_like = np.log(gauss(block_j_attribute, mean=block_j_mean, cov=block_j_cov))
block_union_index_start = block_i_index_start
block_union_index_end = block_j_index_end
block_union_attribute = np.concatenate((block_i_attribute, block_j_attribute), axis = 1)
block_union_mean = np.mean(block_union_attribute, axis=1)
block_union_cov = np.cov(block_union_attribute)
block_union_log_like = np.log(gauss(block_union_attribute, mean=block_union_mean, cov=block_union_cov))
relation[INDEX_BOUCLE-1][0] = np.sum(block_i_log_like) + np.sum(block_j_log_like) - np.sum(block_union_log_like)
relation[INDEX_BOUCLE-1][1] = (block_i_index_end + block_j_index_start) / 2 * TIME_OF_STEP
INDEX_BOUCLE += 1
if block_j_index_end + BLOCK_STEP > attribute.shape[1]:
END_OF_FILE = 1
# cut the audio
relation_cut = filter(lambda t: t[0] > 0, relation)
print "=========The data after cutting==========="
for item in relation_cut:
print item
# print getMFCCsFromTime(0.4, 4.0).shape
#===========Kmeans Definition=====================
#calculate Euclidean distance
def euclDistance(vector1, vector2):
return sqrt(sum(power(vector2 - vector1, 2)))
# init centroids with random samples
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape
centroids = zeros((k, dim))
for i in range(k):
index = int(random.uniform(0, numSamples))
centroids[i, :] = dataSet[index, :]
return centroids
# k-means cluster
def kmeans(dataSet, k):
numSamples = dataSet.shape[0]
# first column stores which cluster this sample belongs to,
# second column stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2)))
clusterChanged = True
## step 1: init centroids
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
## for each sample
for i in xrange(numSamples):
minDist = 100000.0
minIndex = 0
## for each centroid
## step 2: find the centroid who is closest
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
## step 3: update its cluster
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2
## step 4: update centroids
for j in range(k):
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
centroids[j, :] = mean(pointsInCluster, axis = 0)
print 'cluster complete!'
return centroids, clusterAssment
def abs (input):
if input <0 : return - input
else : return input
print '======='
temprelation = []
for i in range(len(relation_cut)-3): #-3 because the last 3 numbers are meaningless
temprelation.append([relation_cut[i][0],1])
#temprelation is a list which takes only first colum of relation_cut and add another colum which values are all 1
#This action alow to transefer the MFCC to a 2-dimension data which use for cluster
## step 1: load data
print "step 1: load data..."
dataSet = temprelation
# step 2: clustering...
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
classification =[]
for indexi in range(k):
classification .append(centroids[indexi][0])
## step 3: Mark speaker to each block
relation_cut_regonize = [] #This list is to store the information with different speakers
for indexj in range(len(dataSet)):
min1 = abs(relation_cut[indexj][0]-classification[0])
min2 = abs(relation_cut[indexj][0]-classification[1])
min3 = abs(relation_cut[indexj][0]-classification[2])
min4 = abs(relation_cut[indexj][0]-classification[3])
if min1<min2 and min1 < min3 and min1<min4:
relation_cut_regonize.append([relation_cut[indexj],"Speaker1"])
else:
if min2<min1 and min2 < min3 and min2<min4:
relation_cut_regonize.append([relation_cut[indexj],"Speaker2"])
else:
if min3<min1 and min3 < min2 and min3<min4:
relation_cut_regonize.append([relation_cut[indexj],"Speak3"])
else:
if min4<min1 and min4 < min2 and min4<min3:
relation_cut_regonize.append([relation_cut[indexj],"Speak4"])
#step4: shows the result
print "=========The data after regonizing as 4 speakers==========="
for item in relation_cut_regonize:
print item