/
elbow.py
230 lines (186 loc) · 7.88 KB
/
elbow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# coding:utf-8
# 支持中文显示
from pylab import *
# 指定默认字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
import matplotlib.pyplot as plt
import readAndWriteDataSet
import multiprocessing
import numpy as np
import tools
import kmeans
import pca
# 确定维度,改变聚类中心数量
# 确定聚类中心数量,改变维度
def elbow(channelDataAll, low, high, step, a, iRateOrK, schedule, type):
# 检查参数合理性
if low <= 0:
print(u'下限太低:下限小于等于0!')
return
if type == 0 and high > len(channelDataAll):
print(u'上限太高:聚类中心个数大于数据的个数!')
return
if type == 1 and high >= (shape(channelDataAll[0])[1] / (1 << a)):
print(u'上限太高:降维后维度数大于原数据维度!')
return
# 计算要聚类的总次数
schedule[0] = ((int)((high - low) / step + 1)) * (1 << a)
# 利用SSE选择k
SSE_S = [] # 存放每次结果
SSE_C = [] # 存放每次结果
SSE_U = [] # 存放每次结果
ps = multiprocessing.Pool(4)
# 维度固定iRateOrK == iRate
if type == 0:
for i in range(low, high + 1, step):
# SSE = ps.apply_async(elbowCore, args=(channelDataAll, a, i, iRateOrK, schedule)).get()
SSE = elbowCore(channelDataAll, a, i, iRateOrK, schedule)
SSE_S.append(SSE[0])
SSE_C.append(SSE[1])
SSE_U.append(SSE[2])
plt.xlabel(u'聚类中心数量K')
# 聚类中心数量固定iRateOrK == k
if type == 1:
for i in range(low, high + 1, step):
# SSE = ps.apply_async(elbowCore, args=(channelDataAll, a, iRateOrK, i, schedule)).get()
SSE = elbowCore(channelDataAll, a, iRateOrK, i, schedule)
SSE_S.append(SSE[0])
SSE_C.append(SSE[1])
SSE_U.append(SSE[2])
plt.xlabel(u'保留维度k')
# 修饰一下曲线
for i in range(1, np.shape(SSE_C)[0]):
if SSE_C[i] < SSE_C[i - 1]:
next = i + 1
while next < np.shape(SSE_C)[0] and SSE_C[next] < SSE_C[i - 1]:
next += 1
if (next < np.shape(SSE_C)[0]):
SSE_C[i] = ((SSE_C[next] - SSE_C[i - 1]) / (next - i + 1)) + SSE_C[i - 1]
else:
SSE_C[i] = 2 * SSE_C[i - 1] - SSE_C[i]
for i in range(1, np.shape(SSE_U)[0]):
if SSE_U[i] < SSE_U[i - 1]:
next = i + 1
while next < np.shape(SSE_U)[0] and SSE_U[next] < SSE_U[i - 1]:
next += 1
if (next < np.shape(SSE_U)[0]):
SSE_U[i] = ((SSE_U[next] - SSE_U[i - 1]) / (next - i + 1)) + SSE_U[i - 1]
else:
SSE_U[i] = 2 * SSE_U[i - 1] - SSE_U[i]
ps.close()
ps.join()
X = range(low, high + 1, step)
plt.ylabel(u'特征值保留')
plt.plot(X, SSE_S, 'k-s', label=u'标准PCA')
plt.plot(X, SSE_C, 'r-v', label=u'聚类协方差矩阵')
plt.plot(X, SSE_U, 'b-o', label=u'聚类变换矩阵')
plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=0,
ncol=3, mode="expand", borderaxespad=0.)
plt.show()
print(u'主进程结束!')
# k:聚类中心数量 iRate:维度
def elbowCore(channelDataAll, a, k, iRate, schedule):
n = np.shape(channelDataAll[0])[1] # 列数
p = len(channelDataAll) # 页数
sub = n >> a
rates_C = []
rates_U = []
rates_S = []
for g in range(1 << a):
# 显示进度
schedule[1] += 1
tmpSchedule = schedule[1]
print(u'共' + str(schedule[0]) + u'部分,' + u'第' + str(tmpSchedule) + u'部分开始!')
channelData = []
for h in range(p):
channelDataPage = channelDataAll[h]
channelData.append(channelDataPage[:, g * sub:(g + 1) * sub])
covMatrixList = tools.getCovMatrixList(channelData)
allCovMatrix = tools.matrixListToMatrix(covMatrixList)
# 对协方差进行聚类
centroids, clusterAssment = kmeans.KMeansOushi(allCovMatrix, k)
centroidList = tools.matrixToMatrixList(centroids)
# 计算原信道信息量、协方差矩阵特征值、变换矩阵
informations, SigmaList, UList = tools.getInformations(covMatrixList)
# 分析PCA效果,计算信息量保留程度
tmpRates = pca.pca(channelData, informations, centroidList, clusterAssment, iRate)[3][0][:, 1]
rates_C.append(np.mean(tmpRates))
# 对变换矩阵进行聚类
allU = tools.matrixListToMatrix_U(UList)
weights = tools.matrixListToMatrix_U(SigmaList)
centroids, clusterAssment = kmeans.KMeansOushi_U(allU, k, weights, iRate)
centroidList = tools.matrixToMatrixList_U(centroids)
# 分析PCA效果,计算信息量保留程度
tmpRates = pca.pca_U(channelData, informations, centroidList, clusterAssment, iRate)[3][0][:, 1]
rates_U.append(np.mean(tmpRates))
# 不聚类,直接PCA
tmpRates = pca.pca_S(SigmaList, iRate)[0][:, 1]
rates_S.append(np.mean(tmpRates))
# 显示进度
print(u'共' + str(schedule[0]) + u'部分,' + u'第' + str(tmpSchedule) + u'部分完成,' + u'已完成' + str(schedule[1]) + u'部分,' + u'完成度:' + '%.2f%%' % (schedule[1] / schedule[0] * 100) + u'!')
rate_C = np.mean(rates_C)
rate_U = np.mean(rates_U)
rate_S = np.mean(rates_S)
return rate_S.real, rate_C.real, rate_U.real
# 用标准PCA选择k
def elbow2(channelDataAll, low, high, step, a, schedule):
# 检查参数合理性
if low <= 0:
print(u'下限太低:下限小于等于0!')
return
if high >= (shape(channelDataAll[0])[1] / (1 << a)):
print(u'上限太高:降维后维度数大于原数据维度!')
return
# 计算PCA的总次数
time1 = ((int)((high - low) / step + 1))
time2 = 1 << a
schedule[0] = time2
# 利用SSE选择k
SSE_S = [] # 存放所有结果
rates_S = np.array(np.zeros((time2, time1))) # 存放单次结果
n = np.shape(channelDataAll[0])[1] # 列数
p = len(channelDataAll) # 页数
sub = n >> a
for g in range(time2):
channelData = []
for h in range(p):
channelDataPage = channelDataAll[h]
channelData.append(channelDataPage[:, g * sub:(g + 1) * sub])
covMatrixList = tools.getCovMatrixList(channelData)
# 计算原信道信息量、协方差矩阵特征值、变换矩阵
informations, SigmaList, UList = tools.getInformations(covMatrixList)
for h in range(time1):
tmpRates = pca.pca_S(SigmaList, h * step + low)[0][:, 1]
rates_S[g, h] = np.mean(tmpRates).real
# 显示进度
schedule[1] += 1
print(u'共' + str(schedule[0]) + u'轮,' + u'已完成' + str(schedule[1]) + u'轮,' + u'完成度:' + '%.2f%%' % (
schedule[1] / schedule[0] * 100) + u'!')
for h in range(time1):
SSE_S.append(np.mean(rates_S[:, h]))
plt.xlabel(u'保留维度数k')
X = range(low, high + 1, step)
plt.ylabel(u'特征值保留')
plt.plot(X, SSE_S, 'k-s')
plt.show()
print(u'主进程结束!')
if __name__ == '__main__':
manager = multiprocessing.Manager()
schedule = manager.Array('i', [1, 0])
path = u'/Users/jinruimeng/Downloads/keyan/'
# path = u'E:\\workspace\\keyan\\'
a = 0
# 读取数据
channelDataPath = path + "channelDataP.xlsx"
channelDataAll = readAndWriteDataSet.excelToMatrixList(channelDataPath)
# 确定维度,改变聚类中心数量
iRate = 5
elbow(channelDataAll, 1, 16, 1, a, iRate, schedule, 0)
# 确定聚类中心数量,改变维度
# k = 5
# elbow(channelDataAll, 3, 5, 1, a, k, schedule, 1)
# 用标准PCA选择k
# elbow2(channelDataAll, 1, 11, 2, a, schedule)