def condition_HXD(d, a, fenmu): ''' 计算H X|D :param d: list/1 array -- 类标签(跟a等长度) :param a: list/1 array fenmu : 存储密度*n 的一组数据 ----- ndarry 一维数组 :return: 条件熵 H(D|X) ''' d1 = np.unique(d) # 将d中重复数值剔除 # 形成字典,key:类标签值(不重复) dD = {} lD = dict.fromkeys(d1, 0) # 计数每个类别的数个数 dexD = {} # 用来存储对应被分类元素的下标 # 开始将a分类 for i in range(0, len(a)): dD.setdefault(d[i], []).append(a[i]) # 将a的值逐个写入dD中 dexD.setdefault(d[i],[]).append(i) # a下标值写入 lD[d[i]] += 1 atr = np.array(a) # ndarray aa2 = atr.reshape(len(a), 1) # 2d ndarray # 计算数据中标签(d/xi)的条件密度 && (xi|d)分子 # 放入list中 l = list() print("lenlD: ",len(lD)) for j in lD: at = dD[j]; # list aa = np.array(at) # ndarray aa2t = aa.reshape(len(aa), 1) # 2d ndarray h = h1decision(np.array(at)) # 全体连续数据的h print("at: ",at) print("h: ",h) pad = kde.KernelDensity(kernel='gaussian', bandwidth=h).fit(aa2t).score_samples(aa2) # p(x,d # print("log_pxd",log_pad) pad = np.exp(pad) tmp = len(at) * pad # p d|x 的分子 # print("p",tmp) l.append(tmp) # 每个标签的每个数据计算出的分子组成之一 print(j," done") print("sum tmp: ", sum(tmp)) pdx = p_d_a(l, fenmu) lpdx = # 计算条件熵 ree = 0 # print("ttest",log_pad[0],type(log_pad[0])) # float # print("tteste",log_pad[0][0],type(log_pad[0][0])) # print("ttts", pdx[0],type(pdx[0])) #ndarry # print("llltt",np.array(pdx[0]),type(np.array(pdx[0]))) #ndarray for k in range(0, len(pad)): nlog_pad = np.array(pad[k]) npda = np.array(pdx[k]) ree += npda.dot(nlog_pad) # print("reee",ree,type(ree)) hxd = (-1/len(a)) * ree return hxd
def kde2D(x, y, bandwidth, xbins=100j, ybins=100j, **kwargs): xx, yy = np.mgrid[x.min():x.max():xbins, y.min():y.max():ybins] xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T xy_train = np.vstack([y, x]).T kde_skl = kde.KernelDensity(bandwidth=bandwidth, kernel="gaussian") kde_skl.fit(xy_train) z = np.exp(kde_skl.score_samples(xy_sample)) return xx, yy, np.reshape(z, xx.shape)
def kernel_gaussian(x_train, y_train): # 分别提取三个类别的数据 index1 = [] index2 = [] index3 = [] for each in np.arange(0, len(y_train)): if y_train[each] == 1: index1.append(each) if y_train[each] == 2: index2.append(each) if y_train[each] == 3: index3.append(each) data1_train = x_train[index1, :] data2_train = x_train[index2, :] data3_train = x_train[index3, :] # 估计模型 clf1 = kde.KernelDensity(kernel='gaussian', bandwidth=0.4).fit(data1_train) clf2 = kde.KernelDensity(kernel='gaussian', bandwidth=0.4).fit(data2_train) clf3 = kde.KernelDensity(kernel='gaussian', bandwidth=0.4).fit(data3_train) return clf1, clf2, clf3
def ClusterOne(path_In_Source, path_In_Text, path_In_Product, path_In_AllPicture, path_Out_Product, path_Out_Text): TotalData = ReadData(path_In_Source, path_In_Text, path_In_Product, path_In_AllPicture) data = TotalData[0] output = [] outputP = [] outputT = [] path = path_Out_Product # 产品位置聚类 ProductPosition = [] for ccData in data: ZPData = [] for each in ccData: ZPData.append([(each[2][0] + each[2][1]) / 2, (each[2][2] + each[2][3]) / 2]) pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.3).fit(ZPData) inputData = [] iData = [] for a1 in np.linspace(0, 1, 50): temp = [] for a2 in np.linspace(0, 1, 50): inputData.append([a1, a2]) temp.append(a2) iData.append(temp) iData = np.array(iData) outputData = pp.score_samples(inputData) count = 0 tenData = [] TTData = [] for each in outputData: tenData.append(pow(np.e, each)) count += 1 if count == 50: TTData.append(tenData) tenData = [] count = 0 TTData = np.array(TTData) yData = iData.T xData = iData position = np.argmax(TTData) y = position % 50 - 1 x = int((position - (y + 1)) / 50) ProductPosition.append([yData[x][y], xData[x][y]]) # 产品长宽聚类 ProductAll = [] f2 = open(path, 'w') kP = 0 for ccData in data: ppData = [] for each in ccData: ppData.append([each[2][1] - each[2][0], each[2][3] - each[2][2]]) pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(ppData) inputData = [] for a1 in np.linspace(0, 1, 100): for a2 in np.linspace(0, 1, 100): inputData.append([a1, a2]) outputData = pp.score_samples(inputData) density = [] for each in outputData: density.append(pow(np.e, each)) index = density.index(max(density)) pw = np.linspace(0, 1, 100)[index % 100] pl = np.linspace(0, 1, 100)[int((index - pw) / 100)] ProductAll.append( [ProductPosition[kP][0], ProductPosition[kP][1], pl, pw]) f2.write('\n' + str(ProductPosition[kP][0]) + ' ' + str(ProductPosition[kP][1]) + ' ' + str(pl) + ' ' + str(pw)) outputP.append( [ProductPosition[kP][0], ProductPosition[kP][1], pl, pw]) kP += 1 f2.close() # 图片筛选1 NewData = [] th = 0.8 cNum = 0 for ccData in data: cSave = ccData[0] yS1 = ProductAll[cNum][0] - ProductAll[cNum][2] * 0.5 yS2 = ProductAll[cNum][0] + ProductAll[cNum][2] * 0.5 xS1 = ProductAll[cNum][1] - ProductAll[cNum][3] * 0.5 xS2 = ProductAll[cNum][1] + ProductAll[cNum][3] * 0.5 NewCCData = [] for each in ccData: pData = each[2] pY = (pData[0] + pData[1]) / 2 pX = (pData[2] + pData[3]) / 2 OArea = OverlapArea([yS1, yS2, xS1, xS2], pData) SArea = (yS2 - yS1) * (xS2 - xS1) PArea = (pData[1] - pData[0]) * (pData[3] - pData[2]) if OArea / SArea >= th and OArea / PArea >= th: NewCCData.append(each) pData1 = cSave[2] OArea1 = OverlapArea([yS1, yS2, xS1, xS2], pData1) SArea1 = (yS2 - yS1) * (xS2 - xS1) PArea1 = (pData1[1] - pData1[0]) * (pData1[3] - pData1[2]) if OArea / SArea + OArea / PArea > OArea1 / SArea1 + OArea1 / PArea1: cSave = each if len(NewCCData) == 0: NewCCData.append(cSave) NewData.append(NewCCData) cNum += 1 # 文字位置聚类 TextPosition = [] k = 0 for ttData in NewData: TRPData = [] for each in ttData: TRPData.append([ (each[1][0] + each[1][1]) / 2 - ProductPosition[k][0], (each[1][2] + each[1][3]) / 2 - ProductPosition[k][1] ]) k += 1 pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(TRPData) inputData = [] for a1 in np.linspace(-1, 1, 100): for a2 in np.linspace(-1, 1, 100): inputData.append([a1, a2]) outputData = pp.score_samples(inputData) density = [] for each in outputData: density.append(pow(np.e, each)) index = density.index(max(density)) xR = np.linspace(-1, 1, 100)[index % 100] yR = np.linspace(-1, 1, 100)[int((index - xR) / 100)] TextPosition.append([yR, xR]) # 文字大小聚类 path = path_Out_Text f3 = open(path, 'w') kT = 0 for ttData in NewData: tpData = [] for each in ttData: tpData.append([each[1][1] - each[1][0], each[1][3] - each[1][2]]) pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.5).fit(tpData) inputData = [] for a1 in np.linspace(0, 1, 100): for a2 in np.linspace(0, 1, 100): inputData.append([a1, a2]) outputData = pp.score_samples(inputData) density = [] for each in outputData: density.append(pow(np.e, each)) index = density.index(max(density)) tw = np.linspace(0, 1, 100)[index % 100] tl = np.linspace(0, 1, 100)[int((index - tw) / 100)] f3.write('\n' + str(TextPosition[kT][0]) + ' ' + str(TextPosition[kT][1]) + ' ' + str(tl) + ' ' + str(tw)) outputT.append([TextPosition[kT][0], TextPosition[kT][1], tl, tw]) kT += 1 f3.close() output.append(outputP) output.append(outputT) return output
return [hr, wr] TotalData = ReadData() data = TotalData[0] path = "G:/浙大实习/text_product聚类/One2TwoProduct.txt" # 产品位置聚类 ProductPosition = [] for ccData in data: ZPData = [] for each in ccData: ZPData.append([(each[2][0] + each[2][1]) / 2, (each[2][2] + each[2][3]) / 2]) pp = kde.KernelDensity(kernel='gaussian', bandwidth=0.3).fit(ZPData) inputData = [] iData = [] for a1 in np.linspace(0, 1, 50): temp = [] for a2 in np.linspace(0, 1, 50): inputData.append([a1, a2]) temp.append(a2) iData.append(temp) iData = np.array(iData) outputData = pp.score_samples(inputData) count = 0 tenData = [] TTData = [] for each in outputData: tenData.append(pow(np.e, each))
def estimate_pdf_kde(samples): kde_estimator = kde.KernelDensity(kernel='gaussian', bandwidth=0.2) kde_estimator.fit(samples) return kde_estimator
def cal_prob_smooth(vec, sample): model = kde.KernelDensity(kernel='gaussian', bandwidth=0.2).fit(sample) prob = np.exp(model.score_samples(vec.reshape(1, -1))) return prob
10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9 ], 0) X2 = np.delete(X2, [ 10 * t, 10 * t + 1, 10 * t + 2, 10 * t + 3, 10 * t + 4, 10 * t + 5, 10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9 ], 0) X3 = np.delete(X3, [ 10 * t, 10 * t + 1, 10 * t + 2, 10 * t + 3, 10 * t + 4, 10 * t + 5, 10 * t + 6, 10 * t + 7, 10 * t + 8, 10 * t + 9 ], 0) test1 = A[t * 10:t * 10 + 10, 0:4].copy() test2 = A[50 + t * 10:60 + t * 10, 0:4].copy() test3 = A[100 + t * 10:110 + t * 10, 0:4].copy() pattern1 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X1) pattern2 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X2) pattern3 = kde.KernelDensity(kernel="gaussian", bandwidth=h).fit(X3) log_dens11 = pattern1.score_samples(test1) log_dens12 = pattern2.score_samples(test1) log_dens13 = pattern3.score_samples(test1) log_dens21 = pattern1.score_samples(test2) log_dens22 = pattern2.score_samples(test2) log_dens23 = pattern3.score_samples(test2) log_dens31 = pattern1.score_samples(test3) log_dens32 = pattern2.score_samples(test3) log_dens33 = pattern3.score_samples(test3)
from sklearn.neighbors import kde import numpy as np import matplotlib.pyplot as plt X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) X = np.random.normal(0, 3, 30)[:, np.newaxis] X = np.concatenate((np.random.normal(0, 1, 30), np.random.normal(5, 1, 70)))[:, np.newaxis] kde = kde.KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X) log_dens = kde.score_samples(X) print(log_dens) print(np.exp(kde.score_samples(X))) score = kde.score([[5]]) print(score) print(np.exp(score)) fig, ax = plt.subplots() ax.plot(X[:, 0], np.exp(log_dens), '*', label="kernel = '{0}'".format('gaussian')) ax.set_xlim(-4, 10) ax.set_ylim(-0.02, 1) plt.show()