def test_myTFIDF(): txtdict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') # 0.6s data, txtName, wordName = dict2Array(txtdict, dtype=int) tfidf2 = myTFIDF_array(data, itc=True) tfidf1 = myTFIDF(txtdict, itc=True) # myTFIDF_dict 比 myTFIDF_array 快 dd = dict2Array(tfidf1)[0] cc = dd - tfidf2 fdd = numpy.abs(cc) print(sum(sum(fdd))) # 误差在1e-15*n 浮点运算 精度下降
def test_myTC(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') # 0.6s tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) tc_dict = myTC_dict(tfidf_dict, rtype=list) tc_array = myTC_array(tfidf_array) # myTC_array 比 myTC_dict 快 # print(sum(tc_dict - tc_array)) # 可以借助dataInfo内的函数、查看TC的数据分布 print(fiveNumber(tc_array)) showDistplot(tc_array)
def test_pca_sklearn(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int) testdata = txt_array[:, 1:500] # newData_dict = doTC_dict(txt_dict, topN=1000) # testdata, txtName, wordName = dict2Array(newData_dict, dtype=int) dataMat = numpy.mat(testdata) print(dataMat.shape) topNfeat = 50 lowDDataMat, redEigVects = myPCA_R(dataMat, topN=topNfeat)
def test_doTC(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') minTC, topN = 0, 10000 tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN) tcData_dict = doTC_dict(txt_dict, minTC, topN) tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict, dtype=int) # doTC_dict 比 doTC_array 快 for j in range(len(tcWordName)): if tcWordName[j] != tcWordName_dict_array[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break print(sum(sum(tcData_dict_array - tcData_array)))
def test_pca(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int) dataMat = numpy.mat(txt_array[:, :1000], dtype=numpy.float64) topN = 100 newData, U, rdata = myPCA(dataMat, topN=topN, onlyNewData=False) newData2, U2, rdata2 = myPCA_R(dataMat, topN=topN, onlyNewData=False) newData3, U3, rdata3 = pca_sklearn(dataMat, topN=topN, onlyNewData=False) # showDiff(newData, newData2) # dd = getDiff(dataMat, newData, U) # rdata = newData * U.T + numpy.mean(dataMat, axis=0) # print(dd,rdata.max()) print(getDiff(dataMat, rdata)) print(getDiff(dataMat, rdata2)) print(getDiff(dataMat, rdata3))
def test_selectData(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict) newWid = random.sample(range(0, len(wordName)), 300) newWid.sort() newWordname = [wordName[i] for i in newWid] sfdata = selectData_dict(txt_dict, newWordname) sfdata2 = selectData_array(txt_array, newWordname, oldWordName=wordName) sfdata22 = selectData_array(txt_array, newWordname, oldWordName=wordName, orderchange=False) # 当newWordname顺序改变时即orderchange=True # selectData_dict 比 selectFeature_array快 sfdata1 = dict2Array(sfdata)[0] print(sum(sum(sfdata1 - sfdata2)))
def test_selectFeature(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) tc_array = myTC_array(tfidf_array) minTC, topN = 0, 100 tc = tc_array wordAndIdx = list(zip(wordName, tc)) wordAndIdx.sort(key=lambda x: x[1], reverse=True) # 按tc排序 newWordName = [wordAndIdx[i][0] for i in range(topN)] newWordName.sort() idx = tc.argsort() idx = idx[:-topN - 1:-1] idx.sort() newWordName2 = [wordName[i] for i in idx] for j in range(len(newWordName2)): if newWordName2[j] != newWordName[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break
def do_treecluster_images(): """特征维度对各层次聚类的影响""" outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2' txt_dict = getWordCount(outDir) xx = range(100, 1000, 100) xx = [300, 600] for topN in xx: data, textNames = TC(txt_dict, topN=topN)[:2] # # 不降维 # tfidf_dict = myTFIDF(txt_dict, itc=False) # data, textNames, wordName = dict2Array(tfidf_dict) # method 's': 最小距离法 'm': 最大距离法 'c': 重心法 'a': 类平均法 # dist e 欧式距离 u 余弦距离 tree = treecluster(data=data, method='m', dist='e') # tree2 = treecluster(data=data, method='s', dist='e') # tree3 = treecluster(data=data, method='a', dist='e') # tree4 = treecluster(data=data, method='c', dist='e') args = range(2, 50) # args = list(range(2, 15, 3)) + [21, 27, 30, 40, 50, 60, 70, 80, 100, 150, 250] d = [[], [], [], [], []] # 轮廓系数 ksize = [[], [], [], [], []] # 最大类的大小 for k in args: clusterid = tree.cut(nclusters=k) d[0].append(silhouette_score(data, clusterid, metric='euclidean')) ksize[0].append(max(size_of_cluster(clusterid))) clustering = AgglomerativeClustering(linkage='ward', n_clusters=k) # ['ward','complete','average'] clustering.fit(data) d[1].append(silhouette_score(data, clustering.labels_, metric='euclidean')) ksize[1].append(max(size_of_cluster(clustering.labels_))) # clusterid2 = tree2.cut(nclusters=k) # d[2].append(silhouette_score(data, clusterid2, metric='euclidean')) # ksize[2].append(max(size_of_cluster(clusterid2))) # clusterid3 = tree3.cut(nclusters=k) # d[3].append(silhouette_score(data, clusterid3, metric='euclidean')) # ksize[3].append(max(size_of_cluster(clusterid3))) # clusterid4 = tree4.cut(nclusters=k) # d[4].append(silhouette_score(data, clusterid4, metric='euclidean')) # ksize[4].append(max(size_of_cluster(clusterid4))) # d[2].append(hierarchical(data, k, 'complete'))#m,e # d[3].append(hierarchical(data, k, 'average'))#a,e # 用subplot()方法绘制多幅图形 plt.figure(figsize=(6, 6)) # 创建第一个画板 plt.figure(1) # 将第一个画板划分为2行1列组成的区块,并获取到第一块区域 ax1 = plt.subplot(211) realN = 0 # 在第一个子区域中绘图 for di in d: if len(di) > 1: plt.plot(args, di, marker='o') realN += 1 # plt.legend(xx) plt.legend(range(realN)) plt.xlabel = 'k' plt.ylabel = 'silhouette' # plt.ylim(-1, 1) # 选中第二个子区域,并绘图 ax2 = plt.subplot(212) for di in ksize: if len(di) > 1: plt.plot(args, di, marker='o') plt.legend(range(realN)) plt.xlabel = 'k' plt.ylabel = 'MAXcluster' # plt.ylim(0, 2000) ax1.set_title('feature number=%d by TC' % topN) ax2.set_title("max size of clusters") plt.savefig('./treecluster_images/feature number=%d by TC 1<k<50' % topN) plt.show()
txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') minTC, topN = 0, 10000 tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN) tcData_dict = doTC_dict(txt_dict, minTC, topN) tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict, dtype=int) # doTC_dict 比 doTC_array 快 for j in range(len(tcWordName)): if tcWordName[j] != tcWordName_dict_array[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break print(sum(sum(tcData_dict_array - tcData_array))) if __name__ == '__main__': txt_dict = getWordCount( '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2') tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) # 计算各词的单词权值 tc_array = myTC_array(tfidf_array) showDistplot(tc_array) tc_array.sort() tc_array = tc_array[::-1] from matplotlib import pyplot as plt plt.plot(range(len(tc_array)), tc_array) plt.ylim(0, 200) plt.show()
clusterLabel[i] = minIndex if minIndex in clusterLabel_map: clusterLabel_map[minIndex].append(i) else: clusterLabel_map[minIndex] = [i] # 更新中心 for i in range(k): Cent[i, :] = numpy.mean(data[clusterLabel_map[i], :], axis=0) print(iter) return clusterLabel if __name__ == '__main__': outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2' txt_dict = getWordCount(outDir) tfidf_dict = myTFIDF(txt_dict, itc=False) data, textNames, wordName = dict2Array(tfidf_dict) # 降维 topN = 1200 data, textNames = PCA(txt_dict, topN=topN, itc=False)[:2] # 确定特征维数 for x in [i * 0.1 for i in range(1, 10)]: data, textNames = PCA(txt_dict, topN=x, itc=False)[:2] print(x, data.shape) # 结果:0.1 74 0.2 204 0.3 357 0.4 519 0.5 684 0.6 851 0.7 1022 0.8 1198 0.9 1387 # [74, 204, 357, 519, 684, 851, 1022, 1198, 1387] # # # # 肘方法看k值 # kList = range(5, 40, 1)
@log("Feature_useTime") def PCA(txt_dict, topN=None, itc=False): # 137s tfidf_dict = myTFIDF(txt_dict, itc=itc) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) newData_mat = pca_sklearn(tfidf_array, topN=topN) return newData_mat, txtName @log("Feature_useTime") def TC_PCA(txt_dict, minTC=0, topN=None, itc=False): # 45s newData_dict = doTC_dict(txt_dict, minTC=minTC) tfidf_dict = myTFIDF(newData_dict, itc=itc) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) newData_mat = pca_sklearn(tfidf_array, topN=topN) return newData_mat, txtName if __name__ == '__main__': txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') # 0.6s topN = 1800 # newData_mat, txtName, wordName = TC(txt_dict, topN) # newData_mat2, txtName2 = PCA(txt_dict, topN=topN) newData_mat3, txtName3 = TC_PCA(txt_dict, minTC=0, topN=topN) # numpy.savetxt('data_TC_1800', newData_mat, delimiter=",") # numpy.savetxt('data_PCA_1800', newData_mat2, delimiter=",") numpy.savetxt('data_TC_PCA_1800', newData_mat3, delimiter=",") # TEST # txt_dict_test = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s_test' % 'afterProccess') # newData_mat3, txtName3, wordName3 = TC(txt_dict_test, topN) # numpy.savetxt('data_test', newData_mat3, delimiter=",")