def doTC_array(txt_dict, minTC=0, topN=None): """进行TC特征降维""" # 计算各文本各词itc权值的tfidf矩阵 txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int) tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array = dict2Array(tfidf_dict)[0] # 计算各词的单词权值 tc_array = myTC_array(tfidf_array) # 根据TC权值筛选单词 newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN) # 根据新的单词集(特征集)压缩数据 newData = selectData(txt_array, newWordName, oldWordName=wordName, orderchange=False) return newData, newWordName
def test_selectData(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict) newWid = random.sample(range(0, len(wordName)), 300) newWid.sort() newWordname = [wordName[i] for i in newWid] sfdata = selectData_dict(txt_dict, newWordname) sfdata2 = selectData_array(txt_array, newWordname, oldWordName=wordName) sfdata22 = selectData_array(txt_array, newWordname, oldWordName=wordName, orderchange=False) # 当newWordname顺序改变时即orderchange=True # selectData_dict 比 selectFeature_array快 sfdata1 = dict2Array(sfdata)[0] print(sum(sum(sfdata1 - sfdata2)))
def test_myTC(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') # 0.6s tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) tc_dict = myTC_dict(tfidf_dict, rtype=list) tc_array = myTC_array(tfidf_array) # myTC_array 比 myTC_dict 快 # print(sum(tc_dict - tc_array)) # 可以借助dataInfo内的函数、查看TC的数据分布 print(fiveNumber(tc_array)) showDistplot(tc_array)
def doTC_dict(txt_dict, minTC=0, topN=None): # 快 """进行TC特征降维""" # 计算各文本各词itc权值的tfidf矩阵 tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) # 计算各词的单词权值 tc_array = myTC_array(tfidf_array) # 根据TC权值筛选单词 newWordName = selectFeature(tc_array, wordName, minTC=minTC, topN=topN) # 根据新的单词集(特征集)压缩数据 newData = selectData(txt_dict, newWordName) return newData
def test_pca_sklearn(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int) testdata = txt_array[:, 1:500] # newData_dict = doTC_dict(txt_dict, topN=1000) # testdata, txtName, wordName = dict2Array(newData_dict, dtype=int) dataMat = numpy.mat(testdata) print(dataMat.shape) topNfeat = 50 lowDDataMat, redEigVects = myPCA_R(dataMat, topN=topNfeat)
def test_doTC(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') minTC, topN = 0, 10000 tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN) tcData_dict = doTC_dict(txt_dict, minTC, topN) tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict, dtype=int) # doTC_dict 比 doTC_array 快 for j in range(len(tcWordName)): if tcWordName[j] != tcWordName_dict_array[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break print(sum(sum(tcData_dict_array - tcData_array)))
def test_pca(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') # 0.6s txt_array, txtName, wordName = dict2Array(txt_dict, dtype=int) dataMat = numpy.mat(txt_array[:, :1000], dtype=numpy.float64) topN = 100 newData, U, rdata = myPCA(dataMat, topN=topN, onlyNewData=False) newData2, U2, rdata2 = myPCA_R(dataMat, topN=topN, onlyNewData=False) newData3, U3, rdata3 = pca_sklearn(dataMat, topN=topN, onlyNewData=False) # showDiff(newData, newData2) # dd = getDiff(dataMat, newData, U) # rdata = newData * U.T + numpy.mean(dataMat, axis=0) # print(dd,rdata.max()) print(getDiff(dataMat, rdata)) print(getDiff(dataMat, rdata2)) print(getDiff(dataMat, rdata3))
def test_selectFeature(): txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess_test') tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) tc_array = myTC_array(tfidf_array) minTC, topN = 0, 100 tc = tc_array wordAndIdx = list(zip(wordName, tc)) wordAndIdx.sort(key=lambda x: x[1], reverse=True) # 按tc排序 newWordName = [wordAndIdx[i][0] for i in range(topN)] newWordName.sort() idx = tc.argsort() idx = idx[:-topN - 1:-1] idx.sort() newWordName2 = [wordName[i] for i in idx] for j in range(len(newWordName2)): if newWordName2[j] != newWordName[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break
txt_dict = getWordCount('/Users/brobear/OneDrive/data-whitepaper/data/%s' % 'afterProccess') minTC, topN = 0, 10000 tcData_array, tcWordName = doTC_array(txt_dict, minTC, topN) tcData_dict = doTC_dict(txt_dict, minTC, topN) tcData_dict_array, txtName, tcWordName_dict_array = dict2Array(tcData_dict, dtype=int) # doTC_dict 比 doTC_array 快 for j in range(len(tcWordName)): if tcWordName[j] != tcWordName_dict_array[j]: print("%d tcWordName[i]!=tcWordName_dict_array" % j) break print(sum(sum(tcData_dict_array - tcData_array))) if __name__ == '__main__': txt_dict = getWordCount( '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2') tfidf_dict = myTFIDF(txt_dict, itc=True) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) # 计算各词的单词权值 tc_array = myTC_array(tfidf_array) showDistplot(tc_array) tc_array.sort() tc_array = tc_array[::-1] from matplotlib import pyplot as plt plt.plot(range(len(tc_array)), tc_array) plt.ylim(0, 200) plt.show()
clusterLabel_map[minIndex].append(i) else: clusterLabel_map[minIndex] = [i] # 更新中心 for i in range(k): Cent[i, :] = numpy.mean(data[clusterLabel_map[i], :], axis=0) print(iter) return clusterLabel if __name__ == '__main__': outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2' txt_dict = getWordCount(outDir) tfidf_dict = myTFIDF(txt_dict, itc=False) data, textNames, wordName = dict2Array(tfidf_dict) # 降维 topN = 1200 data, textNames = PCA(txt_dict, topN=topN, itc=False)[:2] # 确定特征维数 for x in [i * 0.1 for i in range(1, 10)]: data, textNames = PCA(txt_dict, topN=x, itc=False)[:2] print(x, data.shape) # 结果:0.1 74 0.2 204 0.3 357 0.4 519 0.5 684 0.6 851 0.7 1022 0.8 1198 0.9 1387 # [74, 204, 357, 519, 684, 851, 1022, 1198, 1387] # # # # 肘方法看k值 # kList = range(5, 40, 1) # d = [] # for k in kList:
def TC_PCA(txt_dict, minTC=0, topN=None, itc=False): # 45s newData_dict = doTC_dict(txt_dict, minTC=minTC) tfidf_dict = myTFIDF(newData_dict, itc=itc) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) newData_mat = pca_sklearn(tfidf_array, topN=topN) return newData_mat, txtName
def PCA(txt_dict, topN=None, itc=False): # 137s tfidf_dict = myTFIDF(txt_dict, itc=itc) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) newData_mat = pca_sklearn(tfidf_array, topN=topN) return newData_mat, txtName
def TC(txt_dict, topN): # 7.6S newData_dict = doTC_dict(txt_dict, topN=topN) tfidf_dict = myTFIDF(newData_dict, itc=False) tfidf_array, txtName, wordName = dict2Array(tfidf_dict) newData_mat = numpy.mat(tfidf_array) return newData_mat, txtName, wordName
def show_wbpj(y_pre, textNames): """显示外部评价""" # 获取标签 y_true_dict = getTags(textNames) # 非类别标签 deleteTag = [ 'Other', 'Platform', 'Cryptocurrency', 'Business services', 'Investment', 'Smart Contract', 'Software', 'Internet', 'Infrastructure', 'Entertainment' ] for v in y_true_dict.values(): for i in deleteTag: if i in v: v.remove(i) y_pre2true = {} for i in range(len(textNames)): if y_pre[i] not in y_pre2true: y_pre2true[y_pre[i]] = [] y_pre2true[y_pre[i]].append(y_true_dict[textNames[i]]) y_pre2true_dict = {} for k, v in y_pre2true.items(): y_pre2true_dict[k] = {} vv = [i for i in v if i != []] for i in v: for j in i: y_pre2true_dict[k][j] = y_pre2true_dict[k].get( j, 0.0) + 1 / len(i) / len(vv) y_pre2true_array, cid, tagList = dict2Array(y_pre2true_dict) colorListAll = [ 'yellowgreen', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgreen', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'green', 'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgreen', 'lightgray', 'lightpink', 'lightsalmon', 'lightseagreen', 'lightskyblue', 'lightslategray', 'lightsteelblue', 'lightyellow', 'lime', 'limegreen', 'linen', 'magenta', 'maroon', 'mediumaquamarine', 'mediumblue', 'mediumorchid', 'mediumpurple', 'mediumseagreen', 'mediumslateblue', 'mediumspringgreen', 'mediumturquoise', 'mediumvioletred', 'midnightblue', 'mintcream', 'mistyrose', 'moccasin', 'navajowhite', 'navy', 'oldlace', 'olive', 'olivedrab', 'orange', 'orangered', 'orchid', 'palegoldenrod', 'palegreen', 'paleturquoise', 'palevioletred', 'papayawhip', 'peachpuff', 'peru', 'pink', 'plum', 'powderblue', 'purple', 'red', 'rosybrown', 'royalblue', 'saddlebrown', 'salmon', 'sandybrown', 'seagreen', 'seashell', 'sienna', 'silver', 'skyblue', 'slateblue', 'slategray', 'snow', 'springgreen', 'steelblue', 'tan', 'teal', 'thistle', 'tomato', 'turquoise', 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'aliceblue' ] # colorList = random.sample(colorListAll, y_pre2true_array.shape[1]) colorList = [colorListAll[i * 4] for i in range(y_pre2true_array.shape[1])] for i in range(y_pre2true_array.shape[1] - 1, -1, -1): plt.bar(cid, +y_pre2true_array[:, i], color=colorList[i], edgecolor='black', alpha=0.9) plt.legend(tagList[::-1], bbox_to_anchor=(0, 1), loc=3, ncol=3, borderaxespad=0) plt.show() y_pre2true_array2 = y_pre2true_array.copy() for i in range(1, y_pre2true_array2.shape[1]): y_pre2true_array2[:, i] += y_pre2true_array2[:, i - 1] for i in range(y_pre2true_array.shape[1] - 1, -1, -1): plt.bar(cid, +y_pre2true_array2[:, i], color=colorList[i], edgecolor='black', alpha=1) plt.legend(tagList[::-1], bbox_to_anchor=(0, 1), loc=3, ncol=3, borderaxespad=0) plt.show()