def clusterResult(doc={}, num=5, sim_num=3): # 此时doc为原文本(经过预处理分词) # 对文本进行处理,获取tf——idf,使用w2v进行扩容 # 预计取前5个,扩容为3个 # num 为 keyword数量 # sim_num 为 扩容数 model = w2v.load_model_binary(r"") doc = tfidf.expend_word(model, doc, num, sim_num) # 返回对应的聚类结果 metrix = [] ldaa.doc = doc # 获取lda模型和词袋 print("创建主题模型") lda, corpus = ldaa.ldaAdapter(k1) for index, values in enumerate(lda.inference(corpus)[0]): topicmatrix = [] # 对应文档的分布 for topic, value in enumerate(values): topicmatrix.append(value) metrix.append(topicmatrix) # 将metrix送入kMeans进行聚类 print("开始kMeans聚类") data = np.array(metrix) estimator = kmn.kMeansByFeature(k2, metrix) labels = list(estimator.labels_) return labels
def clusterResult(doc={}, num=3, save_path=r"", result_save=False): # 处理文档,返回聚类标签 word_list, weight = tfidf.cal_tf_idf(doc) model = w2v.load_model_binary(save_path) print("开始生成vec_list") vec_list = w2v.wordlist_to_vec(model, word_list) # 二维矩阵 print("完成生成vec_list,计算doc_vec") # 每个文档进行处理,生成向量 doc_vec = [] t = 0 for doc_weight in weight: if t % 10 == 0: print("turn", t) t += 1 doc_vec_list = [] for i in range(len(doc_weight)): mul = doc_weight[i] doc_vec_list.append([x * mul for x in vec_list[i]]) simple_vec = np.array([0.0] * len(doc_vec_list[0])) for vec in doc_vec_list: # 使用np进行矩阵相加 simple_vec += np.array(vec) simple_vec = list(simple_vec) doc_vec.append(simple_vec) if result_save: save_dv(doc_vec) # 进行聚类 estimator = kmn.kMeansByFeature(num, doc_vec) labels = list(estimator.labels_) return labels
def clusterResultByFile(num=3): # 直接从文件中读取分布信息计算 doc_vec = load_dv() estimator = kmn.kMeansByFeature(num, doc_vec) labels = list(estimator.labels_) return labels
def lda_kmn_result(doc, iterator=1000): # 返回对应的聚类结果 # 获取lda模型和词袋 print("创建主题模型") word_list, r_model = ldaa.lda_model(doc, k1, iterator) # 获取文档——主题分布 doc_topic = r_model.doc_topic_ # 转为普通list进行聚类 doc_topic_list = np.array(doc_topic).tolist() estimator = kmn.kMeansByFeature(k2, doc_topic_list) labels = estimator.labels_ return list(labels)
def lda_kmn_result(k, topic, doc, former, iterator=1000): # 返回对应的聚类结果 # 获取lda模型和词袋 print("创建主题模型") word_list, r_model = ldaa.lda_model(doc, topic, iterator) # 获取文档——主题分布 doc_topic = r_model.doc_topic_ # 转为普通list进行聚类 doc_topic_list = np.array(doc_topic).tolist() result = np.zeros(6) for i in range(30): estimator = kmn.kMeansByFeature(topic, doc_topic_list) labels = estimator.labels_ # assign_labels="discretize", #labels = SC(assign_labels="discretize",gamma=1e-7, n_clusters=k).fit_predict(doc_topic) result += cr.printResult(k, labels, former) return result / 30
def clusterResult_gibbs(k, topic, model, doc, num=5, sim_num=3, iterator=500): # 此时doc为原文本(经过预处理分词) # 对文本进行处理,获取tf——idf,使用w2v进行扩容 # 预计取前5个,扩容为3个 # num 为 keyword数量 # sim_num 为 扩容数 # model = w2v.load_model_binary(r"E:\学校\快乐推荐\word2vec\saveVec") print("拓展文档语料") doc = tfidf.expend_word(model, doc, num, sim_num) # 返回对应的聚类结果 # 获取lda模型和词袋 print("创建主题模型") word_list, r_model = ldaa.lda_model(doc, k, iterator) # 获取文档——主题分布 doc_topic = r_model.doc_topic_ writeResult(doc_topic,"12_LDA+wiki.txt"); # 转为普通list进行聚类 doc_topic_list = np.array(doc_topic).tolist() estimator = kmn.kMeansByFeature(topic, doc_topic_list) labels = estimator.labels_ return list(labels)
def clusterResult_gibbs(model, doc={}, num=5, sim_num=3, iterator=500): # 此时doc为原文本(经过预处理分词) # 对文本进行处理,获取tf——idf,使用w2v进行扩容 # 预计取前5个,扩容为3个 # num 为 keyword数量 # sim_num 为 扩容数 # model = w2v.load_model_binary(r"") print("拓展文档语料") doc = tfidf.expend_word(model, doc, num, sim_num) # 返回对应的聚类结果 # 获取lda模型和词袋 print("创建主题模型") word_list, r_model = ldag.lda_model(doc, k1, iterator) # 获取文档——主题分布 doc_topic = r_model.doc_topic_ # 转为普通list进行聚类 doc_topic_list = np.array(doc_topic).tolist() estimator = kmn.kMeansByFeature(k2, doc_topic_list) labels = estimator.labels_ return list(labels)
import cluster.cluster_result as cr import data.data_util as du import model.TF_IDFAdapter as tfidf import numpy as np from cluster import kmeans as kmn if __name__ == "__main__": k = 4 filename = "data4.csv" # 4 diping 3 8,12 diping 5 doc = du.getDocAsWordArray(filename, 3) former = du.getFormerCategory(filename) word_dic, weight = tfidf.cal_tf_idf(doc) result = np.zeros(6) for i in range(30): estimator = kmn.kMeansByFeature(k, weight) labels = estimator.labels_ result += cr.printResult(k, labels, former) result = result / 30 print(result)
print("开始模型训练") b_model = BtmVnModel(t_k, t_doc_word_id, t_voc_list, t_word_sim_matrix, iterate=t_iterate, threshold=t_threshold, miu=t_miu) b_model.buildModel() print("训练完成") dis = b_model.getDoc_Topic() print("完成文档主题文档获取") cluster_result = kmn.kMeansByFeature(t_k, dis).labels_ former_type = du.getFormerCategory("C10.csv") result = wce.printResult(t_k, cluster_result, former_type) print(result) # t_word = b_model.getTopic_word(10) suffix = voc_filename[str(voc_filename).rindex("_") + 1:str(voc_filename).rindex(".")] dis_save = "{}_iterate{}_threshold{}_miu{}_result.txt".format( suffix, t_iterate, t_threshold, t_miu) tp_save = "{}_iterate{}_threshold{}_miu{}_tp.txt".format( suffix, t_iterate, t_threshold, t_miu) # write_Doc_Topic_Matrix(dis, dis_save) # write_Topic_word(b_model.topic_word, tp_save)