def visualize(dir, dataset_id): # prefix_list = ['*', '*/information_retrieval', '*/information_retrieval/web_search'] params = load_init_params(dataset_id) top_name = params['dataset_top_name'] name = "" result_file = os.path.join(dir, 'result' + name + '.txt') main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-3', min_level=0, max_level=2) main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-4', min_level=0, max_level=3) main(top_name, result_file, dir + "/SpanningTree" + name + "-" + dir[-14:-6] + '-our-overall-5', min_level=0, max_level=10)
def generate_tree(spanning_tree, dataset_id): """ 生成具有层次的树结构,前提是确定了根节点 :param spanning_tree: 无向无环图 :return: 层次结构列表 """ params = load_init_params(dataset_id) top = params['dataset_top_name'] used = [] top_tree = ["*/top"] temp = ["*/" + top] print("构建树开始") while len(temp) != 0: print(temp) print(top_tree) print("==========") new_temp = [] for item in temp: item_list = item.split("/") leave = item_list[-1] if top == leave: item_list.remove(top) for tuple in spanning_tree: item_1, item_2, _, _, _ = tuple if item_1 in used or item_2 in used: continue if leave == item_1: new_edges = "/".join(item_list) + "/" + item_2 new_temp.append(new_edges) top_tree.append(new_edges) print(new_edges) elif leave == item_2: new_edges = "/".join(item_list) + "/" + item_1 new_temp.append(new_edges) top_tree.append(new_edges) print(new_edges) used.append(leave) temp = new_temp return top_tree
def generateTree(dataset, dataset_id): """ :param dataset: 数据集名称 :param dataset_id: 具体数据集领域名称 :return: 无返回值,生成文件 """ params = load_init_params(dataset_id) min_kl = params['min_kl'] data_path = os.path.join("./data/raw_data", dataset, dataset_id + "_0.csv") word_index_path = os.path.join("./data/processed_data", dataset, dataset_id + "_word_index.txt") index_dict_path = os.path.join("./data/processed_data", dataset, dataset_id + "_index_dict.txt") mi_matrix_path = os.path.join('./data/processed_data', dataset, dataset_id + '_mi_matrix.csv') kl_matrix_path = os.path.join('./data/processed_data', dataset, dataset_id + '_kl_matrix.csv') folder = create_dir() # 生成边信息和节点信息 nodes, edges = generate_nodes_edges(word_index_path, index_dict_path, mi_matrix_path, kl_matrix_path) # 根据边和节点生成最大生成树 spanning_tree = Kruskal(nodes, edges, data_path, min_kl) print(spanning_tree) # 确定根节点之后生成从根节点到叶节点的路径信息 top_tree = generate_tree(spanning_tree, dataset_id) # 将路径信息写进文件夹中 write_file(top_tree, folder) # 可视化生成树 visualize(folder, dataset_id)
generateTree(dataset, dataset_domain) if __name__ == '__main__': # params = load_init_params() # main(params) # # params = load_init_params("Beijing") # main(params) # # params = load_init_params("Guiyang") # main(params) # # params = load_init_params("Kunming") # main(params) # # params = load_init_params("Hangzhou") # main(params) # # params = load_init_params("nlp") # main(params) # # params = load_init_params("nlpcn") # main(params) # # params = load_init_params("ZhongGuoJinDaiShi") # main(params) params = load_init_params("g60763") main(params)
for word in self.word_index: self.word_set.add(word) self.PR = np.ones([len(set(self.word_set)), 1]) for i in range(self.iternum): self.PR = (1 - self.alpha) + self.alpha * np.dot(self.matrix, self.PR) # 输出词和相应的权重 def printResult(self): word_pr = {} for i in range(len(self.PR)): word_pr[self.index_dict[str(i)]] = self.PR[i][0] res = sorted(word_pr.items(), key=lambda x: x[1], reverse=True) return res if __name__ == '__main__': dataset_domain = "g60763" params = load_init_params(dataset_domain) dataset_domain = 'g60763' word_index_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_word_index.txt" index_dict_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_index_dict.txt" matrix_path = f".\\data\\processed_data\\{params['dataset']}\\{dataset_domain}_mi_matrix_norm.csv" tr = TextRank(word_index_path, index_dict_path, matrix_path, 3, 0.85, 700) # 创建对象 tr.createMatrix() tr.calPR() results = tr.printResult() print(results)
def calcu_mi_kl(dataset, dataset_id): """ 计算实体之间的互信息量并且保存成供下一步最小生成树的生成 :param dataset: 数据集名称:目前包含马蜂窝(mafengwo)以及知乎(zhihu) :param dataset_id: 具体数据集领域名称,目前包括: 马蜂窝:北京(Beijing),贵阳(Guiyang),杭州(Hangzhou),昆明(Kunming) 知乎:中文的自然语言处理(nlpcn),中英文的自然语言处理(nlp),中国近代史(ZhongGuoJinDaiShi) :return: 没有返回值,只写入四个文件 """ # Initialize the minimum value of tf value. params = load_init_params(dataset_id) min_enti_tf = params['min_enti_tf'] min_feat_tf = params['min_feat_tf'] data_path = os.path.join("./data/raw_data", dataset, dataset_id + "_0.csv") keywords_path = os.path.join("./data/raw_data", dataset, dataset_id + "_geo_noun.txt") feature_path = os.path.join("./data/raw_data", dataset, dataset_id + "_non_geo_noun.txt") # 读取数据集信息 print("读取数据集信息") sentences = createUsers(data_path) # 读取实体集合信息 print("读取实体集合信息") keywords_set = createWordList(keywords_path) # 读取特征环境词集合信息 print("读取特征环境词集合信息") feature_set = createWordList(feature_path) print(feature_set) # 统计关键词词频,删除低频关键词,更新关键词集合 print("[关键词] 统计关键词词频,删除低频关键词,更新关键词集合") word_frequency, keywords_set = calcu_wordFrenq(sentences, keywords_set, min_enti_tf) # 统计特征词词频,删除低频特征词,更新特征词集合 print("[特征词] 统计特征词词频,删除低频特征词,更新特征词集合") _, feature_set = calcu_wordFrenq(sentences, feature_set, min_feat_tf) feature_set.update(keywords_set) # 创建共现次数矩阵 print("创建 共现次数 矩阵") coocurrence_matrix, word_index, index_dict = create_coocurrence_matrix( sentences, keywords_set) # 计算实体互信息矩阵 print("计算 实体互信息 矩阵") mi_matrix, mi_matrix_norm = create_mi_matrix(keywords_set, word_frequency, coocurrence_matrix, index_dict) # 创建实体-特征矩阵 print("创建 实体-特征 矩阵") enti_feat_matrix = create_enti_feat_matrix(sentences, keywords_set, word_index, feature_set) # 计算实体 KL Divergence 矩阵 kl_matrix, _, _ = create_kl_matrix(keywords_set, index_dict, enti_feat_matrix) # 保存文件 print("保存 MI 文件") mi_matrix_path = os.path.join('./data/processed_data', dataset, dataset_id + '_mi_matrix.csv') mi_pd = pd.DataFrame(mi_matrix) mi_pd.to_csv(mi_matrix_path) mi_matrix_norm_path = os.path.join('./data/processed_data', dataset, dataset_id + '_mi_matrix_norm.csv') mi_pd = pd.DataFrame(mi_matrix_norm) mi_pd.to_csv(mi_matrix_norm_path) print("保存 entity-feature 文件") enti_feat_matrix_path = os.path.join( './data/processed_data', dataset, dataset_id + '_entity_feature_matrix.csv') enti_feat_pd = pd.DataFrame(enti_feat_matrix) enti_feat_pd.to_csv(enti_feat_matrix_path) print("保存 KL 散度矩阵 文件") kl_matrix_path = os.path.join('./data/processed_data', dataset, dataset_id + '_kl_matrix.csv') kl_pd = pd.DataFrame(kl_matrix) kl_pd.to_csv(kl_matrix_path) print("保存 索引 文件") word_index_path = os.path.join("./data/processed_data", dataset, dataset_id + "_word_index.txt") with open(word_index_path, "w", encoding="UTF-8") as file: file.writelines(json.dumps(word_index, ensure_ascii=False) + "\n") index_dict_path = os.path.join("./data/processed_data", dataset, dataset_id + "_index_dict.txt") with open(index_dict_path, "w", encoding="UTF-8") as file: file.writelines(json.dumps(index_dict, ensure_ascii=False) + "\n")
for i in range(n): if i == alter_id: continue if mi_matrix[topic_id][i] > 0: neighbor.add(i) # print(index_dict[str(i)], end=",") topic_pd = topic_pd + enti_feat_matrix[i] # print() return topic_pd dataset = "mafengwo" dataset_id = "Beijing" params = load_init_params(dataset_id) min_kl = params['min_kl'] word_index_path = os.path.join("./data", dataset, dataset_id + "_word_index.txt") index_dict_path = os.path.join("./data", dataset, dataset_id + "_index_dict.txt") mi_matrix_path = os.path.join('./data', dataset, dataset_id + '_mi_matrix.csv') kl_matrix_path = os.path.join('./data', dataset, dataset_id + '_kl_matrix.csv') entity_feature_matrix_path = os.path.join( './data', dataset, dataset_id + '_entity_feature_matrix.csv') # 生成边信息和节点信息 nodes, edges = generate_nodes_edges(word_index_path, index_dict_path, mi_matrix_path, kl_matrix_path)