def group_all(self):
     """
     先将基于不同数量亲友团的聚类结果进行清理、删除被包含在更大团中的团,然后将清洗后的聚类结果进行结合并输出
     :return:
     """
     group_all = []
     group_name = []
     for i in range(max_relatives_nums, min_relatives_nums-1, -1):
         group_path = os.path.join("data", "group"+str(i)+".csv.pkl")
         group_name.append("group"+str(i))
         group_all.append(group_clean(group_path))
     # print("group_all:", group_all)
     write_csv(group_name, group_all_path, group_all)    # 将不同亲友团数量下的所有聚类结果进行输出
     group_all_dict = {}  # 记录各个团出现的次数
     self.group_all_2 = []   # 用于保存所有亲友团数量下的功效团
     for group_list in group_all:
         for group in group_list:
             group_all_dict[tuple(group)] = group_all_dict[tuple(group)]+1 if tuple(group) in group_all_dict else 1
     print("group_all_dict:", group_all_dict)
     for group in group_all_dict.keys():
         self.group_all_2.append(group)  # 确保self.group_all_2中不出现重复的团
     print("self.group_all_2:", self.group_all_2)
     # 人工审核找到信息利用率最高的亲友团数量,即最佳的亲友团数量,然后以该亲友团数量下的结果作为最佳结果
     group_best_name = []
     group_best = []
     for i in range(12, 13):
         group_path = os.path.join("data", "group"+str(i)+".csv.pkl")
         group_best_name.append("group"+str(i))
         group_best.append(group_clean(group_path))
     write_csv(group_best_name, group_best_path, group_best)
     self.group_best = group_best[0]
     print(self.group_best)
Esempio n. 2
0
def merge_loop(double_set, root_name, file=None):
    """
    进行团合并操作,循环直到不能合并
    :param double_set:强相关的两两组合
    :param root_name:词根列表
    :param file:对聚类结果进行dump的目标路径
    :return:团成员最大数,最终的团
    """
    best_set = set()
    old_set = double_set
    num_list = []
    count_list = []
    group_list = []
    while len(old_set) > 0:
        # oldSet为需要继续进行合并操作的团
        print('成员数:', len(list(old_set)[0]))  # oldSet中团的成员数量
        print('个数:', len(old_set))   # oldSet中团的数量
        # print("old_set", old_set)
        num_list.append(len(list(old_set)[0]))
        count_list.append(len(old_set))
        group_list.append(old_set)
        best_set = old_set
        old_set = merge_group(old_set, double_set)    # 返回新组合成的团,对这些团继续进行合并操作
    # 若oldSet不存在,则说明聚类收敛、合并到最大的团了,无法继续合并了
    if file is not None:
        group_list = index_2_word(root_name, group_list)
        write_csv(['成员数', '个数', '团'], file, [num_list, count_list, group_list])
        save_pickle(file + '.pkl', group_list)
    # print("best_set", best_set)
    return len(list(best_set)[0]), best_set
 def search_relatives(self):
     """
     先计算每一对两两组合之间的互信息,然后根据最大亲友团数量找到每个词根的亲友团
     :return:
     """
     correlation = calculate_correlation(self.combine_index, self.combine_fre, self.root_fre)
     self.combine_name = index_2_word(self.root_name, self.combine_index)  # 将单独的词根和组合中的索引转换为词
     # 将互信息按照大小降序排列大小,然后再写入到csv中
     data = write_csv(['组合', '关联度系数'], correlation_path, [self.combine_name, correlation])
     # 获取每个症状的亲友团list
     self.relatives_list = create_relatives(self.root_name, data, max_relatives_nums)
     print("relatives_list", self.relatives_list)  # 这里的亲友团是用嵌套列表存储的,用字典存储应该更好吧?键值对为变量-亲友团列表