def relatives(list_name, data, relatives_num): relatives_list = [] # 药物亲友团 length = data.shape[0] for item in list_name: list_ = [] for i in range(length): words = data['药物'][i] if item in words: list_.append(words) relatives_list.append(list_) return utils.cut_by_num(relatives_list, relatives_num)
def cluster_main2(relatives_list, list_name): list_qyt = duplicate_removal(relatives_list, list_name) # 使用数字代替列表中的项 list_num = utils.word_2_num(list_name, list_qyt) for group_num in range(3, 9): new_list = utils.cut_by_num(list_num, group_num) list_num2 = del_by_correlation(new_list) reWord = utils.num_2_word(list_name, list_num2) # 创建二元组 doubleSet = create_double_set(list_num2) max_num, bestSet = merge_loop(doubleSet, list_name, 'data/group' + str(group_num) + '.csv') # 信息利用率 print(max_num, '/', group_num, '=', max_num / group_num)
def relatives_2(list_name, data, relatives_num): """ 根据互信息得到每项的亲友团 :param list_name:所有词的list :param data:dataFrame,{组合,关联度系数} :param relatives_num:限制亲友团个数 :return:[[]] 所有项的亲友团 """ relatives_list = [[] for i in range(len(list_name))] length = data.shape[0] for i in range(length): words = data['组合'][i] # words = data['药物'][i] pre_index = list_name.index(words[0]) relatives_list[pre_index].append(words) suf_index = list_name.index(words[1]) relatives_list[suf_index].append(words) return utils.cut_by_num(relatives_list, relatives_num)