def growth(self, tree, postNodes): if tree.isUniquePath(): nodeCombinations = [] tree.getCombinationFromPath(nodeCombinations) for combination in nodeCombinations: support = self._getMinSupport(combination) if support is None or support < self.minsup: continue #gen pattern pattern = ([], support) for node in combination: pattern[0].append(node["name"]) for node in postNodes: pattern[0].append(node) if len(pattern[0]) > 1: self.fp.append(pattern) #self._printPattern(pattern) else: for item in tree.itemTable: #gen pattern pattern = ([], tree.itemTable[item][0]) pattern[0].append(item) for node in postNodes: pattern[0].append(node) if len(pattern[0]) > 1 and pattern[1] > self.minsup: self.fp.append(pattern) #self._printPattern(pattern) #construct conditional pattern base baseSet = [] tree.getConditionalPatternBase(item, baseSet) tmpTree = FPTree.FPTree(baseSet, minsup=self.minsup) tmpTree.build() if not tmpTree.isEmpty(): self.growth(tmpTree, pattern[0])
def get_rules(keywordfile): #testcase = [[["i2","i1","i5"],1],[["i2","i4"],1],[["i2","i3"],1],[["i2","i1","i4"],1],[["i1","i3"],1],[["i2","i3"],1],[["i1","i3"],1],[["i2","i1","i3","i5"],1],[["i2","i1","i3"],1]] print("getting rules...") testcase = load_data(keywordfile) tree = FPTree.FPTree(testcase, minsup=2) tree.build() algorithm = FPGrowth1(minsup=2) algorithm.growth(tree, []) res = sorted(algorithm.fp, key=lambda d: d[1], reverse=True) with open("rule_" + rule_type + "_TF_IDF.txt", 'w', encoding='utf-8') as f: for rule in res: f.write(str(rule) + "\n") print("关联规则写入文件成功")
def test(): #testcase = [[["i2","i1","i5"],1],[["i2","i4"],1],[["i2","i3"],1],[["i2","i1","i4"],1],[["i1","i3"],1],[["i2","i3"],1],[["i1","i3"],1],[["i2","i1","i3","i5"],1],[["i2","i1","i3"],1]] testcase = [[["a", "b"], 1], [["b", "c", "d"], 1], [["a", "c", "d", "e"], 1], [["a", "d", "e"], 1], [["a", "b", "c"], 1], [["a", "b", "c", "d"], 1], [["a"], 1], [["a", "b", "c"], 1], [["a", "b", "d"], 1], [["b", "c", "e"], 1]] #testcase = [(["i1","i2"],1),(["i3"],1)] tree = FPTree.FPTree(testcase, minsup=2) tree.build() algorithm = FPGrowth(minsup=2) algorithm.growth(tree, []) res = sorted(algorithm.fp, key=lambda d: d[1], reverse=True) for rule in res: print(rule)
def tjms_tree(paths, rect1, rect_dict): # 建树 treep = FPTree.FPTree() for path in paths: condition_node = path.pop(0) path = list(filter(lambda v: v in rect1, path)) path.sort(key=lambda x: rect_dict[x]) point = treep.root for item in path: next_point = point.search(item) if next_point: next_point.count += condition_node[1] else: next_point = FPNode.FPNode(treep, item, condition_node[1]) point.add(next_point) treep._update_route(next_point) point = next_point return treep
def find_frequent_items(data_set, n, include_support=False): # n = 20000 # 获取1项集及其支持度 def get_item1(data_set): rect = {} for line in data_set: for item in line: rect[item] = rect.get(item, 0) + 1 return rect # 项目降序排列 def descending(rect): tp_list = [] tp_list.extend(rect.keys()) # 列表的扩展,即将两个列表合并成一个 tp_list.sort(key=lambda x: rect[x], reverse=True) tp_dict = {} i = 1 for item in tp_list: tp_dict[item] = i i += 1 return tp_dict # 事务排序后带有计数 def count_order(rect, tp_dict): for elem in tp_dict: if elem in rect: tp_dict[elem] = rect[elem] return tp_dict # 2.获得1项集计数 ys_item1 = get_item1(data_set) # 3.事务按从大到小排序 tp_list = [] tp_list.extend(ys_item1.keys()) tp_list.sort(key=lambda x: ys_item1[x], reverse=True) tp_dict = {} i = 1 for item in tp_list: tp_dict[item] = i i += 1 ys_item1_order = count_order(rect=ys_item1, tp_dict=tp_dict) # 4.把项目分别加入大项目表和小项目表 ##################### min_sup = 0.01 big_table = {} small_table = {} for item in tp_list: if ys_item1[item] >= min_sup * n: big_table[item] = ys_item1[item] else: small_table[item] = ys_item1[item] # 5.根据big_table建树 tree = FPTree.FPTree() for line in data_set: line = list(filter(lambda v: v in big_table, line)) line.sort(key=lambda x: big_table[x], reverse=True) tree.add(line) # =================================挖掘频繁项集============================== # print('开始挖掘:') def pf_item(paths, min_sup): rect = {} for path in paths: condition_node = path.pop(0) for item in path: rect[item] = rect.get(item, 0) + condition_node[1] rect1 = {} for item in rect: if rect[item] >= min_sup: rect1[item] = rect[item] rect_dict = descending(rect1) return rect1, rect_dict def tjms_tree(paths, rect1, rect_dict): # 建树 treep = FPTree.FPTree() for path in paths: condition_node = path.pop(0) path = list(filter(lambda v: v in rect1, path)) path.sort(key=lambda x: rect_dict[x]) point = treep.root for item in path: next_point = point.search(item) if next_point: next_point.count += condition_node[1] else: next_point = FPNode.FPNode(treep, item, condition_node[1]) point.add(next_point) treep._update_route(next_point) point = next_point return treep # 找到后缀 def find_with_suffix(tree, suffix): for item, node in tree.items(): support = sum(n.count for n in node) if support >= min_sup and item not in suffix: # item不在fuffix中 found_set = [item] + suffix # 把item放入到suffix的最前 yield (found_set, support) if include_support else found_set # 构建条件树并递归搜索频繁其中的项目集 rect1, rect_dict = pf_item(tree.prefix_paths(item), min_sup * n) cond_tree = tjms_tree(tree.prefix_paths(item), rect1, rect_dict) for s in find_with_suffix(cond_tree, found_set): yield s for itemset in find_with_suffix(tree, []): yield itemset
def get_recommend_based_on_enemies(self, enemy, min_support): """ :param enemy: (str) name of one enemy in current enemies :param min_support: (int) min support count of heroes who win the enemy :return: """ df_lose = self.df_lose df_win = self.df_win df_match = self.df_match indexes = \ df_lose.index[df_lose.isin([enemy]).any(axis=1)].tolist() # print("indexes {}".format(indexes)) df_lose = df_lose.iloc[indexes, :] df_win = df_win.iloc[indexes, :] heroes_win = df_win.reset_index(drop=True).values.tolist() print("current heroes_win len {}".format(len(heroes_win))) min_support = int(round(len(heroes_win) * min_support)) fp_tree = FPTree(heroes_win, min_support) freq_heroes = fp_tree.gen_freq_itemsets() print("win_heroes len: {}".format(len(freq_heroes))) candidates = \ list(filter(lambda hero_set: len(hero_set) == 1, freq_heroes)) print("enemies freq hero num: {}".format(len(candidates))) all_candidate_rules = list() for hero in candidates: # print("hero: {}, enemy: {}".format(hero, enemy)) # radiant_win_indexes = \ # (df_match["winner"] == 1) & \ # (df_match.loc[:, "radiant_hero_1":"radiant_hero_5"].isin(hero).any(axis=1)) & \ # (df_match.loc[:, "dire_hero_1":"dire_hero_5"].isin([enemy]).any(axis=1)) # # dire_win_indexes = \ # (df_match["winner"] == -1) & \ # (df_match.loc[:, "radiant_hero_1":"radiant_hero_5"].isin([enemy]).any(axis=1)) & \ # (df_match.loc[:, "dire_hero_1":"dire_hero_5"].isin(hero).any(axis=1)) # # print("radiant_win_indexes sum : {}".format(radiant_win_indexes.sum())) # print("dire_win_indexes sum {}".format(dire_win_indexes.sum())) # df_match = df_match.loc[(radiant_win_indexes | dire_win_indexes)] # print("df_match==== {}".format(df_match)) hero = hero[0].strip(" ") if hero in (self.allies + self.enemies): continue rule = AssociationRule(lhs=[enemy], rhs=[hero], rule_type="enemies") rule.compute_metrics(df_win=df_win, df_lose=df_lose, df_match=df_match) all_candidate_rules.append(rule) return all_candidate_rules
df_radiant_win_dire_heroes = pd.read_csv("../data/processed_data/radiant_win_dire_heros.csv") df_lose_heroes = pd.concat([df_dire_win_radiant_heroes, df_radiant_win_dire_heroes], axis=0, sort=False) df_lose_heroes = df_lose_heroes.reset_index(drop=True) # df_lose_heroes.to_csv("./inter_data/df_lose_heroes.csv") df_all_heroes = pd.concat([df_win_heroes, df_lose_heroes], axis=0) # df_all_heroes.to_csv("./inter_data/df_all_heroes.csv") all_heroes = df_all_heroes.reset_index(drop=True).values.tolist() fp_tree = FPTree(transactions=all_heroes, min_support_count=int(round(min_support_allies * len(all_heroes)))) freq_allies = fp_tree.gen_freq_itemsets() df_radiant_win_match = pd.read_csv("../data/processed_data/radiant_win_match.csv") df_dire_win_match = pd.read_csv("../data/processed_data/dire_win_match.csv") df_match = pd.concat([df_radiant_win_match, df_dire_win_match], axis=0, sort=False) df_match = df_match.reset_index(drop=True) # df_match.to_csv("./inter_data/df_match.csv") print("df_match : {}".format(df_match))