def test(): simpDat = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simpDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.disp() freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
import fpGrowth rootNode = fpGrowth.treeNode('pyramid', 9, None) rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None) rootNode.disp() rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None) rootNode.disp() from importlib import reload reload(fpGrowth) simpDat = fpGrowth.loadSimpDat() simpDat initSet = fpGrowth.createInitSet(simpDat) initSet # 创建FP树 myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.disp() reload(fpGrowth) fpGrowth.findPrefixPath('x', myHeaderTab['x'][1]) fpGrowth.findPrefixPath('z', myHeaderTab['z'][1]) fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) reload(fpGrowth) freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) freqItems # 示例:从新闻网站点击流中挖掘 parsedDat = [line.split() for line in open('kosarak.dat').readlines()] initSet = fpGrowth.createInitSet(parsedDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000)
import fpGrowth simpDat = fpGrowth.loadSimpDat() # print simpDat initSet = fpGrowth.createInitSet(simpDat) # print initSet myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3) # print myFpTree.disp() # print myHeaderTab myCondPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1]) # print myCondPat freqItems = [] myfpGrowth = fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems) # print myFpTree.disp()
# -*- coding:utf-8 -*- import fpGrowth #算法基本过程: #1.创建FP树的数据结构 #2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项 #3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行 #4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。 # 如果树中已经存在现有元素,则增加现有元素的值 # 如果该路径不存在,则创建一条新路径。 ###测试FP数的数据结构 #rootNode = fpGrowth.treeNode('pyramid',9,None) #rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) #rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) #rootNode.disp() simData = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simData) myFpTree,myHeaderTab = fpGrowth.createTree(initSet, 3) myFpTree.disp() myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) print ('myCondPats is' , myCondPats) freqItems = [] fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set ([]), freqItems) print('频繁项集 is' , freqItems)
# -*- coding: utf-8 -*- ''' Created on 2016年6月22日 @author: xiaoyuan ''' import fpGrowth # #testing create a tree datastructor # rootNode = fpGrowth.treeNode('pyramid',9,None) # rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) # rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) # rootNode.disp() # # # simDat = fpGrowth.loadSimpDat() # initSet = fpGrowth.createInitSet(simDat) # myFPtree,myHeaderTab = fpGrowth.createTree(initSet,3) # myFPtree.disp() # condPattBases = fpGrowth.findPrefixPath('x',myHeaderTab['x'][1]) dateSet = fpGrowth.loadSimpDat() freqItems = fpGrowth.fpGrowth(dateSet)
if __name__ == '__main__': # takes all bacterias names from bactTaxa_Habitant that are animal or marine bacteria_names() category = "Lipid transport and metabolism" # list of all cogs that belong to catagory function_cogs_list = build_category_list(category) # dict = {bac_name, {words_list, lable}} dict, animal_counter, marine_counter = write_words(function_cogs_list) min_sup = 100 output = open("output.txt", "w") simpDat = fpGrowth.loadSimpDat(dict) initSet = fpGrowth.createInitSet(simpDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, min_sup) if not myHeaderTab: print("header table empty") print("--- %s seconds ---" % (time.time() - start_time)) exit(1) freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, min_sup, set([]), freqItems) class0 = marine_counter / (marine_counter + animal_counter) class1 = animal_counter / (marine_counter + animal_counter) Hc = entropy(class0, class1) dict_IG_freq = {} # claculate IG to all the frequent itemsets, returns also how many time each freq_itemset appears in lable 1 and 0
@author: xiaoyuan ''' import fpGrowth # #testing create a tree datastructor # rootNode = fpGrowth.treeNode('pyramid',9,None) # rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) # rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) # rootNode.disp() # # # simDat = fpGrowth.loadSimpDat() # initSet = fpGrowth.createInitSet(simDat) # myFPtree,myHeaderTab = fpGrowth.createTree(initSet,3) # myFPtree.disp() # condPattBases = fpGrowth.findPrefixPath('x',myHeaderTab['x'][1]) dateSet = fpGrowth.loadSimpDat() freqItems = fpGrowth.fpGrowth(dateSet)
# -*- coding:utf-8 -*- import fpGrowth #算法基本过程: #1.创建FP树的数据结构 #2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项 #3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行 #4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。 # 如果树中已经存在现有元素,则增加现有元素的值 # 如果该路径不存在,则创建一条新路径。 ###测试FP数的数据结构 #rootNode = fpGrowth.treeNode('pyramid',9,None) #rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) #rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) #rootNode.disp() simData = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simData) myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFpTree.disp() myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) print('myCondPats is', myCondPats) freqItems = [] fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems) print('频繁项集 is', freqItems)