def mineTweets(inputfile, minSup=5): parsedList = [] for line in open(inputfile).readlines(): parsedList.append(textParse(line.strip())) initSet = fpgrowth.createInitSet(parsedList) myFPTree, myHeaderTab = fpgrowth.createTree(initSet, minSup) myFreqList = [] fpgrowth.mineTree(myFPTree, myHeaderTab, minSup, set([]), myFreqList) return myFreqList
def gen_rules(filepath, args): if not os.path.exists(filepath): print(filepath, 'not exists,please set the filepath') print('\n\n\n') print( '------------------------------处理文件%s-----------------------------------' % (os.path.basename(filepath))) with open(filepath, encoding='utf-8') as f: dataSet = [line.split() for line in f.readlines()] if len(dataSet) <= 100: ##如果交易数据少于100条,返回空 print( '-----------------------交易数据小于100条,不生成rules-------------------------' ) return [] n = args.support * len(dataSet) initSet = fpgrowth.createInitSet(dataSet) myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, n) freqItems = [] fpgrowth.mineFPtree(myFPtree, myHeaderTab, n, set([]), freqItems) # for x in freqItems: # print(x) # compute support values of freqItems suppData = fpgrowth.calSuppData(myHeaderTab, freqItems, len(dataSet)) suppData[frozenset([])] = 1.0 # for x, v in suppData.items(): # print(x, v) # freqItems = [frozenset(x) for x in freqItems] # print(freqItems) rules = fpgrowth.generateRules(freqItems, suppData, minConf=args.confidence) filter_rules = [rule for rule in rules if len(rule[1]) == 1] filter_rules = sorted(filter_rules, key=lambda p: p[2], reverse=True) print('number of association rules:\n', len(filter_rules)) return filter_rules
# myFPtree.disp() # print fpgrowth.findPrefixPath('z', myHeaderTab) # print fpgrowth.findPrefixPath('r', myHeaderTab) # print fpgrowth.findPrefixPath('t', myHeaderTab) # freqItems = [] # fpgrowth.mineFPtree(myFPtree, myHeaderTab, 3, set([]), freqItems) # for x in freqItems: # print x '''kosarak data''' start = time.time() n = 20000 with open("E:\dvancedos\database\webdocs.dat", "rb") as f: parsedDat = [line.split() for line in f.readlines()] initSet = fpgrowth.createInitSet(parsedDat) myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, n) freqItems = [] fpgrowth.mineFPtree(myFPtree, myHeaderTab, n, set([]), freqItems) for x in freqItems: print(x) print(time.time() - start, 'sec') # compute support values of freqItems suppData = fpgrowth.calSuppData(myHeaderTab, freqItems, len(parsedDat)) suppData[frozenset([])] = 1.0 for x, v in suppData.items(): print(x, v) freqItems = [frozenset(x) for x in freqItems] fpgrowth.generateRules(freqItems, suppData)
# -*- coding: utf-8 -*- """ Created on Wed Apr 11 20:29:01 2018 @author: Admin """ import fpgrowth import generaterules #导入数据,转化为相应的形式 testdata = fpgrowth.loadSimpData() print(testdata) itemset = fpgrowth.createInitSet(testdata) #设置参数,minSup为最小支持度,itemnumber为频繁集中的项数 minSup = 0.1 itemnumber = 3 #生成树和项头表 retTree, headerTable = fpgrowth.createTree(itemset, sum(itemset.values()), minSup=minSup) print(itemset) #生成频繁项 freqItems = {} fpgrowth.mineTree(inTree=retTree, headerTable=headerTable, dataSetLen=sum(itemset.values()), itemnumber = itemnumber, minSup = minSup, freqItemList = freqItems) #从频繁集中挖掘相关规则 minConf = 0.8 #最小置信度 minLift = 1.3 #最小提升度 ruleset = generaterules.generateRules(freqItems = freqItems, dataSetLen = sum(itemset.values()), minConf = minConf, minLift = minLift)
import fpgrowth import time #simple data simDat = fpgrowth.loadSimpDat() initSet = fpgrowth.createInitSet(simDat) myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, 3) myFPtree.disp() #FP树 print(fpgrowth.findPrefixPath('z', myHeaderTab)) print(fpgrowth.findPrefixPath('r', myHeaderTab)) print(fpgrowth.findPrefixPath('x', myHeaderTab)) freqItems = [] fpgrowth.mineFPtree(myFPtree, myHeaderTab, 3, set([]), freqItems) for x in freqItems: print(x) #kosarak data ''' start = time.time() n = 100000 with open("H:/data/kosarak.dat/kosarak.dat", "rb") as f: parsedDat = [line.split() for line in f.readlines()] initSet = fpgrowth.createInitSet(parsedDat)
ruleset = [] temp = [] for i in range(5422): if(float(content[i].split(',')[-1]) > float(content[i+1].split(',')[-1])): temp.append(content[i]) ruleset.append(temp) temp = [] else: temp.append(content[i]) fitdata = [] for j in range(len(ruleset)): datamid = [] for i in range(len(ruleset[j])): datamid.append(ruleset[j][i].split(',')[:-1]) fitdata.append(datamid) ###fitdata contains the data for every week itemset = fpgrowth.createInitSet(fitdata[2]) minSup = 0.1 itemnumber = 3 retTree, headerTable = fpgrowth.createTree(itemset, sum(itemset.values()), minSup=minSup) freqItems = {} fpgrowth.mineTree(inTree=retTree, headerTable=headerTable, dataSetLen=sum(itemset.values()), itemnumber = itemnumber, minSup = minSup, freqItemList = freqItems) minConf = 0.7 minLift = 0.9 ruleset = generaterules.generateRules(freqItems = freqItems, dataSetLen = sum(itemset.values()), minConf = minConf, minLift = minLift) print(ruleset)