def test3(): parsedDat = [line.split() for line in open('kosarak.dat').readlines()] initSet = fpGrowth.createInitSet(parsedDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 10000) myFreqList = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 10000, set([]), myFreqList) print myFreqList
def test(): simpDat = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simpDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.disp() freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
def mineTweets(tweetArr, minSup=5): parsedList = [] for i in range(14): for j in range(100): parsedList.append(textParse(tweetArr[i][j].text)) initSet = fpGrowth.createInitSet(parsedList) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup) myFreqList = [] fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) return myFreqList
def mineTweets(tweetArr, minSup=5): parsedList = [] for i in range(len(tweetArr)): for j in range(len(tweetArr[i])): #twitter爬出的数据是status类,因此需要status.text来找到实际需要的文本 parsedList.append(textParse(tweetArr[i][j].text)) initSet = fpGrowth.createInitSet(parsedList) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup) myFreqList = [] fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) return (myFreqList)
def test2(): simplDat = fpGrowth.loadSimpleData() print "Data: ", simplDat initSet = fpGrowth.createInitSet(simplDat) print "initSet: ", initSet myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.disp() condPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1]) print "condPat: ", condPat freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) print "freqItems: ", freqItems
def calcFrequentSets(region): songs = db.lyrics.find({"region":region}) all_verses = [] for song in songs: verses = song['lyrics'] for verse in verses: verse = [v for v in set(verse) if v not in stop_words] all_verses.append(verse) minSup = 10 initSet = fpGrowth.createInitSet(all_verses) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup) myFreqList = [] fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) return myFreqList
def find_distinguishing_itemsets(transactions, max_items_list, minSup, cog_dict): # 1. arrange the transactions as proper input for createInitSet function dataSet = creatDataSet(transactions) # 2. arrange the data as proper input for fptree creation initSet = fpGrowth.createInitSet(dataSet) # 3. crate fptree myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup) if myHeaderTab is None: # tree is empty return max_items_list freqItems = [] # 4. apply fpgrowth algorithm for finding frequent itemsets in data. Eventually [freqitems] contains them fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems, cog_dict) max_IG = 0 max_item = "" # 5. calculate entropy values for whole data s_entropy, label_0_num, label_1_num = calculate_all_DB_entropy( transactions) # 6. calculate IG score for each itemset while finding the one with the highest IG score value for item in freqItems: Ig = calculate_IG_value(item, transactions, s_entropy, label_0_num, label_1_num) if Ig >= max_IG: max_item = item max_IG = Ig str_max_item = itemToStr(max_item) max_items_list[str_max_item] = [max_IG] copy_transactions = transactions.copy() # 7. find all teansactions in data that contain [max_item] and remove them from data base for bact in copy_transactions: if max_item.issubset(transactions[bact][0]): label = transactions[bact][1] max_items_list[str_max_item].append([bact, label]) del transactions[bact] # 8. repeat until no transactions left in data base if len(transactions) > 0: find_distinguishing_itemsets(transactions, max_items_list, minSup, cog_dict) else: return max_items_list
def subjectFreq(teamList,authorList,subjectList,minFreq): subjectFreqList = [] for team in teamList: tempData = [] #添加minFreq个'a',保证FPTree不为空 for i in range(minFreq): tempData.append(['a']) for author,subject in zip(authorList,subjectList): if(not [False for a in team if a not in author]): tempData.append(subject) tempSet = fpGrowth.createInitSet(tempData) #fpGrowth树建立 subjectFPtree,subjectHeaderTab=fpGrowth.createTree(tempSet,minFreq) #挖掘主题 subjectFreqItems = [] fpGrowth.mineTree(subjectFPtree,subjectHeaderTab,minFreq,set([]),subjectFreqItems) #每个团队主题词添加到subjectFreqList subjectFreqList.append(subjectFreqItems) return subjectFreqList
''' # FP-Tree node create test rootNode = fpGrowth.treeNode('pyramid', 9, None) rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None) rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None) rootNode.disp() ''' simData = fpGrowth.loadSimpleData() print('simData : ' , simData) initSet = fpGrowth.createInitSet(simData) print('initSet : ', initSet) simFPTree, simHeaderTable = fpGrowth.createTree(initSet, 3) simFPTree.disp() ''' print('========= prefix path : ') print(fpGrowth.findPrefixPath('x', simHeaderTable['x'][1])) print(fpGrowth.findPrefixPath('z', simHeaderTable['z'][1])) print(fpGrowth.findPrefixPath('r', simHeaderTable['r'][1])) ''' # print('sorted(simHeaderTable.items()) : ', sorted(simHeaderTable.items())) # bigL = [v[0] for v in sorted(simHeaderTable.items(), key = lambda p : p[0])]
data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) counts = [] for index in columns: line = df[index] count = 0 for i in range(len(line)): if line[i] == 1: count += 1 counts.append((float)(count) / 10000) counts.sort() minSupport = counts[len(counts) * 1 / 5] #use apriori L, supportData = ap.apriori(data, minSupport) rules = ap.generateRules(L, supportData, minConf=0.4) #use fpGrowth minSup = minSupport * 10000 simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
# -*- coding:utf-8 -*- import fpGrowth parseData = [line.split() for line in open('kosarak.dat').readlines()] #print(parseData) initSet = fpGrowth.createInitSet(parseData) myFPTree,myHeaderTab = fpGrowth.createTree(initSet, 100000) myFreqList = [] fpGrowth.mineTree(myFPTree, myHeaderTab, 100000, set([]), myFreqList) print ('myFreqList is ' , myFreqList)
# print(len(row_by_user[u])) row_by_user[u] = sorted(row_by_user[u], key=lambda p: p[1])[:req_num] user[u] = set() for row in row_by_user[u]: # if row[0] not in user[u]: # user[u][row[0]] = set() user[u].add(int(row[NOPOSITION]) if row[NOPOSITION] != "" else -1) if row[NOPOSITION] != "": namedict[int(row[NOPOSITION])] = row[NAMEPOSITON] # user[row[4]][row[0]][row[7]] = user[row[4]][row[0]].get(row[7], 0) + 1 for MINSUP in [2, 4, 6, 8, 10]: # MINSUP = 2# print(cnt) # for vip, uidset in user.items(): # print(uidset) data = createInitSet(user.values()) # print(data) myFpTree, myHeaderTab = createTree(data, MINSUP) if myFpTree is not None: # print("vipno #", vip, "的支持度大于", MINSUP, "的频繁项集如下") freqItems = [] mineTree(myFpTree, myHeaderTab, MINSUP, set([]), freqItems) # print(freqItems) for freq in freqItems: for x in freq: print(namedict[x], end=",") print("\n**************") else: pass # print("vipno #", vip, "并没有支持度大于", MINSUP, "的频繁项集")
row_by_user[u] = sorted(row_by_user[u], key=lambda p: p[1])[:req_num] user[u] = {} for row in row_by_user[u]: if row[0] not in user[u]: user[u][row[0]] = set() user[u][row[0]].add( int(row[NOPOSITION]) if row[NOPOSITION] != "" else -1) if row[NOPOSITION] != "": namedict[int(row[NOPOSITION])] = row[NAMEPOSITON] # user[row[4]][row[0]][row[7]] = user[row[4]][row[0]].get(row[7], 0) + 1 for MINSUP in [2, 4, 8, 16, 32, 64]: # MINSUP = 2# print(cnt) for vip, uidset in user.items(): # print(uidset) data = createInitSet(uidset.values()) # print(data) myFpTree, myHeaderTab = createTree(data, MINSUP) if myFpTree is not None: print("vipno #", vip, "的支持度大于", MINSUP, "的频繁项集如下") freqItems = [] mineTree(myFpTree, myHeaderTab, MINSUP, set([]), freqItems) # print(freqItems) for freq in freqItems: for x in freq: print(namedict[x], end=",") print("\n**************") else: # pass print("vipno #", vip, "并没有支持度大于", MINSUP, "的频繁项集")
# -*- coding:utf-8 -*- import fpGrowth #算法基本过程: #1.创建FP树的数据结构 #2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项 #3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行 #4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。 # 如果树中已经存在现有元素,则增加现有元素的值 # 如果该路径不存在,则创建一条新路径。 ###测试FP数的数据结构 #rootNode = fpGrowth.treeNode('pyramid',9,None) #rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) #rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) #rootNode.disp() simData = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simData) myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFpTree.disp() myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) print('myCondPats is', myCondPats) freqItems = [] fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems) print('频繁项集 is', freqItems)
freqItemList.append(newFreqSet) #调用findPrefixPath()函数来创建条件模式基 condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) #从条件模式基来构建FP树 myCondTree, myHead = fpGrowth.createTree(condPattBases, minSup) #如果树种有元素项的话,递归调用mineTree()函数 if myHead != None: print('conditional tree for:', newFreqSet) myCondTree.disp(1) mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList) if __name__ == '__main__': # 测试给定元素项返回的条件模式基 myDat = fpGrowth.loadSimpDat() myDic = fpGrowth.createInitSet(myDat) myFPtree, myHeaderTab = fpGrowth.createTree(myDic, 3) print(myHeaderTab) # condPatsX= findPrefixPath('x',myHeaderTab['x'][1]) # print(condPatsX) # condPatsZ = findPrefixPath('z', myHeaderTab['z'][1]) # print(condPatsZ) # condPatsR = findPrefixPath('r', myHeaderTab['r'][1]) # print(condPatsR) # 测试利用条件模式基递归查找频繁项集 freqItems = [] myMinTree = mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) print(freqItems) """ 正如我们所期望的那样,返回项集与条件FP树相匹配,到现在为止,完整的FP-growth算法以及可以运行。
import fpGrowth rootNode = fpGrowth.treeNode('pyramid', 9, None) rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None) rootNode.disp() rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None) rootNode.disp() from importlib import reload reload(fpGrowth) simpDat = fpGrowth.loadSimpDat() simpDat initSet = fpGrowth.createInitSet(simpDat) initSet # 创建FP树 myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.disp() reload(fpGrowth) fpGrowth.findPrefixPath('x', myHeaderTab['x'][1]) fpGrowth.findPrefixPath('z', myHeaderTab['z'][1]) fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) reload(fpGrowth) freqItems = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) freqItems # 示例:从新闻网站点击流中挖掘 parsedDat = [line.split() for line in open('kosarak.dat').readlines()] initSet = fpGrowth.createInitSet(parsedDat) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000)
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- import fpGrowth rootNode = fpGrowth.treeNode("pyramid", 9, None) # 这个调用的就是class的__init__函数来声明第一个结点 rootNode.children["eye"] = fpGrowth.treeNode("eye", 13, None) # rootNode的['eye']孩子结点也是新声明的一个treeNode rootNode.display() rootNode.children["phoenix"] = fpGrowth.treeNode("phoenix", 3, rootNode.children["eye"]) rootNode.display() simpDat = fpGrowth.loadSimpDat() print(simpDat) initSet = fpGrowth.createInitSet(simpDat) print("createTree with this initSet:", initSet) myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3) myFPtree.display() Result_x = fpGrowth.findPrefixPath("x", myHeaderTab["x"][1]) Result_z = fpGrowth.findPrefixPath("z", myHeaderTab["z"][1]) Result_r = fpGrowth.findPrefixPath("r", myHeaderTab["r"][1]) print(Result_x) print(Result_z) print(Result_r) freqItems = [] Result = fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) print(Result) print("100万记录用FP-growth来处理:\n") parsedDat = [line.split() for line in open("kosarak.dat").readlines()]
# freqItems = [] # #@ myFPtree:FP树的根节点 # #@ myHeaderTab:头指针表(字典) # #@ # #递归查找频繁项集(每一个频繁项集都要创建一颗FP树) # fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) #发现频繁项集并显示前缀路径 # print "\nfreqItems:\n", freqItems #显示频繁项集 #### 例子:在Twitter源中发现一些共现词 # lotsOtweets = fpGrowth.getLotsOfTweets('RIMM') # print lotsOtweets parsedDat = [line.split() for line in open('kosarak.dat').readlines()] #加载数据存放在列表中 initSet = fpGrowth.createInitSet(parsedDat) #数据存放形式从列表转化成字典 myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000) #创建FP树和头指针表 myFreqList = [] fpGrowth.mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList) #生成频繁项集 print "\nlength of 'myFreqList' is:", len(myFreqList) print "myFreqList is:", myFreqList end = time.clock() print "The run time of the program is:",end-start, "seconds"
yearRd.openFile() yearData = yearRd.readFile() #读取主题数据 titleRd = readData(fileName,confName,"title") titleRd.openFile() titleData = titleRd.readFile() #查询活跃研究者 lastYear = "2015" activeAuthorList = activeAuthor(authorData,yearData,lastYear) #频繁模式查询团队,利用fpGrowth方法进行 #数据预处理 minMem = 3 authorSet = fpGrowth.createInitSet(authorData) #fpGrowth树建立 authorFPtree,authoryHeaderTab=fpGrowth.createTree(authorSet,minMem) #挖掘团队 teamList = [] fpGrowth.mineTree(authorFPtree,authoryHeaderTab,minMem,set([]),teamList) teamList = fpGrowth.teamSelect(minMem,teamList) #主题List查询 removeList = ['a','an','the','for','of','with','and','in','to'] subjectList = titleInit(titleData,removeList) #团队主题 minFreq = 2 subjectFreqList = []
import fpGrowth simpDat = fpGrowth.loadSimpDat() # print simpDat initSet = fpGrowth.createInitSet(simpDat) # print initSet myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3) # print myFpTree.disp() # print myHeaderTab myCondPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1]) # print myCondPat freqItems = [] myfpGrowth = fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems) # print myFpTree.disp()
# -*- coding:utf-8 -*- import fpGrowth #算法基本过程: #1.创建FP树的数据结构 #2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项 #3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行 #4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。 # 如果树中已经存在现有元素,则增加现有元素的值 # 如果该路径不存在,则创建一条新路径。 ###测试FP数的数据结构 #rootNode = fpGrowth.treeNode('pyramid',9,None) #rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) #rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) #rootNode.disp() simData = fpGrowth.loadSimpDat() initSet = fpGrowth.createInitSet(simData) myFpTree,myHeaderTab = fpGrowth.createTree(initSet, 3) myFpTree.disp() myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1]) print ('myCondPats is' , myCondPats) freqItems = [] fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set ([]), freqItems) print('频繁项集 is' , freqItems)
import fpGrowth as fp import pandas as pd import numpy as np # simpDat = fp.loadSimpDat() # initset = fp.createInitSet(simpDat) # myfptree,myheaderTab = fp.createTree(initset,3) # freqItems = [] # fp.mineTree(myfptree,myheaderTab,3,set[()],freqItems) # print freqItems df = pd.read_excel("Transactions.xls") values = df.values data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) minSup = 313.0 #simpDat = fp.loadSimpDat() simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
from imp import reload import fpGrowth reload(fpGrowth) rootNode = fpGrowth.treeNode('pyramid' , 9 , None) # 创建一个单节点 rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None) # 增加一个子节点 rootNode.disp() # 显示 rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None) # 增加一个子节点 rootNode.disp() # 显示 reload(fpGrowth) simpDat = fpGrowth.loadSimpDat() # 获取数据集 simpDat initSet = fpGrowth.createInitSet(simpDat) # 格式化处理数据集 initSet myFPtree,myHeaderTab = fpGrowth.createTree(initSet,3) # 生成树 myFPtree.disp() # 显示 reload(fpGrowth) fpGrowth.findPrefixPath('x',myHeaderTab['x'][1]) # 给定元素,生成条件基 fpGrowth.findPrefixPath('z',myHeaderTab['z'][1]) fpGrowth.findPrefixPath('r',myHeaderTab['r'][1]) reload(fpGrowth) freqItems = [] # 空列表存储频繁项集 fpGrowth.mineTree(myFPtree,myHeaderTab,3,set([]),freqItems) freqItems