Example #1
0
def test3():
    parsedDat = [line.split() for line in open('kosarak.dat').readlines()]
    initSet = fpGrowth.createInitSet(parsedDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 10000)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 10000, set([]), myFreqList)
    print myFreqList
Example #2
0
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
    """
    利用条件模式基递归查找频繁项集的minTree函数
    :param inTree:          事先创建好的FP树
    :param headerTable:     头指针列表
    :param minSup:          最小支持度
    :param preFix:          preFix为newFreqSet上一次的存储记录,初始为空。
    :param freqItemList:    用来存储频繁项集的列表
    :return:
    """
    #通过value进行从小到大的排序, 得到频繁项集的key组成的list。
    headerDic = {}
    for i in headerTable.keys():
        headerDic[i] = headerTable[i][0]
    # headerDic[i] = (headerTable[i][0] for i in headerTable.keys())
    bigL = [v[0] for v in sorted(headerDic.items(), key=lambda p: p[1])]
    for basePat in bigL:  #循环遍历每个元素
        newFreqSet = preFix.copy()  #preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
        newFreqSet.add(basePat)  #把每一个频繁项添加到频繁项集列表中。
        freqItemList.append(newFreqSet)
        #调用findPrefixPath()函数来创建条件模式基
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #从条件模式基来构建FP树
        myCondTree, myHead = fpGrowth.createTree(condPattBases, minSup)
        #如果树种有元素项的话,递归调用mineTree()函数
        if myHead != None:
            print('conditional tree for:', newFreqSet)
            myCondTree.disp(1)
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
def test():
    simpDat = fpGrowth.loadSimpDat()
    initSet = fpGrowth.createInitSet(simpDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
    myFPtree.disp()
    freqItems = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
Example #4
0
def mineTweets(tweetArr, minSup=5):
    parsedList = []
    for i in range(14):
        for j in range(100):
            parsedList.append(textParse(tweetArr[i][j].text))
    initSet = fpGrowth.createInitSet(parsedList)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return myFreqList
Example #5
0
def mineTweets(tweetArr, minSup=5):
    parsedList = []
    for i in range(len(tweetArr)):
        for j in range(len(tweetArr[i])):
            #twitter爬出的数据是status类,因此需要status.text来找到实际需要的文本
            parsedList.append(textParse(tweetArr[i][j].text))
    initSet = fpGrowth.createInitSet(parsedList)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return (myFreqList)
Example #6
0
def test2():
    simplDat = fpGrowth.loadSimpleData()
    print "Data: ", simplDat
    initSet = fpGrowth.createInitSet(simplDat)
    print "initSet: ", initSet
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
    myFPtree.disp()
    condPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
    print "condPat: ", condPat

    freqItems = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
    print "freqItems: ", freqItems
Example #7
0
def calcFrequentSets(region):
    songs = db.lyrics.find({"region":region})
    all_verses = []
    for song in songs:
        verses = song['lyrics']
        for verse in verses:
            verse = [v for v in set(verse) if v not in stop_words]
            all_verses.append(verse)

    minSup = 10
    
    initSet = fpGrowth.createInitSet(all_verses)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return myFreqList
Example #8
0
def find_distinguishing_itemsets(transactions, max_items_list, minSup,
                                 cog_dict):
    # 1. arrange the transactions as proper input for createInitSet function
    dataSet = creatDataSet(transactions)
    # 2. arrange the data as proper input for fptree creation
    initSet = fpGrowth.createInitSet(dataSet)
    # 3. crate fptree
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    if myHeaderTab is None:  # tree is empty
        return max_items_list
    freqItems = []
    # 4. apply fpgrowth algorithm for finding frequent itemsets in data. Eventually [freqitems] contains them
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems,
                      cog_dict)
    max_IG = 0
    max_item = ""
    # 5. calculate entropy values for whole data
    s_entropy, label_0_num, label_1_num = calculate_all_DB_entropy(
        transactions)
    # 6. calculate IG score for each itemset while finding the one with the highest IG score value
    for item in freqItems:
        Ig = calculate_IG_value(item, transactions, s_entropy, label_0_num,
                                label_1_num)
        if Ig >= max_IG:
            max_item = item
            max_IG = Ig

    str_max_item = itemToStr(max_item)
    max_items_list[str_max_item] = [max_IG]
    copy_transactions = transactions.copy()

    # 7. find all teansactions in data that contain [max_item] and remove them from data base
    for bact in copy_transactions:
        if max_item.issubset(transactions[bact][0]):
            label = transactions[bact][1]
            max_items_list[str_max_item].append([bact, label])
            del transactions[bact]

    # 8. repeat until no transactions left in data base
    if len(transactions) > 0:
        find_distinguishing_itemsets(transactions, max_items_list, minSup,
                                     cog_dict)
    else:
        return max_items_list
Example #9
0
def subjectFreq(teamList,authorList,subjectList,minFreq):
    subjectFreqList = []
    for team in teamList:
        tempData = []
        #添加minFreq个'a',保证FPTree不为空
        for i in range(minFreq):
            tempData.append(['a'])
        for author,subject in zip(authorList,subjectList):
            if(not [False for a in team if a not in author]):
                tempData.append(subject)
        tempSet = fpGrowth.createInitSet(tempData)
        #fpGrowth树建立
        subjectFPtree,subjectHeaderTab=fpGrowth.createTree(tempSet,minFreq)
        #挖掘主题
        subjectFreqItems = []
        fpGrowth.mineTree(subjectFPtree,subjectHeaderTab,minFreq,set([]),subjectFreqItems)
        #每个团队主题词添加到subjectFreqList
        subjectFreqList.append(subjectFreqItems)
    return subjectFreqList
Example #10
0
    row_by_user[u] = sorted(row_by_user[u], key=lambda p: p[1])[:req_num]
    user[u] = {}
    for row in row_by_user[u]:
        if row[0] not in user[u]:
            user[u][row[0]] = set()
        user[u][row[0]].add(
            int(row[NOPOSITION]) if row[NOPOSITION] != "" else -1)
        if row[NOPOSITION] != "":
            namedict[int(row[NOPOSITION])] = row[NAMEPOSITON]
    # user[row[4]][row[0]][row[7]] = user[row[4]][row[0]].get(row[7], 0) + 1

for MINSUP in [2, 4, 8, 16, 32, 64]:
    # MINSUP = 2# print(cnt)
    for vip, uidset in user.items():
        # print(uidset)
        data = createInitSet(uidset.values())
        # print(data)
        myFpTree, myHeaderTab = createTree(data, MINSUP)
        if myFpTree is not None:
            print("vipno #", vip, "的支持度大于", MINSUP, "的频繁项集如下")
            freqItems = []
            mineTree(myFpTree, myHeaderTab, MINSUP, set([]), freqItems)
            # print(freqItems)
            for freq in freqItems:
                for x in freq:
                    print(namedict[x], end=",")
                print("\n**************")
        else:
            # pass
            print("vipno #", vip, "并没有支持度大于", MINSUP, "的频繁项集")
Example #11
0
#/usr/bin/python
# coding:utf-8

import fpGrowth

simpDat = fpGrowth.loadSimpDat()
dataSet = fpGrowth.createInitSet(simpDat)

retTree, headTab = fpGrowth.createTree(dataSet, 3)
retTree.disp()
Example #12
0
        #调用findPrefixPath()函数来创建条件模式基
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #从条件模式基来构建FP树
        myCondTree, myHead = fpGrowth.createTree(condPattBases, minSup)
        #如果树种有元素项的话,递归调用mineTree()函数
        if myHead != None:
            print('conditional tree for:', newFreqSet)
            myCondTree.disp(1)
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)


if __name__ == '__main__':
    # 测试给定元素项返回的条件模式基
    myDat = fpGrowth.loadSimpDat()
    myDic = fpGrowth.createInitSet(myDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(myDic, 3)
    print(myHeaderTab)
    # condPatsX= findPrefixPath('x',myHeaderTab['x'][1])
    # print(condPatsX)
    # condPatsZ = findPrefixPath('z', myHeaderTab['z'][1])
    # print(condPatsZ)
    # condPatsR = findPrefixPath('r', myHeaderTab['r'][1])
    # print(condPatsR)

    # 测试利用条件模式基递归查找频繁项集
    freqItems = []
    myMinTree = mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
    print(freqItems)
    """
    正如我们所期望的那样,返回项集与条件FP树相匹配,到现在为止,完整的FP-growth算法以及可以运行。
    """
import fpGrowth
rootNode = fpGrowth.treeNode('pyramid', 9, None)
rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None)
rootNode.disp()
rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None)
rootNode.disp()

from importlib import reload
reload(fpGrowth)
simpDat = fpGrowth.loadSimpDat()
simpDat
initSet = fpGrowth.createInitSet(simpDat)
initSet
# 创建FP树
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
myFPtree.disp()

reload(fpGrowth)
fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
fpGrowth.findPrefixPath('z', myHeaderTab['z'][1])
fpGrowth.findPrefixPath('r', myHeaderTab['r'][1])

reload(fpGrowth)
freqItems = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
freqItems

# 示例:从新闻网站点击流中挖掘
parsedDat = [line.split() for line in open('kosarak.dat').readlines()]
initSet = fpGrowth.createInitSet(parsedDat)
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000)
Example #14
0
File: 12.py Project: niumeng07/ML
# -*- coding: UTF-8 -*-

import fpGrowth

rootNode = fpGrowth.treeNode("pyramid", 9, None)  #   这个调用的就是class的__init__函数来声明第一个结点
rootNode.children["eye"] = fpGrowth.treeNode("eye", 13, None)  #   rootNode的['eye']孩子结点也是新声明的一个treeNode
rootNode.display()
rootNode.children["phoenix"] = fpGrowth.treeNode("phoenix", 3, rootNode.children["eye"])
rootNode.display()


simpDat = fpGrowth.loadSimpDat()
print(simpDat)
initSet = fpGrowth.createInitSet(simpDat)
print("createTree with this initSet:", initSet)
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
myFPtree.display()

Result_x = fpGrowth.findPrefixPath("x", myHeaderTab["x"][1])
Result_z = fpGrowth.findPrefixPath("z", myHeaderTab["z"][1])
Result_r = fpGrowth.findPrefixPath("r", myHeaderTab["r"][1])
print(Result_x)
print(Result_z)
print(Result_r)
freqItems = []
Result = fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
print(Result)

print("100万记录用FP-growth来处理:\n")
parsedDat = [line.split() for line in open("kosarak.dat").readlines()]
initSet = fpGrowth.createInitSet(parsedDat)
Example #15
0
data = []
for i in range(len(values)):
    temp = []
    for j in range(len(values[0])):
        if values[i][j] == 1:
            temp.append(j)
    data.append(temp)
counts = []
for index in columns:
    line = df[index]
    count = 0
    for i in range(len(line)):
        if line[i] == 1:
            count += 1
    counts.append((float)(count) / 10000)
counts.sort()
minSupport = counts[len(counts) * 1 / 5]

#use apriori
L, supportData = ap.apriori(data, minSupport)
rules = ap.generateRules(L, supportData, minConf=0.4)

#use fpGrowth

minSup = minSupport * 10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
Example #16
0
# #@ myFPtree:FP树的根节点
# #@ myHeaderTab:头指针表(字典)
# #@
# #递归查找频繁项集(每一个频繁项集都要创建一颗FP树)
# fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) #发现频繁项集并显示前缀路径
# print "\nfreqItems:\n", freqItems #显示频繁项集

####   例子:在Twitter源中发现一些共现词
# lotsOtweets = fpGrowth.getLotsOfTweets('RIMM')
# print lotsOtweets



parsedDat = [line.split() for line in open('kosarak.dat').readlines()] #加载数据存放在列表中
initSet = fpGrowth.createInitSet(parsedDat) #数据存放形式从列表转化成字典
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000) #创建FP树和头指针表
myFreqList = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList) #生成频繁项集
print "\nlength of 'myFreqList' is:", len(myFreqList)
print "myFreqList is:", myFreqList






end = time.clock()
print "The run time of the program is:",end-start, "seconds"


Example #17
0
#读取主题数据
titleRd = readData(fileName,confName,"title")
titleRd.openFile()
titleData = titleRd.readFile()

#查询活跃研究者
lastYear = "2015"
activeAuthorList = activeAuthor(authorData,yearData,lastYear)

#频繁模式查询团队,利用fpGrowth方法进行
#数据预处理
minMem = 3
authorSet = fpGrowth.createInitSet(authorData)
#fpGrowth树建立
authorFPtree,authoryHeaderTab=fpGrowth.createTree(authorSet,minMem)
#挖掘团队
teamList = []
fpGrowth.mineTree(authorFPtree,authoryHeaderTab,minMem,set([]),teamList)
teamList = fpGrowth.teamSelect(minMem,teamList)


#主题List查询
removeList = ['a','an','the','for','of','with','and','in','to']
subjectList = titleInit(titleData,removeList)

#团队主题
minFreq = 2
subjectFreqList = []
subjectFreqList = subjectFreq(teamList,authorData,subjectList,minFreq)
Example #18
0
rootNode = fpGrowth.treeNode('pyramid', 9, None)

rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None)

rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None)

rootNode.disp()
'''

simData = fpGrowth.loadSimpleData()
print('simData : ' , simData)

initSet = fpGrowth.createInitSet(simData)
print('initSet : ', initSet)

simFPTree, simHeaderTable = fpGrowth.createTree(initSet, 3)
simFPTree.disp()

'''
print('========= prefix path : ')
print(fpGrowth.findPrefixPath('x', simHeaderTable['x'][1]))
print(fpGrowth.findPrefixPath('z', simHeaderTable['z'][1]))
print(fpGrowth.findPrefixPath('r', simHeaderTable['r'][1]))
'''

# print('sorted(simHeaderTable.items()) : ', sorted(simHeaderTable.items()))

# bigL = [v[0] for v in sorted(simHeaderTable.items(), key = lambda p : p[0])]
# print(' test bigL = ', bigL)

freqItems = []
Example #19
0
if __name__ == "__main__":
    
    path="./../item/"
    arr = getData(path,3)
    dataSet = arr;
    print(dataSet)
    inputData={}
    for line in dataSet:
        inputData[frozenset(line)] = 1
        
        
    print(len(inputData))
    #print(inputData)
    fp_items=[]
    fp_items_temp=[]
    Fptree,headerTable=fpg.createTree(inputData, 0.1*len(inputData))
    Fptree.display()
    fpg.mineTree(Fptree, headerTable,  0.1*len(inputData), set([]), fp_items_temp)
    print(headerTable)
#    for item in fp_items_temp:
#        item2=sorted(item)
#        fp_items.append(set(item2))    
#        
#    
#    supportdata={}
#    for fp in fp_items:
#        for transaction in inputData:
#            if fp.issubset(transaction):
#                if frozenset(fp) in supportdata:
#                    supportdata[frozenset(fp)] += 1
#                else:supportdata[frozenset(fp)] = 1
Example #20
0
import fpGrowth

simpDat = fpGrowth.loadSimpDat()
# print simpDat

initSet = fpGrowth.createInitSet(simpDat)
# print initSet

myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3)
# print myFpTree.disp()
# print myHeaderTab

myCondPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
# print myCondPat

freqItems = []

myfpGrowth = fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems)
# print myFpTree.disp()
# #12.5 示例:从新闻网站点击流中挖掘
# parsedDat=[line.split() for line in open(homedir+'kosarak.dat').readlines()]
# # print "parsedDat:",parsedDat
# initSet=fpGrowth.createInitSet(parsedDat)
# # print "initSet:",initSet
# # print ":",
# myFPtree,myHeaderTab=fpGrowth.createTree(initSet,100000)
# myFPtree.disp()
# print "myHeaderTab:",myHeaderTab
# myFreqList=[]
# fpGrowth.mineTree(myFPtree,myHeaderTab,100000,set([]),myFreqList)
# # print "len(myFreqList):",len(myFreqList)
# print "myFreqList:",myFreqList

# #12.6 示例:从图灵书籍中挖掘
import sys
# print sys.getdefaultencoding()
# print sys.stdout.encoding
parsedDat = [line.split() for line in open(homedir + 'anjuke.dat').readlines()]
# print "parsedDat:",parsedDat
initSet = fpGrowth.createInitSet(parsedDat)
# print "initSet:",initSet
# print ":",
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 20)
# myFPtree.disp()
#print "myHeaderTab:",myHeaderTab
myFreqList = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 20, set([]), myFreqList)
# print "len(myFreqList):",len(myFreqList)
#print "myFreqList:",myFreqList
Example #22
0
import fpGrowth as fp
import pandas as pd
import numpy as np

# simpDat = fp.loadSimpDat()
# initset = fp.createInitSet(simpDat)
# myfptree,myheaderTab = fp.createTree(initset,3)
# freqItems = []
# fp.mineTree(myfptree,myheaderTab,3,set[()],freqItems)
# print freqItems
df = pd.read_excel("Transactions.xls")
values = df.values
data = []
for i in range(len(values)):
	temp = []
	for j in range(len(values[0])):
		if values[i][j] == 1:
			temp.append(j)
	data.append(temp)

minSup = 313.0
#simpDat = fp.loadSimpDat()
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
Example #23
0
import fpGrowth

reload(fpGrowth)
rootNode = fpGrowth.treeNode('pyramid' , 9 , None)           # 创建一个单节点
rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None)  # 增加一个子节点
rootNode.disp()                 # 显示
rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None)  # 增加一个子节点
rootNode.disp()                 # 显示


reload(fpGrowth)
simpDat = fpGrowth.loadSimpDat()    # 获取数据集
simpDat
initSet = fpGrowth.createInitSet(simpDat)   # 格式化处理数据集
initSet
myFPtree,myHeaderTab = fpGrowth.createTree(initSet,3)   # 生成树
myFPtree.disp()         # 显示


reload(fpGrowth)
fpGrowth.findPrefixPath('x',myHeaderTab['x'][1])    # 给定元素,生成条件基
fpGrowth.findPrefixPath('z',myHeaderTab['z'][1])
fpGrowth.findPrefixPath('r',myHeaderTab['r'][1])


reload(fpGrowth)
freqItems = []      # 空列表存储频繁项集
fpGrowth.mineTree(myFPtree,myHeaderTab,3,set([]),freqItems)
freqItems