Ejemplo n.º 1
0
def test3():
    parsedDat = [line.split() for line in open('kosarak.dat').readlines()]
    initSet = fpGrowth.createInitSet(parsedDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 10000)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 10000, set([]), myFreqList)
    print myFreqList
Ejemplo n.º 2
0
def test():
    simpDat = fpGrowth.loadSimpDat()
    initSet = fpGrowth.createInitSet(simpDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
    myFPtree.disp()
    freqItems = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
Ejemplo n.º 3
0
def mineTweets(tweetArr, minSup=5):
    parsedList = []
    for i in range(14):
        for j in range(100):
            parsedList.append(textParse(tweetArr[i][j].text))
    initSet = fpGrowth.createInitSet(parsedList)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return myFreqList
Ejemplo n.º 4
0
def mineTweets(tweetArr, minSup=5):
    parsedList = []
    for i in range(len(tweetArr)):
        for j in range(len(tweetArr[i])):
            #twitter爬出的数据是status类,因此需要status.text来找到实际需要的文本
            parsedList.append(textParse(tweetArr[i][j].text))
    initSet = fpGrowth.createInitSet(parsedList)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return (myFreqList)
Ejemplo n.º 5
0
def test2():
    simplDat = fpGrowth.loadSimpleData()
    print "Data: ", simplDat
    initSet = fpGrowth.createInitSet(simplDat)
    print "initSet: ", initSet
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
    myFPtree.disp()
    condPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
    print "condPat: ", condPat

    freqItems = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
    print "freqItems: ", freqItems
Ejemplo n.º 6
0
def calcFrequentSets(region):
    songs = db.lyrics.find({"region":region})
    all_verses = []
    for song in songs:
        verses = song['lyrics']
        for verse in verses:
            verse = [v for v in set(verse) if v not in stop_words]
            all_verses.append(verse)

    minSup = 10
    
    initSet = fpGrowth.createInitSet(all_verses)
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    myFreqList = []
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
    return myFreqList
Ejemplo n.º 7
0
def find_distinguishing_itemsets(transactions, max_items_list, minSup,
                                 cog_dict):
    # 1. arrange the transactions as proper input for createInitSet function
    dataSet = creatDataSet(transactions)
    # 2. arrange the data as proper input for fptree creation
    initSet = fpGrowth.createInitSet(dataSet)
    # 3. crate fptree
    myFPtree, myHeaderTab = fpGrowth.createTree(initSet, minSup)
    if myHeaderTab is None:  # tree is empty
        return max_items_list
    freqItems = []
    # 4. apply fpgrowth algorithm for finding frequent itemsets in data. Eventually [freqitems] contains them
    fpGrowth.mineTree(myFPtree, myHeaderTab, minSup, set([]), freqItems,
                      cog_dict)
    max_IG = 0
    max_item = ""
    # 5. calculate entropy values for whole data
    s_entropy, label_0_num, label_1_num = calculate_all_DB_entropy(
        transactions)
    # 6. calculate IG score for each itemset while finding the one with the highest IG score value
    for item in freqItems:
        Ig = calculate_IG_value(item, transactions, s_entropy, label_0_num,
                                label_1_num)
        if Ig >= max_IG:
            max_item = item
            max_IG = Ig

    str_max_item = itemToStr(max_item)
    max_items_list[str_max_item] = [max_IG]
    copy_transactions = transactions.copy()

    # 7. find all teansactions in data that contain [max_item] and remove them from data base
    for bact in copy_transactions:
        if max_item.issubset(transactions[bact][0]):
            label = transactions[bact][1]
            max_items_list[str_max_item].append([bact, label])
            del transactions[bact]

    # 8. repeat until no transactions left in data base
    if len(transactions) > 0:
        find_distinguishing_itemsets(transactions, max_items_list, minSup,
                                     cog_dict)
    else:
        return max_items_list
Ejemplo n.º 8
0
def subjectFreq(teamList,authorList,subjectList,minFreq):
    subjectFreqList = []
    for team in teamList:
        tempData = []
        #添加minFreq个'a',保证FPTree不为空
        for i in range(minFreq):
            tempData.append(['a'])
        for author,subject in zip(authorList,subjectList):
            if(not [False for a in team if a not in author]):
                tempData.append(subject)
        tempSet = fpGrowth.createInitSet(tempData)
        #fpGrowth树建立
        subjectFPtree,subjectHeaderTab=fpGrowth.createTree(tempSet,minFreq)
        #挖掘主题
        subjectFreqItems = []
        fpGrowth.mineTree(subjectFPtree,subjectHeaderTab,minFreq,set([]),subjectFreqItems)
        #每个团队主题词添加到subjectFreqList
        subjectFreqList.append(subjectFreqItems)
    return subjectFreqList
Ejemplo n.º 9
0
'''
# FP-Tree node create test

rootNode = fpGrowth.treeNode('pyramid', 9, None)

rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None)

rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None)

rootNode.disp()
'''

simData = fpGrowth.loadSimpleData()
print('simData : ' , simData)

initSet = fpGrowth.createInitSet(simData)
print('initSet : ', initSet)

simFPTree, simHeaderTable = fpGrowth.createTree(initSet, 3)
simFPTree.disp()

'''
print('========= prefix path : ')
print(fpGrowth.findPrefixPath('x', simHeaderTable['x'][1]))
print(fpGrowth.findPrefixPath('z', simHeaderTable['z'][1]))
print(fpGrowth.findPrefixPath('r', simHeaderTable['r'][1]))
'''

# print('sorted(simHeaderTable.items()) : ', sorted(simHeaderTable.items()))

# bigL = [v[0] for v in sorted(simHeaderTable.items(), key = lambda p : p[0])]
Ejemplo n.º 10
0
data = []
for i in range(len(values)):
    temp = []
    for j in range(len(values[0])):
        if values[i][j] == 1:
            temp.append(j)
    data.append(temp)
counts = []
for index in columns:
    line = df[index]
    count = 0
    for i in range(len(line)):
        if line[i] == 1:
            count += 1
    counts.append((float)(count) / 10000)
counts.sort()
minSupport = counts[len(counts) * 1 / 5]

#use apriori
L, supportData = ap.apriori(data, minSupport)
rules = ap.generateRules(L, supportData, minConf=0.4)

#use fpGrowth

minSup = minSupport * 10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
Ejemplo n.º 11
0
# -*- coding:utf-8 -*-
import fpGrowth

parseData = [line.split() for line in open('kosarak.dat').readlines()]
#print(parseData)
initSet = fpGrowth.createInitSet(parseData)
myFPTree,myHeaderTab = fpGrowth.createTree(initSet, 100000)
myFreqList = []
fpGrowth.mineTree(myFPTree, myHeaderTab, 100000, set([]), myFreqList)
print ('myFreqList is ' , myFreqList)
Ejemplo n.º 12
0
    # print(len(row_by_user[u]))
    row_by_user[u] = sorted(row_by_user[u], key=lambda p: p[1])[:req_num]
    user[u] = set()
    for row in row_by_user[u]:
        # if row[0] not in user[u]:
        #     user[u][row[0]] = set()
        user[u].add(int(row[NOPOSITION]) if row[NOPOSITION] != "" else -1)
        if row[NOPOSITION] != "":
            namedict[int(row[NOPOSITION])] = row[NAMEPOSITON]
    # user[row[4]][row[0]][row[7]] = user[row[4]][row[0]].get(row[7], 0) + 1

for MINSUP in [2, 4, 6, 8, 10]:
    # MINSUP = 2# print(cnt)
    #     for vip, uidset in user.items():
    # print(uidset)
    data = createInitSet(user.values())
    # print(data)
    myFpTree, myHeaderTab = createTree(data, MINSUP)
    if myFpTree is not None:
        # print("vipno #", vip, "的支持度大于", MINSUP, "的频繁项集如下")
        freqItems = []
        mineTree(myFpTree, myHeaderTab, MINSUP, set([]), freqItems)
        # print(freqItems)
        for freq in freqItems:
            for x in freq:
                print(namedict[x], end=",")
            print("\n**************")
    else:
        pass
        # print("vipno #", vip, "并没有支持度大于", MINSUP, "的频繁项集")
Ejemplo n.º 13
0
    row_by_user[u] = sorted(row_by_user[u], key=lambda p: p[1])[:req_num]
    user[u] = {}
    for row in row_by_user[u]:
        if row[0] not in user[u]:
            user[u][row[0]] = set()
        user[u][row[0]].add(
            int(row[NOPOSITION]) if row[NOPOSITION] != "" else -1)
        if row[NOPOSITION] != "":
            namedict[int(row[NOPOSITION])] = row[NAMEPOSITON]
    # user[row[4]][row[0]][row[7]] = user[row[4]][row[0]].get(row[7], 0) + 1

for MINSUP in [2, 4, 8, 16, 32, 64]:
    # MINSUP = 2# print(cnt)
    for vip, uidset in user.items():
        # print(uidset)
        data = createInitSet(uidset.values())
        # print(data)
        myFpTree, myHeaderTab = createTree(data, MINSUP)
        if myFpTree is not None:
            print("vipno #", vip, "的支持度大于", MINSUP, "的频繁项集如下")
            freqItems = []
            mineTree(myFpTree, myHeaderTab, MINSUP, set([]), freqItems)
            # print(freqItems)
            for freq in freqItems:
                for x in freq:
                    print(namedict[x], end=",")
                print("\n**************")
        else:
            # pass
            print("vipno #", vip, "并没有支持度大于", MINSUP, "的频繁项集")
Ejemplo n.º 14
0
# -*- coding:utf-8 -*-
import fpGrowth

#算法基本过程:
#1.创建FP树的数据结构
#2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项
#3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行
#4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。
#    如果树中已经存在现有元素,则增加现有元素的值
#    如果该路径不存在,则创建一条新路径。

###测试FP数的数据结构
#rootNode = fpGrowth.treeNode('pyramid',9,None)
#rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None)
#rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None)
#rootNode.disp()

simData = fpGrowth.loadSimpDat()
initSet = fpGrowth.createInitSet(simData)
myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3)
myFpTree.disp()

myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1])
print('myCondPats is', myCondPats)

freqItems = []
fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems)
print('频繁项集 is', freqItems)
Ejemplo n.º 15
0
        freqItemList.append(newFreqSet)
        #调用findPrefixPath()函数来创建条件模式基
        condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
        #从条件模式基来构建FP树
        myCondTree, myHead = fpGrowth.createTree(condPattBases, minSup)
        #如果树种有元素项的话,递归调用mineTree()函数
        if myHead != None:
            print('conditional tree for:', newFreqSet)
            myCondTree.disp(1)
            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)


if __name__ == '__main__':
    # 测试给定元素项返回的条件模式基
    myDat = fpGrowth.loadSimpDat()
    myDic = fpGrowth.createInitSet(myDat)
    myFPtree, myHeaderTab = fpGrowth.createTree(myDic, 3)
    print(myHeaderTab)
    # condPatsX= findPrefixPath('x',myHeaderTab['x'][1])
    # print(condPatsX)
    # condPatsZ = findPrefixPath('z', myHeaderTab['z'][1])
    # print(condPatsZ)
    # condPatsR = findPrefixPath('r', myHeaderTab['r'][1])
    # print(condPatsR)

    # 测试利用条件模式基递归查找频繁项集
    freqItems = []
    myMinTree = mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
    print(freqItems)
    """
    正如我们所期望的那样,返回项集与条件FP树相匹配,到现在为止,完整的FP-growth算法以及可以运行。
import fpGrowth
rootNode = fpGrowth.treeNode('pyramid', 9, None)
rootNode.children['eye'] = fpGrowth.treeNode('eye', 13, None)
rootNode.disp()
rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix', 3, None)
rootNode.disp()

from importlib import reload
reload(fpGrowth)
simpDat = fpGrowth.loadSimpDat()
simpDat
initSet = fpGrowth.createInitSet(simpDat)
initSet
# 创建FP树
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
myFPtree.disp()

reload(fpGrowth)
fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
fpGrowth.findPrefixPath('z', myHeaderTab['z'][1])
fpGrowth.findPrefixPath('r', myHeaderTab['r'][1])

reload(fpGrowth)
freqItems = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
freqItems

# 示例:从新闻网站点击流中挖掘
parsedDat = [line.split() for line in open('kosarak.dat').readlines()]
initSet = fpGrowth.createInitSet(parsedDat)
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000)
Ejemplo n.º 17
0
Archivo: 12.py Proyecto: niumeng07/ML
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import fpGrowth

rootNode = fpGrowth.treeNode("pyramid", 9, None)  #   这个调用的就是class的__init__函数来声明第一个结点
rootNode.children["eye"] = fpGrowth.treeNode("eye", 13, None)  #   rootNode的['eye']孩子结点也是新声明的一个treeNode
rootNode.display()
rootNode.children["phoenix"] = fpGrowth.treeNode("phoenix", 3, rootNode.children["eye"])
rootNode.display()


simpDat = fpGrowth.loadSimpDat()
print(simpDat)
initSet = fpGrowth.createInitSet(simpDat)
print("createTree with this initSet:", initSet)
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 3)
myFPtree.display()

Result_x = fpGrowth.findPrefixPath("x", myHeaderTab["x"][1])
Result_z = fpGrowth.findPrefixPath("z", myHeaderTab["z"][1])
Result_r = fpGrowth.findPrefixPath("r", myHeaderTab["r"][1])
print(Result_x)
print(Result_z)
print(Result_r)
freqItems = []
Result = fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems)
print(Result)

print("100万记录用FP-growth来处理:\n")
parsedDat = [line.split() for line in open("kosarak.dat").readlines()]
Ejemplo n.º 18
0
# freqItems = []
# #@ myFPtree:FP树的根节点
# #@ myHeaderTab:头指针表(字典)
# #@
# #递归查找频繁项集(每一个频繁项集都要创建一颗FP树)
# fpGrowth.mineTree(myFPtree, myHeaderTab, 3, set([]), freqItems) #发现频繁项集并显示前缀路径
# print "\nfreqItems:\n", freqItems #显示频繁项集

####   例子:在Twitter源中发现一些共现词
# lotsOtweets = fpGrowth.getLotsOfTweets('RIMM')
# print lotsOtweets



parsedDat = [line.split() for line in open('kosarak.dat').readlines()] #加载数据存放在列表中
initSet = fpGrowth.createInitSet(parsedDat) #数据存放形式从列表转化成字典
myFPtree, myHeaderTab = fpGrowth.createTree(initSet, 100000) #创建FP树和头指针表
myFreqList = []
fpGrowth.mineTree(myFPtree, myHeaderTab, 100000, set([]), myFreqList) #生成频繁项集
print "\nlength of 'myFreqList' is:", len(myFreqList)
print "myFreqList is:", myFreqList






end = time.clock()
print "The run time of the program is:",end-start, "seconds"

Ejemplo n.º 19
0
yearRd.openFile()
yearData = yearRd.readFile()

#读取主题数据
titleRd = readData(fileName,confName,"title")
titleRd.openFile()
titleData = titleRd.readFile()

#查询活跃研究者
lastYear = "2015"
activeAuthorList = activeAuthor(authorData,yearData,lastYear)

#频繁模式查询团队,利用fpGrowth方法进行
#数据预处理
minMem = 3
authorSet = fpGrowth.createInitSet(authorData)
#fpGrowth树建立
authorFPtree,authoryHeaderTab=fpGrowth.createTree(authorSet,minMem)
#挖掘团队
teamList = []
fpGrowth.mineTree(authorFPtree,authoryHeaderTab,minMem,set([]),teamList)
teamList = fpGrowth.teamSelect(minMem,teamList)


#主题List查询
removeList = ['a','an','the','for','of','with','and','in','to']
subjectList = titleInit(titleData,removeList)

#团队主题
minFreq = 2
subjectFreqList = []
Ejemplo n.º 20
0
import fpGrowth

simpDat = fpGrowth.loadSimpDat()
# print simpDat

initSet = fpGrowth.createInitSet(simpDat)
# print initSet

myFpTree, myHeaderTab = fpGrowth.createTree(initSet, 3)
# print myFpTree.disp()
# print myHeaderTab

myCondPat = fpGrowth.findPrefixPath('x', myHeaderTab['x'][1])
# print myCondPat

freqItems = []

myfpGrowth = fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set([]), freqItems)
# print myFpTree.disp()
Ejemplo n.º 21
0
# -*- coding:utf-8 -*-
import fpGrowth

#算法基本过程:
#1.创建FP树的数据结构
#2.第一次遍历数据集会获得每个元素项的出现频率。 去掉不满足支持度的元素项
#3.对每个事务(即每个记录)中的集合进行排序。排序基于元素项的绝对出现频率来进行
#4.构建FP树。从空集开始,向其中不断添加频繁项集。即在构建时,读入每个事务中的项集,并将其添加到已存在的路径中。
#    如果树中已经存在现有元素,则增加现有元素的值
#    如果该路径不存在,则创建一条新路径。

###测试FP数的数据结构
#rootNode = fpGrowth.treeNode('pyramid',9,None)
#rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None)
#rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None)
#rootNode.disp()


simData = fpGrowth.loadSimpDat()
initSet = fpGrowth.createInitSet(simData)
myFpTree,myHeaderTab = fpGrowth.createTree(initSet, 3)
myFpTree.disp()

myCondPats = fpGrowth.findPrefixPath('r', myHeaderTab['r'][1])
print ('myCondPats is' , myCondPats)

freqItems = []
fpGrowth.mineTree(myFpTree, myHeaderTab, 3, set ([]), freqItems)
print('频繁项集 is' , freqItems)

Ejemplo n.º 22
0
import fpGrowth as fp
import pandas as pd
import numpy as np

# simpDat = fp.loadSimpDat()
# initset = fp.createInitSet(simpDat)
# myfptree,myheaderTab = fp.createTree(initset,3)
# freqItems = []
# fp.mineTree(myfptree,myheaderTab,3,set[()],freqItems)
# print freqItems
df = pd.read_excel("Transactions.xls")
values = df.values
data = []
for i in range(len(values)):
	temp = []
	for j in range(len(values[0])):
		if values[i][j] == 1:
			temp.append(j)
	data.append(temp)

minSup = 313.0
#simpDat = fp.loadSimpDat()
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
Ejemplo n.º 23
0
from imp import reload
import fpGrowth

reload(fpGrowth)
rootNode = fpGrowth.treeNode('pyramid' , 9 , None)           # 创建一个单节点
rootNode.children['eye'] = fpGrowth.treeNode('eye',13,None)  # 增加一个子节点
rootNode.disp()                 # 显示
rootNode.children['phoenix'] = fpGrowth.treeNode('phoenix',3,None)  # 增加一个子节点
rootNode.disp()                 # 显示


reload(fpGrowth)
simpDat = fpGrowth.loadSimpDat()    # 获取数据集
simpDat
initSet = fpGrowth.createInitSet(simpDat)   # 格式化处理数据集
initSet
myFPtree,myHeaderTab = fpGrowth.createTree(initSet,3)   # 生成树
myFPtree.disp()         # 显示


reload(fpGrowth)
fpGrowth.findPrefixPath('x',myHeaderTab['x'][1])    # 给定元素,生成条件基
fpGrowth.findPrefixPath('z',myHeaderTab['z'][1])
fpGrowth.findPrefixPath('r',myHeaderTab['r'][1])


reload(fpGrowth)
freqItems = []      # 空列表存储频繁项集
fpGrowth.mineTree(myFPtree,myHeaderTab,3,set([]),freqItems)
freqItems