Esempio n. 1
0
	def rulesthread(self):
		start = time.time()
		if (self.suanfa=='Aprioi'):
			if(self.minconfig!=0):
				self.rules=apriori.generateRules(self.L,self.support,self.minconfig)
			else:
				self.rules=apriori.generateRules(self.L,self.support)
		else:
			self.rules = []
			if (self.minconfig!=0):
				Rules.rulesGenerator(self.L, self.rules, self.minconfig)
			else:
				Rules.rulesGenerator(self.L, self.rules)
		# self.rules = []
		# if (self.minconfig!=0):
		# 	Rules.rulesGenerator(self.L, self.rules, self.minconfig)
		# else:
		# 	Rules.rulesGenerator(self.L, self.rules)
		self.result_data_Text.insert(INSERT,'关联规则\n')
		for i in self.rules:
			self.result_data_Text.insert(INSERT,list(i))
			self.result_data_Text.insert(INSERT,'\n')
		end = time.time()
		self.result_data_Text.insert(INSERT,str(len(self.rules))+'\n')
		self.log_data_Text.insert(INSERT,'关联规则生成完毕! 共'+str(len(self.rules))+'项	耗时:'+str(round(end-start,2))+'s\n')
Esempio n. 2
0
 def test_generate_associations(self):
     L, supp_data = apriori.apriori(self.dataset, min_support=0.5)
     print 'L:', L
     print '-'*20
     print 'supp_data: ', supp_data
     print '-'*20
     rules = apriori.generateRules(L, supp_data, min_confidence=0.95)
     print '-'*20
     print 'rules: ', rules
     print '-'*20
     assert False
Esempio n. 3
0
def test1():
    dataSet = apriori.loadDataSet()
    print(dataSet)  #[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

    #C1=apriori.createC1(dataSet)
    #print(set(C1)) #{frozenset({4}), frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})}
    #print(list(C1)) #[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]

    #D=map(set,dataSet)
    #print(list(D)) #[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}] 注意!!被list(map1)之后,map1的内容就空了。。。好像set(.)也会清空人家

    #L1,suppData0 = apriori.scanD(D, C1, 0.5)  #不能直接用了,要把D和C1先变成list
    #print(L1)   #[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]
    #print(suppData0) #{frozenset({4}): 0.25, frozenset({5}): 0.75, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({1}): 0.5}

    L, suppData = apriori.apriori(dataSet, 0.5)
    print(L)
    print(suppData)
    rules = apriori.generateRules(L, suppData, minConf=0.5)
    print(rules)
Esempio n. 4
0
    #print 'rules:\n', rules
    with open("xss-train.txt") as f:
        for line in f:
            #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E
            index=line.find("?")
            if index>0:
                line=line[index+1:len(line)]
                #print line
                tokens=re.split('\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',line)
                #print "token:"
                #print tokens
                myDat.append(tokens)
        f.close()

    L, suppData = apriori(myDat, 0.15)
    rules = generateRules(L, suppData, minConf=0.6)
    #print 'rules:\n', rules# -*- coding:utf-8 -*-

import sys
import urllib
import urlparse
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import HTMLParser
import nltk


#处理参数值的最小长度
MIN_LEN=6
Esempio n. 5
0
import sys

sys.path.append('../chapter4')
import pandas as pd
from graphviz import Digraph
import apriori

# 定义数据文件
fileName = 'association.txt'

# 通过调用自定义的apriori做关联分析
minS = 0.1  # 定义最小支持度阀值
minC = 0.38  # 定义最小置信度阀值
dataSet = apriori.createData(fileName)  # 获取格式化的数据集
L, suppData = apriori.apriori(dataSet, minSupport=minS)  # 计算得到满足最小支持度的规则
rules = apriori.generateRules(fileName, L, suppData,
                              minConf=minC)  # 计算满足最小置信度的规则

# 关联结果报表评估
model_summary = 'data record: {1} \nassociation rules count: {0}'  # 展示数据集记录数和满足阀值定义的规则数量
print(model_summary.format(len(rules), len(dataSet)))  # 使用str.format做格式化输出
df = pd.DataFrame(
    rules,
    columns=['item1', 'itme2', 'instance', 'support', 'confidence',
             'lift'])  # 创建频繁规则数据框
df_lift = df[df['lift'] > 1.0]  # 只选择提升度>1的规则
print(df_lift.sort('instance', ascending=False))  # 打印排序后的数据框

# 关联结果图形展示
dot = Digraph()  # 创建有向图
graph_data = df_lift[['item1', 'itme2', 'instance']]  # 切分画图用的前项、后项和实例数数据
for each_data in graph_data.values:  # 循环读出每条规则
Esempio n. 6
0
File: svm.py Progetto: HgLeL/rub
import apriori
dataSet = apriori.loadDataSet()
L, suppData = apriori.apriori(dataSet, minSupport=0.5)
rules = apriori.generateRules(L, suppData, minConf=0.5)

from sklearn import svm

x = [[2, 0], [1, 1], [2, 3]]
y = [0, 0, 1]
clf = svm.SVC(kernel='linear')
clf.fit(x, y)
print(clf)
print(clf.predict([[2, 0]]))  #predict[2,0] belong to which class

import numpy as np
import pylab as pl
from sklearn import svm
np.random.seed(0)
x = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
y = [0] * 20 + [1] * 20  #[0,0...1,1]
clf = svm.SVC(kernel='linear')
clf.fit(x, y)
w = clf._get_coef()[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf._intercept_[0]) / w[1]
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
Esempio n. 7
0
#dataSet = apriori.loadDataSet()

#C1 = apriori.createC1(dataSet)

#D = map(set, dataSet)
#print('D', D)
#L1, suppData0 = apriori.scanD(D, C1, 0.5)

#频繁项集L
#所有候选项集的支持度信息suppData
L, suppData = apriori.apriori(dataSet, 0.3)

print('L:', L)
#print('suppData:', suppData)
rules = apriori.generateRules(L, suppData, minConf = 0.3)
#print('rules', len(rules))

rules_remove_redundancy = []
def remove_redundancy():

	redundancy_indices = []

	for i in range(0, len(rules) - 1):
		for j in range(i + 1, len(rules)):
			if rules[i][0] < rules[j][0] and rules[i][1] < rules[j][1] and rules[i][4] <= rules[j][4]:
				if i not in redundancy_indices:
					redundancy_indices.append(i)

			if rules[j][0] < rules[i][0] and rules[j][1] < rules[i][1] and rules[j][4] <= rules[i][4]:
				if j not in redundancy_indices:
import apriori

dataMat = apriori.loadDataSet()
print(dataMat)

dataSet = apriori.createC1(dataMat)
print(dataSet)

L, supportData = apriori.apriori(dataMat)
print(L)
print(supportData)

apriori.generateRules(L, supportData, 0.5)
#print dataset
C1 = apriori.createC1(dataset)
#print 'C1', C1
D = map(set,dataset)
#print 'D', D
L1, support_data = apriori.scanD(D,C1,support)
#print 'L1', L1
#print 'support_data', support_data
k_length = 2
transactions = apriori.aprioriGen(L1, k_length)
#print 'transactions', transactions
#print '\n*** *** ***'
L,support_data = apriori.apriori(dataset, support)
#print 'L', L
#print 'support_data', support_data
rules = apriori.generateRules(L, support_data, min_confidence=0.7)
#print 'rules', rules

ruleDict = apriori.generateRuleDict(rules)

'''
print 'ruleDict', ruleDict
print '*** *** ***'
'''
print 'keys', ruleDict.keys()
print '*** *** ***'


## testing
if __name__ == '__main__':
    #print '\n\n***\n'
Esempio n. 10
0
import ex1
import apriori

data = ex1.loadDataSet()
c1 = ex1.createcl(data)
print(c1)
print(data)
# d = map(set,data)
l1, supportdata = ex1.scand(data, c1, 0.5)
print(l1)
print(supportdata)
k = 2
l = [l1]

# ck = ex1.apriorigen(l,k)
# print(ck)
while (len(l[k - 2]) > 0):
    ck = ex1.apriorigen(l[k - 2], k)
    lk, supk = ex1.scand(data, ck, 0.5)
    supportdata.update(supk)
    l.append(lk)
    k += 1

print(l, k)
rules = apriori.generateRules(l, supportdata, minConf=0.7)
print(rules)
Esempio n. 11
0
    D = map(set, datSet)
    #D[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]
    L1, suppData0 = apriori.scanD(D, Cl, 0.5)
    #retList, supportData
    print("retList-L1")
    print(L1)
    print("supportData-suppData0")
    print(suppData0)
    # apriori.aprioriGen()
    L2, suppData0 = apriori.apriori(datSet)
    print("L2")
    print(L2)
    print("suppData0")
    print(suppData0)
    #
    rules = apriori.generateRules(L2, suppData0, minConf=0.6)
    print("rules")
    print(rules)

    # actionIdList, billTitleList=recentAprioriTest.getActionIds()
    # print("actionIdList")
    # print(actionIdList)
    # print("billTitleList")
    # print(billTitleList)
    #
    # transDict, itemMeaning=recentAprioriTest.getTransList(actionIdList[:2],billTitleList[:2])
    # print("transDict")
    # print(transDict)
    # print("itemMeaning")
    # print(itemMeaning)
# the current data-set isn't in transactional format. To convert it into a transactional data-set, we use the following snippet of code:
basket_str=""

for rowNum, row in accident_data.iterrows():
    
    #Break lines
    if (rowNum != 0):
        basket_str = basket_str + "\n"
    #Add the rowid as the first column
    basket_str = basket_str + str(rowNum) 
    #Add columns
    for colName, col in row.iteritems():
           if ( colName != 'Accident_Index'):
               basket_str = basket_str + "," + colName + "=" + str(col)
#print basket_str
basket_file=open("accidents_basket.csv","w")
basket_file.write(basket_str)
basket_file.close()

import csv
with open("accidents_basket.csv","rb") as f:
    reader=csv.reader(f)
    my_list=list(reader)

#my_list
L,supportData=apriori.apriori(my_list,0.6)
f_rules= apriori.generateRules(L,supportData,0.6)

for row in f_rules:
    print list(row[0]), " => ", list(row[1]), row[2]   
    
Esempio n. 13
0
File: 12-2.py Progetto: yuyuqi/1book
        for line in f:
            #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E
            index = line.find("?")
            if index > 0:
                line = line[index + 1:len(line)]
                #print line
                tokens = re.split(
                    '\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',
                    line)
                #print "token:"
                #print tokens
                myDat.append(tokens)
        f.close()

    L, suppData = apriori(myDat, 0.15)
    rules = generateRules(L, suppData, minConf=0.6)
    #print 'rules:\n', rules# -*- coding:utf-8 -*-

import sys
import urllib
import urlparse
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import HTMLParser
import nltk

#处理参数值的最小长度
MIN_LEN = 6
Esempio n. 14
0
    arrAll.append(arr4.strip().split(','))
arr5 = arr[5]
if arr5:
    arrAll.append(arr5.strip().split(','))
dataSet = arrAll
print 'dataSet: ', dataSet

# Apriori 算法生成频繁项集以及它们的支持度
# 这个
L1, supportData1 = apriori.apriori(dataSet, minSupport=0.2)
print 'L(0.7): ', L1
print 'supportData(0.7): ', supportData1

# 生成关联规则
dic = dict()
rules = apriori.generateRules(L1, supportData1, minConf=0.8)
print 'rules: ', rules[:10]
print type(rules)

#for i in range(0,len(rules)):
#   if rules[i][0]==frozenset(['58691ed3d87f49b489feb40de28a92f9']):
#      print map(str,rules[i][1])
#     dic[map(str,rules[i][0])].append(map(str,rules[i][1]))
#    print dic
"""
for line in sys.stdin:
    arr =line.strip().split()
    userID = arr[0]
    downloadRes =arr[1]
    if downloadRes:
        arrAll.append(downloadRes.strip().split(','))
Esempio n. 15
0
#!usr/bin/env python3
# -*- coding:utf-8 -*-
"""
#@author:Benny.Chen
#@file: main.py
#@time: 2020/6/10 11:06
#@email:[email protected]
"""
from pprint import pprint

import apriori

if __name__ == '__main__':
    dataSet = apriori.loadDataSet()
    L ,supportData = apriori.apriori(dataSet, minSupport=0.5)
    rules = apriori.generateRules(L, supportData)
    pprint(rules)
Esempio n. 16
0
# coding:utf-8
import apriori
import time
import numpy as np

# 读取训练集
with open("./data/agaricus_train.csv", "rb") as f:
    dataSet = [line[:-1].split(',') for line in f.readlines()]

# L中的每一个元素都至少在25%的样本中出现过
L, suppData = apriori.apriori(dataSet, 0.25) # 阈值越小,越慢

# 生成规则,每个规则的置信度至少是0.6
bigRuleList = apriori.generateRules(L, suppData, 0.6)

# P→H,根据P集合的大小排序
bigRuleList = sorted(bigRuleList, key=lambda x:len(x[0]), reverse=True)

# 读取测试集
with open("./data/agaricus_test.csv", "rb") as f:
    dataSet = [line[:-1].split(',') for line in f.readlines()]
labels = np.array([int(x[0]) for x in dataSet])

scores = []
for line in dataSet:
    tmp = []
    for item in bigRuleList:
        if item[0].issubset(set(line)):
            if "1" in item[1]:
                tmp.append(float(item[2]))
            # 因为是预测“为1的概率”,所以要用1减
        # 在循环中出现的前置路径,均为频繁项,因为在构建条件树的过程中已经过滤了频繁项
        frequentItems.append(prefixPath)
        # 获取条件基
        conditionPatterns = findPrefixPath(basePattern, myTableHeader[basePattern][1])
        # 根据条件基,得到条件树
        conditionTree, headerTable = createTree(conditionPatterns, minSupport)
        if conditionTree != None:
            print('condition: ', prefixPath)
            conditionTree.display(1)
            mineTree(conditionTree, headerTable, minSupport, prefixPath, frequentItems)
    return frequentItems


# simpleDat = loadSimpDat()
# initSet = createInitSet(simpleDat)
# myFPTree, myHeaderTab = createTree(initSet, 3)
# frequentItems = []
# mineTree(myFPTree, myHeaderTab, 3, set([]), frequentItems)
parsedData = [line.split() for line in open('./machinelearninginaction/Ch12/kosarak.dat').readlines()]
initSet = createInitSet(parsedData)
timeStart = time.time()
myFPTree, myHeaderTab = createTree(initSet, 100000)
frequentItems = []
mineTree(myFPTree, myHeaderTab, 100000, set([]), frequentItems)
print(frequentItems)
print('timeEnd:', time.time() - timeStart)

timeStart = time.time()
L, supportData = apriori.apriori(parsedData, 0.1)
apriori.generateRules(L, supportData, 0.7)
print('timeEnd:', time.time() - timeStart)
Esempio n. 18
0
# Extract transactions and meanings
transactions = map_transactions(RAW_DATA[0])
meaning = map_meaning(RAW_MEANING[0])

for threshold in np.arange(0.5, 0.25, -0.05):
    itemsets, support = apriori.apriori(transactions.values(), minSupport=threshold)
    print "THRESHOLD: ", threshold
    print len(itemsets), "itemsets of length:"
    print [len(i) for i in itemsets]
    print "\n"

itemset, support = apriori.apriori(transactions.values(), minSupport=0.3)
for threshold in np.arange(0.7, 0.99, 0.05):
    print "THRESHOLD: ", threshold
    rules = apriori.generateRules(itemset, support, minConf=threshold)
    print "\n"

def get_meaning (rule, meaning):
    condition, result = [], []
    for c in rule[0]:
        condition.append(meaning[c])
    for r in rule[1]:
        result.append(meaning[r])
    
    print "IF:", " AND ".join(condition)
    print "THEN:", " AND ".join(result)
    print "CONFIDENCE: ", rule[2], "\n\n"

for i in range(6):
    get_meaning(random.choice(rules), meaning)
Esempio n. 19
0
D = map(set, dataset)
#print 'D', D
L1, support_data = apriori.scanD(D, C1, support)
#print 'L1', L1
#print 'support_data', support_data
print 'support_data'
for k, v in support_data.iteritems():
    print k, v
k_length = 2
transactions = apriori.aprioriGen(L1, k_length)
#print 'transactions', transactions
#print '\n*** *** ***'
L, support_data = apriori.apriori(dataset, support)
#print 'L', L
#print 'support_data', support_data
rules = apriori.generateRules(L, support_data, min_confidence=0.7)
#print 'rules', rules

ruleDict = apriori.generateRuleDict(rules)

print 'ruleDict'
for k, v in ruleDict.iteritems():
    print '\t', k, "".join(
        ' '
        for i in range(30 - len(''.join(item
                                        for item in list(k))) - len(k) * 4)), v
print '*** *** ***'

#print 'keys', ruleDict.keys()
#print '*** *** ***'
Esempio n. 20
0
import data
import apriori
import matplotlib.pyplot as plt
import numpy

dataSet = data.loadDataSet()
print ('size of dataSet: ', len(dataSet))

# 1)
plt.figure()
minSupports = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
cntOfRules = []
for minSupport in minSupports:
    frequentItemSet, supportData = apriori.apriori(dataSet, minSupport)
    rules = apriori.generateRules(frequentItemSet, supportData, 0.1 )
    cntOfRules.append(len(rules))
x = numpy.array(minSupports)
y = numpy.array(cntOfRules)
plt.plot(x, y)
plt.xlabel('minimal support with 0.3 confidence')
plt.ylabel('cntOfRules')

# 2)
plt.figure()
minConfidences = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
cntOfRules = []
frequentItemSet, supportData = apriori.apriori(dataSet, 0.1)
for minConfidence in minConfidences:
    rules = apriori.generateRules(frequentItemSet, supportData, minConfidence)
    cntOfRules.append(len(rules))
x = numpy.array(minConfidences)
Esempio n. 21
0
#apriori只支持python2.7版本
from apriori import apriori
from apriori import generateRules
import re

if __name__ == "__main__":
    myDat = []
    with open("./data/ibook/data/xss-2000.txt") as f:
        for line in f:
            index = line.find("?")
            tokens = re.split(
                '\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',
                line)
            myDat.append(tokens)

    L, suppData = apriori(myDat, 0.15)
    rules = generateRules(L, suppData, miniConf=0.5)
    print(rules)
Esempio n. 22
0
D
L1, supportData0 = apriori.scanD(
    D, C1, 0.5)  # 以0.5支持度为要求,计算候选集的每一个项的支持度,并返回大于支持度的集合L1
L1
supportData0

# 根据支持度生成频繁集
reload(apriori)
L, supportData = apriori.apriori(dataSet)
L  # 获得支持度大于0.5的频繁集合
L[0]  # 包含一个元素的
L[1]  # 包含两个元素的
L[2]  # 包含三个元素的
L[3]
apriori.aprioriGen(L[0], 2)  # 看一下如何生成的未和支持度比较的‘L[1]’
L, supportData = apriori.apriori(dataSet, minSupport=0.7)  # 更大的支持度,获得少的结果了

# 根据可信度生成关联规则
reload(apriori)
L, supportData = apriori.apriori(dataSet, minSupport=0.5)
rules = apriori.generateRules(L, supportData, minConf=0.7)  # 0.7的可信度生成的规则
rules = apriori.generateRules(L, supportData, minConf=0.5)  # 0.5的可信度生成的规则

# 在毒蘑菇的数据集上测试下效果如何
mushDataSet = [line.split() for line in open('mushroom.dat').readlines()]
# 这个数据集第一列是标签是否有毒
L, supportData = apriori.apriori(mushDataSet, minSupport=0.3)
for item in L[3]:
    if item.intersection('2'):  # 看下包含特征有毒为2的频繁项集
        print(item)
Esempio n. 23
0
##replace the processed data into the raw data
def mumerizeData(data):
    data_birth_year = dataDiscretizeBirthyear(data)
    data_weight = dataDiscretizeWeight(data)
    data_height = dataDiscretizeHeight(data)
    data.drop("birth_year",inplace = True ,axis = 1)
    data.drop("weight",inplace = True ,axis = 1)
    data.drop("height",inplace = True ,axis = 1)
    data = pd.concat([data,data_birth_year,data_weight,data_height],axis=1)
    return data

if __name__ == '__main__':

    myDat = mumerizeData(data)
    #%%
    print myDat['weight']
    print myDat['birth_year']
    print myDat['height']
    #%%
    #频繁项集与支持度
    sets,sp = apriori.apriori(myDat.values,4,0.6)
    rules = apriori.generateRules(sets,sp,0.8)
    print ("层级:频繁项集:支持度")
    for Lk in sets:
        for freq_set in Lk:
            print(str(len(list(Lk)[0])),':',freq_set,' : ',sp[freq_set])
    print("强关联规则:置信度")
    for item in rules:
            print (item[0], "-->>", item[1], " : ", item[2])
#%%
Esempio n. 24
0
def test():
    dataSet = apriori.loadDataSet()
    print "DataSet:", dataSet
    L,suppData = apriori.apriori(dataSet)
    rules = apriori.generateRules(L, suppData, minConf=0.5)
    print rules
Esempio n. 25
0

import apriori as ap

dataSet = ap.loadDataSet()
#print dataSet
C1 = ap.createC1(dataSet)
#print C1
D = map(set, dataSet)
#print D
L1, suppData0 = ap.scanD(D, C1, 0.5)
#print suppData0
L, S = ap.apriori(D, 0.5)
#print L

print L

List = ap.generateRules(L, S, minConf=0.4)
print List


Esempio n. 26
0
def test1():
    dataSet = apriori.loadDataSet()
    C1 = apriori.createC1(dataSet)
    L, supportData = apriori.apriori(dataSet, minSupport=0.5)
    # print(L)
    rules = apriori.generateRules(L, supportData, minConf=0.5)
Esempio n. 27
0
data = []
for i in range(len(values)):
    temp = []
    for j in range(len(values[0])):
        if values[i][j] == 1:
            temp.append(j)
    data.append(temp)
counts = []
for index in columns:
    line = df[index]
    count = 0
    for i in range(len(line)):
        if line[i] == 1:
            count += 1
    counts.append((float)(count) / 10000)
counts.sort()
minSupport = counts[len(counts) * 1 / 5]

#use apriori
L, supportData = ap.apriori(data, minSupport)
rules = ap.generateRules(L, supportData, minConf=0.4)

#use fpGrowth

minSup = minSupport * 10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
df = pd.DataFrame(et_tours, columns=tour_names)
#print(df.shape)
#print(df.head)

for index in range(len(tour_names)):
    #        tour_to_name = {'0' : np.nan, '1' : tour_names[index],'2' : tour_names[index],'1' : tour_names[index],'3' : tour_names[index],'4' : tour_names[index],'5' : tour_names[index],'6' : tour_names[index],'7' : tour_names[index]}
    tour_to_name = {0: np.nan, 1: tour_names[index]}
    df[tour_names[index]] = df[tour_names[index]].map(tour_to_name)

print(df.head)

C1 = createC1(df)  #new stuff
D = map(set, df)  #new stuff
L1, support_data = scanD(D, C1, 0.0000005)  #new stuff

my_data = list()

for index in range(df.shape[0]):
    basket = list(df.ix[index].dropna())
    my_data.append(basket)

L, suppData = apriori(my_data)

print('Identified rules with support = ', alpha, 'and confidence= ', beta)
rules = generateRules(L, suppData, minConf=beta)
n_other_items = 1
while n_other_items <= max_other_items:
    print('\nRules with ', n_other_items, 'other item(s)')
    for item in L[n_other_items]:
        if item.intersection(item_of_interest): print(item)
    n_other_items = n_other_items + 1
Esempio n. 29
0
#apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。
#反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。


#问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个
#而不计算2个  --->1个???????????????

import apriori
from votesmart import votesmart

dataSet = apriori.loadDataSet()
#C1 = apriori.createC1(dataSet)
#print ("C1 is %s"  % C1)
#D = map(set,dataSet)
#print ( "%r"  % D)
#L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5)
#print (L1)
#print (suppData0)

L,suppData = apriori.apriori(dataSet, 0.5)
print ("L is" , L)
print ("suppData is" , suppData)
#L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []]
#suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75}

#关联规则挖掘
rules = apriori.generateRules(L, suppData, 0.7)

print ("rules is " ,rules)
Esempio n. 30
0
#关联规则对其中的3/4的记录都是适用的。

#apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。
#反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。

#问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个
#而不计算2个  --->1个???????????????

import apriori
from votesmart import votesmart

dataSet = apriori.loadDataSet()
#C1 = apriori.createC1(dataSet)
#print ("C1 is %s"  % C1)
#D = map(set,dataSet)
#print ( "%r"  % D)
#L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5)
#print (L1)
#print (suppData0)

L, suppData = apriori.apriori(dataSet, 0.5)
print("L is", L)
print("suppData is", suppData)
#L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []]
#suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75}

#关联规则挖掘
rules = apriori.generateRules(L, suppData, 0.7)

print("rules is ", rules)
Esempio n. 31
0
data = []
for i in range(len(values)):
	temp = []
	for j in range(len(values[0])):
		if values[i][j] == 1:
			temp.append(j)
	data.append(temp)
counts = []
for index in columns:
	line = df[index]
	count = 0
	for i in range(len(line)):
		if line[i]==1:
			count += 1
	counts.append((float)(count)/10000)
counts.sort()
minSupport = counts[len(counts)*1/5]

#use apriori 
L,supportData = ap.apriori(data,minSupport)
rules = ap.generateRules(L,supportData,minConf=0.4)

#use fpGrowth

minSup = minSupport*10000
simpDat = data
initSet = fp.createInitSet(simpDat)
myFPtree, myHeaderTab = fp.createTree(initSet, minSup)
myFreqList = []
fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)
print myFreqList
basket_str = ""
for rowNum, row in accident_data.iterrows():

    #Break lines
    if (rowNum != 0):
        basket_str = basket_str + "\n"
    #Add the rowid as the first column
    basket_str = basket_str + str(rowNum)
    #Add columns
    for colName, col in row.iteritems():
        if (colName != 'Accident_Index'):
            basket_str = basket_str + "," + colName + "=" + str(col)

print basket_str
basket_file = open("accident_basket.csv", "w")
basket_file.write(basket_str)
basket_file.close()
"""
Read the basket file now and compute rules
"""
import csv
with open('accident_basket.csv', 'rb') as f:
    reader = csv.reader(f)
    your_list = list(reader)

L, supportData = apriori.apriori(your_list, 0.6)
brl = apriori.generateRules(L, supportData, 0.6)

for row in brl:
    print list(row[0]), " => ", list(row[1]), row[2]
Esempio n. 33
0
import apriori

dataSet = apriori.loadDataSet()
L, supportData = apriori.apriori(dataSet, minSupport=0.1)

print "[result]-----------------------------------------"
rules = apriori.generateRules(L, supportData, minConf=1.0)
Esempio n. 34
0
import apriori
mushDatSet = [line.split() for line in open('mushroom.dat').readlines()]

L, supportData = apriori.apriori(mushDatSet, 0.4)

rule = apriori.generateRules(L, supportData, minConf=0.5)
for item in L[3]:
    if item.intersection('2'):
        print(item)