def rulesthread(self): start = time.time() if (self.suanfa=='Aprioi'): if(self.minconfig!=0): self.rules=apriori.generateRules(self.L,self.support,self.minconfig) else: self.rules=apriori.generateRules(self.L,self.support) else: self.rules = [] if (self.minconfig!=0): Rules.rulesGenerator(self.L, self.rules, self.minconfig) else: Rules.rulesGenerator(self.L, self.rules) # self.rules = [] # if (self.minconfig!=0): # Rules.rulesGenerator(self.L, self.rules, self.minconfig) # else: # Rules.rulesGenerator(self.L, self.rules) self.result_data_Text.insert(INSERT,'关联规则\n') for i in self.rules: self.result_data_Text.insert(INSERT,list(i)) self.result_data_Text.insert(INSERT,'\n') end = time.time() self.result_data_Text.insert(INSERT,str(len(self.rules))+'\n') self.log_data_Text.insert(INSERT,'关联规则生成完毕! 共'+str(len(self.rules))+'项 耗时:'+str(round(end-start,2))+'s\n')
def test_generate_associations(self): L, supp_data = apriori.apriori(self.dataset, min_support=0.5) print 'L:', L print '-'*20 print 'supp_data: ', supp_data print '-'*20 rules = apriori.generateRules(L, supp_data, min_confidence=0.95) print '-'*20 print 'rules: ', rules print '-'*20 assert False
def test1(): dataSet = apriori.loadDataSet() print(dataSet) #[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] #C1=apriori.createC1(dataSet) #print(set(C1)) #{frozenset({4}), frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})} #print(list(C1)) #[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})] #D=map(set,dataSet) #print(list(D)) #[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}] 注意!!被list(map1)之后,map1的内容就空了。。。好像set(.)也会清空人家 #L1,suppData0 = apriori.scanD(D, C1, 0.5) #不能直接用了,要把D和C1先变成list #print(L1) #[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})] #print(suppData0) #{frozenset({4}): 0.25, frozenset({5}): 0.75, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({1}): 0.5} L, suppData = apriori.apriori(dataSet, 0.5) print(L) print(suppData) rules = apriori.generateRules(L, suppData, minConf=0.5) print(rules)
#print 'rules:\n', rules with open("xss-train.txt") as f: for line in f: #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E index=line.find("?") if index>0: line=line[index+1:len(line)] #print line tokens=re.split('\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',line) #print "token:" #print tokens myDat.append(tokens) f.close() L, suppData = apriori(myDat, 0.15) rules = generateRules(L, suppData, minConf=0.6) #print 'rules:\n', rules# -*- coding:utf-8 -*- import sys import urllib import urlparse import re from hmmlearn import hmm import numpy as np from sklearn.externals import joblib import HTMLParser import nltk #处理参数值的最小长度 MIN_LEN=6
import sys sys.path.append('../chapter4') import pandas as pd from graphviz import Digraph import apriori # 定义数据文件 fileName = 'association.txt' # 通过调用自定义的apriori做关联分析 minS = 0.1 # 定义最小支持度阀值 minC = 0.38 # 定义最小置信度阀值 dataSet = apriori.createData(fileName) # 获取格式化的数据集 L, suppData = apriori.apriori(dataSet, minSupport=minS) # 计算得到满足最小支持度的规则 rules = apriori.generateRules(fileName, L, suppData, minConf=minC) # 计算满足最小置信度的规则 # 关联结果报表评估 model_summary = 'data record: {1} \nassociation rules count: {0}' # 展示数据集记录数和满足阀值定义的规则数量 print(model_summary.format(len(rules), len(dataSet))) # 使用str.format做格式化输出 df = pd.DataFrame( rules, columns=['item1', 'itme2', 'instance', 'support', 'confidence', 'lift']) # 创建频繁规则数据框 df_lift = df[df['lift'] > 1.0] # 只选择提升度>1的规则 print(df_lift.sort('instance', ascending=False)) # 打印排序后的数据框 # 关联结果图形展示 dot = Digraph() # 创建有向图 graph_data = df_lift[['item1', 'itme2', 'instance']] # 切分画图用的前项、后项和实例数数据 for each_data in graph_data.values: # 循环读出每条规则
import apriori dataSet = apriori.loadDataSet() L, suppData = apriori.apriori(dataSet, minSupport=0.5) rules = apriori.generateRules(L, suppData, minConf=0.5) from sklearn import svm x = [[2, 0], [1, 1], [2, 3]] y = [0, 0, 1] clf = svm.SVC(kernel='linear') clf.fit(x, y) print(clf) print(clf.predict([[2, 0]])) #predict[2,0] belong to which class import numpy as np import pylab as pl from sklearn import svm np.random.seed(0) x = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]] y = [0] * 20 + [1] * 20 #[0,0...1,1] clf = svm.SVC(kernel='linear') clf.fit(x, y) w = clf._get_coef()[0] a = -w[0] / w[1] xx = np.linspace(-5, 5) yy = a * xx - (clf._intercept_[0]) / w[1] b = clf.support_vectors_[0] yy_down = a * xx + (b[1] - a * b[0]) b = clf.support_vectors_[-1] yy_up = a * xx + (b[1] - a * b[0])
#dataSet = apriori.loadDataSet() #C1 = apriori.createC1(dataSet) #D = map(set, dataSet) #print('D', D) #L1, suppData0 = apriori.scanD(D, C1, 0.5) #频繁项集L #所有候选项集的支持度信息suppData L, suppData = apriori.apriori(dataSet, 0.3) print('L:', L) #print('suppData:', suppData) rules = apriori.generateRules(L, suppData, minConf = 0.3) #print('rules', len(rules)) rules_remove_redundancy = [] def remove_redundancy(): redundancy_indices = [] for i in range(0, len(rules) - 1): for j in range(i + 1, len(rules)): if rules[i][0] < rules[j][0] and rules[i][1] < rules[j][1] and rules[i][4] <= rules[j][4]: if i not in redundancy_indices: redundancy_indices.append(i) if rules[j][0] < rules[i][0] and rules[j][1] < rules[i][1] and rules[j][4] <= rules[i][4]: if j not in redundancy_indices:
import apriori dataMat = apriori.loadDataSet() print(dataMat) dataSet = apriori.createC1(dataMat) print(dataSet) L, supportData = apriori.apriori(dataMat) print(L) print(supportData) apriori.generateRules(L, supportData, 0.5)
#print dataset C1 = apriori.createC1(dataset) #print 'C1', C1 D = map(set,dataset) #print 'D', D L1, support_data = apriori.scanD(D,C1,support) #print 'L1', L1 #print 'support_data', support_data k_length = 2 transactions = apriori.aprioriGen(L1, k_length) #print 'transactions', transactions #print '\n*** *** ***' L,support_data = apriori.apriori(dataset, support) #print 'L', L #print 'support_data', support_data rules = apriori.generateRules(L, support_data, min_confidence=0.7) #print 'rules', rules ruleDict = apriori.generateRuleDict(rules) ''' print 'ruleDict', ruleDict print '*** *** ***' ''' print 'keys', ruleDict.keys() print '*** *** ***' ## testing if __name__ == '__main__': #print '\n\n***\n'
import ex1 import apriori data = ex1.loadDataSet() c1 = ex1.createcl(data) print(c1) print(data) # d = map(set,data) l1, supportdata = ex1.scand(data, c1, 0.5) print(l1) print(supportdata) k = 2 l = [l1] # ck = ex1.apriorigen(l,k) # print(ck) while (len(l[k - 2]) > 0): ck = ex1.apriorigen(l[k - 2], k) lk, supk = ex1.scand(data, ck, 0.5) supportdata.update(supk) l.append(lk) k += 1 print(l, k) rules = apriori.generateRules(l, supportdata, minConf=0.7) print(rules)
D = map(set, datSet) #D[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])] L1, suppData0 = apriori.scanD(D, Cl, 0.5) #retList, supportData print("retList-L1") print(L1) print("supportData-suppData0") print(suppData0) # apriori.aprioriGen() L2, suppData0 = apriori.apriori(datSet) print("L2") print(L2) print("suppData0") print(suppData0) # rules = apriori.generateRules(L2, suppData0, minConf=0.6) print("rules") print(rules) # actionIdList, billTitleList=recentAprioriTest.getActionIds() # print("actionIdList") # print(actionIdList) # print("billTitleList") # print(billTitleList) # # transDict, itemMeaning=recentAprioriTest.getTransList(actionIdList[:2],billTitleList[:2]) # print("transDict") # print(transDict) # print("itemMeaning") # print(itemMeaning)
# the current data-set isn't in transactional format. To convert it into a transactional data-set, we use the following snippet of code: basket_str="" for rowNum, row in accident_data.iterrows(): #Break lines if (rowNum != 0): basket_str = basket_str + "\n" #Add the rowid as the first column basket_str = basket_str + str(rowNum) #Add columns for colName, col in row.iteritems(): if ( colName != 'Accident_Index'): basket_str = basket_str + "," + colName + "=" + str(col) #print basket_str basket_file=open("accidents_basket.csv","w") basket_file.write(basket_str) basket_file.close() import csv with open("accidents_basket.csv","rb") as f: reader=csv.reader(f) my_list=list(reader) #my_list L,supportData=apriori.apriori(my_list,0.6) f_rules= apriori.generateRules(L,supportData,0.6) for row in f_rules: print list(row[0]), " => ", list(row[1]), row[2]
for line in f: #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E index = line.find("?") if index > 0: line = line[index + 1:len(line)] #print line tokens = re.split( '\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29', line) #print "token:" #print tokens myDat.append(tokens) f.close() L, suppData = apriori(myDat, 0.15) rules = generateRules(L, suppData, minConf=0.6) #print 'rules:\n', rules# -*- coding:utf-8 -*- import sys import urllib import urlparse import re from hmmlearn import hmm import numpy as np from sklearn.externals import joblib import HTMLParser import nltk #处理参数值的最小长度 MIN_LEN = 6
arrAll.append(arr4.strip().split(',')) arr5 = arr[5] if arr5: arrAll.append(arr5.strip().split(',')) dataSet = arrAll print 'dataSet: ', dataSet # Apriori 算法生成频繁项集以及它们的支持度 # 这个 L1, supportData1 = apriori.apriori(dataSet, minSupport=0.2) print 'L(0.7): ', L1 print 'supportData(0.7): ', supportData1 # 生成关联规则 dic = dict() rules = apriori.generateRules(L1, supportData1, minConf=0.8) print 'rules: ', rules[:10] print type(rules) #for i in range(0,len(rules)): # if rules[i][0]==frozenset(['58691ed3d87f49b489feb40de28a92f9']): # print map(str,rules[i][1]) # dic[map(str,rules[i][0])].append(map(str,rules[i][1])) # print dic """ for line in sys.stdin: arr =line.strip().split() userID = arr[0] downloadRes =arr[1] if downloadRes: arrAll.append(downloadRes.strip().split(','))
#!usr/bin/env python3 # -*- coding:utf-8 -*- """ #@author:Benny.Chen #@file: main.py #@time: 2020/6/10 11:06 #@email:[email protected] """ from pprint import pprint import apriori if __name__ == '__main__': dataSet = apriori.loadDataSet() L ,supportData = apriori.apriori(dataSet, minSupport=0.5) rules = apriori.generateRules(L, supportData) pprint(rules)
# coding:utf-8 import apriori import time import numpy as np # 读取训练集 with open("./data/agaricus_train.csv", "rb") as f: dataSet = [line[:-1].split(',') for line in f.readlines()] # L中的每一个元素都至少在25%的样本中出现过 L, suppData = apriori.apriori(dataSet, 0.25) # 阈值越小,越慢 # 生成规则,每个规则的置信度至少是0.6 bigRuleList = apriori.generateRules(L, suppData, 0.6) # P→H,根据P集合的大小排序 bigRuleList = sorted(bigRuleList, key=lambda x:len(x[0]), reverse=True) # 读取测试集 with open("./data/agaricus_test.csv", "rb") as f: dataSet = [line[:-1].split(',') for line in f.readlines()] labels = np.array([int(x[0]) for x in dataSet]) scores = [] for line in dataSet: tmp = [] for item in bigRuleList: if item[0].issubset(set(line)): if "1" in item[1]: tmp.append(float(item[2])) # 因为是预测“为1的概率”,所以要用1减
# 在循环中出现的前置路径,均为频繁项,因为在构建条件树的过程中已经过滤了频繁项 frequentItems.append(prefixPath) # 获取条件基 conditionPatterns = findPrefixPath(basePattern, myTableHeader[basePattern][1]) # 根据条件基,得到条件树 conditionTree, headerTable = createTree(conditionPatterns, minSupport) if conditionTree != None: print('condition: ', prefixPath) conditionTree.display(1) mineTree(conditionTree, headerTable, minSupport, prefixPath, frequentItems) return frequentItems # simpleDat = loadSimpDat() # initSet = createInitSet(simpleDat) # myFPTree, myHeaderTab = createTree(initSet, 3) # frequentItems = [] # mineTree(myFPTree, myHeaderTab, 3, set([]), frequentItems) parsedData = [line.split() for line in open('./machinelearninginaction/Ch12/kosarak.dat').readlines()] initSet = createInitSet(parsedData) timeStart = time.time() myFPTree, myHeaderTab = createTree(initSet, 100000) frequentItems = [] mineTree(myFPTree, myHeaderTab, 100000, set([]), frequentItems) print(frequentItems) print('timeEnd:', time.time() - timeStart) timeStart = time.time() L, supportData = apriori.apriori(parsedData, 0.1) apriori.generateRules(L, supportData, 0.7) print('timeEnd:', time.time() - timeStart)
# Extract transactions and meanings transactions = map_transactions(RAW_DATA[0]) meaning = map_meaning(RAW_MEANING[0]) for threshold in np.arange(0.5, 0.25, -0.05): itemsets, support = apriori.apriori(transactions.values(), minSupport=threshold) print "THRESHOLD: ", threshold print len(itemsets), "itemsets of length:" print [len(i) for i in itemsets] print "\n" itemset, support = apriori.apriori(transactions.values(), minSupport=0.3) for threshold in np.arange(0.7, 0.99, 0.05): print "THRESHOLD: ", threshold rules = apriori.generateRules(itemset, support, minConf=threshold) print "\n" def get_meaning (rule, meaning): condition, result = [], [] for c in rule[0]: condition.append(meaning[c]) for r in rule[1]: result.append(meaning[r]) print "IF:", " AND ".join(condition) print "THEN:", " AND ".join(result) print "CONFIDENCE: ", rule[2], "\n\n" for i in range(6): get_meaning(random.choice(rules), meaning)
D = map(set, dataset) #print 'D', D L1, support_data = apriori.scanD(D, C1, support) #print 'L1', L1 #print 'support_data', support_data print 'support_data' for k, v in support_data.iteritems(): print k, v k_length = 2 transactions = apriori.aprioriGen(L1, k_length) #print 'transactions', transactions #print '\n*** *** ***' L, support_data = apriori.apriori(dataset, support) #print 'L', L #print 'support_data', support_data rules = apriori.generateRules(L, support_data, min_confidence=0.7) #print 'rules', rules ruleDict = apriori.generateRuleDict(rules) print 'ruleDict' for k, v in ruleDict.iteritems(): print '\t', k, "".join( ' ' for i in range(30 - len(''.join(item for item in list(k))) - len(k) * 4)), v print '*** *** ***' #print 'keys', ruleDict.keys() #print '*** *** ***'
import data import apriori import matplotlib.pyplot as plt import numpy dataSet = data.loadDataSet() print ('size of dataSet: ', len(dataSet)) # 1) plt.figure() minSupports = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4] cntOfRules = [] for minSupport in minSupports: frequentItemSet, supportData = apriori.apriori(dataSet, minSupport) rules = apriori.generateRules(frequentItemSet, supportData, 0.1 ) cntOfRules.append(len(rules)) x = numpy.array(minSupports) y = numpy.array(cntOfRules) plt.plot(x, y) plt.xlabel('minimal support with 0.3 confidence') plt.ylabel('cntOfRules') # 2) plt.figure() minConfidences = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] cntOfRules = [] frequentItemSet, supportData = apriori.apriori(dataSet, 0.1) for minConfidence in minConfidences: rules = apriori.generateRules(frequentItemSet, supportData, minConfidence) cntOfRules.append(len(rules)) x = numpy.array(minConfidences)
#apriori只支持python2.7版本 from apriori import apriori from apriori import generateRules import re if __name__ == "__main__": myDat = [] with open("./data/ibook/data/xss-2000.txt") as f: for line in f: index = line.find("?") tokens = re.split( '\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29', line) myDat.append(tokens) L, suppData = apriori(myDat, 0.15) rules = generateRules(L, suppData, miniConf=0.5) print(rules)
D L1, supportData0 = apriori.scanD( D, C1, 0.5) # 以0.5支持度为要求,计算候选集的每一个项的支持度,并返回大于支持度的集合L1 L1 supportData0 # 根据支持度生成频繁集 reload(apriori) L, supportData = apriori.apriori(dataSet) L # 获得支持度大于0.5的频繁集合 L[0] # 包含一个元素的 L[1] # 包含两个元素的 L[2] # 包含三个元素的 L[3] apriori.aprioriGen(L[0], 2) # 看一下如何生成的未和支持度比较的‘L[1]’ L, supportData = apriori.apriori(dataSet, minSupport=0.7) # 更大的支持度,获得少的结果了 # 根据可信度生成关联规则 reload(apriori) L, supportData = apriori.apriori(dataSet, minSupport=0.5) rules = apriori.generateRules(L, supportData, minConf=0.7) # 0.7的可信度生成的规则 rules = apriori.generateRules(L, supportData, minConf=0.5) # 0.5的可信度生成的规则 # 在毒蘑菇的数据集上测试下效果如何 mushDataSet = [line.split() for line in open('mushroom.dat').readlines()] # 这个数据集第一列是标签是否有毒 L, supportData = apriori.apriori(mushDataSet, minSupport=0.3) for item in L[3]: if item.intersection('2'): # 看下包含特征有毒为2的频繁项集 print(item)
##replace the processed data into the raw data def mumerizeData(data): data_birth_year = dataDiscretizeBirthyear(data) data_weight = dataDiscretizeWeight(data) data_height = dataDiscretizeHeight(data) data.drop("birth_year",inplace = True ,axis = 1) data.drop("weight",inplace = True ,axis = 1) data.drop("height",inplace = True ,axis = 1) data = pd.concat([data,data_birth_year,data_weight,data_height],axis=1) return data if __name__ == '__main__': myDat = mumerizeData(data) #%% print myDat['weight'] print myDat['birth_year'] print myDat['height'] #%% #频繁项集与支持度 sets,sp = apriori.apriori(myDat.values,4,0.6) rules = apriori.generateRules(sets,sp,0.8) print ("层级:频繁项集:支持度") for Lk in sets: for freq_set in Lk: print(str(len(list(Lk)[0])),':',freq_set,' : ',sp[freq_set]) print("强关联规则:置信度") for item in rules: print (item[0], "-->>", item[1], " : ", item[2]) #%%
def test(): dataSet = apriori.loadDataSet() print "DataSet:", dataSet L,suppData = apriori.apriori(dataSet) rules = apriori.generateRules(L, suppData, minConf=0.5) print rules
import apriori as ap dataSet = ap.loadDataSet() #print dataSet C1 = ap.createC1(dataSet) #print C1 D = map(set, dataSet) #print D L1, suppData0 = ap.scanD(D, C1, 0.5) #print suppData0 L, S = ap.apriori(D, 0.5) #print L print L List = ap.generateRules(L, S, minConf=0.4) print List
def test1(): dataSet = apriori.loadDataSet() C1 = apriori.createC1(dataSet) L, supportData = apriori.apriori(dataSet, minSupport=0.5) # print(L) rules = apriori.generateRules(L, supportData, minConf=0.5)
data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) counts = [] for index in columns: line = df[index] count = 0 for i in range(len(line)): if line[i] == 1: count += 1 counts.append((float)(count) / 10000) counts.sort() minSupport = counts[len(counts) * 1 / 5] #use apriori L, supportData = ap.apriori(data, minSupport) rules = ap.generateRules(L, supportData, minConf=0.4) #use fpGrowth minSup = minSupport * 10000 simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
df = pd.DataFrame(et_tours, columns=tour_names) #print(df.shape) #print(df.head) for index in range(len(tour_names)): # tour_to_name = {'0' : np.nan, '1' : tour_names[index],'2' : tour_names[index],'1' : tour_names[index],'3' : tour_names[index],'4' : tour_names[index],'5' : tour_names[index],'6' : tour_names[index],'7' : tour_names[index]} tour_to_name = {0: np.nan, 1: tour_names[index]} df[tour_names[index]] = df[tour_names[index]].map(tour_to_name) print(df.head) C1 = createC1(df) #new stuff D = map(set, df) #new stuff L1, support_data = scanD(D, C1, 0.0000005) #new stuff my_data = list() for index in range(df.shape[0]): basket = list(df.ix[index].dropna()) my_data.append(basket) L, suppData = apriori(my_data) print('Identified rules with support = ', alpha, 'and confidence= ', beta) rules = generateRules(L, suppData, minConf=beta) n_other_items = 1 while n_other_items <= max_other_items: print('\nRules with ', n_other_items, 'other item(s)') for item in L[n_other_items]: if item.intersection(item_of_interest): print(item) n_other_items = n_other_items + 1
#apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。 #反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。 #问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个 #而不计算2个 --->1个??????????????? import apriori from votesmart import votesmart dataSet = apriori.loadDataSet() #C1 = apriori.createC1(dataSet) #print ("C1 is %s" % C1) #D = map(set,dataSet) #print ( "%r" % D) #L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5) #print (L1) #print (suppData0) L,suppData = apriori.apriori(dataSet, 0.5) print ("L is" , L) print ("suppData is" , suppData) #L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []] #suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75} #关联规则挖掘 rules = apriori.generateRules(L, suppData, 0.7) print ("rules is " ,rules)
#关联规则对其中的3/4的记录都是适用的。 #apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。 #反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。 #问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个 #而不计算2个 --->1个??????????????? import apriori from votesmart import votesmart dataSet = apriori.loadDataSet() #C1 = apriori.createC1(dataSet) #print ("C1 is %s" % C1) #D = map(set,dataSet) #print ( "%r" % D) #L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5) #print (L1) #print (suppData0) L, suppData = apriori.apriori(dataSet, 0.5) print("L is", L) print("suppData is", suppData) #L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []] #suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75} #关联规则挖掘 rules = apriori.generateRules(L, suppData, 0.7) print("rules is ", rules)
data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) counts = [] for index in columns: line = df[index] count = 0 for i in range(len(line)): if line[i]==1: count += 1 counts.append((float)(count)/10000) counts.sort() minSupport = counts[len(counts)*1/5] #use apriori L,supportData = ap.apriori(data,minSupport) rules = ap.generateRules(L,supportData,minConf=0.4) #use fpGrowth minSup = minSupport*10000 simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
basket_str = "" for rowNum, row in accident_data.iterrows(): #Break lines if (rowNum != 0): basket_str = basket_str + "\n" #Add the rowid as the first column basket_str = basket_str + str(rowNum) #Add columns for colName, col in row.iteritems(): if (colName != 'Accident_Index'): basket_str = basket_str + "," + colName + "=" + str(col) print basket_str basket_file = open("accident_basket.csv", "w") basket_file.write(basket_str) basket_file.close() """ Read the basket file now and compute rules """ import csv with open('accident_basket.csv', 'rb') as f: reader = csv.reader(f) your_list = list(reader) L, supportData = apriori.apriori(your_list, 0.6) brl = apriori.generateRules(L, supportData, 0.6) for row in brl: print list(row[0]), " => ", list(row[1]), row[2]
import apriori dataSet = apriori.loadDataSet() L, supportData = apriori.apriori(dataSet, minSupport=0.1) print "[result]-----------------------------------------" rules = apriori.generateRules(L, supportData, minConf=1.0)
import apriori mushDatSet = [line.split() for line in open('mushroom.dat').readlines()] L, supportData = apriori.apriori(mushDatSet, 0.4) rule = apriori.generateRules(L, supportData, minConf=0.5) for item in L[3]: if item.intersection('2'): print(item)