def test_generate_rules(): data_set = Apriori.load_data_set() l, support_data = Apriori.apriori(data_set, 0.5) print '='*100 print l rules = Apriori.generate_rules(l, support_data, 0.5) print rules
def test_create(): data_set = Apriori.load_data_set() c1 = Apriori.create_c1(data_set) all_set = map(set, data_set) list_set, support_data = Apriori.scan_d(all_set, c1, 0.5) set2 = Apriori.generate_ck(list_set, 2) print set2
def pcy(baskets: list, minSupport: float, minConfidence: float, maxK: int, bucketSize: int, hashFunc1, hashFunc2) -> (list, int, defaultdict): C1 = ap.genC1(baskets) L1, sup1 = ap.genFreqSet(baskets, C1, minSupport) # 生成2项候选集时,进行PCY优化 bitmap1 = hashPairs(baskets, L1, bucketSize, minSupport, hashFunc1) Ck1 = genCkByBitMap(L1, bitmap1, bucketSize, 2, hashFunc1) bitmap2 = hashPairs(baskets, L1, bucketSize, minSupport, hashFunc2) Ck2 = genCkByBitMap(L1, bitmap2, bucketSize, 2, hashFunc2) Ck = Ck1 & Ck2 L = [set(), L1] sup = [defaultdict(float), sup1] k = 2 while True: Lk, supk = ap.genFreqSet(baskets, Ck, minSupport) L.append(Lk) sup.append(supk) if k == maxK: break k += 1 Ck = ap.genCk(L[k - 1], k) return sup, bitmap1, ap.genRules(L, sup, minConfidence)
def test_generate_rules(): data_set = Apriori.load_data_set() l, support_data = Apriori.apriori(data_set, 0.5) print '=' * 100 print l rules = Apriori.generate_rules(l, support_data, 0.5) print rules
def apriori_classID(lastestTime, appointMap, minSupport=0.1, minConf=0.6): dataSet = ReadData.get_all_recent_class(lastestTime, appointMap) L, suppData = Apriori.apriori(dataSet, minSupport) rules = Apriori.generateRules(L, suppData, minConf) return rules
def main(): # Connect to database ppsd_data = connect_db('train') fuzzified_data = Fuzzification.fuzzify(ppsd_data) # print(fuzzified_data) # Insert Fuzzified Data insert_db(fuzzified_data, ppsd_data) # Apriori Algorithm fuzzy_csv = pd.read_csv('fuzzified.csv') # FP Grwoth start_time = time.time() rules, confi = FPGrowth.mine('fuzzified.csv') print("FP: --- %s seconds ---" % (time.time() - start_time)) insert_fprules(rules, confi) # insert_fprulesCSV(rules,confi) start_time = time.time() ant, con, conf, lift = Apriori.mine('fuzzified.csv') print("Apriori: --- %s seconds ---" % (time.time() - start_time)) insert_arules(ant, con, conf, lift)
def aiSub(ingredients, unwantedIng): #First get frequent item sets L = ap.readFromFile() if not(unwantedIng in ingredients): return "error" ingredients.remove(unwantedIng) wantedIngredients = set(ingredients) a = 0 bestFit = None for thing in L: if thing <= wantedIngredients: continue counter = 0 for ing in thing: if ing in wantedIngredients: counter += 1 if counter > a and not('butter' in thing) and not('bread' in thing): a = counter bestFit = thing elif counter == a: pass #do something clever here return bestFit - set(ingredients)
def main(): print 'begin:' t = time.time() print '获取数据中:' data = getdata() # printdata(data) print '获取数据完毕' print '商店总数:', len(data) aprioriResult = {} for shop in data: print 'shop:', shop, ' 顾客数:', len(data[shop]) assortment = [] for curstomer in data[shop]: assortment.append(list(data[shop][curstomer])) # print 'get assortment done',assortment adata = Apriori.apriori(assortment, 0.5) # if adata: # print adata aprioriResult[shop] = adata print '保存apriori结果到数据库' saveaprioriresult(aprioriResult) print str(time.time() - t) print 'done'
def arpriori_trade(full_table_name): query = \ """SELECT T.* FROM( SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list FROM topdata.top_item_trade_his_20121219_1 GROUP BY shop_id, nick)T WHERE LOCATE(',', T.list)<>0 """ db_conn = ibbdlib.get_db_conn(**db_server) metadata = dict() try: for row in db_conn.iter(cmd_str): if not metadata.get(row.shop_id): metadata[row.shop_id] = [row.list.split(',')] else: metadata[row.shop_id].append(row.list.split(',')) for (shop_id, t) in metadata.items(): print shop_id t = Apriori.apriori(t, 0.1) print json.dumps(t, indent=4) query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s'] * 4) db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']] for (pair, data) in t.items() if len(pair.split(',')) == 2]) except Exception, e: print e
def arpriori_trade(): while 1: full_table_name = tasks_queue.get() log.info(full_table_name) query = \ """SELECT T.* FROM( SELECT shop_id, GROUP_CONCAT(DISTINCT(item_id)) AS list FROM %s GROUP BY shop_id, nick)T WHERE LOCATE(',', T.list)<>0""" \ % full_table_name db_conn = ibbdlib.get_db_conn(**db_server) metadata = dict() try: log.info('%s query data...', full_table_name) for row in db_conn.iter(query): if not metadata.get(row.shop_id): metadata[row.shop_id] = [row.list.split(',')] else: metadata[row.shop_id].append(row.list.split(',')) log.info('%s run apriori...', full_table_name) for (shop_id, t) in metadata.items(): t = Apriori.apriori(t, 0.1) query = 'INSERT IGNORE INTO ibbd2.ststc_shop_apriori values(%%s, CURDATE(), %s, NOW())' % ','.join(['%s' ] * 4) db_conn.executemany(query, [[shop_id, pair.split(',')[0], pair.split(',')[1], data['sup'], data['num']] for (pair, data) in t.items() if len(pair.split(',')) == 2]) log.info('%s finished', full_table_name) except: log.error(traceback.format_exc()) finally: db_conn.close() tasks_queue.task_done()
def getSupport(minsupport, processbiclusters, winedata): print('...', end='') tidata = processbiclusters apriori = Apriori.Apriori() itemlist = apriori.getSupportOnlyAssociationRules(minsupport, tidata) #formattedlist = apriori.getDataFormattedItemList(winedata, itemlist) #there's a problem with the formatted list(problem is that winedata does not carry the same format as the testdata format) formattedlist = [] return apriori, itemlist, formattedlist
def apriori(train, test, return_pred, num_cluster): train_orders_i = train.set_index('order_id')['product_id'].rename('item_id') test_orders_i = test.set_index('order_id')['product_id'].rename('item_id') #item_name = train['product_id', 'product_name', 'aisle_id', 'department_id'].rename(columns={'product_id': 'item_id', 'product_name': 'item_name'}) rules_i = Apriori.association_rules(train_orders_i, 0.01) #rules_final_i = Apriori.merge_item_name(rules_i, item_name).sort_values('lift', ascending=False) #display(rules_final_i) # Train set pairs train_pairs_gen_i = Apriori.get_item_pairs(train_orders_i) train_pairs_i = Apriori.freq(train_pairs_gen_i).to_frame("freqAB") train_pairs_i = train_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'}) train_pairs_i['pair'] = train_pairs_i.item_A.astype(str).str.cat(train_pairs_i.item_B.astype(str), sep='-') # Test set pairs test_pairs_gen_i = Apriori.get_item_pairs(test_orders_i) test_pairs_i = Apriori.freq(test_pairs_gen_i).to_frame("freqAB") test_pairs_i = test_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'}) test_pairs_i['pair'] = test_pairs_i.item_A.astype(str).str.cat(test_pairs_i.item_B.astype(str), sep='-') # Rules set pairs rules_i['pair'] = rules_i.item_A.astype(str).str.cat(rules_i.item_B.astype(str), sep='-') test_pair_set_i = set(np.unique(test_pairs_i.pair)) train_pair_set_i = set(np.unique(train_pairs_i.pair)) rules_pair_set_i = set(np.unique(rules_i.pair)) # TP= Pairs that exist in a priori pred and test tp = len(list(test_pair_set_i & rules_pair_set_i)) # TN= pairs that exists train set but not in test tn = len(list(test_pair_set_i - train_pair_set_i)) # FN= Pairs that exists in test but not in a priori fn = len(list(rules_pair_set_i - test_pair_set_i)) # FP= Pairs that exists in a priori but not in test fp = len(list(test_pair_set_i - rules_pair_set_i)) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = 2 * (recall * precision) / (recall + precision) print('APRIORI') return recall, precision, f1
def aprioriWorker(data_fname, sup, out_fname, conn): try: conn.send(Apriori.Apriori_main(data_fname, sup, out_fname)[1]) conn.close() return except MemoryError: conn.send(-1) conn.close() return except Exception as e: conn.send((-1,e.message,)) conn.close() return
def aprioriWorker(data_fname, sup, conn, flag=True): try: if flag: Apriori.Apriori_main(data_fname, sup) conn.send(0) conn.close() return else: conn.send(Apriori.Apriori_main(data_fname, sup)[1]) conn.close() return except MemoryError: conn.send(-1) conn.close() return except Exception as e: conn.send(( -1, e.message, )) conn.close() return
def main(): #data preprocessing filename = "stars_data.csv" data = a.read_data(filename) data.pop(0) random.shuffle(data) words = a.frequency_word(data) features = a.create_binary_feature(data,words,6) words.append("isPositive") words.append("isNegative") minsupport = 0.03 minconf = 0.25 L,support_count = ampriori.frequentItemsetGeneration(features,words,minsupport) print len(L[0]) + len(L[1]) + len(L[2]) rules,r = ampriori.ruleGeneration(L,support_count,minconf) print len(rules) rules = sorted(rules.items(),key=operator.itemgetter(1),reverse= True) rules = [rules[i] for i in range(30)] for index, rule in enumerate(rules): print rule
def metricWorker(fname, sanitized, sens, sup, conn): Apriori_results_init = readLargeData(fname) S = minSet(readSensitiveSet(sens)) SS = supersets(S, Apriori_results_init.keys()) r_fd = list(set(Apriori_results_init) - SS) Apriori_results = Apriori.Apriori_main(sanitized, sup)[0] side_effects = len(r_fd)-len(Apriori_results) if side_effects<0: conn.send((side_effects,0,)) conn.close() return else: ## a1 = 0. ## a2 = 0. ## for itemset in convert2frozen_m(apriori(r_fd, target='m', supp = float(0.0))): ## a1 += 1.0 ## for itemset2 in convert2frozen_m(apriori(Apriori_results.keys(), target='m', supp = float(0.0))): ## if itemset == itemset2: ## a2 += 1.0 ## ## Bd_rate = abs(round(float((a1-a2)/a1),2)) SumAll = 0 AbsDif = 0.0 for itemset in r_fd: SumAll += Apriori_results_init[itemset] if itemset in Apriori_results: AbsDif += float(abs(Apriori_results_init[itemset] - Apriori_results[itemset])) else: AbsDif += float(Apriori_results_init[itemset]) if SumAll == 0: inls = round(float(AbsDif), 3) else: inls = round(float(AbsDif/SumAll), 3) conn.send((side_effects, inls,)) conn.close() return
def retrievePatterns(ingredient): # For generating local patterns instead of global patterns # db = connect(host="localhost", db="sandwiches", user="******", passwd="root") # cursor = db.cursor() # cursor1 = db.cursor() # cursor2 = db.cursor() # cursor.execute("""SELECT recipes.recipeId, recipeName FROM ingredients, recipes, ingredientsForRecipe WHERE ingredients.ingredientId = ingredientsForRecipe.ingredientId and recipes.recipeId = ingredientsForRecipe.recipeId and ingredientName = %s """, ingredient) # l = [] # for row in cursor: # cursor1.execute("""SELECT ingredientId FROM ingredientsForRecipe WHERE recipeId = %s""", row[0]) # temp = [] # for r in cursor1: # # temp.append(int(r[0])) # cursor2.execute("""SELECT ingredientName FROM ingredients WHERE ingredientId = %s""", r[0]) # t = cursor2.fetchone() # temp.append(t) # l.append(temp) # result, supp = ap.apriori(l, minsupport=0.02) return ap.readFromFile()
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7): #生成候选规则集合,从最初的项集中生成更多的关联规则 #freqSet:频繁项集,例:[2,3,5] #H:可以出现在规则右部的元素列表,例:[set([2]),set([3]),set([5])] #m:H中频繁项集大小,例:1 m = len(H[0]) print('m:') print m print('freqSet:') print freqSet #查看freqSet是否可以移除大小为m的子集 if (len(freqSet) > (m + 1)): #生成H中无重复的m+1元组合,例:[set([2,3]),set([2,5]),set([3,5])] Hmp1 = Apriori.aprioriGen(H, m+1) print('Hmp1:') print Hmp1 #检测Hmp1中的组合是否能成为规则中的右部(满足最小可信度要求) Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf) #如果不止一条规则满足要求,那么使用Hmp1迭代 if (len(Hmp1) > 1): rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def metricWorker(fname, sanitized, sens, sup, conn): Apriori_results_init = readLargeData(fname) S = minSet(readSensitiveSet(sens)) SS = supersets(S, Apriori_results_init.keys()) r_fd = list(set(Apriori_results_init) - SS) Apriori_results = Apriori.Apriori_main(sanitized, sup)[0] side_effects = len(r_fd) - len(Apriori_results) if side_effects < 0: conn.send(( side_effects, 0, )) conn.close() return else: SumAll = 0 AbsDif = 0.0 for itemset in r_fd: SumAll += Apriori_results_init[itemset] if itemset in Apriori_results: AbsDif += float( abs(Apriori_results_init[itemset] - Apriori_results[itemset])) else: AbsDif += float(Apriori_results_init[itemset]) if SumAll == 0: inls = round(float(AbsDif), 3) else: inls = round(float(AbsDif / SumAll), 3) conn.send(( side_effects, inls, )) conn.close() return
import Apriori dataSet = Apriori.loadDataSet() # C1 = Apriori.createC1(dataSet) # D = map(set, dataSet) # L1, suppData = Apriori.scanD(D, C1, 0.5) L, suppData = Apriori.apriori(dataSet, minSupport=0.5) rules = Apriori.generateRules(L, suppData, minConf=0.7) ''' Apriori.mushTest("E:/TestDatas/MachineLearningInAction/Ch11/mushroom.dat") '''
#buckets = LSH.MyLSHashing(signature_ls,bandNum,s,r) # buckets = LSH.MylocalitySensitiveHashing(signature_ls,bandNum,s,r) buckets = LSH.localitySensitiveHashing(signature_ls, bandNum, s, r) I_set_list = LSH.buckets_sort(buckets) for ii in I_set_list: print(ii) buckets_dict = LSH.hash_buckets(I_set_list) for i in buckets_dict: print(i, buckets_dict[i]) #Apriori Method dataset = Apriori.loadDataSet(buckets_dict) C1 = Apriori.createC1(dataset) print("C1:") print(C1) D = list(map(set, dataset)) print("D:") print(D) #L1, supportData0 = Apriori.scanD(D,C1, minSupport = 0.5) L, supportData = Apriori.apriori(dataset, minSupport=0.5) rules = Apriori.generateRules(L, supportData) print("*" * 100) print(L) print("*" * 100) print(supportData) print("*" * 100) print(rules)
import Apriori transSet={} transSet['T100']=(1,2,5) transSet['T200']=(2,4) transSet['T300']=(2,3) transSet['T400']=(1,2,4) transSet['T500']=(1,3) transSet['T600']=(2,3) transSet['T700']=(1,3) transSet['T800']=(1,2,3,5) transSet['T900']=(1,2,3) frequentSets=Apriori.apriori(transSet, 2) rules=Apriori.generateAssociationRules(frequentSets,0.4,transSet) for rule in rules: prioriSet,inferredSet,confidence=rule print prioriSet,'=>',inferredSet,' confidence=',confidence
import Apriori import AuxiliaryFunctions dataSet = AuxiliaryFunctions.loadDataSet() print dataSet C1 = AuxiliaryFunctions.createC1(dataSet) print('C1:') print C1 D = map(set, dataSet) print('D:') print D L1, supportData1 = AuxiliaryFunctions.scanD(D, C1, 0.5) print('L1:') print L1 print('supportData1:') print supportData1 L, supportData = Apriori.apriori(dataSet, 0.5) print('频繁集:') print('k=1:') print L[0] print('k=2:') print L[1] print('k=3:') print L[2] print('支持度:') print(supportData) print('候选集:') print('C1:') print C1 print('C2:') print Apriori.aprioriGen(L[0], 2) print('C3:')
default=False) args = parser.parse_args() print("Parameter values:") print("Number of k-tuples: ", args.k_tuples) print("Support: ", args.support) print("Confidence threshold: ", args.confidence) print("Plotting: ", args.plot) k_tuples = args.k_tuples support = args.support confidence = args.confidence filename = "../data/T10I4D100K.dat" apriori = Apriori(filename, support) apriori.load_dataset() baskets = apriori.baskets times = [] start = time.time() print("\nStart Apriori Pipeline\n") for i in range(k_tuples): if (i == 0): start_stage = time.time() apriori.generate_candidates_ck(baskets, None, 1) apriori.filter_candidates(apriori.candidates[i], support) end_stage = time.time()
#-*-coding:utf8-*- import pandas as pd from apriori import * import sys import Apriori reload(sys) sys.setdefaultencoding('utf-8') inputfile ='G://PyCharm//data//menu_orders.xls' outputfile = 'apriori_rules.csv' #结果文件 data = pd.read_excel(inputfile) print(u'\n转换原始数据至0-1矩阵。。。') ct = lambda x :pd.Series(1 , index=x[pd.notnull(x)]) b = map(ct , data.as_matrix()) data = pd.DataFrame(list(b)).fillna(0) print(u'\n转换完毕。') del b support = 0.2 #最小支持度 confidence = 0.5 #最小置信度 ms = '---' Apriori.find_rule(data, support, confidence, ms).to_csv(outputfile) #保存结果
support = 50 threshold = 70 createCombiner = (lambda line: [line]) mergeValue = (lambda exist, new: exist + [new]) mergeCombiner = (lambda exist1, exist2: exist1 + exist2) userRDD = userRDD.filter(lambda line: line != header) \ .map(lambda line: (line.split(',')[0], line.split(',')[1])) \ .combineByKey(createCombiner, mergeValue, mergeCombiner) \ .filter(lambda line: len(line[1]) >= threshold) # userRDD.foreach(print) businessRDD = userRDD.flatMap(lambda line: A.convert(line)) \ .groupByKey() \ .mapValues(set) business = businessRDD.collect() businessdict = {item[0]: item[1] for item in business} numOfPar = userRDD.getNumPartitions() candidates = userRDD.mapPartitions(lambda partition: A.Apriori(partition, support / numOfPar)) \ .reduceByKey(lambda a, b: a | b) \ .collect() candidates = sorted([(key, sorted([list(sets) for sets in value])) for key, value in candidates])
binary_os = operating_sys_cleaning_binary(cleaned_os) # for Kmeans and KNN #make arrays of the selected features with one person per array for apriori apriori_features = [] for i in xrange(len(snow_letter)): temp = [] temp.append(string_cleaned_prog_skills[i]) temp.append(snow_letter[i]) temp.append(cleaned_os[i]) apriori_features.append(temp) #Calling Apriori prunedlen3 = Apriori.apriori(apriori_features) print "*" *45 print "Apriori" print "*" * 45 print "Longest frequent pattern:", prunedlen3 allpossible = Apriori.possible_comb(prunedlen3) permutated = Apriori.permutations(allpossible) Rules = Apriori.confidence(permutated, apriori_features) print "Rules from longest frequent pattern and their confidence:", Rules #make arrays of the selected features with one person per array for Kmeans and KNN num_features = []
# coding:utf8 ''' Created on 2018年2月22日 @author: XuXianda ''' import CreateRules import Apriori import AuxiliaryFunctions dataSet = AuxiliaryFunctions.loadDataSet() L, supportData = Apriori.apriori(dataSet, minSupport=0.5) print('L:') print L print('supportData:') print supportData rules = CreateRules.generateRules(L, supportData, minConf=0.5) print('rule1:') print rules #rules2=CreateRules.generateRules(L,supportData,minConf=0.5) #print('rule2:') #print rules2
import json, sys if __name__ == '__main__': if len(sys.argv) < 4: print "incorrect input format" print "Correct Format: <CSV FILE> <min_sup> <min_conf>" else: csvFile = sys.argv[1] min_sup = sys.argv[2] min_conf = sys.argv[3] print "Finding Associated Pair..." # data_set is a list of lists, each item is a row in csv file # attributes : DBA, BORO, CUSINE DESCRIPTION, INSPECTION DATE, ACTION, VIOLATION CODE, # VIOLATION DESCRIPTION, CRITICAL FLAG, SCORE, GRADE, GRADE DATE, RECORD DATE, INSPECTION TYPE data_set = [] data_set = Parser.parse(csvFile) frequentSets = Apriori.getFrequentSets(data_set, min_sup) # frequentSets = [] associatedRules = Apriori.getAssociatedRulesWith(data_set, frequentSets, min_sup, min_conf) number_of_rows = len(data_set) Apriori.exportWith(frequentSets, associatedRules, min_sup, min_conf, number_of_rows)
def rule_generate(request): operationtype=str(request.POST.get("OperationType")) global datagridpagesize global TotalFileList global mylist global patterns global lengthofmylist,pages,nextpage,previouspage,pagelist,ruleslist,TopList,interests global protocl global label_list lengthofmylist=len(TotalFileList) pages=int(lengthofmylist / datagridpagesize) + (lengthofmylist % datagridpagesize > 0) nextpage=pages previouspage=0 pagelist=[] ruleslist=[] TopList=[] interests=[] templist=[] try: for tab in range(datagridpagesize): TopList.append(TotalFileList[tab+(gloffset-1)*datagridpagesize]) except: pass for tab in range(pages): pagelist.append(str(tab+1)) protocol=str(request.POST.get("strProtocol")).strip() label_list=str(request.POST.get("strAtributes")).strip().split(',') #print("@@@@@@@@@@@@@@@@@@@@@@@@@") #print(label_list) rulesfolder="inputrules" #starttime = time.time() #timestart = time.clock() if not os.path.isdir(ProjectPath+'/'+rulesfolder): os.makedirs(ProjectPath+'/'+rulesfolder) if not os.path.isdir(str(ProjectPath+'/results')): os.makedirs(str(ProjectPath+'/results')) for each in interests: #InputForRules.GenerateInputForRules(eachgroup) with open(ProjectPath+'/'+rulesfolder+"/input_"+each,"w")as fout: pass with open(ProjectPath+"/results/RulesFor_"+each,"w")as fout: pass for each in TotalFileList: templist.append(each.filename) if operationtype=="Association Rule": para=Parameter(float(request.POST.get("paraMinSupp")),float(request.POST.get("paraMinCond"))\ ,float(request.POST.get("paraMinLift")),float(request.POST.get("paraMinKulc")),float(request.POST.get("paraThreshIR"))) patterns=str(request.POST.get("strPattern")) interests.append(patterns) for each in interests: #print(each+" is processing......") InPutForRulesVersion2.MainFunc(templist,os.path.join(ProjectPath,FilesStoreFolder),protocol,each.strip(),rulesfolder,label_list) a = Apriori(para.min_supp,ProjectPath+'/'+rulesfolder+"/input_"+each.strip()) ls = a.do() rules = a.ralationRules(ls.get(ls.size()).items,para.min_cond,para.min_lift,para.min_kulc,para.thresh_ir) rule_count=0 for rule in rules: rule_count += 1 ruleslist.append(str(rule_count)+'th'+str(rule)) with open(ProjectPath+"/results/RulesFor_"+each,"a")as fout: fout.write("min_support is "+str(para.min_supp)+". min_confidence is "+str(para.min_cond)+"\n") for rule in rules: fout.write(str(rule)+'\n') fout.write("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n") return rule_show(request) elif operationtype=="Bayes": O=[] Result,TotalIdf,E,D1,D2=BayesEntropy2.Main(templist,os.path.join(ProjectPath,FilesStoreFolder),protocol,label_list) for eachk,eachv in D1.items(): O.append("***********************************\nThe commander is *************************: "+eachk+'\n') O.append("The Entropy is : "+str(E[eachk])+'\n') O.append("The Total appear times is : "+str(sum(D2[eachk]))+'.\n') for e2,k2 in eachv.items(): O.append("The \""+ e2 + "\" appears "+ str(k2)+" times and appears in "+str(TotalIdf[e2])+" files. \n") O.append("The result for \""+e2+"\" is : "+str(Result[e2])+'!\n') ruleslist=O return rule_show(request)
# -*- coding: utf-8 -*- #------------------------------------------------------------------------------- #Name : #Author : Xueshijun #MailTo : [email protected] / [email protected] #QQ : 324858038 #Blog : http://blog.csdn.net/xueshijun666 #Created on Tue Feb 23 11:08:20 2016 #Version: 1.0 #------------------------------------------------------------------------------- '''mushroom.dat''' import Apriori as ap mushDataSet = [line.split() for line in open('mushroom.dat').readlines()] L,supportData = ap.apriori(mushDataSet,minSupport=0.3) for item in L[1]: if item.intersection('2'):print item for item in L[3]: if item.intersection('2'):print item
if __name__ == '__main__': args = main(sys.argv[1:]) min_sup = args[1][0] min_conf = args[1][1] input_filename = args[1][2] output_filename = args[1][3] L = [[]] C = [[], []] C_support = [[]] data_set = load_data(input_filename) data_count = float(len(data_set)) C_support.append(support_count_C1(data_set)) L.append(Apriori_L1(C_support[1], data_count, min_sup)) # print(L[1]) k = 1 while L[k]: k += 1 C.append(Apriori_gen(L[k - 1], k)) C_support.append(support_count(data_set, C[k])) L.append(Apriori(C_support[k], data_count, min_sup)) rule = [] rule = generate_Rule(L, C_support, min_sup=min_sup, min_conf=min_conf) association_rule_output(rule, output_filename)
def GET(self, cnpj): cnpjs = Banco().searchCNPJS(cnpj) regras = Apriori().extractRules(cnpjs) cnpjs = Banco().extractCNPJs(regras) return Banco().formatCNPJS(regras, cnpj)
# 原数据为倒排表数据 # In[ ]: inverted = pd.read_csv( r'D:\Python_Training\script_Python\15Association\bank.csv', encoding='gbk') inverted.head() # ## 数据转换 # 倒排表数据转换为相应的二维列表数据 # In[ ]: idataset = apri.dataconvert(inverted, tidvar='CSR_ID', itemvar='PROD', data_type='inverted') idataset[:5] # ## 关联规则 # 参数说明: # # + minSupport:最小支持度阈值 # + minConf:最小置信度阈值 # + minlen:规则最小长度 # + maxlen:规则最大长度 # 这里,minSupport或minConf设定越低,产生的规则越多,计算量也就越大 # # 设定参数为:minSupport=0.05,minConf=0.5,minlen=1,maxlen=10
if v.find("BROWN") != -1: return "brown" return v df.FavSQLServ = df.FavSQLServ.map(sql_standardize) df.FavColor = df.FavColor.map(color_standardize) df.OS = df.OS.map(os_standardize) # Fill missing. (Not for Apriori) # #df = df.fillna(df.mean()) #Pre Apriori: #Combine attributes with their values, so we mine on key/values that are equal, not just same values. r = [] for i in range(len(df)): f = set([]) for j in range(len(df.iloc[i])): a = str(df.iloc[i].index[j]) b = str(df.iloc[i][j]) c = a + ": " + b f.add(c) r += [f] print "The output reads: 20 of 66 students has these values alike:" a = Apriori.Apriori() for i in range(20): print str(20 - i) print a.find(r, 20 - i)
Lk, supk = ap.genFreqSet(baskets, Ck, minSupport) L.append(Lk) sup.append(supk) if k == maxK: break k += 1 Ck = ap.genCk(L[k - 1], k) return sup, bitmap1, ap.genRules(L, sup, minConfidence) if __name__ == '__main__': minConf = 0.5 minSup = 0.005 maxk = 4 bucketSize = 4999 idMap, dataSet = ap.loadData("./src/Groceries.csv") itemBaskets = list(map(frozenset, dataSet)) def hashFuncForPair1(pair: set) -> int: # sha1 = hashlib.sha1() hashVal = 1 for item in pair: # sha1.update(idMap[item].encode("utf-8")) hashVal *= hash(idMap[item]) # hashVal += hash(idMap[item]) # return int(sha1.hexdigest(), 16) return hashVal def hashFuncForPair2(pair: set) -> int: sha1 = hashlib.sha1() # hashVal = 0
''' # 挖掘频繁项集和频繁规则 itemsets, rules = apriori(data, min_support=0.5, min_confidence=0.1) print(itemsets) print() print(rules) ''' 这个的优点是使用起来简单,并且efficient-apriori 工具包把每一条数据集里的项式都放到了一个集合中进行运算, 并没有考虑它们之间的先后顺序。因为实际情况下,同一个购物篮中的物品也不需要考虑购买的先后顺序。 而其他的 Apriori 算法可能会因为考虑了先后顺序,出现计算频繁项集结果不对的情况。 所以这里采用的是 efficient-apriori 这个工具包。 ''' # In[]: # 2、自定义库: ress = apri.arules(data, minSupport=0.5, minConf=0.1, minlen=1, maxlen=4) # DataFrame print(type(ress)) # In[]: # ***************************************************************************** # In[]: # 二、实例: 数据库格式数据集: inverted = pd.read_csv("Transactions.csv") inverted.head() # In[]: def encode_unit(x): if x <= 0: return False
#!usr/bin/python # coding:utf-8 import Apriori dataSet = Apriori.loadDataSet() print dataSet # c1 = Apriori.createC1(dataSet) # print c1 # D = map(set, dataSet) # L1, supportData0 = Apriori.scanD(D, c1, 0.5) # print L1 # print supportData0 L, supportData = Apriori.apriori(dataSet) print L print supportData print '\n' rules = Apriori.generateRules(L, supportData, 0.7) print rules print dataSet[0][:1]
def test_apriori(): data_set = Apriori.load_data_set() l, support_data = Apriori.apriori(data_set, 0.5) print l print support_data
__author__ = 'bigship' import Apriori def split(str,cha): retList = [] for x in str: if x != cha[0] and x!= cha[1]: retList.append(x) return retList mushDataSet = [] data = open('mushroom.txt') cha = [',','\n'] for line in data.readlines(): mushDataSet.append(split(line,cha)) data.close() smallDataSet = mushDataSet[:10] print smallDataSet L , supportData = Apriori.apriori(smallDataSet,0.7) for item in L[4]: if item.intersection('e'): print item result = Apriori.generateRules(L,supportData,0.85)
# 原数据为倒排表数据 # In[ ]: #Transactions---自行车及周边物品的销售数据 inverted = pd.read_csv(r'D:\Python_book\15Association\Transactions.csv') inverted.head() # ## 数据转换 # 倒排表数据转换为相应的二维列表数据 # In[ ]: idataset = apri.dataconvert(inverted, tidvar='OrderNumber', itemvar='Model', data_type='inverted') idataset[:5] # ## 关联规则 # 参数说明: # # + minSupport:最小支持度阈值 # + minConf:最小置信度阈值 # + minlen:规则最小长度 # + maxlen:规则最大长度 # 这里,minSupport或minConf设定越低,产生的规则越多,计算量也就越大 # # 设定参数为:minSupport=0.05,minConf=0.5,minlen=1,maxlen=10
# print date[0] # print start_date #print bigRise[i] if len(bigRise) == 23: if bigRise[day] == '1': calcu_list.append(code[day]) #else: # print code[0] + ' does not have enough days' #print calcu_list sum_list.append(calcu_list) #print sum_list print sum_list print len(sum_list) minSupport = 0.1 minConf = 0.1 L,SupportData = Apriori.apriori(sum_list,minSupport) rules = Apriori.generateRules(L,SupportData,minConf) print rules time_end=time.time() print u"总运行时间为:" print time_end-time_start