def freqAlgo(): print(' ------------------------------------- ') print('|Please select your desired algorithm |') print('|1. Apriori |') print('|2. FP-Growth |') print(' ------------------------------------- ') freqChoice = input('Enter the number of your choice: ') if freqChoice.isalpha() or any(c in specialChars for c in freqChoice): print('\nPlease enter an integer.\n') freqFlag = 1 else: freqChoice = int(freqChoice) if freqChoice == 1: print('*** You selected Apriori algorithm. ***\n') freqFlag = 0 supportCount() apriori.apriori(transRecord, minSupp) elif freqChoice == 2: print('*** You selected FP-Growth algorithm. ***\n') freqFlag = 0 supportCount() fpgrowth.process(transRecord, minSupp) else: print('\n*** Please choose between 1 or 2 and try again. ***\n') freqFlag = 1
def aprithread(self): self.result_data_Text.insert(INSERT,'频繁项集:\n') start = time.time() n=0 if (self.suanfa=='Aprioi'): if(self.minsupport!=0): self.L,self.support=apriori.apriori(self.dataset,self.minsupport) else: self.L,self.support=apriori.apriori(self.dataset) for x in self.L: for i in x: self.result_data_Text.insert(INSERT,i) self.result_data_Text.insert(INSERT,'\n') n+=1 else: self.frozenDataSet = fpgrowth.transfer2FrozenDataSet(self.dataset) self.L = {} self.prefix = set([]) if(self.minconfig!=0): self.fptree,self.headPointTable = fpgrowth.createFPTree(self.frozenDataSet, self.minsupport) fpgrowth.mineFPTree(self.headPointTable, self.prefix, self.L, self.minsupport) else: self.fptree,self.headPointTable = fpgrowth.createFPTree(self.frozenDataSet) fpgrowth.mineFPTree(self.headPointTable, self.prefix,self.L) for i in self.L: # print(i) self.result_data_Text.insert(INSERT,i) self.result_data_Text.insert(INSERT,'\n') n+=1 self.result_data_Text.insert(INSERT,str(n)+'\n') end = time.time() self.log_data_Text.insert(INSERT,'频繁项集已生成! 共'+str(n)+'项 耗时:'+str(round(end-start,2))+'s\n')
def run(file, s, categorical=False): a_t = 0 e_t = 0 f_t = 0 t1 = time.time() apriori(file, s, categorical) a_t = time.time() - t1 t1 = time.time() eclat(file, s, categorical) e_t = time.time() - t1 t1 = time.time() fp(file, s, categorical) f_t = time.time() - t1 return a_t, e_t, f_t
def product_frequency(filename): # Import Data df = read_cheerio(filename) df = df[['Business/Ent Support','CloudFront', 'CloudSearch', 'DynamoDB', 'EC2', 'EMR',\ 'ElastiCache', 'Glacier', 'RDS Service', 'Route 53', 'SES',\ 'SNS', 'SQS', 'S3', 'SWS', 'SimpleDB', 'VPC', 'Red Shift',\ 'OpsWorks', 'Transcode', 'EC2 P-IOPS', 'EC2 EBS Optimized',\ 'EC2 Load Balancer', 'EC2 Spot Usage', ' EBS:Snapshot',\ 'Invalidations', 'Multi-AZ', ' RDS-PIOPS', 'LBR Queries',\ 'TimedStorage Glacier', 'TimedStorage RRS', 'Data Transfer Region']] df = df.fillna(0) df = df.applymap(f) #print df counts = pd.DataFrame() for column in df.columns.values: counts[column] = df[column].value_counts() counts = counts.T counts=counts.sort(1, ascending=False) minsupport = 80 valid = set(k for k,v in counts[1].iteritems() if (v >= minsupport)) itemsets = [frozenset([v]) for v in valid] freqsets, support = apriori(itemsets, minsupport, 100) pprint(freqsets) print support
def run_apriori_and_generate_rules(transactions, items, min_support, min_confidence, output_filename, output_rules=True): """ Take the necessary parameters after the main function parses the CLI arguments to start the apriori algorithm and generate all association rules. @Input: transactions: list of frozensets of transactions items: list of 1-itemsets min_support: minimum support to use when calculating candidate itemsets and validating itemsets min_confidence: minimum confidence to use when generating rules output_filename: filename to which the program will write association rules output_rules: If set to True, the program will serialize the rules to a file, as specified on the command line. If set to False, the function will simply return the array of rules for use in stress testing. @Return: None or association_rules (depends on output_rules) """ N = len(transactions) global_itemset_dict, frequency_set = apriori.apriori(transactions, items, min_support) association_rules, output_header = apriori.derive_association_rules(global_itemset_dict, frequency_set, integer_to_data, min_support, min_confidence, N) if output_rules: if len(association_rules) == 0: print("No association rules to serialize") else: serialize_rules(global_itemset_dict, association_rules, output_header, output_filename) else: return association_rules
def main(): min_sup = 0.3 min_conf = 0.8 items_frecuentes = apriori(carrito, min_sup) print("Base de Datos utilizada") for i, t in enumerate(carrito): print("t", i, ">>", t) print('*' * 10, "REGLAS GENERADAS", '*' * 10) generador_reglas(carrito, items_frecuentes, min_conf, min_sup)
def test_apriori(): data = ("a,b,c,d,e,f\n" "g,h,i,j,k,l\n" "z,x\n" "z,x\n" "z,x,y\n" "z,x,y,i\n") expectedItemSets = { ItemSet("i"): 2 / 6, ItemSet("z"): 4 / 6, ItemSet("x"): 4 / 6, ItemSet("y"): 2 / 6, ItemSet("xz"): 4 / 6, ItemSet("yz"): 2 / 6, ItemSet("xy"): 2 / 6, ItemSet("xyz"): 2 / 6 } index = InvertedIndex() index.load(data) itemsets = apriori(index, 2 / 6) assert (set(expectedItemSets.keys()) == set(itemsets)) for itemset in itemsets: assert (expectedItemSets[itemset] == index.support(itemset)) print("Itemsets={}".format([i for i in itemsets if len(i) > 1])) # (antecedent, consequent, confidence, lift, support) expectedRules = { (frozenset({Item("x"), Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3), (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z"), Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3), (frozenset({Item("z"), Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z"), Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x"), Item("y")}), 0.5, 1.5, 1 / 3), (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3), (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3), } rules = set(generate_rules(itemsets, 0, 0, index)) for (antecedent, consequent, confidence, lift, support) in rules: print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format( antecedent, consequent, confidence, lift, support)) assert (rules == expectedRules)
def test_rules(filename, minconf): transactions, attr_info = pp.read_transaction_data(filename) large_itemsets = ap.apriori(filename, 3) #lhs = large_itemsets[-1][:-1] #rhs = [large_itemsets[-1][-1]] lhs = ['A', 'B', 'C'] rhs = ['Iris-setosa'] print(lhs, rhs) print(gr.process_one_rule(lhs, rhs, transactions, minconf))
def test_apriori(): expected_result = { ('johnson81;4081;Craig;Johnson', ): 0.2, ('Username; Identifier;First name;Last name', ): 0.2, ('grey07;2070;Laura;Grey', ): 0.2, ('booker12;9012;Rachel;Booker', ): 0.2, ('jenkins46;9346;Mary;Jenkins', ): 0.2 } actual_result = apriori.apriori(data_path, 0.2) assert expected_result.items() <= actual_result.items()
def test_generate_associations(self): L, supp_data = apriori.apriori(self.dataset, min_support=0.5) print 'L:', L print '-'*20 print 'supp_data: ', supp_data print '-'*20 rules = apriori.generateRules(L, supp_data, min_confidence=0.95) print '-'*20 print 'rules: ', rules print '-'*20 assert False
def run_algorithm(data, mode, support, iterative, use_CUDA, block, thread): if mode == 'apriori': print('Running Apriori algorithm with %f support and data shape: ' % (support), np.shape(data)) result = apriori(data, support) return result elif mode == 'eclat': print('Running eclat algorithm with %f support and data shape: ' % (support), np.shape(data)) result = eclat(data, support, iterative, use_CUDA, block, thread) return result else: raise NotImplementedError('Invalid algorithm mode.')
def run_main(): #处理数据 #changeData() #handleData() #测试频繁项集 dataSet = apriori.loadDataSet() print(dataSet) print(len(dataSet)) #C1 = apriori.createC1(dataSet) #D = list(map(set,dataSet)) L,suppData = apriori.apriori(dataSet,0.2) print(L) print("========") print(L[0])
def createLs1(dataSet, min_support):# 'Ls' for Large Sequence n = len(dataSet) flattenSet = list(itertools.chain(*dataSet)) flatten_n = len(flattenSet) # Transform the min_support to litemset_support min_support_new = min_support * n /flatten_n litemsets = apriori(flattenSet, min_support=min_support_new) mapping = {v: k for k, v in enumerate(litemsets)} # Transform the litemset_support to sequence_support supportLs1 = {(mapping[k],):v * flatten_n / n for k, v in litemsets.items()} return mapping, supportLs1
def explain(self): print('start') retData = [] with progressbar.ProgressBar(max_value=self.num) as bar: for m in range(self.num): tmpData = {} code = self.tokenedCodes[m] # print(self.sbts[m]) com, c = self.translateStrs( code, self.checkMode(self.mode, 'withSbt'), self.sbts[m]) tmpData['code'] = code tmpData['comment'] = com self.r.extract_keywords_from_text(com) comKeys = self.r.get_ranked_phrases() tmpData['commentKeywords'] = comKeys codeWordList = self.tokenizer.toDoubleList(code) codeKeys, codeKeyIndex = self.extractCodeKeys(code) # codeKeys, codeKeyIndex = self.extractCodeKeysn(code) tmpData['codeKeywords'] = codeKeys tmpData['codeKeyIndex'] = codeKeyIndex tmpList = [] for key in comKeys: tmpResults = { 'commentKeyword': key, } keyNums = np.zeros(len(codeKeyIndex)) i, ni, p = self.explainKey(codeWordList, self.sbts[m], codeKeyIndex, self.numSamples, key, 0.6) tmpResults['numberHaveKey'] = len(i) tmpResults['numberNoKey'] = len(ni) tmpResults['probability'] = p for keyIds in ni: tmp = list(set(keyIds)) for id in tmp: keyNums[id] += 1 L, support = apriori(ni, 0.3) L = [[[int(j) for j in i] for i in l] for l in L] support = [[[int(i) for i in s[0]], s[1]] for s in support.items()] tmpResults['anchors'] = L tmpResults['supports'] = support tmpList.append(tmpResults) tmpData['explanations'] = tmpList retData.append(tmpData) bar.update(m) return retData
def test_stress(): datasets = [ ("datasets/UCI-zoo.csv", 0.3), ("datasets/mushroom.csv", 0.4), # ("datasets/BMS-POS.csv", 0.05), # ("datasets/kosarak.csv", 0.05), ] for (csvFilePath, min_support) in datasets: # Run Apriori and FP-Growth and assert both have the same results. print("Running Apriori for {}".format(csvFilePath)) start = time.time() index = InvertedIndex() index.load_csv(csvFilePath) apriori_itemsets = apriori(index, min_support) apriori_duration = time.time() - start print( "Apriori complete. Generated {} itemsets in {:.2f} seconds".format( len(apriori_itemsets), apriori_duration)) print("Running FPTree for {}".format(csvFilePath)) start = time.time() with open(csvFilePath, newline='') as csvfile: test_transactions = list(csv.reader(csvfile)) fptree_itemsets = mine_fp_tree(test_transactions, min_support) fptree_duration = time.time() - start print( "fp_growth complete. Generated {} itemsets in {:.2f} seconds".format( len(fptree_itemsets), fptree_duration)) if set(fptree_itemsets) == set(apriori_itemsets): print("SUCCESS({}): Apriori and fptree results match".format(csvFilePath)) else: print("FAIL({}): Apriori and fptree results differ!".format(csvFilePath)) assert(set(fptree_itemsets) == set(apriori_itemsets)) if apriori_duration > fptree_duration: print( "FPTree was faster by {:.2f} seconds".format( apriori_duration - fptree_duration)) else: print( "Apriori was faster by {:.2f} seconds".format( fptree_duration - apriori_duration)) print("")
def frequentPattern(): # Count language frequencies langfreq = dict() for val in columns["progLangs"]: for lang in val.split(','): langl = lang.lower().lstrip() if(langl in langfreq.keys()): langfreq[langl] = langfreq[langl] + 1 else: langfreq[langl] = 1 # List language frequencies of languages appearing more than once langfreqc = dict() langfreqkey = [] langfreqindex = dict() index = 0 for lang in langfreq.keys(): freq = langfreq[lang] if(freq > 1 and lang != ""): langfreqc[lang] = freq langfreqkey.append(lang) langfreqindex[lang] = index index = index + 1 # Create list of sets for languages setLang = [] for i, val in enumerate(columns["progLangs"]): row = [] # Add languages for lang in val.split(','): langl = lang.lower().lstrip() if(langl in langfreqkey): row.append(langl) # Add OS #if (not oss[i] == "-"): # row.append(oss[i]) # Add if all data good if(len(row) > 0): setLang.append(row) result = apriori.apriori(setLang, 0.5) print("\nLanguage frequencies: {0}".format(langfreqc)) print("Frequent Patterns: {0}".format(result)) # Verification (compute lift) for (pattern, freq) in result.iteritems(): lift = apriori.lift(setLang, pattern) print("Lift{0}: {1}".format(pattern, lift))
def main(argv): #读取数据 dataset = load_large_data() print dataset #寻找频繁项集 timestart = time.clock() L, support = ap.apriori(dataset, 0.2) time_elapsed = (time.clock() - timestart) print '频繁项集有' print L #生成关联规则 print '关联规则有' ap.printRules(L, support, 0.6) print '耗时:', time_elapsed, 's' print '内存:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, 'byte'
def execute(self) : if self.somethingWrong : #We do not execute the program print("An error was found, the data-set have missing values") print("Please remove those values before the execution") print("Aborting the program") #We should not use the statement: System.exit(-1); else : #We do here the algorithm's operations print("No errors, Execute in FARCHD execute :") self.dataBase = DataBase(self.nLabels,self.train_myDataSet) self.ruleBase = RuleBase(self.dataBase,self.train_myDataSet,self.k,self.typeInference) print("dataBase, ruleBase initialized , Execute in FARCHD execute :") self.apriori_instance = apriori() self.apriori_instance.init_with_more_parameters(self.ruleBase,self.dataBase,self.train_myDataSet,self.minsup,self.maxconf,self.depth) self.apriori_instance.generate_RB() print("dataBase, ruleBase initialized , Execute in FARCHD execute :") self.rules_stage1 = self.apriori_instance.get_rules_stage1() print("FARC_HD,rules_stage1,is :" + str(self.rules_stage1)) self.rules_stage2 = self.ruleBase.size() print("FARC_HD,rules_stage2,is :" + str(self.rules_stage2)) print("self.ruleBase in FARC_HD execute, pass into population :" + str(self.ruleBase)) pop = population(self.train_myDataSet,self.dataBase,self.ruleBase,self.population_size,self.BITS_GEN,self.maxTrials,self.alpha) pop.generation() print("Building classifier ......") self.ruleBase = pop.rulebase_get_bestRB() print("FARC_HD,rule stage3, FARC_HD ruleBase.size() is :" + str(self.ruleBase.size())) self.rules_stage3 = self.ruleBase.size() self.dataBase.save_file(self.fileDB) self.ruleBase.save_file(self.fileRB) self.doOutput(self.val_myDataSet,self.outputTr) self.doOutput(self.test_myDataSet,self.outputTst) self.total_time= time.time() -self.startTime self.write_time() self.write_rules() print(" FARC_HD algorithm is finished . ")
def test1(): dataSet = apriori.loadDataSet() print(dataSet) #[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] #C1=apriori.createC1(dataSet) #print(set(C1)) #{frozenset({4}), frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})} #print(list(C1)) #[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})] #D=map(set,dataSet) #print(list(D)) #[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}] 注意!!被list(map1)之后,map1的内容就空了。。。好像set(.)也会清空人家 #L1,suppData0 = apriori.scanD(D, C1, 0.5) #不能直接用了,要把D和C1先变成list #print(L1) #[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})] #print(suppData0) #{frozenset({4}): 0.25, frozenset({5}): 0.75, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({1}): 0.5} L, suppData = apriori.apriori(dataSet, 0.5) print(L) print(suppData) rules = apriori.generateRules(L, suppData, minConf=0.5) print(rules)
def explain_n(self): print('start') retData = [] with progressbar.ProgressBar(max_value=self.num) as bar: for m in range(self.num): tmpData = {} code = self.tokenedCodes[m] # print(self.sbts[m]) com, c = self.translateStrs( code, self.checkMode(self.mode, 'withSbt'), self.sbts[m]) tmpData['code'] = code tmpData['comment'] = com self.r.extract_keywords_from_text(com) comKeys = self.r.get_ranked_phrases() tmpData['commentKeywords'] = comKeys codeWordList = self.tokenizer.toDoubleList(code) codeKeys, codeKeyIndex = self.extractCodeKeys(code) tmpData['codeKeywords'] = codeKeys tmpData['codeKeyIndex'] = codeKeyIndex tmpList = [] retSamples = self.explainMultiKey(codeWordList, self.sbts[m], codeKeyIndex, self.numSamples, comKeys, 0.6) for index, sample in retSamples.items(): tmpResults = { 'commentKeyword': comKeys[index], } L, support = apriori(sample, 0.3) L = [[[int(j) for j in i] for i in l] for l in L] support = [[[int(i) for i in s[0]], s[1]] for s in support.items()] tmpResults['anchors'] = L tmpResults['supports'] = support tmpList.append(tmpResults) tmpData['explanations'] = tmpList retData.append(tmpData) bar.update(m) return retData
def run(dataset=None, filename=None, path="./Data", sep=",", minsupport=0.1, min_factor=0.5): # reading binarize file(changing binarize file into transaction) filepath = path + "/" + dataset + "/" + filename load.load(dataset=dataset, filename=filename) # reading binarized transactions transpath = path + "/" + dataset + "/" + "trans.json" with open(transpath, "r") as fp: d = json.load(fp) # running apriori with minsupport to get frequency sets l, support_data, c, f = apriori.apriori(d, minsupport=0.1) # print("l is",l) # print("support is",support_data) # printing frequency set with support """ filepath=path + "/" + dataset + "/support_" + str(minsupport) + ".csv" writer = csv.writer(open(filepath, 'wb')) for key, value in support_data.items(): writer.writerow([list(key)[:], value]) """ # generating maximal and closed itemset print("# of candidate itemset is", c) print("# of frequent itemset is", f) s, sc = maximal_itemset.maximal(l) print("# of maximal frequent itemset", sc) c, cc = closed_itemset.closed(l, support_data) print("# of closed frequent itemset is", cc) # print("support data is",support_data) # mining.generateRules(l,support_data) # generating rules # min_lift=min_factor # rules,noofrules=mining_lift.generateRules(l,support_data,min_factor=0.85) """
def main(args: Namespace): print(f"Dataset: {args.dataset}") print(f"Support: {args.support}") print(f"Confidence: {args.confidence}") print("-" * 20, "\n") transactions = get_transactions(args.dataset) f_item_sets = apriori(transactions, args.support, args.k) lengths = {} for k, v in f_item_sets.items(): if len(k) in lengths: lengths[len(k)] += 1 else: lengths[len(k)] = 1 if len(f_item_sets) <= 20: print(f"The frequent itemsets are:\n") else: print(f"The first 20 frequent itemsets are:\n") for i, (f_item_set, support) in enumerate(f_item_sets.items()): if i < 20: print(f_item_set, support) print(f"\nA total of {len(f_item_sets)} frequent itemsets was found.") print(f"The distribution of frequent itemsets is: {lengths}.\n") a_rules = association_rules(f_item_sets, args.confidence) if len(a_rules) <= 20: print(f"The association rules are:\n") else: print(f"The first 20 association rules are:\n") for i, rule in enumerate(a_rules.items()): if i < 20: print(rule) print(f"\nA total of {len(a_rules)} association rules was found.")
def guestlike(guestid): guestid = int(guestid) idlist, goodslist, actionlist = reader.search() dataSet = reader.data_handle(idlist, goodslist) L, supportData = apriori.apriori(dataSet, minSupport=0.2) rule = apriori.gen_rule(L, supportData, minConf=0.7) glike = search.search(search.PowerSetsBinary(search.get(guestid, idlist, goodslist, actionlist)), rule) print(glike) guestlike = [] conn = pymysql.connect(host='wxs.chinaeast.cloudapp.chinacloudapi.cn', user='******', password='******', port=3306, db='demo') cur = conn.cursor() for i in glike: j = str(i) + '%' sql3 = " SELECT `goods_id`,`goods_name`,`goods_price` FROM goods_information " \ "WHERE `goods_id` like '%s'" % (j) cur.execute(sql3) u3 = cur.fetchone() guestlike.append(u3) conn.close() return guestlike
#dataSet=[line.split() for line in f.readlines()] #f.close() dataSet=[line.split() for line in open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\SourceDrugNum20160125_asc.txt').readlines()] # code by Adu #dataSet = reduce(lambda x,y:x|y,[set(i.strip().split(',')) for i in open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\drug_allClass.txt')]) print dataSet #// test print utf-8 data into chinese character #f2 = codecs.open('C:\\Users\\GYN\\Desktop\\lxx\yibin_data\\apriori_result\\testwrite.txt','a','utf-8') #for line in dataSet: #f2.writelines(str(line).encode('gbk')+'\n') L,suppData=apriori.apriori(dataSet,minSupport=0.0005) #1/2124=0.00047 #file_object=open('F:\\test_result.txt', mode='w') #strresult = str(suppData) #file_object.write(strresult) #file_object.close() print "ok" #print 'suppData=',suppData #print 'L=',L rules = apriori.generateRules(L,suppData,minConf=0.0005) #decodedRules = rules.decode("unicode-escape") print 'rules='
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License from apriori import apriori, association_rules from gzip import GzipFile dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] freqsets, baskets = apriori(dataset, 80, maxsize=5) nr_transactions = float(len(dataset)) for ant, con, base, pyx, lift in association_rules(dataset, freqsets, baskets, 30): print('{} | {} | {} ({:%}) | {} | {} | {}' .format(ant, con, len(baskets[con]), len(baskets[con]) / nr_transactions, len(baskets[ant]), len(baskets[con | ant]), int(lift)))
import preprocess DATA_FILE = '../data/diagnosis.data' OUTPUT_FILE = '../data/diagnosis.csv' MIN_SUPPORT = 0.2 MIN_CONFIDENT = 0.6 MIN_LIFT = 3.0 # prepocess the original data file preprocess.preprocess(DATA_FILE, OUTPUT_FILE) # get the data_set from .csv file data_set = apriori.load_dataset(OUTPUT_FILE) # get frequent items and their support value f, f_support = apriori.apriori(data_set, MIN_SUPPORT) # generate the rules rules = apriori.gen_rules(f, f_support, MIN_CONFIDENT, MIN_LIFT) # discard duplicate rules # if rule A's lhs and rhs is the subset of rule B's # and rule A's lift is less than B's discard_rules = [] for i in range(len(rules)): rule_a = rules[i] for j in range(len(rules)): if i == j or i in discard_rules or j in discard_rules: continue rule_b = rules[j] if(rule_a['lhs'].issubset(rule_b['lhs']) and rule_a['rhs'].issubset(rule_b['rhs']) and rule_a['lift'] <= rule_b['lift']):
data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) counts = [] for index in columns: line = df[index] count = 0 for i in range(len(line)): if line[i] == 1: count += 1 counts.append((float)(count) / 10000) counts.sort() minSupport = counts[len(counts) * 1 / 5] #use apriori L, supportData = ap.apriori(data, minSupport) rules = ap.generateRules(L, supportData, minConf=0.4) #use fpGrowth minSup = minSupport * 10000 simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
def fp_growth(self, transactions, support): return pyfpgrowth.find_frequent_patterns(transactions, support) if __name__ == '__main__': sys.setrecursionlimit(80000) train_set = pd.read_csv('GSM/new2gtrain.csv') a_time = [] f_time = [] for i in range(1, 10): train = train_set.head(i * 100).groupby('IMSI')['GridID'].apply(list) train = map(lambda a: list(set(a)), train) a_start = time.time() apriori(train, minSupport=1.0) a_end = time.time() print a_end - a_start a_time.append((a_end - a_start) * 1000) f = FPGrowthProcessor() f_start = time.time() f.fp_growth(train, 1) f_end = time.time() print f_end - f_start f_time.append((f_end - f_start) * 1000) x = [100, 200, 300, 400, 500, 600, 700, 800, 900] plt.figure(figsize=(8, 4)) plt.plot(x, a_time, label="apriori", color="red", linewidth=2) plt.plot(x, f_time, color='blue', label="fpgrowth")
reload(apriori) dataSet = apriori.loadDataSet() # 获取数据 dataSet C1 = apriori.creadteC1(dataSet) # 获取数据集的C1-候选项集合 C1 D = list(map(set, dataSet)) # 把数据转换成集合的形式存放在列表中 D L1, supportData0 = apriori.scanD( D, C1, 0.5) # 以0.5支持度为要求,计算候选集的每一个项的支持度,并返回大于支持度的集合L1 L1 supportData0 # 根据支持度生成频繁集 reload(apriori) L, supportData = apriori.apriori(dataSet) L # 获得支持度大于0.5的频繁集合 L[0] # 包含一个元素的 L[1] # 包含两个元素的 L[2] # 包含三个元素的 L[3] apriori.aprioriGen(L[0], 2) # 看一下如何生成的未和支持度比较的‘L[1]’ L, supportData = apriori.apriori(dataSet, minSupport=0.7) # 更大的支持度,获得少的结果了 # 根据可信度生成关联规则 reload(apriori) L, supportData = apriori.apriori(dataSet, minSupport=0.5) rules = apriori.generateRules(L, supportData, minConf=0.7) # 0.7的可信度生成的规则 rules = apriori.generateRules(L, supportData, minConf=0.5) # 0.5的可信度生成的规则 # 在毒蘑菇的数据集上测试下效果如何
from apriori import generate_one_item_set from apriori import apriori def loadDataSet(): return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] if __name__ == '__main__': dataSet = loadDataSet() one = generate_one_item_set(dataSet) freq = apriori(dataSet) print freq
print L1 print suppData1 ''' ''' L,suppData=apriori.apriori(dataSet,minSupport=0.5) rules=apriori.generateRules(L,suppData,minConf=0.7) print rules rules=apriori.generateRules(L,suppData,minConf=0.5) print rules ''' ''' #--------------国会投票------------# actionIdList,billTitles=apriori.getActionIds() ''' mushDatSet=[line.split() for line in open("C:\Users\YAN\Desktop\Apriori/mushroom.dat").readlines()] L,suppData=apriori.apriori(mushDatSet,minSupport=0.3) for item in L[1]: #intersection 表示交集的意思 if item.intersection('2'): print item
import apriori dataMat = apriori.loadDataSet() print(dataMat) dataSet = apriori.createC1(dataMat) print(dataSet) L, supportData = apriori.apriori(dataMat) print(L) print(supportData) apriori.generateRules(L, supportData, 0.5)
import apriori apriori.apriori("75000-out1.csv", 0.01)
import apriori as ap dataSet = ap.loadDataSet() #print dataSet C1 = ap.createC1(dataSet) #print C1 D = map(set, dataSet) #print D L1, suppData0 = ap.scanD(D, C1, 0.5) #print suppData0 L, S = ap.apriori(D, 0.5) #print L print L List = ap.generateRules(L, S, minConf=0.4) print List
''' Analyse species itemsets ''' import argparse import joblib import pandas as pd import apriori import apriori_sequential as asq import helpers parser = argparse.ArgumentParser(description='Convert Halias RDF dataset for data mining') parser.add_argument('minsup', help='Minimum support', nargs='?', type=float, default=0.8) #parser.add_argument('minconf', help='Minimum confidence', nargs='?', type=float, default=0.8) args = parser.parse_args() itemsets = helpers.get_species_itemsets() all_items = list(set([item for itemset in itemsets for item in itemset])) print(len(itemsets)) print(len(all_items)) freq_items = apriori.apriori(itemsets, all_items, args.minsup, verbose=True) print('\nSupport {:.3f} frequent itemsets:\n'.format(args.minsup)) print(len(freq_items)) print(freq_items[-1]) joblib.dump(freq_items, helpers.DATA_DIR + 'freq_species_itemsets_{:.3f}_NEW.pkl'.format(args.minsup))
def map_meaning (raw_meaning): strip = re.compile("\'.*\'") parsed = [] for i in raw_meaning: parsed.append(strip.search(i).group()) return dict(zip(range(0,len(raw_meaning)), parsed)) def get_meaning (i, meaning): print meaning[i] # Extract transactions and meanings transactions = map_transactions(RAW_DATA[0]) meaning = map_meaning(RAW_MEANING[0]) for threshold in np.arange(0.5, 0.25, -0.05): itemsets, support = apriori.apriori(transactions.values(), minSupport=threshold) print "THRESHOLD: ", threshold print len(itemsets), "itemsets of length:" print [len(i) for i in itemsets] print "\n" itemset, support = apriori.apriori(transactions.values(), minSupport=0.3) for threshold in np.arange(0.7, 0.99, 0.05): print "THRESHOLD: ", threshold rules = apriori.generateRules(itemset, support, minConf=threshold) print "\n" def get_meaning (rule, meaning): condition, result = [], [] for c in rule[0]: condition.append(meaning[c])
def test(): dataSet = apriori.loadDataSet() print "DataSet:", dataSet L,suppData = apriori.apriori(dataSet) rules = apriori.generateRules(L, suppData, minConf=0.5) print rules
# coding:utf-8 import apriori # 发现频繁项集和发现关联规则 dataSet = apriori.loadDataSet() print(dataSet) C1 = apriori.createC1(dataSet) print(C1) D = map(set, dataSet) print(D) L1, suppData0 = apriori.scanD(D, C1, 0.5) print(L1) L, suppData = apriori.apriori(dataSet) print(L) L, suppData = apriori.apriori(dataSet, minSupport=0.5) rules = apriori.generateRules(L, suppData, minConf=0.7) print rules rules = apriori.generateRules(L, suppData, minConf=0.5) print rules
def test2(): mushDataSet = [line.split() for line in open('mushroom.dat').readlines()] L,suppData = apriori.apriori(mushDataSet, minSupport=0.3) for item in L[1]: if item.intersection('2'): print item
# -*- coding: utf-8 -*- """ Module implementing Dialog. """ import sys from PyQt4 import QtGui from PyQt4 import QtCore from PyQt4.Qt import * from Ui_mainWindow import Ui_MainWindow import jobDB import apriori import re from Login import login apr = apriori.apriori() db = jobDB.jobDB() #列表内容区中显示 class CenterDelegate(QtGui.QItemDelegate): def __init__(self, parent=None): QtGui.QItemDelegate.__init__(self, parent) def paint(self, painter, option, index): painter.save() #painter.drawText(option.rect, Qt.AlignCenter, index.data(Qt.DisplayRole).toString()) painter.drawText(option.rect, Qt.TextWordWrap | Qt.AlignHCenter, index.data(Qt.DisplayRole).toString()) painter.restore()
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License from apriori import apriori, association_rules from gzip import GzipFile # 读入数据,数据格式为二维列表,[[1,3],[22,1]],第一维代表购物篮,第二维代表商品 dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] freqsets, support = apriori(dataset, 80, maxsize=16) rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) rules.sort(key=(lambda ar: ar.lift),reverse=True) for ar in rules: print('{} -> {} (lift = {:.4})' .format(set(ar.antecendent), set(ar.consequent), ar.lift))
#rules = generateRules(L, suppData, minConf=0.7) #print 'rules:\n', rules with open("xss-train.txt") as f: for line in f: #/discuz?q1=0&q3=0&q2=0%3Ciframe%20src=http://xxooxxoo.js%3E index=line.find("?") if index>0: line=line[index+1:len(line)] #print line tokens=re.split('\=|&|\?|\%3e|\%3c|\%3E|\%3C|\%20|\%22|<|>|\\n|\(|\)|\'|\"|;|:|,|\%28|\%29',line) #print "token:" #print tokens myDat.append(tokens) f.close() L, suppData = apriori(myDat, 0.15) rules = generateRules(L, suppData, minConf=0.6) #print 'rules:\n', rules# -*- coding:utf-8 -*- import sys import urllib import urlparse import re from hmmlearn import hmm import numpy as np from sklearn.externals import joblib import HTMLParser import nltk #处理参数值的最小长度
# This code is supporting material for the book # Building Machine Learning Systems with Python # by Willi Richert and Luis Pedro Coelho # published by PACKT Publishing # # It is made available under the MIT License from apriori import apriori, association_rules from gzip import GzipFile # Load dataset dataset = [[int(tok) for tok in line.strip().split()] for line in GzipFile('retail.dat.gz')] freqsets, support = apriori(dataset, 80, maxsize=16) rules = list(association_rules(dataset, freqsets, support, minlift=30.0)) rules.sort(key=(lambda ar: -ar.lift)) for ar in rules: print('{} -> {} (lift = {:.4})' .format(set(ar.antecendent), set(ar.consequent), ar.lift))
# 导入库 import sys sys.path.append('../chapter4') import pandas as pd from graphviz import Digraph import apriori # 定义数据文件 fileName = 'association.txt' # 通过调用自定义的apriori做关联分析 minS = 0.1 # 定义最小支持度阀值 minC = 0.38 # 定义最小置信度阀值 dataSet = apriori.createData(fileName) # 获取格式化的数据集 L, suppData = apriori.apriori(dataSet, minSupport=minS) # 计算得到满足最小支持度的规则 rules = apriori.generateRules(fileName, L, suppData, minConf=minC) # 计算满足最小置信度的规则 # 关联结果报表评估 model_summary = 'data record: {1} \nassociation rules count: {0}' # 展示数据集记录数和满足阀值定义的规则数量 print(model_summary.format(len(rules), len(dataSet))) # 使用str.format做格式化输出 df = pd.DataFrame( rules, columns=['item1', 'itme2', 'instance', 'support', 'confidence', 'lift']) # 创建频繁规则数据框 df_lift = df[df['lift'] > 1.0] # 只选择提升度>1的规则 print(df_lift.sort('instance', ascending=False)) # 打印排序后的数据框 # 关联结果图形展示 dot = Digraph() # 创建有向图
import apriori transactions = [('elma', 'muz', 'dondurma', 'simit'), ('elma', 'muz', 'simit'), ('yumurta', 'simit'), ('yumurta', 'erik'), ('elma', 'muz'), ('elma', 'muz', 'yumurta')] print(apriori.apriori(transactions))
import os import django import sys pro_dir = os.getcwd() sys.path.append(pro_dir) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "BioDesigner.settings") import fpTree import igemRecomdData import apriori from design.models import team_parts def getPartData(): result = list() tList = team_parts.objects.all().distinct().values_list('team_id', flat=True)[:100] for t in tList: pList = team_parts.objects.filter(team_id=t).values_list('part_id', flat=True) result.append(pList) #for i in result: #print i return result if __name__ == '__main__': django.setup() l,m = apriori.apriori(getPartData()) print m print l
print dataSet C1 = apriori.createC1(dataSet) print "C1" print C1 D=map(set, dataSet) print "D" print D L1, suppData0 = apriori.scanD(D, C1, 0.5) print "L1" print L1 print "suppData0" print suppData0 L,suppData = apriori.apriori(dataSet, minSupport=0.5) print "L" print L print "suppData" print suppData rules = apriori.generateRules(L, suppData, minConf=0.7) print "rules" print rules rules = apriori.generateRules(L, suppData, minConf=0.5) print "rules" print rules
apriori.NUM_CORES = 1 MINSUP = args.minsup itemsets = helpers.read_observation_basket(helpers.DATA_DIR + 'observation.basket') all_items = list(set([item for itemset in itemsets for item in itemset])) print(len(itemsets)) print(len(all_items)) #print(itemsets[:1]) print('\nSupport {:.3f} frequent itemsets:\n'.format(MINSUP)) freq_items = apriori.apriori(itemsets, all_items, MINSUP, verbose=True) print(freq_items[-1]) print(len(freq_items)) joblib.dump(freq_items, helpers.DATA_DIR + 'freq_items_{:.3f}.pkl'.format(MINSUP)) ruler = RuleGenerator(itemsets, freq_items) rules = ruler.rule_generation(0.5) #, fixed_consequents=[('varis',)]) print(len(rules)) joblib.dump(rules, helpers.DATA_DIR + 'freq_rules_{:.3f}.pkl'.format(MINSUP)) #for (rule, conf) in rules:
#apriori原理:可以减少可能感兴趣的项集。apriori原理是说,如果某个项集是频繁的,那么它的所有子集也是频繁的。 #反过来说,如果一个项集是非频繁集,那么它的所有超集也是非频繁集。 #问题1:为什么关联规则中,如果项集中有三个元素,为什么只计算1个 -> 2个 #而不计算2个 --->1个??????????????? import apriori from votesmart import votesmart dataSet = apriori.loadDataSet() #C1 = apriori.createC1(dataSet) #print ("C1 is %s" % C1) #D = map(set,dataSet) #print ( "%r" % D) #L1,suppData0 = apriori.scanD(list(D), list(C1), 0.5) #print (L1) #print (suppData0) L,suppData = apriori.apriori(dataSet, 0.5) print ("L is" , L) print ("suppData is" , suppData) #L is [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})], [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})], [frozenset({2, 3, 5})], []] #suppData is {frozenset({5}): 0.75, frozenset({3}): 0.75, frozenset({2, 3, 5}): 0.5, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({1}): 0.5, frozenset({1, 3}): 0.5, frozenset({2}): 0.75} #关联规则挖掘 rules = apriori.generateRules(L, suppData, 0.7) print ("rules is " ,rules)
#!/usr/bin/env python # encoding: utf-8 import apriori import codecs f=codecs.open('/home/will/data/search_result_cn_phone.txt','r','utf-8') id_word={} for x in f: temp=x.split('\t') if len(temp)==4: temp1=temp[1].split(',') for y in temp1: if y not in id_word: id_word[y]=[temp[0]] else: id_word[y].append(temp[0]) ''' with codecs.open('/home/will/data/temp.txt','w','utf-8') as wf: for key,value in id_word.iteritems(): wf.write(str(key)+'\t'+'\t'.join(s for s in value)+'\n') ''' l,support_data=apriori.apriori(id_word.values(),minSupport=0.001) print '#########################################################' print l
def test_apriori(self): result_dict = apriori.apriori(small_trans, min_sup) self.assert_small_trans_result(result_dict)
support = 0.4 loadText.importFromFile('spanish_db.txt') dataset = loadText.rawPriori #print dataset C1 = apriori.createC1(dataset) #print 'C1', C1 D = map(set,dataset) #print 'D', D L1, support_data = apriori.scanD(D,C1,support) #print 'L1', L1 #print 'support_data', support_data k_length = 2 transactions = apriori.aprioriGen(L1, k_length) #print 'transactions', transactions #print '\n*** *** ***' L,support_data = apriori.apriori(dataset, support) #print 'L', L #print 'support_data', support_data rules = apriori.generateRules(L, support_data, min_confidence=0.7) #print 'rules', rules ruleDict = apriori.generateRuleDict(rules) ''' print 'ruleDict', ruleDict print '*** *** ***' ''' print 'keys', ruleDict.keys() print '*** *** ***'
data = [] for i in range(len(values)): temp = [] for j in range(len(values[0])): if values[i][j] == 1: temp.append(j) data.append(temp) counts = [] for index in columns: line = df[index] count = 0 for i in range(len(line)): if line[i]==1: count += 1 counts.append((float)(count)/10000) counts.sort() minSupport = counts[len(counts)*1/5] #use apriori L,supportData = ap.apriori(data,minSupport) rules = ap.generateRules(L,supportData,minConf=0.4) #use fpGrowth minSup = minSupport*10000 simpDat = data initSet = fp.createInitSet(simpDat) myFPtree, myHeaderTab = fp.createTree(initSet, minSup) myFreqList = [] fp.mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList) print myFreqList
Lee un documento dado su identificador. """ s = "" with codecs.open("documentos_keywords/" + str(id) + ".txt", "r", "utf-8-sig") as f: s=f.read() return s.split() def cargar_datos(N): """ Carga N documentos, para realizar los experimentos. """ data = [] for i in range(1, N + 1): data.append(leer_documento(i)) return data if __name__ == "__main__": # Leer argumentos de entrada N = int(sys.argv[1]) support = float(sys.argv[2]) # Cargar datos dataset = cargar_datos(N) # Aplicar algoritmo apriori L, support_data = apriori.apriori(dataset, minsupport = support) # Generar reglas apriori.generateRules(L, support_data, min_confidence = 0.0)
import apriori dataSet = apriori.loadDataSet() L, supportData = apriori.apriori(dataSet, minSupport=0.1) print "[result]-----------------------------------------" rules = apriori.generateRules(L, supportData, minConf=1.0)
# the current data-set isn't in transactional format. To convert it into a transactional data-set, we use the following snippet of code: basket_str="" for rowNum, row in accident_data.iterrows(): #Break lines if (rowNum != 0): basket_str = basket_str + "\n" #Add the rowid as the first column basket_str = basket_str + str(rowNum) #Add columns for colName, col in row.iteritems(): if ( colName != 'Accident_Index'): basket_str = basket_str + "," + colName + "=" + str(col) #print basket_str basket_file=open("accidents_basket.csv","w") basket_file.write(basket_str) basket_file.close() import csv with open("accidents_basket.csv","rb") as f: reader=csv.reader(f) my_list=list(reader) #my_list L,supportData=apriori.apriori(my_list,0.6) f_rules= apriori.generateRules(L,supportData,0.6) for row in f_rules: print list(row[0]), " => ", list(row[1]), row[2]