def main(argv): # parse input arguments filename = sys.argv[1] minsupp = float(sys.argv[2]) minconf = float(sys.argv[3]) data = apriori(filename, minsupp, minconf) data.run()
def mapper(record): return_apriori=apriori(record,False) for item in return_apriori: if item: for subitem in item: if subitem: map_reduce_obj.emit_intermediate(tuple(subitem),1)
def main(): dataset = loadDataSet() print dataset L, suppData = apriori(dataset) print L bigRuleList = generateRules(L, suppData, 0.5)
def processApriori(data, minSupport=0.5, minConf=0.5): ''' data: 数据集,list格式 minSupport: 词语组合的最小频率 minConf: 规则最小置信度,默认为0.5 ''' L, suppData = apriori(data, minSupport) rules = generateRules(L, suppData, minConf) return L, suppData, rules
def mapper(record): # call apriori() candidates = apriori(record, False) # sent local frequent itemsets to intermediate layer for i in candidates: for c in i: mr.emit_intermediate(tuple(c), 1)
def entity_recognition(data: pd.DataFrame): paragraphes = [] for index, row in data.iterrows(): para = str(row['content']) paragraphes.append(para) term_data = extract_entity(paragraphes) m, n = apriori(term_data, minSupport=0.001) rules = generateRules(m, n, minConf=0.01) print(rules)
def runapriori(file,delim,support): x =dataprocess(file, delim).tuplemaker() y = apriori(x, support) y.frequent1items() for n in range(1,len(x[0])-1): y.merge(n) y.prune(n+1) print(y.frequents)
def test_apriori(self): transactions = [ ['A','B','D','E'], ['C','A','D','B','F'], ['D','A','H','B','C'], ['E','F','B','C'] ] rules = apriori(transactions) self.assertIn( (set(['A']), set(['B']), 1), rules ) self.assertIn( (set(['B']),set(['A','D']), 0.75), rules ) self.assertIn( (set(['B']), set(['E']), 0.5), apriori(transactions, min_supp=2, confidence=0.5) )
def categoryExp(category): ids = cur.execute("Select business_id, count(business_id) from business_category group by business_id having count(business_id) > 2 \ and category_name = ?",(category,)).fetchall() ids = [i[0] for i in ids] categories_list = [] for i in ids: categories = cur.execute("SELECT category_name from \ business_category where business_id = ? and category_name <> ?",(i,category)).fetchall() categories = [c[0] for c in categories] categories_list.append(categories) # chain = itertools.chain(*categories_list) # d = Counter(list(chain)) # for k, v in d.iteritems(): # print k, v L, suppData = apriori(categories_list, minSupport=0.01) rules = generateRules(L, suppData, minConf=0.5) for rule in rules: print rule
def get(self): user_id=self.request.cookies.get('user_id') if user_id: result=check_secure_val(user_id) if result: user=Customer.get_by_id(int(result)) cursor=db.GqlQuery("SELECT * FROM Cart") total=0 count=0 for c in cursor: if c.email==user.email: total+=c.price count+=1 greet='<a href="/profile">Hello, '+user.name+'</a>' logout='<a href="/logout">LOGOUT</a>' email=user.email cursor=db.GqlQuery("SELECT * FROM Cart WHERE email='%s'"%email) inputitem=[] for c in cursor: pid=c.product_id.key().id() inputitem.append(pid) dataset=load_dataset() minsupport=0.5 min_confidence=0.5 l,support_data=apriori(dataset,minsupport) rules=generateRules(l,support_data,min_confidence) recommend=[] ch=frozenset(inputitem) for result in rules: if result[0]==ch: for temp in list(result[1]): if not temp in recommend: recommend.append(temp) self.render("cart.html",error="Empty Cart",count=count,greet=greet,logout=logout,cursor=cursor,user=user,total=total,recommend=recommend) else: self.redirect("/login") else: self.redirect("/login")
def detect_reverse_ctl(stream): fp.write("\n") fp.write("stream_total_time " + str(stream[-1][0] - stream[0][0])) fp.write("\n") stream_no_ack = del_ack(stream) if not stream_no_ack: return # printstream(stream_no_ack) up_down_no_ack = obj2bit(stream_no_ack, 1) # packetlen = pktlen2bit(stream) timeline_no_ack = obj2time(stream_no_ack, 0) instant_res_intra = 0 pattern_index_intra = [] instant_res_out = 0 pattern_index_out = [] # instant response # not_instant_res_intra,instant_res_intra, # pattern_intra = [[up_down,packetlen],["011","101"]] # pattern_intra = [[up_down_no_ack],["0111"]] # pattern_index_intra = pattern_intersection(pattern_intra) # instant_res_intra,not_instant_intra = instant_response(timeline_no_ack,pattern_index_intra,"intra") # if not_instant_intra: # fp.write("human inside!\n") # pattern_out = [[up_down_no_ack],["1000"]] # pattern_index_out = pattern_intersection(pattern_out) # instant_res_out,not_instant_out = instant_response(timeline_no_ack,pattern_index_out,"out") # if not_instant_out: # fp.write("human outside!\n") # pattern_out = [[up_down,packetlen],["100","101"]] # server first up_down = obj2bit(stream, 1) flag_s = obj2bit(stream, 6) flag_f = obj2bit(stream, 7) flag_p = obj2bit(stream, 5) pattern_server = [[up_down, flag_s, flag_f, flag_p], ["010", "100", "000", "001"]] pattern_index_server = pattern_intersection(pattern_server) server_first = 0 if pattern_index_server: # print "Server first push packets!" fp.write("port:" + str(stream[0][2]) + "Server first push packets!\n") server_first == 1 ####fixed time & length heartbeat count_packet = {} packetlist_no_ack = obj2time(stream_no_ack, 4) # print packetlist_no_ack up_down_no_ack_int = obj2time(stream_no_ack, 1) # simple pure 0 pure 1 fp.write("updown_no_ack\n") fp.write(up_down_no_ack + "\n") serial_long = detect_long_constant(up_down_no_ack, 2) # count_packet = packet_count_with_up_down(packetlist_no_ack,up_down_no_ack_int) # big_packets = detect_big_packets(up_down_no_ack_int,packetlist_no_ack,5) # fp.write("".join(serial_long)+"\n") packetlist_no_ack_signed = signed_packets(up_down_no_ack_int, packetlist_no_ack) count_packet_signed = packet_count(packetlist_no_ack_signed) if len(set(packetlist_no_ack)) > 20: fp.write("too many packets length!\n") up_down_slice, packetlist_slice, timeline_slice = time_slice( timeline_no_ack, up_down_no_ack, packetlist_no_ack_signed) up_down_slice_array = [] for x in up_down_slice: up_down_slice_array.append(list(x)) print "up_down_frequent itemset", apriori(up_down_slice_array, 0.7) print "\n" print "packet_length_list itemset", apriori(packetlist_slice, 0.7) print "\n" fp.write("up_down_frequent itemset" + str(apriori(up_down_slice_array, 0.7)) + "\n") fp.write("packet_length_list itemset" + str(apriori(packetlist_slice, 0.7)) + "\n")
import sys, time from document import * from apriori import * if __name__ == '__main__': start = time.time() f = open(sys.argv[1], 'r') doc = document(f) apr = apriori(doc.txns) apr.train() end = time.time() print 'Time taken for Training: {}'.format(end - start) start = time.time() apr.evaluate(doc.txns) end = time.time() print 'Time taken for Testing: {}'.format(end - start)
#coding=utf-8 from apriori import * dataSet = loadDataSet() print('测试数据:', dataSet) L, suppData = apriori(dataSet, minSupport=0.5) print('候选项集:', L) print('支持度表:', suppData) print('关联规则:') rules = generateRules(L, suppData, minConf=0.7) print(rules) # 例子:找毒蘑菇的公共特征,数据中包含特征值2的为毒蘑菇,我们只需找到包含2的频繁项集 # mushDataSet = [line.split() for line in open('data/mushroom.dat').readlines()] # L, suppData = apriori(mushDataSet, minSupport=0.3) # for item in L[3]: # if item.intersection('2'): # print(item, 'conf', suppData[item])
from apriori import * mushDataSet = [line.split() for line in open('mushroom.dat').readlines()] L, suppData = apriori(mushDataSet, 0.3) for item in L[3]: if item.intersection('2'): print item
from apriori import * mushDataSet=[line.split() for line in open('mushroom.dat').readlines()] L,suppData=apriori(mushDataSet,0.3) for item in L[3]: if item.intersection('2'): print item
def apriori(self, dataList, min_sup, min_conf): ap = apriori(dataList, min_sup, min_conf) ap.start() freq = ap.getOutCome() #频繁项集 rules = rule_gen(freq, len(dataList), min_conf) return freq, rules
#!/usr/bin/env python # encoding: utf-8 import sys from apriori import * suporte = 0.2 confianca = 0.8 if __name__ == '__main__': path = sys.argv[1:] if len(path) == 0: print 'Por favor informe a entrada !!' else : dataset = lerDados(path[0]) itemsets, dadosSuport = apriori(dataset, suporte) regras = regras(itemsets, dadosSuport, confianca) print len(regras) escreverResultados(regras)
#Test Apriori algorithm. #Author: Justin Nie #Date: 2018/2/10 from numpy import * from apriori import * dataset = load_simdata() frequent, support_data = apriori(dataset) #print(frequent) #print(support_data) rules = get_rules(frequent, support_data) print(rules)
from apriori import * import csv file_name = 'D:\\各种数据集\\专利数据\\INVT:CITY.xlsx' lists = csv.reader(open(file_name, 'r', encoding='utf-8-sig')) # 数据加载 data = [] for names in lists: name_new = [] for name in names: # 去掉演员数据中的空格 name_new.append(name.strip()) data.append(name_new[1:]) # 挖掘频繁项集和关联规则 #itemsets, rules = apriori(data, min_support=0.05, min_confidence=1) itemsets, rules = apriori(data, min_support=0.5, min_confidence=1) print('频繁项集:', itemsets) print('关联规则:', rules)
data_list = line.split(',') deck_ids = [] for entry in data_list: if ':' in entry: deck_ids.append((entry.split(":"))[3]) deck_dict = {} for row in individuals_log_file: deck_dict[row["Individual"]] = row["Deck"].split('*') list_of_set_of_elite_decks = [] for id in deck_ids: list_of_set_of_elite_decks.append(set(deck_dict[id])) epsilon = len(list_of_set_of_elite_decks) * epsilon_factor result = apriori(list_of_set_of_elite_decks, epsilon) percent_result = {} for item in result: percent_result[item[0]] = float(item[1]) / float( len(list_of_set_of_elite_decks)) output_file = open(output_file, 'w') for item in percent_result: output_file.write(str(item) + ";" + str(percent_result[item]) + "\n") output_file.close() print(path, " done!")
from apriori import * import time import numpy as np # 读取训练集 with open("./data/agaricus_train.csv", "r") as f: dataSet = [line[:-1].split(',') for line in f.readlines()] # L中的每一个元素都至少在25%的样本中出现过 L, suppData = apriori(dataSet, 0.25) # 阈值越小,越慢 # 生成规则,每个规则的置信度至少是0.6 bigRuleList = generateRules(L, suppData, 0.6) # P→H,根据P集合的大小排序 bigRuleList = sorted(bigRuleList, key=lambda x:len(x[0]), reverse=True) # 读取测试集 with open("./data/agaricus_test.csv", "rb") as f: dataSet = [line[:-1].split(',') for line in f.readlines()] labels = np.array([int(x[0]) for x in dataSet]) scores = [] for line in dataSet: tmp = [] for item in bigRuleList: if item[0].issubset(set(line)): if "1" in item[1]: tmp.append(float(item[2])) # 因为是预测“为1的概率”,所以要用1减 if "0" in item[1]:
#辨别毒蘑菇 #mushroom.dat 第一个特征表示有毒或者可食用。如果某样本有毒,则值为2。如果可食用,则值为1。下一 #个特征是蘑菇伞的形状,有六种可能的值,分别用整数3-8来表示。 from time import sleep from apriori import * mushDatset = [line.split() for line in open('mushroom.dat').readlines()] #print(mushDatset) L, supportData = apriori(mushDatset, 0.3) # # for item in L[1]: # #if item.intersection('2'): # if set('2').issubset(item): # print(item) for item in L[3]: #if item.intersection('2'): if set(['2']).issubset(item): print(item)
from apriori import * def get_only_dataset_from_file(filename, split_format="\t", type_func=float): with open(filename) as datafile: words = [line.strip().split(split_format) for line in datafile] dataset = [ [type_func(cell) for cell in row] for row in words] return dataset if __name__ == '__main__': from minitest import * with test_case("poisonous_mushrooms"): dataset = get_only_dataset_from_file("mushroom.dataset", split_format=" ", type_func=str) dataset[0].p() with test("some"): frequents, support_dict = apriori(dataset, support_ratio_threshold = 0.5) # frequents.size().pp() # frequents.pp() # support_dict.size().pp() [item for item in frequents[1] if item.intersection('2')].p() [item for item in frequents[3] if item.intersection('2')].p() # filter(lambda item: item.intersection(2), frequents[1]).pp() pass