Beispiel #1
0
def main(argv):
    # parse input arguments
    filename = sys.argv[1]
    minsupp = float(sys.argv[2])
    minconf = float(sys.argv[3])
    data = apriori(filename, minsupp, minconf)
    data.run()
Beispiel #2
0
def mapper(record):
    return_apriori=apriori(record,False)
    for item in return_apriori:
        if item:
            for subitem in item:
                if subitem:
                    map_reduce_obj.emit_intermediate(tuple(subitem),1)
Beispiel #3
0
def main(argv):
	# parse input arguments
	filename = sys.argv[1]
	minsupp = float(sys.argv[2])
	minconf = float(sys.argv[3])
	data = apriori(filename, minsupp, minconf)
	data.run()
Beispiel #4
0
def main():
	dataset = loadDataSet()
	print dataset

	L, suppData = apriori(dataset)
	print L
	
	bigRuleList = generateRules(L, suppData, 0.5)
Beispiel #5
0
def processApriori(data, minSupport=0.5, minConf=0.5):
    '''
    data: 数据集,list格式
    minSupport: 词语组合的最小频率
    minConf: 规则最小置信度,默认为0.5
    '''
    L, suppData = apriori(data, minSupport)
    rules = generateRules(L, suppData, minConf)
    return L, suppData, rules
def mapper(record):

    # call apriori()
    candidates = apriori(record, False)

    # sent local frequent itemsets to intermediate layer
    for i in candidates:
        for c in i:
            mr.emit_intermediate(tuple(c), 1)
def entity_recognition(data: pd.DataFrame):
    paragraphes = []
    for index, row in data.iterrows():
        para = str(row['content'])
        paragraphes.append(para)
    term_data = extract_entity(paragraphes)
    m, n = apriori(term_data, minSupport=0.001)
    rules = generateRules(m, n, minConf=0.01)
    print(rules)
Beispiel #8
0
def runapriori(file,delim,support):
    x =dataprocess(file, delim).tuplemaker()

    y = apriori(x, support)
    y.frequent1items()
    for n in range(1,len(x[0])-1):
        y.merge(n)
        y.prune(n+1)
    print(y.frequents)
Beispiel #9
0
 def test_apriori(self):
     transactions = [
         ['A','B','D','E'],
         ['C','A','D','B','F'],
         ['D','A','H','B','C'],
         ['E','F','B','C']
     ]
     rules = apriori(transactions)
     self.assertIn(
         (set(['A']), set(['B']), 1),
         rules
     )
     self.assertIn(
         (set(['B']),set(['A','D']), 0.75),
         rules
     )
     self.assertIn(
         (set(['B']), set(['E']), 0.5),
         apriori(transactions, min_supp=2, confidence=0.5)
     )
def categoryExp(category):
    ids = cur.execute("Select business_id, count(business_id) from business_category group by business_id having count(business_id) > 2 \
        and category_name = ?",(category,)).fetchall()
    ids = [i[0] for i in ids]
    categories_list = []
    for i in ids:
        categories = cur.execute("SELECT category_name from \
                business_category where business_id = ? and category_name <> ?",(i,category)).fetchall()
        categories = [c[0] for c in categories]
        categories_list.append(categories)
    # chain = itertools.chain(*categories_list)
    # d = Counter(list(chain))
    # for k, v in d.iteritems():
        # print k, v
    L, suppData = apriori(categories_list, minSupport=0.01)
    rules = generateRules(L, suppData, minConf=0.5)
    for rule in rules:
        print rule
Beispiel #11
0
 def get(self):
     user_id=self.request.cookies.get('user_id')        
     if user_id:
         result=check_secure_val(user_id)
         if result:
             user=Customer.get_by_id(int(result))
             cursor=db.GqlQuery("SELECT * FROM Cart")
             total=0
             count=0
             for c in cursor:
                 if c.email==user.email:
                     total+=c.price
                     count+=1
             greet='<a href="/profile">Hello, '+user.name+'</a>'
             logout='<a href="/logout">LOGOUT</a>'
             email=user.email
             cursor=db.GqlQuery("SELECT * FROM Cart WHERE email='%s'"%email)
             inputitem=[]
             for c in cursor:
                 pid=c.product_id.key().id()
                 inputitem.append(pid)
             dataset=load_dataset()    
             minsupport=0.5
             min_confidence=0.5
             l,support_data=apriori(dataset,minsupport)
             rules=generateRules(l,support_data,min_confidence)
             recommend=[]
             ch=frozenset(inputitem)
             for result in rules:
                 if result[0]==ch: 
                     for temp in list(result[1]):
                         if not temp in recommend:
                             recommend.append(temp) 
             self.render("cart.html",error="Empty Cart",count=count,greet=greet,logout=logout,cursor=cursor,user=user,total=total,recommend=recommend)
         else:
             self.redirect("/login")
     else:
         self.redirect("/login")
Beispiel #12
0
def detect_reverse_ctl(stream):
    fp.write("\n")
    fp.write("stream_total_time " + str(stream[-1][0] - stream[0][0]))
    fp.write("\n")
    stream_no_ack = del_ack(stream)
    if not stream_no_ack:
        return
    # printstream(stream_no_ack)
    up_down_no_ack = obj2bit(stream_no_ack, 1)
    # packetlen = pktlen2bit(stream)
    timeline_no_ack = obj2time(stream_no_ack, 0)

    instant_res_intra = 0
    pattern_index_intra = []
    instant_res_out = 0
    pattern_index_out = []

    # instant response
    # not_instant_res_intra,instant_res_intra,
    # pattern_intra = [[up_down,packetlen],["011","101"]]

    # pattern_intra = [[up_down_no_ack],["0111"]]
    # pattern_index_intra = pattern_intersection(pattern_intra)
    # instant_res_intra,not_instant_intra =  instant_response(timeline_no_ack,pattern_index_intra,"intra")
    # if not_instant_intra:
    # 	fp.write("human inside!\n")
    # pattern_out = [[up_down_no_ack],["1000"]]
    # pattern_index_out = pattern_intersection(pattern_out)
    # instant_res_out,not_instant_out =  instant_response(timeline_no_ack,pattern_index_out,"out")
    # if not_instant_out:
    # 	fp.write("human outside!\n")

    # pattern_out = [[up_down,packetlen],["100","101"]]

    # server first
    up_down = obj2bit(stream, 1)
    flag_s = obj2bit(stream, 6)
    flag_f = obj2bit(stream, 7)
    flag_p = obj2bit(stream, 5)
    pattern_server = [[up_down, flag_s, flag_f, flag_p],
                      ["010", "100", "000", "001"]]
    pattern_index_server = pattern_intersection(pattern_server)
    server_first = 0
    if pattern_index_server:
        # print "Server first push packets!"
        fp.write("port:" + str(stream[0][2]) + "Server first push packets!\n")
        server_first == 1

    ####fixed time & length heartbeat
    count_packet = {}
    packetlist_no_ack = obj2time(stream_no_ack, 4)
    # print packetlist_no_ack
    up_down_no_ack_int = obj2time(stream_no_ack, 1)

    # simple pure 0 pure 1
    fp.write("updown_no_ack\n")
    fp.write(up_down_no_ack + "\n")
    serial_long = detect_long_constant(up_down_no_ack, 2)
    # count_packet = packet_count_with_up_down(packetlist_no_ack,up_down_no_ack_int)
    # big_packets = detect_big_packets(up_down_no_ack_int,packetlist_no_ack,5)

    # fp.write("".join(serial_long)+"\n")
    packetlist_no_ack_signed = signed_packets(up_down_no_ack_int,
                                              packetlist_no_ack)
    count_packet_signed = packet_count(packetlist_no_ack_signed)
    if len(set(packetlist_no_ack)) > 20:
        fp.write("too many packets length!\n")
    up_down_slice, packetlist_slice, timeline_slice = time_slice(
        timeline_no_ack, up_down_no_ack, packetlist_no_ack_signed)
    up_down_slice_array = []
    for x in up_down_slice:
        up_down_slice_array.append(list(x))
    print "up_down_frequent itemset", apriori(up_down_slice_array, 0.7)
    print "\n"
    print "packet_length_list itemset", apriori(packetlist_slice, 0.7)
    print "\n"
    fp.write("up_down_frequent itemset" +
             str(apriori(up_down_slice_array, 0.7)) + "\n")
    fp.write("packet_length_list itemset" +
             str(apriori(packetlist_slice, 0.7)) + "\n")
Beispiel #13
0
import sys, time
from document import *
from apriori import *

if __name__ == '__main__':
    start = time.time()
    f = open(sys.argv[1], 'r')
    doc = document(f)
    apr = apriori(doc.txns)
    apr.train()
    end = time.time()
    print 'Time taken for Training: {}'.format(end - start)

    start = time.time()
    apr.evaluate(doc.txns)
    end = time.time()
    print 'Time taken for Testing: {}'.format(end - start)
Beispiel #14
0
#coding=utf-8

from apriori import *

dataSet = loadDataSet()
print('测试数据:', dataSet)
L, suppData = apriori(dataSet, minSupport=0.5)
print('候选项集:', L)
print('支持度表:', suppData)
print('关联规则:')
rules = generateRules(L, suppData, minConf=0.7)
print(rules)


# 例子:找毒蘑菇的公共特征,数据中包含特征值2的为毒蘑菇,我们只需找到包含2的频繁项集
# mushDataSet = [line.split() for line in open('data/mushroom.dat').readlines()]
# L, suppData = apriori(mushDataSet, minSupport=0.3)
# for item in L[3]:
#     if item.intersection('2'):
#         print(item, 'conf', suppData[item])
Beispiel #15
0
from apriori import *

mushDataSet = [line.split() for line in open('mushroom.dat').readlines()]
L, suppData = apriori(mushDataSet, 0.3)

for item in L[3]:
    if item.intersection('2'):
        print item
Beispiel #16
0
from apriori import *

mushDataSet=[line.split() for line in open('mushroom.dat').readlines()]
L,suppData=apriori(mushDataSet,0.3)

for item in L[3]:
	if item.intersection('2'):
		print item

Beispiel #17
0
 def apriori(self, dataList, min_sup, min_conf):
     ap = apriori(dataList, min_sup, min_conf)
     ap.start()
     freq = ap.getOutCome()  #频繁项集
     rules = rule_gen(freq, len(dataList), min_conf)
     return freq, rules
Beispiel #18
0
#!/usr/bin/env python

# encoding: utf-8
import sys
from apriori import *

suporte = 0.2
confianca = 0.8 

if __name__ == '__main__':
	
	path = sys.argv[1:]
	if len(path) == 0:
		print 'Por favor informe a entrada !!'
	else :
		dataset = lerDados(path[0])
		itemsets, dadosSuport = apriori(dataset, suporte)
		regras  = regras(itemsets, dadosSuport, confianca)
		print len(regras)
		escreverResultados(regras)
Beispiel #19
0
#Test Apriori algorithm.
#Author: Justin Nie
#Date: 2018/2/10

from numpy import *
from apriori import *

dataset = load_simdata()
frequent, support_data = apriori(dataset)
#print(frequent)
#print(support_data)
rules = get_rules(frequent, support_data)
print(rules)


from apriori import *
import csv

file_name = 'D:\\各种数据集\\专利数据\\INVT:CITY.xlsx'
lists = csv.reader(open(file_name, 'r', encoding='utf-8-sig'))
# 数据加载
data = []
for names in lists:
    name_new = []
    for name in names:
        # 去掉演员数据中的空格
        name_new.append(name.strip())
    data.append(name_new[1:])

# 挖掘频繁项集和关联规则
#itemsets, rules = apriori(data, min_support=0.05,  min_confidence=1)
itemsets, rules = apriori(data, min_support=0.5, min_confidence=1)
print('频繁项集:', itemsets)
print('关联规则:', rules)
Beispiel #21
0
    data_list = line.split(',')
    deck_ids = []

    for entry in data_list:
        if ':' in entry:
            deck_ids.append((entry.split(":"))[3])

    deck_dict = {}

    for row in individuals_log_file:
        deck_dict[row["Individual"]] = row["Deck"].split('*')

    list_of_set_of_elite_decks = []

    for id in deck_ids:
        list_of_set_of_elite_decks.append(set(deck_dict[id]))

    epsilon = len(list_of_set_of_elite_decks) * epsilon_factor
    result = apriori(list_of_set_of_elite_decks, epsilon)

    percent_result = {}
    for item in result:
        percent_result[item[0]] = float(item[1]) / float(
            len(list_of_set_of_elite_decks))

    output_file = open(output_file, 'w')
    for item in percent_result:
        output_file.write(str(item) + ";" + str(percent_result[item]) + "\n")
    output_file.close()

    print(path, " done!")
Beispiel #22
0
from apriori import *
import time
import numpy as np

# 读取训练集
with open("./data/agaricus_train.csv", "r") as f:
    dataSet = [line[:-1].split(',') for line in f.readlines()]

# L中的每一个元素都至少在25%的样本中出现过
L, suppData = apriori(dataSet, 0.25) # 阈值越小,越慢

# 生成规则,每个规则的置信度至少是0.6
bigRuleList = generateRules(L, suppData, 0.6)

# P→H,根据P集合的大小排序
bigRuleList = sorted(bigRuleList, key=lambda x:len(x[0]), reverse=True)

# 读取测试集
with open("./data/agaricus_test.csv", "rb") as f:
    dataSet = [line[:-1].split(',') for line in f.readlines()]
labels = np.array([int(x[0]) for x in dataSet])

scores = []
for line in dataSet:
    tmp = []
    for item in bigRuleList:
        if item[0].issubset(set(line)):
            if "1" in item[1]:
                tmp.append(float(item[2]))
            # 因为是预测“为1的概率”,所以要用1减
            if "0" in item[1]:
Beispiel #23
0
#辨别毒蘑菇
#mushroom.dat  第一个特征表示有毒或者可食用。如果某样本有毒,则值为2。如果可食用,则值为1。下一
#个特征是蘑菇伞的形状,有六种可能的值,分别用整数3-8来表示。
from time import sleep
from apriori import *

mushDatset = [line.split() for line in open('mushroom.dat').readlines()]
#print(mushDatset)
L, supportData = apriori(mushDatset, 0.3)
#
# for item in L[1]:
#     #if item.intersection('2'):
#     if set('2').issubset(item):
#         print(item)
for item in L[3]:
    #if item.intersection('2'):
    if set(['2']).issubset(item):
        print(item)
Beispiel #24
0
from apriori import *

def get_only_dataset_from_file(filename, split_format="\t", type_func=float):
    with open(filename) as datafile:
        words = [line.strip().split(split_format) for line in datafile]
    dataset = [ [type_func(cell) for cell in row] for row in words]
    return dataset


if __name__ == '__main__':
    from minitest import *

    with test_case("poisonous_mushrooms"):
        dataset = get_only_dataset_from_file("mushroom.dataset",
            split_format=" ", type_func=str)
        dataset[0].p()
        with test("some"):
            frequents, support_dict = apriori(dataset, support_ratio_threshold = 0.5)
            # frequents.size().pp()
            # frequents.pp()
            # support_dict.size().pp()
            [item for item in frequents[1] if item.intersection('2')].p()
            [item for item in frequents[3] if item.intersection('2')].p()
            # filter(lambda item: item.intersection(2), frequents[1]).pp()
            pass