/
run.py
178 lines (150 loc) · 6.97 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import sqlite3, csv
import collections
import itertools
class AprioriAlgorithm:
def __init__(self, minSupport, minConfidence, conn):
self.conn = conn
self.minSupport = minSupport
self.minConfidence = minConfidence
self.uniqueSets = collections.defaultdict(set)
self.cols = dict()
self.totalCount = 1
self.legend = self.makeLegend()
self.init()
self.output = ''
def makeLegend(self):
legend = dict()
with open('Full-Legend.csv','rb') as fin:
dr = csv.DictReader(fin)
for row in dr:
if row['space_id']:
legend[int(row['space_id'])] = row['space'].lower()
if row['type_id']:
legend[int(row['type_id'])] = row['type'].lower()
if row['category_id']:
legend[int(row['category_id'])] = row['category'].lower()
print "Legend Created\n"
return legend
def init(self):
cursor = conn.execute("SELECT * FROM APRIORITEST4")
for row in cursor:
self.uniqueSets["space_id"].add(row[0])
self.cols[row[0]] = "space_id"
self.uniqueSets["type_id"].add(row[1])
self.cols[row[1]] = "type_id"
self.uniqueSets["category_id"].add(row[2])
self.cols[row[2]] = "category_id"
cursor = self.conn.execute("SELECT COUNT(*) FROM APRIORITEST4")
self.totalCount = float([row[0] for row in cursor][0])
print "Initialization Complete...\n"
print "Total number of entries in Dataset: " + `self.totalCount` + "\n"
# Getting Support for a particular combination of entries
def getSupport(self, itemSet):
col = dict({"space_id":-1, "type_id":-1, "category_id":-1})
for var in itemSet:
col[self.cols[var]] = var
where = ''
for key in col:
if col[key] is not -1:
tmp = key + '=' + `col[key]`
where = ' AND '.join([where, tmp]) if where is not '' else tmp
query = "SELECT COUNT(*) FROM APRIORITEST4 WHERE " + where
cursor = self.conn.execute(query)
support = [row[0] for row in cursor][0]/self.totalCount
return support
def nextIteration(self, Lp):
size = len(Lp[0]) + 1
# generated all possible combinations from L(n-1)
C = [list(val) for val in itertools.combinations(Lp, 2)]
C = map(lambda a:reduce(lambda x,y:x.union(y), a, set()), C)
# size check
C = [val for val in C if len(val) is size]
# remove duplicate sets
tmp = list()
[tmp.append(val) for val in C if val not in tmp]
C = tmp
# check no duplicate columns
C = [s for s in C if len(set([self.cols[val] for val in s])) is size]
# Prune Step We find all the n-1 length combinations of the items in
# candidate set. If any of the n-1 length combinations of an item is not a
# part of L(n-1) then we eliminate that item from the candidate set
for val in C:
[C.remove(val) for temp in itertools.combinations(val,len(val)-1) if set(temp) not in Lp]
# Finding all the large frequency items
L = [val for val in C if self.getSupport(val) >= self.minSupport]
print "Generated " + `len(L)` + " " + `len(L[0])` + "-sized frequent itemsets"
return L
def apriori(self):
print "---------------Generating Large Itemsets--------------------"
large = list()
C1 = reduce(lambda a,b:a.union(b), self.uniqueSets.values(), set())
C1 = map(lambda a:set({a}), C1)
L1 = [val for val in C1 if self.getSupport(val) >= self.minSupport]
print "Generated " + `len(L1)` + " 1-sized frequent itemsets"
large = large + L1
L = self.nextIteration(L1)
large = large + L
while len(L) > 0 and len(L[0]) < 3:
L = self.nextIteration(L)
large = large + L
print "\nGenerated a total of " + `len(large)` + " frequent itemsets...\n"
m = dict()
for a in large:
m[tuple(a)] = self.getSupport(a)
self.output += "Frequent Itemsets (Minimum Support: " + `self.minSupport` + ")\n\n"
for key in sorted(m, key=m.get, reverse=True):
self.output += '[' + ', '.join(cleaner(self.cols[item]) + ": " + self.legend[item] for item in key) + ']: Support -> ' + `m[key]*100` + '%\n'
self.output += '\n'
self.getAssociations(m)
# print '\n' + self.output + '\n'
f = open('output.txt','w')
f.write(self.output)
f.close()
def getAssociations(self, m):
print "---------------Generating Association Rules-----------------"
confDict = dict()
for s in m:
if len(s) > 1:
itemSet = set(s)
for i in s:
item = set({i})
diff = itemSet.difference(item)
conf = self.getSupport(itemSet)/self.getSupport(diff)
if conf >= self.minConfidence:
rule = '[' + ', '.join(cleaner(self.cols[x]) + ": " + self.legend[x] for x in diff) + "] => "
rule += '[' + ', '.join(cleaner(self.cols[x]) + ": " + self.legend[x] for x in item) + "]"
confDict[rule] = conf
self.output += "Association Rules (Minimum Confidence: " + `minConfidence` + ")\n\n"
for key in sorted(confDict, key=confDict.get, reverse=True):
self.output += key + ": Confidence - " + `confDict[key]*100` + '%\n'
print "Generated " + `len(confDict)` + " association rules\n"
def cleaner(s):
return s[0:s.index('_')]
def create_table(conn):
conn.execute('''CREATE TABLE APRIORITEST4(space_id INT NOT NULL,type_id INT NOT NULL,category_id INT NOT NULL);''')
print "Table created successfully\n"
def insert_from_csv(conn, fileName):
with open(fileName,'rb') as fin:
dr = csv.DictReader(fin)
to_db = [(i['space_id'],i['type_id'],i['category_id']) for i in dr]
conn.executemany("INSERT INTO APRIORITEST4 (space_id,type_id,category_id) VALUES (?, ?, ?);", to_db)
conn.commit()
print "Loaded data in Table from CSV file\n"
def delete_table(conn):
conn.execute("DROP TABLE IF EXISTS APRIORITEST4;")
print "\nTable deleted successfully if it ever existed..\n"
if __name__=="__main__":
fileName = raw_input('Enter File Name (Integrated-Dataset.csv by default) : ') or "Integrated-Dataset.csv"
minSupport = float(raw_input('Enter Minimum Support (0.05 by default) : ') or '0.05')
minConfidence = float(raw_input('Enter Minimum Confidence (0.5 by default) : ') or '0.5')
conn = sqlite3.connect('test.db')
delete_table(conn)
create_table(conn)
insert_from_csv(conn, fileName)
apriori = AprioriAlgorithm(minSupport, minConfidence, conn)
apriori.apriori()
print "Apriori Algorithm Completed!!"
delete_table(conn)
conn.close()
print "Result stored in output.txt"
# cleaner("category_id")