forked from VinothRajasekar/NaiveBayes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
amlnaivebayes.py
executable file
·184 lines (153 loc) · 7.6 KB
/
amlnaivebayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# This file was created starting the cookie2.py from Think Bayes
# That code implements a simple Naive Bayes model where there is only one feature, namely
# the flavor of the cookie that was selected from a bowl.
# Refer to that file for the original version of this code.
# Here I will document how I have modified it for our assignment
# Note that you will make changes to two functions
# You will modify the condprob function that is used to compute conditional probabilities
# You will also modify the liklihood function
# Specific instructions about how to do the modifications are given in the code and comments
# for those functions.
# You will turn in your modified version of this file as well as a text file with the output
# that was printed when you ran this.
from thinkbayes import Pmf
import csv
from collections import defaultdict
# here are two global variables you can play with once you get the code working
# classval is the name of the feature that you will predict
# smoothing is a flag that indicates whether you are using smoothing or not
classval = 'play'
smoothing = 0
# These global variables keep track of the counts for each feature value in connection with
# each classvalue (in countdict) as well as the full list of feature values for each feature
# in the order in which they were encountered in the data file (in featuredict)
# featlist is the full list of features in the order in which the corresponding columns
# appear in the data file
# classpos is the position in the feature list where the class value is found
featuredict = defaultdict(list)
countdict = defaultdict(int)
classpos = 0
featlist = list()
# this function simply concatenates the three tags
def conc (tag1, tag2, tag3):
return '+'.join([tag1,tag2,tag3])
# this function reads weather.csv and creates featuredict, countdics, and featlist
# note that this is one of the functions you will need to edit in order to get smoothing to work
# properly when you add one new unobserved feature to the dataset
def read_data ():
global classpos
global countdict
global featuredict
global featlist
pos = 0
with open('weather.csv', 'rb') as csvfile:
datareader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in datareader:
pos += 1
if pos == 1:
featlist = row
for x in range(len(row)):
if row[x] == classval:
classpos = x
featuredict[row[x]] = list()
else:
localclassval = row[classpos]
countdict[localclassval] += 1
for x in range(len(featlist)):
if row[x] in featuredict[featlist[x]]:
countdict[conc(featlist[x],row[x],localclassval)] += 1
else:
featuredict[featlist[x]].append(row[x])
countdict[conc(featlist[x],row[x],localclassval)] = 1
# this function computes the conditional probability of the feature called feat having
# the value featval given that the class value is classval
def condprob (classval, feat, featval):
## Modify this function to use the value of smoothing
total = 0 # This will be the total number of instances of class value = classval
val = 0 # this will be the number of times that feat was of value featval when class was classval
for fval in featuredict[feat]:
count = countdict[conc(feat,fval,classval)]
total += count
if fval == featval:
val = count
# if featval never occred in the dataset, you need to handle this condition here
if not(featval in featuredict[feat]):
total += 0
val = 0
val = float(val)/total # here is where you finally compute the conditional probability
return val
# this is a modification on the Cookie class that is set up for the play tennis data
class Weather(Pmf):
"""A map from whether you play tennis or not to a probablity."""
def __init__(self, hypos):
"""Initialize self.
hypos: whether you play tennis or not
"""
Pmf.__init__(self)
for hypo in hypos:
self.Set(hypo, countdict[hypo])
self.Normalize()
def Update(self, data):
total = 0
"""Updates the PMF with new data.
data: feature values for outlook, temperature, humidity, and windy
"""
for hypo in self.Values():
like = self.Likelihood(data, hypo)
total += like
self.Mult(hypo, like)
if total > 0: self.Normalize()
# Currently this function computes the likelihood from just one conditional probability
# You need to modify it to properly take the full set of features into account.
def Likelihood(self, data, hypo):
"""The likelihood of the data under the hypothesis.
data: feature values for outlook, temperature, humidity, and windy
hypo: whether you play tennis or not
"""
like = 1
# in the Cookie problem, there was only one feature. So the likelihood before
# it multiplied by the prior probability was just the conditional probability
# of the feature value given the class value, which you saw in the mix variable
# That would be like just considering one of the 4 features we have, which I have
# done below. Change the code so that you take all f4 features into account
# according to what we discussed in class and you saw in the Witten book for
# computing the likelihood for the play tennis dataset
like = condprob(hypo, featlist[0], data[0]) # this is the conditional probability of
# the value (in data) of the feature (in
# featlist) given the class value (in hypo)
# note that featlist lists all of the features
# and data lists all the values for this instance
return like
# This function computes the probability for each class value for the data point given
def test_instance (hypos, data):
pmf = Weather(hypos)
pmf.Update(data)
print ("--------------------------")
print (data)
if smoothing:
print ("Smoothing")
else:
print ("No Smoothing")
for hypo, prob in pmf.Items():
print (hypo, prob)
# This function implements the instructions for the assignment.
# It reads the play tennis data, computes all the counts, and then
# evaluates the probability for each possible class value with and
# without smoothing for each of the 4 test instances given in the
# assignment
def main():
global smoothing
read_data()
hypos = featuredict[classval]
smoothing = 0
test_instance(hypos, ['overcast','hot','normal','TRUE','?'])
test_instance(hypos, ['rainy','hot','high','FALSE','?'])
test_instance(hypos, ['overcast','cool','normal','TRUE','?'])
test_instance(hypos, ['rainy','mild','low','FALSE','?'])
smoothing = 1
test_instance(hypos, ['overcast','hot','normal','TRUE','?'])
test_instance(hypos, ['rainy','hot','high','FALSE','?'])
test_instance(hypos, ['overcast','cool','normal','TRUE','?'])
test_instance(hypos, ['rainy','mild','low','FALSE','?'])
if __name__ == '__main__':
main()