-
Notifications
You must be signed in to change notification settings - Fork 0
/
NaiveBayesClassifier.py
114 lines (89 loc) · 3.22 KB
/
NaiveBayesClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Naive Bayes Classifier
import nltk
import math
from nltk.corpus import movie_reviews, stopwords
# get our movie reviews from nltk.corpus (reviews stored as tuples (review, class))
documents = [(movie_reviews.raw(fileid), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
classes = movie_reviews.categories() # [’pos’, ’neg’]
trainingSet = documents[100:900] + documents[1100:1900]
devSet = documents[900:1000] + documents[1900:]
testSet = documents[:100] + documents[1000:1100]
def train(trainingSet, classes):
# train the data
n = len(trainingSet) # total number of docs
log_prior = {} # dictionary to hold log prior for all cases
fulltext = ""
# dictionary that holds bigdoc for each class
bigdoc_dict = {}
# dictionary that holds number of docs in each class
num_docs = {}
for c in classes:
bigdoc_dict[c] = ""
num_docs[c] = 0
log_prior[c] = 0
# divides training set into positive and negative reviews
for d in trainingSet:
fulltext += d[0] + " "
c = d[1]
num_docs[c] += 1
bigdoc_dict[c] += d[0] + " "
# calculate log priors
for c in classes:
n2 = num_docs[c] # number of docs for class c
log_prior[c] = math.log(n2/n)
bigdoc_dict[c] = bigdoc_dict[c].lower().strip(".,;!?:") # sets text to lowercase
def test(testDoc, logPrior, logLikelihood, classes, vocab):
sums = {}
words = nltk.word_tokenize(testDoc)
stop_words = set(stopwords.words('english'))
for c in classes:
sums[c] = logPrior[c]
for w in words:
# ignores words not in vocab from training & stop words
if (w in logLikelihood[c]) and (w not in stop_words):
sums[c] += logLikelihood[c][w]
# returns argmax of sum[c]
max_log = 0
argmax = ""
for c in classes:
if abs(sums[c]) > abs(max_log):
max_log = sums[c]
argmax = c
return argmax
def testCorpus(testSet, logPrior, logLikelihood, classes, vocab):
# tests the classifier for all documents in the test set
tp = 0
tn = 0
fp = 0
fn = 0
# don't know how this can be generalized for other classes
for doc in testSet:
result = test(doc[0], logPrior, logLikelihood, classes, vocab)
if result == "pos" and doc[1] == "pos":
tp += 1
if result == "pos" and doc[1] == "neg":
fp += 1
if result == "neg" and doc[1] == "pos":
fn += 1
if result == "neg" and doc[1] == "neg":
tn += 1
# we tried here...
# for doc in testSet:
# for c in classes:
# result = test(doc[0], logPrior, logLikelihood, classes, vocab)
# if result == doc[1]:
# tp += 1
# if result != doc[1]:
# tn += 1
precision = tp / (tp + fp)
recall = tp / (tp + fn)
return (precision, recall)
results = train(trainingSet, classes)
logPrior = results[0]
logLikelihood = results[1]
vocab = results[2]
# tweaking
results = testCorpus(testSet, logPrior, logLikelihood, classes, vocab)
print("Precision: " + str(results[0]) + ", Recall: " + str(results[1]))