-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextNaiveBayes.py
158 lines (139 loc) · 7.01 KB
/
TextNaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import argparse
import logging
import numpy as np
import math
import heapq
import matplotlib.pyplot as plt
from evaluation import calc_accuracy, confusion_matrix
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DATA_DIR = './data'
class TextNaiveBayes:
def __init__(self, type, runmode, k):
# Model of different texts spam/normal email vs negative/positive review vs 8 newsgroups
if type == 'spam_detection':
self.type = 0
self.classes = {'0': 0, '1': 1}
self.class_names = ['normal email', 'spam']
self.num_classes = 2
self.train_file = '/train_email.txt'
self.test_file = '/test_email.txt'
elif type == 'movie_reviews':
self.type = 1
self.classes = {'-1': 0, '1': 1}
self.class_names = ['negative review', 'positive review']
self.num_classes = 2
self.train_file = '/rt-train.txt'
self.test_file = '/rt-test.txt'
elif type == '8_newsgroups':
self.type = 2
self.classes = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7}
self.class_names = ['sci.space', 'comp.sys.ibm.pc.hardware', 'rec.sport.baseball',
'comp.windows.x', 'talk.politics.misc', 'misc.forsale',
'rec.sport.hockey', 'comp.graphics']
self.num_classes = 8
self.train_file = '/8category.training.txt'
self.test_file = '/8category.testing.txt'
self.runmode = runmode
self.k = k
self.model = {}
self.word_counts = [0] * len(self.classes)
self.doc_counts = np.zeros(self.num_classes)
def train(self):
with open(DATA_DIR + self.train_file) as f:
for line in f:
doc = line.split()
model_class = self.classes[doc[0]]
for x in doc[1:]:
x = x.split(':')
# Increment if in vocabulary
if x[0] in self.model:
if(self.runmode == 'multinomial'):
self.model[x[0]][model_class] += int(x[1])
elif(self.runmode == 'bernoulli'):
self.model[x[0]][model_class] += 1
# Initialize list and count for new word in vocabulary
else:
if(self.runmode == 'multinomial'):
self.model[x[0]] = [0] * self.num_classes
self.model[x[0]][model_class] = int(x[1])
elif self.runmode == 'bernoulli':
self.model[x[0]] = [0] * self.num_classes
self.model[x[0]][model_class] = 1
self.word_counts[model_class] += int(x[1])
self.doc_counts[model_class] += 1
# Calculate Likelihoods
for word in self.model:
for x in range(self.num_classes):
self.model[word][x] += self.k
if(self.runmode == 'multinomial'):
self.model[word][x] /= (self.word_counts[x] + len(self.model) * self.k)
elif(self.runmode == 'bernoulli'):
self.model[word][x] /= (self.doc_counts[x] + 2 * self.k)
# Get top 20 words
for model_class in range(self.num_classes):
top_20 = heapq.nlargest(20, self.model.items(), lambda k: k[1][model_class])
logger.info('Top 20 words for class {0} are {1}'.
format(self.class_names[model_class], [(word[0], word[1][model_class]) for word in top_20]))
def predict(self):
correct_labels = []
predicted_labels = []
# Perform MAP classification
with open(DATA_DIR + self.test_file) as f:
for line in f:
doc = line.split()
correct_labels.append(self.classes[doc[0]])
map_classifier = np.zeros(self.num_classes)
# Calculate decision function on test doc features for each class
for model_class in range(self.num_classes):
map_classifier[model_class] = np.array(
[math.log(float(self.doc_counts[model_class]/np.sum(self.doc_counts)))])
if self.runmode == 'multinomial':
for word in doc[1:]:
word = word.split(':')
if word[0] in self.model:
map_classifier[model_class] += math.log(self.model[word[0]][model_class])
elif self.runmode == 'bernoulli':
words = [word.split(':')[0] for word in doc[1:]]
for word in self.model:
if word in words:
map_classifier[model_class] += math.log(self.model[word][model_class])
else:
map_classifier[model_class] += math.log(1. - self.model[word][model_class])
# Get best classified class
predicted_labels.append(np.argmax(map_classifier))
# Get total accuracy
correct_labels = np.array(correct_labels)
predicted_labels = np.array(predicted_labels)
accuracy = calc_accuracy(correct_labels, predicted_labels)
logger.info('NB model is {0:.2f}% accurate on the {1} data with k = {2}.'
.format(accuracy, self.runmode, self.k))
# Get confusion matrix with class accuracies
cm = confusion_matrix(correct_labels, predicted_labels, self.num_classes)
class_accuracies = [cm[n][n] for n in range(self.num_classes)]
for n, x in enumerate(class_accuracies):
logger.info('Class {0} has an accuracy of {1:.2f}%'.format(self.class_names[n], 100 * x))
# Plot confusion matrix
plt.figure(figsize=(30,30))
plt.imshow(cm, cmap=plt.get_cmap('Greens'), interpolation='nearest')
plt.title('Confusion Matrix')
plt.xticks(np.arange(self.num_classes), self.class_names, fontsize = 8)
plt.yticks(np.arange(self.num_classes), self.class_names, fontsize = 10)
plt.xlabel('Predictions')
plt.ylabel('Truths')
plt.colorbar()
plt.show()
def main():
parser = argparse.ArgumentParser(description='''Text Document Naive Bayes Classifcation
for CS 440 by
Shibo Yao, Mike Chen,
and Jeff Zhu''')
parser.add_argument('document_type', help='''Choose a type: spam_detection, movie_reviews, 8_newsgroups''')
parser.add_argument('runmode', help='''Choose a runmode: multinomial, bernoulli''')
parser.add_argument('-k', type=int, help='''Smoothing factor''')
args = parser.parse_args()
tnb = TextNaiveBayes(args.document_type, args.runmode, args.k)
tnb.train()
tnb.predict()
if __name__ == '__main__':
main()