/
naivebayes2.py
122 lines (89 loc) · 3.26 KB
/
naivebayes2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import numpy as np
from scipy.sparse import issparse
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import coo_matrix
from pprint import pprint
class NaiveBayes(object):
def __init__(self, k=1.0):
self.k = k
# self.classes = []
# self.feature_log_prob = []
self.class_prior = []
def train(self, X, y):
n_features = X.shape[1]
# class_prior = self.class_prior
# Binarize Y
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes = labelbin.classes_
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)
n_effective_classes = Y.shape[1]
self.class_count = np.zeros(n_effective_classes)
self.feature_count = np.zeros((n_effective_classes, n_features))
print "Start counting..."
self.class_count = Y.sum(axis=0)
print "Finished class counting!"
print "Start feature counting..."
self.feature_count = np.dot(Y.T, X)
print "Finished feature counting!"
# Apply add-k-smoothing
print "Start smoothing..."
self.class_count_smooth = self.class_count + self.k * len(self.classes)
self.feature_count_smooth = self.feature_count + self.k
print "Finished smooting!"
# Convert to log probabilities
self.feature_log_prob = (np.log(self.feature_count_smooth) - np.log(self.class_count_smooth.reshape(-1,1)))
self.class_log_prior = np.zeros(len(self.classes)) - np.log(len(self.classes))
return self
def predict(self, X):
neg_prob = np.log(1 - np.exp(self.feature_log_prob))
# print self.feature_count.shape
# print self.feature_count_smooth.shape
# print self.feature_log_prob.shape
# print neg_prob.shape
jll = np.dot(X, (self.feature_log_prob - neg_prob).T)
# print jll.shape
# print type(self.class_log_prior)
# print self.class_log_prior.shape
# print type(neg_prob.sum(axis=1))
# print neg_prob.sum(axis=1).shape
jll += self.class_log_prior + neg_prob.sum(axis=1)
return self.classes[np.argmax(jll, axis=1)]
def evaluate(self, X, y):
# Construct confusion matrix
y_true = y
y_pred = self.predict(X)
labels = np.unique(y_true)
n_labels = labels.size
label_to_ind = dict((y, x) for x, y in enumerate(labels))
pprint(label_to_ind)
none_ind = label_to_ind.get('None')
# convert yt, yp into index
y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred])
y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true])
# intersect y_pred, y_true with labels, eliminate items not in labels
ind = np.logical_and(y_pred < n_labels, y_true < n_labels)
y_pred = y_pred[ind]
y_true = y_true[ind]
CM = coo_matrix((np.ones(y_true.shape[0]), (y_true, y_pred)), shape=(n_labels, n_labels)).toarray()
# Compute Recall
mask = np.ones(len(labels), dtype=bool)
mask[none_ind] = False
# CM = np.delete(CM, none_ind, 0)
# CM = np.delete(CM, none_ind, 1)
diag = CM.diagonal()
diag = np.sum(diag[mask])
recall_denom = np.sum(CM, axis=1)
recall_denom = np.sum(recall_denom[mask])
recall = diag/recall_denom
# Compute precision
prec_denom = np.sum(CM, axis=0)
prec_denom = np.sum(prec_denom[mask])
precision = diag/prec_denom
# Compute accuracy
acc = np.sum(CM.diagonal())/np.sum(CM, axis=None)
# F1 measure (F-score with beta=1)
F1 = 2*precision*recall/(precision+recall)
# Macro-average
return CM, precision, recall, F1