-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
278 lines (208 loc) · 8.41 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# ML Final Project
# Controversial Reddit Comments
#
# Authors: Mai Ho and Maury Quijada
import numpy as np
import preprocess as pp
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
def run_models():
""" Run all classification models. """
# TODO: Vary max_features.
# feature_range = range(1, 31)
# scores = {}
# for i in feature_range:
# print "Beginning preprocessing..."
# train_test_sets = pp.preprocess(max_features=i, force_load=True)
# print "...finished preprocessing"
# print "Depth-limited Decision Tree Classifier..."
# scores[i] = test_decision_tree_classifier(
# train_test_sets, depth_limited=True)
# max_features = max(feature_range, key=lambda x: scores[x])
# print "Best max_features: ", max_features
train_test_sets = pp.preprocess(max_features=10)
# print "Beginning training..."
# TODO: Change all metrics to F1-score once we have imbalanced data set.
# Also change number of folds if necessary.
# print "Baseline Classifier..."
# # test_subreddit_baseline_classifier()
# print "Decision Tree Classifier..."
# # test_decision_tree_classifier(train_test_sets)
print "Depth-limited Decision Tree Classifier..."
test_decision_tree_classifier(train_test_sets, depth_limited=True)
# print "Logistic Regression Classifier..."
# test_logistic_regression_classifier(train_test_sets)
# print "Adaboosting with Decision Tree Stumps..."
# test_adaboost_classifier(train_test_sets)
def performance(y_true, y_pred, metric="accuracy"):
"""
Calculates the performance metric based on the agreement between the
true labels and the predicted labels.
Args:
y_true: true labels
y_pred: predicted labels
metric: type of metric used
Returns:
score
"""
# map continuous-valued predictions to binary labels
y_label = np.sign(y_pred)
y_label[y_label==0] = 1
# part 2a: compute classifier performance
handlers = {
"accuracy": metrics.accuracy_score,
"f1_score": metrics.f1_score,
"precision": metrics.precision_score,
"recall": metrics.recall_score}
return handlers[metric](y_true, y_pred)
def cv_performance(clf, X, y, kf, metric="accuracy"):
"""
Does k-fold cross validation.
Args:
clf: classifier
X: feature vectors
y: labels
kf: cross_validation.StratifiedKFold
metric: performance measure
Returns:
Average cross-validation performance across all k folds.
"""
scores = np.zeros(kf.n_folds)
i = 0
for train_index, test_index in kf:
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores[i] = performance(y_test, y_pred, metric=metric)
i += 1
return np.average(scores)
def select_dt_depth(X, y, kf, metric="accuracy"):
"""
Finds the best depth for the Decision Tree Classifier.
Args:
X: feature vectors
y: labels
kf: cross_validation.StratifiedKFold
metric: performance measure
Returns:
Depth that maximizes performance on k-fold cross validation.
"""
depths = range(5, 25, 3)
depth_scores = {}
for d in depths:
score = cv_performance(DecisionTreeClassifier(
criterion="entropy", max_depth=d), X, y, kf, metric=metric)
depth_scores[d] = score
return max(depth_scores, key=lambda d: depth_scores[d])
def select_regularization(X, y, kf, classifier="logreg", metric="accuracy"):
"""
Finds the best regularization constant for LogisticRegression.
Args:
X: feature vectors
y: labels
classifier: type of classifier, can be logreg or svm.
kf: cross_validation.StratifiedKFold
metric: performance measure
Returns:
Depth that maximizes performance on k-fold cross validation.
"""
C = 10.0 ** np.arange(-3, 3)
C_scores = {}
for c in C:
if classifier == "logreg":
clf = LogisticRegression(C=c)
else:
clf = SVC(kernel='linear', C=c)
score = cv_performance(clf, X, y, kf, metric=metric)
C_scores[c] = score
return max(C_scores, key=lambda c: C_scores[c])
def test_decision_tree_classifier(train_test_sets, criterion="entropy", depth_limited=False):
""" Decision Tree Classifier with optional depth-limit.
Args:
train_test_sets: array of training and testing sets
criterion: parameter for Decision Tree
depth_limited: whether or not to prune to best depth
"""
X_train, X_test, y_train, y_test = train_test_sets
if depth_limited:
# TODO: Change number of folds?
kf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=42)
# depth = select_dt_depth(X_train, y_train, kf, metric="accuracy")
depth = 23
# print "Best depth...", depth
clf = DecisionTreeClassifier(criterion="entropy", max_depth=depth)
else:
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print "DECISION TREE CLASSIFIER RESULTS"
print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)
y_pred = clf.predict(X_test)
print_metrics(y_test, y_pred)
return metrics.f1_score(y_test, y_pred)
def test_logistic_regression_classifier(train_test_sets):
""" Logistic Regression Classifier.
Does k-fold cross validation to determine the regularization term.
"""
X_train, X_test, y_train, y_test = train_test_sets
# TODO: Change number of folds?
kf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=42)
best_C = select_regularization(
X_train, y_train, kf, classifier="logreg", metric="accuracy")
clf = LogisticRegression(C=best_C)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print "LOGISTIC REGRESSION CLASSIFIER RESULTS"
print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)
y_pred = clf.predict(X_test)
print_metrics(y_test, y_pred)
def test_svm_classifier(train_test_sets):
""" Support Vector Machine Classifier.
"""
X_train, X_test, y_train, y_test = train_test_sets
kf = StratifiedKFold(y_train, n_folds=5, shuffle=True, random_state=42)
best_C = select_regularization(
X_train, y_train, kf, classifier="svm", metric="accuracy")
clf = SVC(C=best_C)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print "SVM CLASSIFIER RESULTS"
print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)
y_pred = clf.predict(X_test)
print_metrics(y_test, y_pred)
def test_adaboost_classifier(train_test_sets):
""" Adaboost Classifier with Decision Tree Stumps. """
X_train, X_test, y_train, y_test = train_test_sets
clf = AdaBoostClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print "ADABOOST CLASSIFIER RESULTS"
print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)
y_pred = clf.predict(X_test)
print_metrics(y_test, y_pred)
def test_subreddit_baseline_classifier():
""" Runs baseline classifier.
Baseline Classifier is just a decision tree with one node: subreddit.
"""
X_train, X_test, y_train, y_test = pp.preprocess_subreddit_baseline()
clf = DecisionTreeClassifier(criterion="entropy")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
print "SUBREDDIT BASELINE CLASSIFIER RESULTS"
print "\tTraining accuracy is ", metrics.accuracy_score(y_train, y_pred, normalize=True)
y_pred = clf.predict(X_test)
print_metrics(y_test, y_pred)
def print_metrics(y_test, y_pred):
print
print "\tMetrics"
print "\t\tTesting accuracy is ", metrics.accuracy_score(y_test, y_pred, normalize=True)
print "\t\tPrecision score is ", metrics.precision_score(y_test, y_pred)
print "\t\tRecall score is ", metrics.recall_score(y_test, y_pred)
print "\t\tF1 score is ", metrics.f1_score(y_test, y_pred)
if __name__ == "__main__":
run_models()