-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifiers.py
360 lines (311 loc) · 17.2 KB
/
classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import pandas as pd
import sys
import time
import numpy as np
import matplotlib
matplotlib.use('TkAgg') #for mac support
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import tree
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
__author__ = "Tobias Marzell"
__credits__ = "Thanks to the creators of pandas, numpy, matplotlib and scikit-learn, aswell as to the UCI machine learning repository and to the creators of the data set."
__email__ = "tobias.marzell@gmail.com"
class Plotting:
""" Contains the stuff for plotting. """
color1 = ("#f02c46") #rocketloop red
color2 = ("#00b9ee") #rocketloop blue
def __init__(self, data):
""" Instantiates the data for the plotting. """
self.data = data
def get_categorical_chart(self, data):
""" Return the categorical plot.
Takes data object as argument. """
fig, ax = plt.subplots(nrows=2, ncols=2) # structure of the plot
for i, categorical_feature in enumerate(data["workclass","education","race","sex"]):
achse = data[categorical_feature].value_counts()
achse = achse.plot("bar", ax=ax[int(i / 2), (i % 2)], color=self.color1)
achse.set_title(categorical_feature)
fig.tight_layout()
return fig
def get_pie_chart(self, data):
""" Return the pie plot.
Takes data object as argument. """
fig, ax = plt.subplots(1,1)
data[data.target].value_counts().plot("pie", ax, colors=[self.color2, self.color1])
return fig
def plot_roc_curve(self, classification_reports):
""" Plot the as an array given classification reports roc curves. """
#create the plots
if len(classification_reports) > 2:
fig, ax = plt.subplots(ncols=2, nrows=int(len(classification_reports)/2) + len(classification_reports) % 2)
#removes last ax if uneven:
if (len(classification_reports) % 2) == 1:
fig.delaxes(ax[int(len(ax) - 1), 1])
for i in range(len(classification_reports)):
ax[int(i/2), i%2].plot(classification_reports[i].fpr, classification_reports[i].tpr, \
label='%s (area = %0.2f)' % (classification_reports[i].label, classification_reports[i].roc_auc),\
color=Plotting.color2)
# plots the f(x) = x line:
ax[int(i/2), i%2].plot([0, 1], [0, 1], color=Plotting.color1)
ax[int(i/2), i%2].set_xlim([0.0, 1.0])
ax[int(i/2), i%2].set_ylim([0.0, 1.0])
ax[int(i/2), i%2].set_title(classification_reports[i].label)
ax[int(i/2), i%2].set_ylabel('True Positive Rate')
ax[int(i/2), i%2].set_xlabel('False Positive Rate')
ax[int(i/2), i%2].legend(loc="lower right")
elif len(classification_reports) == 2:
fig, ax = plt.subplots(ncols=2, nrows=1)
for i in range(len(classification_reports)):
ax[i].plot(classification_reports[i].fpr, classification_reports[i].tpr, \
label='%s (area = %0.2f)' % (classification_reports[i].label, classification_reports[i].roc_auc),\
color=Plotting.color2)
# plots the f(x) = x line:
ax[i].plot([0, 1], [0, 1], color=Plotting.color1)
ax[i].set_xlim([0.0, 1.0])
ax[i].set_ylim([0.0, 1.0])
ax[i].set_title(classification_reports[i].label)
ax[i].set_ylabel('True Positive Rate')
ax[i].set_xlabel('False Positive Rate')
ax[i].legend(loc="lower right")
elif len(classification_reports) == 1:
fig, ax = plt.subplots(ncols=1, nrows=1)
ax.plot(classification_reports[0].fpr, classification_reports[0].tpr, \
label='%s (area = %0.2f)' % (classification_reports[0].label, classification_reports[0].roc_auc),\
color=Plotting.color2)
# plots the f(x) = x line:
ax.plot([0, 1], [0, 1], color=Plotting.color1)
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_title(classification_reports[0].label)
ax.set_ylabel('True Positive Rate')
ax.set_xlabel('False Positive Rate')
ax.legend(loc="lower right")
plt.suptitle('Receiver operating characteristic')
plt.tight_layout()
plt.show()
return
class Data:
""" This class handles everthing related to the data analysis.
The Data object holds the train- and testdatasets as well as the column names and the names of the
categorical features which are required for categorical classification.\n
It contains the following object variables:
names: list of strings containing the labels of the columns
learndata: pd array of a converted csv file. This data is used for the training.
testdata: pd array of a converted csv file. This data is used for the testing.
categorical_features: list of strings containing the labels of all categorical features.
target: string of the label of the row that contains the target variables of the pd arrays learndata and testdata.
x_train: pd array that contains the training features.
y_train: pd array that contains the training targets.
x_test: pd array that contains the test features.
y_test: pd array that contains the test targets.
The following Methods can be used to generate classifier objects of the Data object,
which can get used for predicting scores and plotting performances and so on.
They always use the Data object which contains the data they get trained on. \n
get_log_regr(self): returns a logistical regression classifier object of the Data.
get_gradient_boosted_trees(self): create a gradient boosted trees classifier object of the Data.
get_decision_tree(self): returns a decision tree classifier object of the Data.
get_neural_network(self): returns a neural network classifier object of the Data.
get_svm(self): returns a support vector machine classifier object of the Data.
get_kneighbors(self): returns a k-nearest neighbor classifier object of the Data.
get_naive_bayes(self): returns a naive bayes classifier object of the Data.
"""
def __init__(self, files, names, categorical_features, target, sep = ","):
""" Takes files, names and categorical_features as an array of strings as an input.
The files have to be entered in a list as following: 1. item is the traindata, 2. item is the testdata.
(optionally 3. item but this is only important if you want to safe the classification report.)
The names are used as the names of the rows in the data (last name is always the target).
The categorical features must be defined as the names of the data which is categorical.
target: The name of the target value (must be included in names). """
# This data will be used to analyse the dataset (add variability maybe).
self.names = names
self.traindata = pd.read_csv((files[0]), names=self.names, sep=sep)
self.testdata = pd.read_csv((files[1]), names=self.names, sep=sep)
self.categorical_features = categorical_features
self.target = target
self.setup_data()
def setup_data(self):
""" set up the data for classification """
traindata = self.remove_incomplete_data(self.traindata)
testdata = self.remove_incomplete_data(self.testdata)
self.y_train = self.set_target(traindata)
self.y_test = self.set_target(testdata)
# set dummies of combined train and test data with removed target variable
fulldata = self.get_dummies(traindata.append(testdata, ignore_index=True).drop(self.target, axis=1).drop("fnlwgt", axis=1), self.categorical_features)
self.x_train = fulldata[0:len(traindata)]
self.x_test = fulldata[len(traindata):len(fulldata)]
def get_categorical_features(self):
""" Return the categorical features of the data as array. """
return self.categorical_features
def remove_incomplete_data(self, data):
""" Remove every row of the data that contains atleast 1 "?". """
return data.replace("?", np.nan).dropna(0, "any")
def get_dummies(self, data, categorical_features):
""" Get the dummies of the categorical features for the given data. """
for feature in self.categorical_features:
# create dummyvariable with pd.get_dummies and drop all categorical variables with dataframe.drop
data = data.join(pd.get_dummies(data[feature], prefix=feature, drop_first=True)).drop(feature, axis=1)
return data
def set_target(self, data):
""" Set the target values of the target variables (0,1 for either case). """
for i in range(len(data[self.target].unique())):
data[self.target] = np.where(data[self.target] == data[self.target].unique()[i], i, data[self.target])
return data[self.target].astype("int")
def get_log_regr(self):
""" Return a logistic regression classifier object. """
clf = LogisticRegression(solver="lbfgs")
return clf.fit(self.x_train, self.y_train)
def get_gradient_boosted_trees(self):
""" Return a gradient boosted trees classifier object. """
clf = GradientBoostingClassifier(n_estimators=4000, learning_rate=0.04, max_depth=2, random_state=0)
return clf.fit(self.x_train, self.y_train)
def get_decision_tree(self):
""" Return a decision tree classifier object. """
clf = tree.DecisionTreeClassifier()
return clf.fit(self.x_train, self.y_train)
def get_neural_network(self, hidden_layer_size=(100,), max_iter=200):
""" Return a nn classifier object.
Optional arguments:\n
hidden_layer_size: Tuple of ints that defines the size and the amount of hidden layers. (default (100,)))
max_iter: Int that defines the maximum amount of training iterations. (default 200) """
clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size,\
max_iter=max_iter, tol=0.000001, n_iter_no_change=250, early_stopping=False, verbose=False,
learning_rate="adaptive", validation_fraction=0)
return clf.fit(self.x_train, self.y_train)
def get_svm(self):
""" Return an svm classifier object. """
clf = SVC(gamma="auto")
return clf.fit(self.x_train, self.y_train)
def get_kneighbors(self):
""" Return a k-nn object. """
clf = KNeighborsClassifier(n_neighbors=5)
return clf.fit(self.x_train, self.y_train)
def get_naive_bayes(self):
""" Return a native bayes classifier object. """
clf = GaussianNB()
return clf.fit(self.x_train, self.y_train)
def get_gaussian_process_regressor(self):
""" Return a gaussian process regressor classifier object. """
clf = GaussianProcessRegressor()
return clf.fit(self.x_train, self.y_train)
def get_dummy_classifier(self):
""" Return a dummy classifier object. """
clf = DummyClassifier()
return clf.fit(self.x_train, self.y_train)
def get_classification_report(self, clf, x, y, label=""):
""" Return a ClassificationReport object.
Arguments:
clf: The classifier to be reported.
x: The feature values.
y: The target values to validate the predictions.
label: (optionally) sets a label."""
roc_auc = roc_auc_score(y, clf.predict_proba(x)[:,1])
fpr, tpr, thresholds = roc_curve(y, clf.predict_proba(x)[:,1])
return ClassificationReport(metrics.confusion_matrix(y, clf.predict(x)),
metrics.classification_report(y, clf.predict(x)), roc_auc, fpr, tpr, thresholds , label)
class ClassificationReport:
""" Object to Print the classification report data. """
def __init__(self, matrix, report, roc_auc, fpr, tpr, thresholds, label):
""" Takes sklearn matrix and report as arguments. """
self.matrix = matrix
self.report = report
self.roc_auc = roc_auc
self.fpr = fpr
self.tpr = tpr
self.thresholds = thresholds
if label == "":
self.label = None
else:
self.label = label
def __repr__(self):
""" Represent object as string containing its information. """
x = "\tt\t" + "f\n" + "n\t" + str(self.matrix[0][0]) + "\t" + str(self.matrix[0][1]) + \
"\n" + "p\t" + str(self.matrix[1][1]) + "\t" + str(self.matrix[1][0])
x += ("\n" + self.report)
return x
def write_to_file(self, file, additional = []):
""" Write a ClassificationReport object to a file.
Arguments:
file: The file to write to.
additional: (optional) to display additional information at the beginning. """
if additional == []:
with open(file, "a") as file:
file.write(self.__repr__())
else:
with open(file, "a") as file:
file.write(str(additional) + "\n" + self.__repr__())
def main():
files = ["./data_files/adult.data.txt", \
"./data_files/adult.test.txt"]
names = ["age","workclass","fnlwgt","education","education-num",
"marital-status","occupation","relationship","race","sex",
"capital-gain","capital-loss","hours-per-week","native-country",
"income"]
categorical_features = ["workclass","education","race","sex","marital-status","occupation","relationship",
"native-country"]
data = Data(files, names, categorical_features, "income")
debug = False
if debug:
""" If you want to debug something do it here. """
pass
else:
log_time = time.time()
log = data.get_log_regr()
log_time = time.time() - log_time
dec_tree_time = time.time()
dec_tree = data.get_decision_tree()
dec_tree_time = time.time() - dec_tree_time
# svm = data.get_svm()
gbt_time = time.time()
gbt = data.get_gradient_boosted_trees()
gbt_time = time.time() - gbt_time
knn_time = time.time()
knn = data.get_kneighbors()
knn_time = time.time() - knn_time
naive_bayes_time = time.time()
naive_bayes = data.get_naive_bayes()
naive_bayes_time = time.time() - naive_bayes_time
nn_time = time.time()
nn = data.get_neural_network((20,20,20,20,20,20,20,20,20,),5000)
nn_time = time.time() - nn_time
# dummy = data.get_dummy_classifier()
classification_reports = []
classification_reports.append(data.get_classification_report(log, data.x_test, data.y_test, label = "Logistische Regression"))
classification_reports.append(data.get_classification_report(dec_tree, data.x_test, data.y_test, label="Decision Tree"))
# classification_reports.append(data.get_classification_report(svm, data.x_test, data.y_test, label = "svm"))
classification_reports.append(data.get_classification_report(gbt, data.x_test, data.y_test, label = "Gradient Boosted Trees"))
classification_reports.append(data.get_classification_report(knn, data.x_test, data.y_test, label="k-nearest Neighbors"))
classification_reports.append(data.get_classification_report(naive_bayes, data.x_test, data.y_test, label="Naive Bayes"))
classification_reports.append(data.get_classification_report(nn, data.x_test, data.y_test, label = "Neuronales Netz"))
# classification_reports.append(data.get_classification_report(dummy, data.x_test, data.y_test, label = "dummy"))
Plotting.plot_roc_curve(Plotting, classification_reports)
print("log \n", log.score(data.x_test, data.y_test), log_time, "\n", classification_reports[0])
print("decision tree\n", dec_tree.score(data.x_test, data.y_test), dec_tree_time, "\n", classification_reports[1])
print("gradient boosted tree\n" , gbt.score(data.x_test, data.y_test), gbt_time, "\n", classification_reports[2])
print("knn\n", knn.score(data.x_test, data.y_test), knn_time, "\n", classification_reports[3])
print("naive bayes\n", naive_bayes.score(data.x_test, data.y_test), naive_bayes_time, "\n", classification_reports[4])
print("nn\n", nn.score(data.x_test, data.y_test), nn_time, "\n", classification_reports[5])
# print("svm\n", svm.score(data.x_test, data.y_test))
# print("dummy\n", dummy.score(data.x_test, data.y_test))
# only for chart print
#
# chart1 = get_chart1(testdata)
# plt.savefig("/Users/tobiasmarzell/Desktop/rocketloop/" + \
# "classifier-experiment/chart3.png", dpi = 400)
# chart2 = get_chart2(testdata)
# plt.savefig("/Users/tobiasmarzell/Desktop/rocketloop/" + \
# "classifier-experiment/chart4.png", dpi = 700)
# print("done")
if __name__ == "__main__":
main()