/
FinalMLClassifiers.py
232 lines (160 loc) · 8.33 KB
/
FinalMLClassifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#Preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn import preprocessing
#Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
#import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#for data import and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from pydotplus import graph_from_dot_data
'''
Graphviz is not a python tool. The python packages at pypi provide a convenient way of using Graphviz in python code.
You still have to install the Graphviz executables, which are not pythonic, thus not shipped with these packages.
You can install those e.g. with a general-purpose package manager such as homebrew
brew install graphviz
pydot.InvocationException: GraphViz's executables not found
'''
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
sns.set_style("dark")
colors = ["#800000", "#45ada8", "#2a363b", "#fecea8", "#99b898", "#e5fcc2"]
sns.set_palette(sns.color_palette(colors))
breast_data = pd.read_csv('./data/data.csv')
#breast_data = breast_data.drop(['ID','Unnamed: 32'],axis=1)
#drop diagnosis, create X and Y
y = breast_data['diagnosis']
x_ = breast_data.drop('diagnosis', axis=1)
x = x_.drop('id', axis = 1)
#replace M and B with 1s and 0s
y = y.replace(['M', 'B'], [1, 0])
columns = x.columns
x = x.replace(0, np.nan)
#replace missing values with mean
for col in x.columns:
x[col].fillna(x[col].mean(), inplace=True)
#standardize the dataset to have a mean of 0, allows us to compare different scales
scaler = StandardScaler()
standardized_data = x.copy()
standardized_data[columns] = pd.DataFrame(scaler.fit_transform(standardized_data[columns]))
#split the dataset, 70% training, 15% test, 15% development
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state = 0)
#Model Training and Evaluation, Daniella Pombo----------------------------------------------------------
def model_eval(clfr_var): #Evaluates model based on results from test set
prediction = clfr_var.predict(X_test) #Predict class labels for each test sample
cm = confusion_matrix(y_test,prediction) #Returns matrix of confusion matrix values
acc_s = accuracy_score(y_test, prediction) #Accuracy score of model
PRE, REC, _ = precision_recall_curve(y_test, prediction, pos_label = 1) #Precision recall curve returns precision, recall and its threshold
AUC = auc(REC, PRE) #Compute area under precision recall curve
f_s = f1_score(y_test, prediction) #F1 score gives indication on precision and recall
cr = classification_report(y_test, prediction) #Returns txt formate of confusion matrix
return (AUC, f_s, acc_s, cr, clfr_var) #Return tuple of report values
def mod_select_train(clfr_var, hypprm): #Train and tone model
#Model Selection
#Grid search
gs = GridSearchCV(clfr_var, param_grid = hypprm, cv = 10, scoring = 'f1', refit = True, iid= False) #Verbose shows u wats going on
gs.fit(X_train, y_train)
gs = gs.best_estimator_
# K-fold cross validation
cross_val_score(estimator = gs, X = X_train, y = y_train, cv = 10, scoring = 'f1')
return gs
#Run classifiers -------------------------------------------------------------------------
def SupVM(): #Support Vector Machine
grid_param = {'C' :[0.1, 1, 5, 10, 50], 'kernel' :['linear']}
s_run = SVC()
s_run.fit(X_train, y_train)
print("SVM")
return model_eval(mod_select_train(s_run, grid_param))
def lr():#Logistic Regression
'''
MAX ITERATION FOR DEFAULT SOLVER LBFGS DOES NOT CONVERGE- liblinear (simpiler solver converges)
'''
grid_param = {'C' :[0.00001, 0.001, 1, 3, 5, 10, 50, 100, 1000]}
lr_run = LogisticRegression(solver= 'liblinear')
lr_run.fit(X_train, y_train)
print("LR")
return model_eval(mod_select_train(lr_run,grid_param))
def nnP():#Perceptron Neural Network
'''
F-score is ill-defined and being set to 0.0 due to no predicted samples.
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (25) reached and the optimization hasn't converged yet.
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (50) reached and the optimization hasn't converged yet.
ConvergenceWarning: Stochastic Optimizer: Maximum iterations (100) reached and the optimization hasn't converged yet.
'''
grid_param = {'hidden_layer_sizes' : [(100,3), (5,2)], 'max_iter':[25, 50, 100], 'solver': ['adam'], 'activation': ['relu'] }
nnP_run = MLPClassifier()
nnP_run.fit(X_train, y_train)
print("NNP")
return model_eval(mod_select_train(nnP_run, grid_param))
def DT(): #Decision Tree
grid_param = {'criterion' : ['gini', 'entropy'], 'max_depth' : [2, 4, 7, 10, 15], 'random_state' : [0]}
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
print("DT")
tree = mod_select_train(tree, grid_param)
#Decision Tree Chart
#'M', 'B' = 1, 0
dot_data = export_graphviz(tree, out_file = None, impurity = True, filled = True, rounded = True, max_depth = 5, feature_names = ['Radius mean', 'Texture mean', 'Perimeter mean', 'Area mean', 'Smoothness mean', 'Compactness mean', 'Concavity mean', 'Convacve points mean', 'Symmetry mean', 'Fractal dimension', 'Radius se', 'Texture se', 'Perimeter se', 'Area se', 'Smoothness se', 'Compactness se', 'Concavity se', 'Concave points se', 'Symmetry se', 'Fractal dimension se', 'Radius worst', 'Texture worst', 'Perimeter worst', 'Area worst', 'Smoothness worst', 'Compactness worst', 'Concavity worst', 'Concave points worst', 'Symmetry worst', 'Fractal dimension worst'], class_names= ['Begnign', 'Malignant'])
graph = graph_from_dot_data(dot_data)
graph.write_png('DecisionTreeGraph.png')
return model_eval(tree)
def LP():
grid_param={'tol':[1e-3], "random_state":[0], "max_iter":[10,20,30]}
perceptron_model= Perceptron()
perceptron_model.fit(X_train, y_train)
print("Perceptron")
return model_eval(mod_select_train(perceptron_model,grid_param))
def KNN():
'''
Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
'''
grid_param = {'algorithm' : ['brute', 'ball_tree', 'kd_tree'], 'n_neighbors' : [1, 3, 5, 10, 15, 20]}
knn_run = KNeighborsClassifier()
knn_run.fit(X_train, y_train)
print("KNN")
return model_eval(mod_select_train(knn_run, grid_param))
def SGD():#Adaline SGD
grid_param = {'penalty' :['l1', 'l2'], 'max_iter':[10, 25, 50, 100]}
adaline= SGDClassifier()
adaline.fit(X_train, y_train)
print('SGD')
return model_eval(mod_select_train(adaline, grid_param))
def classifiers():
best = [lr(), DT(), nnP(), SupVM(), KNN(), LP(), SGD()]
#All classifiers return tuple of (AUC, f_s, acc_s, cr, clfr_var)
best.sort() #Sort classifiers based on best AUC returned
#Record results/findings (data)
out_file = open('FinalProjectModelEvaluationReport.txt', 'w')
out_file.writelines('From most significant classifier to least' + '\n')
while len(best) > 0:
report = best.pop()
out_file.writelines('Classifier w/ optimal hyperameters: ' + '\n')
out_file.writelines(str(report[-1])+ '\n')
out_file.writelines('Confusion matrix'+ '\n')
out_file.writelines(str(report[-2])+ '\n')
out_file.writelines('Area under Precision and Recall Curve: ' + str(report[0])+ '\n')
out_file.writelines('F1s score: ' + str(report[1])+ '\n')
out_file.writelines('Accuracy: ' + str(report[2]) + '\n')
out_file.writelines('\n')
out_file.close()
classifiers()