-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
161 lines (143 loc) · 6.08 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#%% modules
import Preprocess_Train, Preprocess_Test, Draw
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from tpot import TPOTClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
#%% preprocess
train_location = '/TrainingSet.csv'
test_location = '/TestingSet.csv'
df_preprocessed = Preprocess_Train.preprocess(train_location)
df_test = Preprocess_Test.preprocess(test_location)
# For testing
data = df_preprocessed.drop(['Final_Y'], axis=1)
label = df_preprocessed.Final_Y
# For training
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2) # random_state=34
best_features = [True, True, True, True, False, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, False, True]
#%% Resampling
smote = SMOTE()
x_train, y_train = smote.fit_resample(x_train, y_train)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
#%% Feature selection
def feature_selection(classifier):
start_num = 1
end_num = 36
temp_list = []
for step in range(start_num, end_num):
selector = RFE(classifier, step, step=1)
y_pred = selector.fit(x_train, y_train).predict(x_test)
score = f1_score(y_test, y_pred, average='binary')
temp_list.append((score, step, selector.support_))
best = max(temp_list, key=lambda x: x[0])
print(best) # the best num of features and feature details
return best[2]
best_features = feature_selection(DecisionTreeClassifier()) # DC: 9 features
#%% parameter tuning - Decision Tree
parameters = [{
'max_depth': range(1, 30),
'min_samples_split': range(2, 30),
'criterion': ('entropy', 'gini'),
'splitter': ('random', 'best'),
'min_samples_leaf': range(1, 30),
'max_leaf_nodes': range(2, 30)
}]
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5, scoring='f1', verbose=0, n_jobs=-1)
y_pred = clf.fit(x_train.loc[: , best_features], y_train).predict(x_test.loc[: , best_features])
print(clf.cv_results_['mean_test_score'])
print(clf.best_params_)
Draw.plot_confusion_matrix(y_test, y_pred, title='Decision Tree')
plt.show()
#%% parameter tuning - K-nearest neighbour
def KNN():
start_num = 1
end_num = 100
temp_list = []
for step in range(start_num, end_num):
knn_model = KNeighborsClassifier(n_neighbors=step)
y_pred = knn_model.fit(x_train, y_train).predict(x_test)
score = f1_score(y_test, y_pred, average='binary')
temp_list.append(((score, step), step))
print(max(temp_list, key=lambda x: x[0]))
#%% Parameter tuning - GB
parameters = [{
'n_estimators': range(1, 100),
'learning_rate': np.arange(0.1, 0.5, 0.01),
'max_depth': range(1, 20),
'min_samples_split': range(2, 10),
'max_leaf_nodes': range(1, 20),
'min_samples_leaf': range(1, 20)
}]
clf = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5, scoring='f1', verbose=0, n_jobs=-1)
y_pred = clf.fit(x_train.loc[: , best_features], y_train).predict(x_test.loc[: , best_features])
print(clf.cv_results_['mean_test_score'])
print(clf.best_params_)
Draw.plot_confusion_matrix(y_test, y_pred, title='Decision Tree')
plt.show()
#%% Parameter tuning - MLP
def MLP():
start_num = 1
end_num = 20
temp_list = []
# 2 layers are enough
for first in range(start_num, end_num):
for second in range(start_num, end_num):
for third in range(start_num, end_num):
mlp = MLPClassifier((first, second, third), max_iter=500)
y_pred = mlp.fit(x_train.loc[:, best_features], y_train).predict(x_test.loc[:, best_features])
score = mlp.score(x_test.loc[:, best_features], y_test)
temp_list.append((score, first, second))
best = max(temp_list, key=lambda x: x[0])
print(best)
MLP()
#%% GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=113, learning_rate=0.2, max_depth=3)
clf.fit(x_train.loc[:, best_features], y_train)
y_pred = clf.predict(x_test.loc[:, best_features])
print(f1_score(y_test, y_pred, average='binary'))
Draw.plot_confusion_matrix(y_test, y_pred, title='Gradient Boosting')
plt.show()
clf.score(x_test.loc[:, best_features], y_test)
#%% Random Forest
clf = RandomForestClassifier(n_estimators=70, criterion='entropy', max_depth=9, min_samples_split=6, min_samples_leaf=1, max_features=4, max_leaf_nodes=None)
clf.fit(x_train.loc[:, best_features], y_train)
y_pred = clf.predict(x_test.loc[:, best_features])
print(f1_score(y_test, y_pred, average='binary'))
Draw.plot_confusion_matrix(y_test, y_pred, title='Random forest')
plt.show()
clf.score(x_test.loc[:, best_features], y_test)
Draw.PR_curve(y_test, y_pred)
#%%
def tpot():
df = Preprocess_Train.preprocess(train_location)
data = df.drop(['Final_Y'], axis=1)
label = df.Final_Y
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.25, random_state=34)
tpot = TPOTClassifier(scoring='f1',
max_time_mins=300,
n_jobs=-1,
verbosity=2,
cv=5)
tpot.fit(data, label)
tpot.score(x_test, y_test)
# LinearSVC(C=25.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.1)
#%%
def Output():
rf_model = RandomForestClassifier(n_estimators=70)
rf_model.fit(data, label)
result = rf_model.predict(df_test)
result = pd.Series(result)
result_csv = 'submit.csv'
df_submit = pd.DataFrame()
df_submit['Final_Y'] = result
df_submit.to_csv(result_csv)