/
credit.py
135 lines (115 loc) · 4.52 KB
/
credit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#load the dataset
from pandas import read_csv
from collections import Counter
from numpy import mean
from numpy import std
from matplotlib import pyplot
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import OneSidedSelection
from imblearn.pipeline import Pipeline
filename = 'german.csv'
#load the file
df = read_csv(filename, header = None)
print(df.shape)
#summarize class distribution
target = df.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
per = v/len(target) * 100
print('Class = %d, Count = %d, Percentage = %.3f%%' %(k,v,per))
#create histograms of numeric input variables
#split dataset into X and y
last_ix = len(df.columns) - 1
X, y = df.drop(last_ix, axis=1), df[last_ix]
cat_ix = X.select_dtypes(include = ['object', 'bool']).columns
num_ix = X.select_dtypes(include = ['int64', 'float64']).columns
#label encode the target variable
y = LabelEncoder().fit_transform(y)
def f2_measure(y_true, y_pred):
return fbeta_score(y_true,y_pred, beta = 2)
def evaluate_model(X,y,model):
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats=3, random_state=1 )
metric = make_scorer(f2_measure)
scores = cross_val_score(model, X,y, scoring= metric, cv = cv, n_jobs=-1)
return scores
#define models to test
def get_models():
models,names = list(),list()
models.append(LogisticRegression(solver = 'liblinear'))
names.append('LR')
models.append(LinearDiscriminantAnalysis())
names.append('LDA')
models.append(GaussianNB())
names.append('NB')
models.append(SVC(gamma = 'scale'))
names.append('SVM')
return models,names
def get_under_sample_models():
models,names = list(),list()
models.append(TomekLinks())
names.append('TomesLinks')
models.append(EditedNearestNeighbours())
names.append('EditedNearestNeighbors')
models.append(RepeatedEditedNearestNeighbours())
names.append('RENN')
models.append(OneSidedSelection())
names.append('OneSidedSelection')
models.append(NeighbourhoodCleaningRule())
names.append('NCR')
return models,names
#create dummy model and evaluate
model = DummyClassifier(strategy = 'constant', constant = 1)
scores = evaluate_model(X,y,model)
print(X.shape, y.shape, Counter(y))
print('Dummy Classifier : ')
print('Mean F2 :%.3f (%.3f)' %(mean(scores), std(scores)))
models,names = get_models()
results = list()
#evaluate each model and print results
for i in range(len(models)):
# one hot encode categorical, normalize numerical
ct = ColumnTransformer([('c', OneHotEncoder(), cat_ix), ('n', MinMaxScaler(), num_ix)])
# wrap the model in a pipeline
pipeline = Pipeline(steps=[('t', ct), ('m', models[i])])
# evaluate the model and store results
scores = evaluate_model(X, y, pipeline)
results.append(scores)
# summarize and store
print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))
#boxplot of results
pyplot.boxplot(results, labels = names, showmeans=True)
pyplot.show()
results = list()
models,names = get_under_sample_models()
for i in range(len(models)):
# define model to evaluate
model = LogisticRegression(solver='liblinear', class_weight='balanced')
# one hot encode categorical, normalize numerical
ct = ColumnTransformer([('c',OneHotEncoder(),cat_ix), ('n',MinMaxScaler(),num_ix)])
# scale, then undersample, then fit model
pipeline = Pipeline(steps=[('t',ct), ('s', models[i]), ('m',model)])
# evaluate the model and store results
scores = evaluate_model(X, y, pipeline)
results.append(scores)
# summarize and store
print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))
pyplot.boxplot(results, labels = names, showmeans=True)
pyplot.show()