/
main.py
173 lines (126 loc) · 6.99 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from dataloader import WayneLoanApprovalLoader
from experiment import StratifiedExperiment, BinaryBalancedExperiment
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble.forest import RandomForestClassifier as RF
from sklearn.ensemble.bagging import BaggingClassifier
from sklearn.ensemble.weight_boosting import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB as NB
from custom_models import BinaryDummyModel
from matplotlib import pyplot as plt
from custom_models import LoanPytorchModel
#Pulling in all data from 2007-2014
wayne_all = WayneLoanApprovalLoader(savename='wayneall_indicator', csvfile='wayne_county_2007_2014.tsv')
# We have some data, now lets choose a model and some metrics, before putting them into experiment objects.
lr1 = LogisticRegression()
lr2 = LogisticRegression()
lrb1 = LogisticRegression(class_weight='balanced')
lrb2 = LogisticRegression(class_weight='balanced')
ada1 = AdaBoostClassifier()
ada2 = AdaBoostClassifier()
timemodels = [lr1, lr2]
criterion = accuracy_score # Thankfully this task has a pretty easy evaluation... you either get it right or wrong
# Getting temporally contiguous cuts of data, putting them into different experiments
data_time1 = wayne_all.get_dates([2007, 2008, 2009, 2010])
expmt_time1 = StratifiedExperiment(timemodels[0], criterion, data_time1[:, :-1], data_time1[:, -1], test_size=0.8)
data_time2 = wayne_all.get_dates([2011, 2012, 2013, 2014])
expmt_time2 = StratifiedExperiment(timemodels[1], criterion, data_time2[:, :-1], data_time2[:, -1], test_size=0.8)
def do_runs(input_experiment, runs):
trainvals = []
testvals = []
for run in range(runs):
trainval, testval = input_experiment.run()
trainvals.append(trainval)
testvals.append(testval)
print("\nTraining Validation: %6.2f, Testing Validation: %6.2f" % (np.mean(trainvals), np.mean(testvals)))
def get_conmat(input_model, features, labels, runs):
conmat = np.zeros([2, 2])
for run in range(runs):
conmat += confusion_matrix(labels, input_model.predict(features))
conmat = conmat / np.sum(conmat)
print("\n")
print(conmat)
# do_runs(expmt_time1, 10)
# do_runs(expmt_time2, 10)
#
#
# # Plotting the results of the time experiment
#
# plt.figure(0)
# tickpos = np.arange(len(wayne_all.vector_headers))*2
# one = plt.barh(tickpos, timemodels[0].coef_[0], align='center', alpha=0.5, height=2)
# plt.yticks(tickpos, wayne_all.vector_headers)
#
# two = plt.barh(tickpos, timemodels[1].coef_[0], align='center', alpha=0.5, height=2)
#
# plt.legend((one, two), ('2007-2010', '2011-2014'))
# plt.xlabel("Model Weight")
# plt.tick_params(axis='both', which='major', labelsize=8, rotation=45)
#
# plt.show()
# Just doing an experiment over all time now that we've established the distributions the observations are drawn from
# are fairly similar
# To investigate failures let's look at the confusion matrix from multiple runs as well.
#I'm going to make a meta-split in the data used to train and test models, so we can compare experiment types.
## Trying stratified
strat_model = AdaBoostClassifier()
stratified_experiment = StratifiedExperiment(strat_model, criterion, wayne_all.data[:, :-1], wayne_all.data[:, -1],
test_size=0.2)
train_idx, test_idx = stratified_experiment.partition(wayne_all.data[:, :-1], wayne_all.data[:, -1])
stratified_experiment = StratifiedExperiment(strat_model,
criterion, wayne_all.data[train_idx, :-1],
wayne_all.data[train_idx, -1], test_size=0.2)
# Neural net experiment
nn_model = LoanPytorchModel(wayne_all.data[:, :-1].shape[1], 1, batch_size=10, epochs=10, layers=4)
nn_experiment = StratifiedExperiment(nn_model, criterion, wayne_all.data[train_idx, :-1], wayne_all.data[train_idx, -1],
test_size=0.2)
print("\nSTAT EXPERIMENT NEURAL NETWORK PERFORMANCE:")
do_runs(nn_experiment, 1)
get_conmat(nn_model, wayne_all.data[test_idx, :-1], wayne_all.data[test_idx, -1], 10)
print("\nBINARY BALANCED NEURAL NETWORK PERFORMANCE:")
nn_experiment = BinaryBalancedExperiment(nn_model, criterion, wayne_all.data[train_idx, :-1], wayne_all.data[train_idx, -1],
test_size=0.2)
do_runs(nn_experiment, 1)
get_conmat(nn_model, wayne_all.data[test_idx, :-1], wayne_all.data[test_idx, -1], 10)
print("\nSTRATIFIED:")
do_runs(stratified_experiment, 10)
get_conmat(strat_model, wayne_all.data[test_idx, :-1], wayne_all.data[test_idx, -1], 10)
## Trying balanced partitioning
balanced_model = AdaBoostClassifier()
balanced_experiment = BinaryBalancedExperiment(balanced_model, criterion,
wayne_all.data[train_idx, :-1], wayne_all.data[train_idx, -1],
test_size=0.2)
print("\nBALANCED:")
do_runs(balanced_experiment, 10)
get_conmat(balanced_model, wayne_all.data[test_idx, :-1], wayne_all.data[test_idx, -1], 10)
## Trying weight balancing
weight_balanced_model = LogisticRegression(class_weight='balanced')
weight_balanced_experiment = StratifiedExperiment(weight_balanced_model, criterion,
wayne_all.data[train_idx, :-1], wayne_all.data[train_idx, -1],
test_size=0.8)
print("\nWEIGHT BALANCED:")
do_runs(weight_balanced_experiment, 10)
get_conmat(weight_balanced_model, wayne_all.data[test_idx, :-1], wayne_all.data[test_idx, -1], 10)
## Going back and checking confusion matrices for balanced LRs in different times
timemodels = [LogisticRegression(class_weight='balanced'), LogisticRegression(class_weight='balanced')]
expmt_time1 = StratifiedExperiment(timemodels[0], criterion, data_time1[:, :-1], data_time1[:, -1], test_size=0.8)
expmt_time2 = StratifiedExperiment(timemodels[1], criterion, data_time2[:, :-1], data_time2[:, -1], test_size=0.8)
# Quick model training
do_runs(expmt_time1, 1)
do_runs(expmt_time2, 1)
# Predictions of time models on first timeblock
time1_test_features = data_time1[expmt_time1.last_test_idx, :-1]
time1_test_labels = data_time1[expmt_time1.last_test_idx, -1]
print("\n2007-2010-trained classification behavior on 2007-2010 data")
get_conmat(timemodels[0], time1_test_features, time1_test_labels, 10)
print("\n2011-2014-trained classification behavior on 2007-2010 data")
get_conmat(timemodels[1], time1_test_features, time1_test_labels, 10)
# Predictions of time models on second timeblock
time2_test_features = data_time2[expmt_time2.last_test_idx, :-1]
time2_test_labels = data_time2[expmt_time2.last_test_idx, -1]
print("\n2007-2010-trained classification behavior on 2011-2014 data")
get_conmat(timemodels[0], time2_test_features, time2_test_labels, 10)
print("\n2011-2014-trained classification behavior on 2007-2010 data")
get_conmat(timemodels[1], time2_test_features, time2_test_labels, 10)