forked from leopiney/deep-forest
-
Notifications
You must be signed in to change notification settings - Fork 0
/
deep_forest.py
320 lines (266 loc) · 10.7 KB
/
deep_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#
# Inspired by https://arxiv.org/abs/1702.08835 and https://github.com/STO-OTZ/my_gcForest/
#
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from utils import create_logger, rolling_window
class MGCForest():
"""
Multi-Grained Cascade Forest
@param estimators_config A dictionary containing the configurations for the estimators of
the estimators of the MultiGrainedScanners and the CascadeForest.
@param stride_ratios A list of stride ratios for each MultiGrainedScanner instance.
@param folds The number of k-folds to use.
@param verbose Adds verbosity.
Example:
estimators_config={
'mgs': [{
'estimator_class': ExtraTreesClassifier,
'estimator_params': {
'n_estimators': 30,
'min_samples_split': 21,
'n_jobs': -1,
}
}],
'cascade': [{
'estimator_class': ExtraTreesClassifier,
'estimator_params': {
'n_estimators': 1000,
'min_samples_split': 11,
'max_features': 1,
'n_jobs': -1,
}
}]
},
"""
def __init__(
self,
estimators_config,
stride_ratios=[1.0 / 4, 1.0 / 9, 1.0 / 16],
folds=3,
verbose=False
):
self.mgs_instances = [
MultiGrainedScanner(
estimators_config['mgs'],
stride_ratio=stride_ratio,
folds=folds,
verbose=verbose,
)
for stride_ratio in stride_ratios
]
self.stride_ratios = stride_ratios
self.c_forest = CascadeForest(estimators_config['cascade'], verbose=verbose)
def fit(self, X, y):
scanned_X = np.hstack([
mgs.fit(X, y)
for mgs in self.mgs_instances
])
self.c_forest.fit(scanned_X, y)
def predict(self, X):
scan_pred = np.hstack([
mgs.predict(X)
for mgs in self.mgs_instances
])
return self.c_forest.predict(scan_pred)
def __repr__(self):
return '<MGCForest {}>'.format(self.stride_ratios)
class MultiGrainedScanner():
"""
Multi-Grained Scanner
@param estimators_config A list containing the class and parameters of the estimators for
the MultiGrainedScanner.
@param stride_ratio The stride ratio to use for slicing the input.
@param folds The number of k-folds to use.
@param verbose Adds verbosity.
"""
def __init__(
self, estimators_config, stride_ratio=0.25, folds=3, verbose=False
):
self.estimators_config = estimators_config
self.stride_ratio = stride_ratio
self.folds = folds
self.windows_estimators = []
self.logger = create_logger(self, verbose)
def slices(self, X):
"""
Given an input X with dimention N, return a ndarray of dimention 3 with all the instances
values for each window.
For example, if the input has shape (10, 400), and the stride_ratio is 0.25, then this
will generate 301 windows with shape (10, 100). The final result would have a shape of
(301, 10, 100).
"""
self.logger.debug('Slicing X with shape {}'.format(X.shape))
sample_shape = list(X[0].shape)
window_shape = np.maximum(
np.array([s * self.stride_ratio for s in sample_shape]), 1
).astype(np.int16)
self.logger.debug('Got window shape: {}'.format(window_shape.shape))
#
# Calculate the windows that are going to be used and the total
# number of new generated samples.
#
windows_count = [sample_shape[i] - window_shape[i] + 1 for i in range(len(sample_shape))]
new_instances_total = np.prod(windows_count)
self.logger.debug('Slicing {} windows.'.format(windows_count))
#
# For each sample, get all the windows with their values
#
sliced_X = np.array([
rolling_window(x, window_shape)
for x in X
])
#
# Swap the 0 and 1 axis so as to get for each window, the value of each sample.
#
sliced_X = np.swapaxes(sliced_X, 0, 1)
if len(sliced_X.shape) > 3:
shape = list(sliced_X.shape)
sliced_X = sliced_X.reshape(shape[:2] + [np.prod(shape[2:])])
self.logger.info(
'Scanning turned X ({}) into sliced_X ({}). {} new instances were added '
'per sample'.format(X.shape, sliced_X.shape, new_instances_total)
)
return sliced_X
def fit(self, X, y):
"""
Slice the input and for each window creates the estimators and save the estimators in
self.window_estimators. Then for each window, fit the estimators with the data of all
the samples values on that window and perform a cross_val_predict and get the predictions.
"""
self.logger.info('Scanning and fitting for X ({}) and y ({}) started'.format(
X.shape, y.shape
))
self.n_classes = np.unique(y).size
sliced_X = self.slices(X)
#
# Create an estimator for each generated window
#
self.windows_estimators = []
predictions = []
for window_index, window_X in enumerate(sliced_X):
estimators = [
estimator_config['estimator_class'](**estimator_config['estimator_params'])
for estimator_config in self.estimators_config
]
self.windows_estimators.append(estimators)
self.logger.debug(
'Window #{}:: Training estimators for window with shape {}'.format(
window_index, window_X.shape
)
)
for estimator_index, estimator in enumerate(estimators):
self.logger.debug(
'Window #{}:: Fitting estimator #{} ({})'
.format(window_index, estimator_index, estimator.__class__)
)
estimator.fit(window_X, y)
#
# Gets a prediction of sliced_X with shape (len(newX), n_classes).
# The method `predict_proba` returns a vector of size n_classes.
#
self.logger.debug('Window #{}:: Cross-validation with estimator #{} ({})'.format(
window_index, estimator_index, estimator.__class__
))
prediction = cross_val_predict(
estimator,
window_X,
y,
cv=self.folds,
method='predict_proba',
n_jobs=-1,
)
predictions.append(prediction)
self.logger.info('Finished fitting X ({}) and got predictions with shape {}'.format(
X.shape, np.array(predictions).shape
))
return np.hstack(predictions)
def predict(self, X):
self.logger.info('Predicting X ({})'.format(X.shape))
sliced_X = self.slices(X)
return np.hstack([
estimator
.predict_proba(window_X)
for window_X, window_estimators in zip(sliced_X, self.windows_estimators)
for estimator in window_estimators
])
def __repr__(self):
return '<MultiGrainedScanner stride_ratio={}>'.format(self.stride_ratio)
class CascadeForest():
"""
CascadeForest
@param estimators_config A list containing the class and parameters of the estimators for
the CascadeForest.
@param folds The number of k-folds to use.
@param verbose Adds verbosity.
"""
def __init__(self, estimators_config, folds=3, verbose=False):
self.estimators_config = estimators_config
self.folds = folds
self.logger = create_logger(self, verbose)
def fit(self, X, y):
self.logger.info('Cascade fitting for X ({}) and y ({}) started'.format(X.shape, y.shape))
self.classes = np.unique(y)
self.level = 0
self.levels = []
self.max_score = None
while True:
self.logger.info('Level #{}:: X with shape: {}'.format(self.level + 1, X.shape))
estimators = [
estimator_config['estimator_class'](**estimator_config['estimator_params'])
for estimator_config in self.estimators_config
]
predictions = []
for estimator in estimators:
self.logger.debug('Fitting X ({}) and y ({}) with estimator {}'.format(
X.shape, y.shape, estimator
))
estimator.fit(X, y)
#
# Gets a prediction of X with shape (len(X), n_classes)
#
prediction = cross_val_predict(
estimator,
X,
y,
cv=self.folds,
method='predict_proba',
n_jobs=-1,
)
predictions.append(prediction)
self.logger.info('Level {}:: got all predictions'.format(self.level + 1))
#
# Stacks horizontally the predictions to each of the samples in X
#
X = np.hstack([X] + predictions)
#
# For each sample, compute the average of predictions of all the estimators, and take
# the class with maximum score for each of them.
#
y_prediction = self.classes.take(
np.array(predictions).mean(axis=0).argmax(axis=1)
)
score = accuracy_score(y, y_prediction)
self.logger.info('Level {}:: got accuracy {}'.format(self.level + 1, score))
if self.max_score is None or score > self.max_score:
self.level += 1
self.max_score = score
self.levels.append(estimators)
else:
break
def predict(self, X):
for estimators in self.levels:
predictions = [
estimator.predict_proba(X)
for estimator in estimators
]
self.logger.info('Shape of predictions: {} shape of X: {}'.format(
np.array(predictions).shape, X.shape
))
X = np.hstack([X] + predictions)
return self.classes.take(
np.array(predictions).mean(axis=0).argmax(axis=1)
)
def __repr__(self):
return '<CascadeForest forests={}>'.format(len(self.estimators_config))