forked from Yangqing/iceberk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
528 lines (479 loc) · 18.7 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
'''
mpiclassify
====
Provides an MPI interface that trains linear classifiers that can be represented
by
\min_w 1/N * sum_n L(y_n,w'x_n+b) + gamma * Reg(w)
This algorithm only deals with the primal case (no dual), assuming that there
are more data points than the number of feature dimension (if not, you might
want to look for dual solvers to your problem). We use L-BFGS as the default
solver, and if the loss function or regularizer is not differentiable everywhere
(like the v-style L1 regularizer), we will use the subgradient methods.
'''
from iceberk import mpi, mathutil
import logging
import numpy as np
from scipy import optimize
from sklearn import metrics
_FMIN = optimize.fmin_l_bfgs_b
def to_one_of_k_coding(Y):
'''Convert the vector Y into one-of-K coding. The element will be either -1
or 1
'''
if Y.ndim > 1:
raise ValueError, "The input Y should be a vector."
K = mpi.COMM.allreduce(Y.max(), op=max) + 1
Yout = -np.ones((len(Y), K))
Yout[np.arange(len(Y)), Y.astype(int)] = 1
return Yout
def feature_meanstd(mat):
'''
Utility function that does in-place normalization of features.
Input:
mat: the local data matrix, each row is a feature vector and each
column is a feature dim
Output:
m: the mean for each dimension
std: the standard deviation for each dimension
'''
# subtract mean
N = mpi.COMM.allreduce(mat.shape[0])
m = np.empty_like(mat[0])
mpi.COMM.Allreduce(np.sum(mat, axis=0), m)
m /= N
# we perform in-place modifications
mat -= m
# normalize variance
std = np.empty_like(mat[0])
mpi.COMM.Allreduce(np.sum(mat**2,axis=0), std)
std /= N
# we also add a regularization term
std = np.sqrt(std) + np.finfo(np.float64).eps
# recover the original mat
mat += m
return m, std
class Solver(object):
'''
Solver is the general solver to deal with bookkeeping stuff
'''
def __init__(self, gamma, loss, reg,
lossargs = {}, regargs = {}, fminargs = {}):
'''
Initializes the solver.
Input:
gamma: the regularization parameter
loss: the loss function. Should accept three variables Y, X and W,
where Y is a vector in {labels}^(num_data), X is a matrix of size
[num_data,nDim], and W is a vector of size nDim. It returns
the loss function value and the gradient with respect to W.
reg: the regularizaiton func. Should accept a vector W of
shape nDim and returns the regularization term value and
the gradient with respect to W.
lossargs: the arguments that should be passed to the loss function
regargs: the arguments that should be passed to the regularizer
fminargs: additional arguments that you may want to pass to fmin.
you can check the fmin function to see what arguments can be
passed (like display options: {'disp':1}).
'''
self._gamma = gamma
self.loss = loss
self.reg = reg
self._lossargs = lossargs
self._regargs = regargs
self._fminargs = fminargs
self._add_default_fminargs()
def _add_default_fminargs(self):
'''
This function adds some default args to fmin, if we have not explicitly
specified them.
'''
self._fminargs['maxfun'] = self._fminargs.get('maxfun', 1000)
self._fminargs['disp'] = self._fminargs.get('disp', 1)
# even when fmin displays outputs, we set non-root display to none
if not mpi.is_root():
self._fminargs['disp'] = 0
@staticmethod
def obj(wb, solver):
"""The objective function to be used by fmin
"""
raise NotImplementedError
def presolve(self, X, Y, weight, param_init):
"""This function is called before we call lbfgs. It should return a
vector that is the initialization of the lbfgs.
"""
raise NotImplementedError
def postsolve(self, lbfgs_result):
"""This function deals with the post-processing of the lbfgs result. It
should return the optimal parameter for the classifier.
"""
raise NotImplementedError
def solve(self, X, Y, weight = None, param_init = None):
"""The solve function
"""
param_init = self.presolve(X, Y, weight, param_init)
logging.info('Solver: running lbfgs...')
result = _FMIN(self.__class__.obj, param_init,
args=[self], **self._fminargs)
return self.postsolve(result)
class SolverSC(Solver):
"""The solver that does single-class classification
Output:
w, b : the learned weights and bias
"""
def presolve(self, X, Y, weight, param_init):
self._X = X.reshape((X.shape[0],np.prod(X.shape[1:])))
self._Y = Y
self._weight = weight
# compute the number of data
if weight is None:
self._num_data = mpi.COMM.allreduce(X.shape[0])
else:
self._num_data = mpi.COMM.allreduce(weight.sum())
self._dim = self._X.shape[1]
if param_init is None:
param_init = np.zeros(self._dim+1)
else:
# just to make sure every node is on the same page
mpi.COMM.Bcast(param_init)
return param_init
def postsolve(self, lbfgs_result):
return lbfgs_result[0][:-1], lbfgs_result[0][-1]
@staticmethod
def obj(param, solver):
'''The objective function used by fmin
'''
w = param[:-1]
b = param[-1]
# prediction is a vector
pred = np.dot(solver._X, w) + b
# call the loss
flocal, gpred = solver.loss(solver._Y, pred, solver._weight,
**solver._lossargs)
# get the gradient for both w and b
glocal = np.empty(param.shape)
glocal[:-1] = np.dot(gpred,solver._X)
glocal[-1] = gpred.sum()
# do mpi reduction
# for the regularization term
freg, greg = solver.reg(w, **solver._regargs)
flocal += solver._num_data * solver._gamma / mpi.SIZE * freg
glocal[:-1] += solver._num_data * solver._gamma / mpi.SIZE * greg
mpi.barrier()
f = mpi.COMM.allreduce(flocal)
g = np.empty(glocal.shape)
mpi.COMM.Allreduce(glocal,g)
return f, g
class SolverMC(Solver):
'''SolverMC is a multi-dimensional wrapper
For the input Y, it could be either a vector of the labels
(starting from 0), or a matrix whose values are -1 or 1. You
need to manually make sure that the input Y format is consistent
with the loss function though.
'''
def presolve(self, X, Y, weight, param_init):
self._X = X.reshape((X.shape[0],np.prod(X.shape[1:])))
if len(Y.shape) == 1:
self._K = mpi.COMM.allreduce(Y.max(), op=max) + 1
else:
# We treat Y as a two-dimensional matrix
Y = Y.reshape((Y.shape[0],np.prod(Y.shape[1:])))
self._K = Y.shape[1]
self._Y = Y
self._weight = weight
# compute the number of data
if weight is None:
self._num_data = mpi.COMM.allreduce(X.shape[0])
else:
self._num_data = mpi.COMM.allreduce(weight.sum())
self._dim = self._X.shape[1]
if param_init is None:
param_init = np.zeros(self._K * (self._dim+1))
else:
# just to make sure every node is on the same page
mpi.COMM.Bcast(param_init)
return param_init
def postsolve(self, lbfgs_result):
wb = lbfgs_result[0]
K = self._K
w = wb[: K * self._dim].reshape(self._dim, K).copy()
b = wb[K * self._dim :].copy()
return w, b
@staticmethod
def obj(wb,solver):
'''
The objective function used by fmin
'''
# obtain w and b
K = solver._K
dim = solver._dim
w = wb[:K*dim].reshape((dim, K))
b = wb[K*dim:]
# pred is a matrix of size [num_datalocal, K]
pred = mathutil.dot(solver._X, w)
pred += b
# compute the loss function
flocal,gpred = solver.loss(solver._Y, pred, solver._weight,
**solver._lossargs)
glocal = np.empty(wb.shape)
glocal[:K*dim] = mathutil.dot(solver._X.T, gpred).flat
glocal[K*dim:] = gpred.sum(axis=0)
# add regularization term, but keep in mind that we have multiple nodes
freg, greg = solver.reg(w, **solver._regargs)
flocal += solver._num_data * solver._gamma * freg / mpi.SIZE
glocal[:K*dim] += solver._num_data * solver._gamma / mpi.SIZE \
* greg.ravel()
# do mpi reduction
mpi.barrier()
f = mpi.COMM.allreduce(flocal)
g = np.empty(glocal.shape,dtype=glocal.dtype)
mpi.COMM.Allreduce(glocal,g)
return f, g
class Loss(object):
"""LOSS defines commonly used loss functions
For all loss functions:
Input:
Y: a vector or matrix of true labels
pred: prediction, has the same shape as Y.
Return:
f: the loss function value
g: the gradient w.r.t. pred, has the same shape as pred.
"""
def __init__(self):
"""All functions in Loss should be static
"""
raise NotImplementedError, "Loss should not be instantiated!"
@staticmethod
def loss_l2(Y, pred, weight, **kwargs):
'''
The l2 loss: f = ||Y - pred||_{fro}^2
'''
diff = pred - Y
if weight is None:
return np.dot(diff.flat, diff.flat), 2.*diff
else:
return np.dot((diff**2).sum(1), weight), \
2.*diff*weight[:,np.newaxis]
@staticmethod
def loss_hinge(Y, pred, weight, **kwargs):
'''The SVM hinge loss. Input vector Y should have values 1 or -1
'''
margin = np.maximum(0., 1. - Y * pred)
if weight is None:
f = margin.sum()
g = - Y * (margin>0)
else:
f = np.dot(weight, margin).sum()
g = - Y * weight * (margin>0)
return f, g
@staticmethod
def loss_squared_hinge(Y,pred,weight,**kwargs):
''' The squared hinge loss. Input vector Y should have values 1 or -1
'''
margin = np.maximum(0., 1. - Y * pred)
if weight is None:
return np.dot(margin.flat, margin.flat), -2.*Y*margin
else:
return np.dot(weight, margin**2).sum(), -2.*Y*weight*margin
@staticmethod
def loss_bnll(Y,pred,weight,**kwargs):
'''
the BNLL loss: f = log(1 + exp(-y * pred))
'''
# expnyp is exp(-y * pred)
expnyp = mathutil.exp(-Y*pred)
expnyp_plus = 1. + expnyp
if weight is None:
return np.sum(np.log(expnyp_plus)), -Y * expnyp / expnyp_plus
else:
return np.dot(weight, np.log(expnyp_plus)).sum(), \
- Y * weight * expnyp / expnyp_plus
@staticmethod
def loss_rank_hinge(Y, pred, weight, **kwargs):
"""The rank loss: the score of the true label should be higher
than the other scores by a margin, and hinge loss is used to compute
the loss.
Input:
Y: a vector indicating the true labels
pred: a matrix indicating the scores for each label
"""
N = len(Y)
score_gt = pred[np.arange(N), Y]
diff = pred - (score_gt-1.)[:, np.newaxis]
# diff_hinge will be the hinge loss for each class, except for the
# ground truth where it should be 0 (instead of 1)
diff_hinge = np.maximum(diff, 0.)
if weight is None:
# for the loss we will subtract N due to the ground truth offset
f = diff_hinge.sum() - N
# for the gradient of non-ground truth predictions, it's simply
# a boolean value. For the ground truth prediction, it's the sum of
# the violations
g = (diff > 0).astype(np.float64)
g[np.arange(N), Y] = 1. - g.sum(axis=1)
else:
f = np.dot(weight, diff_hinge).sum()
g = (diff > 0).astype(np.float64)
g[np.arange(N), Y] = 1. - g.sum(axis=1)
g *= weight[:, np.newaxis]
return f, g
@staticmethod
def loss_rank_squared_hinge(Y, pred, weight, **kwargs):
"""The rank-based squared hinge loss
"""
raise NotImplementedError, "Yangqing still needs to debug this"
N = len(Y)
score_gt = pred[np.arange(N), Y]
diff = pred - (score_gt-1.)[:, np.newaxis]
# diff_hinge will be the hinge loss for each class, except for the
# ground truth where it should be 0 (instead of 1)
diff_hinge = np.maximum(diff, 0.)
if weight is None:
# for the loss we will subtract N due to the ground truth offset
f = np.dot(diff_hinge.flat, diff_hinge.flat) - N
# for the gradient of non-ground truth predictions, it's simply
# a boolean value. For the ground truth prediction, it's the sum of
# the violations
g = 2. * diff_hinge
g[np.arange(N), Y] = 2. - g.sum(axis=1)
else:
f = np.dot(weight, diff_hinge).sum()
g = - 2. * diff_hinge
g[np.arange(N), Y] = 2. - g.sum(axis=1)
g *= weight[:, np.newaxis]
return f, g
class Reg(object):
'''
REG defines commonly used regularization functions
For all regularization functions:
Input:
w: the weight vector, or the weight matrix in the case of multiple classes
Return:
f: the regularization function value
g: the gradient w.r.t. w, has the same shape as w.
'''
@staticmethod
def reg_l2(w,**kwargs):
'''
l2 regularization: ||w||_2^2
'''
return np.dot(w.flat, w.flat), 2.*w
@staticmethod
def reg_l1(w,**kwargs):
'''
l1 regularization: ||w||_1
'''
g = np.sign(w)
# subgradient
g[g==0] = 0.5
return np.abs(w).sum(), g
class Evaluator(object):
"""Evaluator implements some commonly-used criteria for evaluation
"""
@staticmethod
def mse(Y, pred, axis=None):
"""Return the mean squared error of the true value and the prediction
Input:
Y, pred: the true value and the prediction
axis: (optional) if Y and pred are matrices, you can specify the
axis along which the mean is carried out.
"""
return ((Y - pred) ** 2).mean(axis=axis)
@staticmethod
def accuracy(Y, pred):
"""Computes the accuracy
Input:
Y, pred: two vectors containing discrete labels
If pred is a matrix instead of a vector, then argmax is used to get
the discrete label.
"""
if pred.ndim == 2:
pred = pred.argmax(axis=1)
correct = mpi.COMM.allreduce((Y==pred).sum())
num_data = mpi.COMM.allreduce(len(Y))
return float(correct) / num_data
@staticmethod
def accuracy_class_averaged(Y, pred):
"""Computes the accuracy, but averaged over classes instead of averaged
over data points.
Input:
Y: the ground truth vector
pred: a vector containing the predicted labels. If pred is a matrix
instead of a vector, then argmax is used to get the discrete label.
"""
if pred.ndim == 2:
pred = pred.argmax(axis=1)
num_classes = Y.max() + 1
accuracy = 0.0
correct = (Y == pred).astype(np.float)
for i in range(num_classes):
idx = (Y == i)
accuracy += correct[idx].mean()
accuracy /= num_classes
return accuracy
@staticmethod
def top_k_accuracy(Y, pred, k):
"""Computes the top k accuracy
Input:
Y: a vector containing the discrete labels of each datum
pred: a matrix of size len(Y) * num_classes, each row containing the
real value scores for the corresponding label. The classes with
the highest k scores will be considered.
"""
if k > pred.shape[1]:
logging.warning("Warning: k is larger than the number of classes"
"so the accuracy would always be one.")
top_k_id = np.argsort(pred, axis=1)[-k:]
match = (top_k_id == Y[:, np.newaxis])
correct = mpi.COMM.allreduce(match.sum())
num_data = mpi.COMM.allreduce(len(Y))
return float(correct) / num_data
@staticmethod
def average_precision(Y, pred):
"""Average Precision for binary classification
"""
# since we need to compute the precision recall curve, we have to
# compute this on the root node.
Y = mpi.COMM.gather(Y)
pred = mpi.COMM.gather(pred)
if mpi.is_root():
Y = np.hstack(Y)
pred = np.hstack(pred)
precision, recall, _ = metrics.precision_recall_curve(
Y == 1, pred)
ap = metrics.auc(recall, precision)
else:
ap = None
mpi.barrier()
return mpi.COMM.bcast(ap)
@staticmethod
def average_precision_multiclass(Y, pred):
"""Average Precision for multiple class classification
"""
K = pred.shape[1]
aps = [Evaluator.average_precision(Y==k, pred[:,k]) for k in range(K)]
return np.asarray(aps).mean()
'''
Utility functions that wraps often-used functions
'''
def svm_binary(X, Y, gamma, weight = None, **kwargs):
solver = SolverSC(gamma, Loss.loss_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)
def l2svm_binary(X, Y, gamma, weight = None, **kwargs):
solver = SolverSC(gamma, Loss.loss_squared_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)
def svm_onevsall(X, Y, gamma, weight = None, **kwargs):
if Y.ndim == 1:
Y = to_one_of_k_coding(Y)
solver = SolverMC(gamma, Loss.loss_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)
def l2svm_onevsall(X, Y, gamma, weight = None, **kwargs):
if Y.ndim == 1:
Y = to_one_of_k_coding(Y)
solver = SolverMC(gamma, Loss.loss_squared_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)
def svm_multiclass(X, Y, gamma, weight = None, **kwargs):
solver = SolverMC(gamma, Loss.loss_rank_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)
def l2svm_multiclass(X, Y, gamma, weight = None, **kwargs):
solver = SolverMC(gamma, Loss.loss_rank_squared_hinge, Reg.reg_l2, **kwargs)
return solver.solve(X, Y, weight)