def main(): # -- top-level parameters of this script dtype = 'float32' # XXX n_examples = 50000 online_batch_size = 1 online_epochs = 2 batch_epochs = 30 lbfgs_m = 20 # -- load and prepare the data set data_view = mnist.views.OfficialVectorClassification(x_dtype=dtype) n_classes = 10 x = data_view.train.x[:n_examples] y = data_view.train.y[:n_examples] y1 = -1 * ones((len(y), n_classes)).astype(dtype) y1[arange(len(y)), y] = 1 # --initialize the SVM model w = zeros((x.shape[1], n_classes), dtype=dtype) b = zeros(n_classes, dtype=dtype) def svm(ww, bb, xx=x, yy=y1): # -- one vs. all linear SVM loss margin = yy * (dot(xx, ww) + bb) hinge = maximum(0, 1 - margin) cost = hinge.mean(axis=0).sum() return cost # -- stage-1 optimization by stochastic gradient descent print 'Starting SGD' n_batches = n_examples / online_batch_size w, b = fmin_sgd( svm, (w, b), streams={ 'xx': x.reshape((n_batches, online_batch_size, x.shape[1])), 'yy': y1.reshape((n_batches, online_batch_size, y1.shape[1])) }, loops=online_epochs, stepsize=0.001, print_interval=10000, ) print 'SGD complete, about to start L-BFGS' show_filters(w.T, (28, 28), ( 2, 5, )) # -- stage-2 optimization by L-BFGS print 'Starting L-BFGS' w, b = fmin_l_bfgs_b(svm, (w, b), maxfun=batch_epochs, iprint=1, m=lbfgs_m) print 'L-BFGS complete' show_filters(w.T, (28, 28), ( 2, 5, ))
def fit(self, x, y, xw=None): """ x - n_examples x n_features design matrix. y - vector of integer labels xw - matrix of real-valued incoming biases obtained by multiplying the existing weight vectors by x """ assert set(y) <= set([-1, 1]) if x.shape[0] != y.shape[0]: raise ValueError('length mismatch between x and y') n_examples, n_features = x.shape if n_features != self.n_features: raise ValueError('n_feature mismatch', (n_features, self.n_features)) weights = self.weights bias = self.bias alpha = self.alpha x = x.astype(self.dtype) y = y.astype(self.dtype) xw = self.as_xw(x, xw) print 'WARNING: IncrementalSVM should use alpha0, n_sgd_iters' # -- warm up with some sgd weights, bias, alpha, = autodiff.fmin_sgd( lambda w, b, a, xi, yi, xwi: binary_svm_hinge_loss(xi, yi, w, b, a, None, None, self.l2_regularization), (weights, bias, alpha), streams={ 'xi': x.reshape((n_examples, 1, x.shape[1])), 'yi': y.reshape((n_examples, 1)), }, stepsize=0.01, loops=max(1, 100000 // len(x)), ) # -- fine-tune without alpha by L-BFGS weights, bias, alpha, = autodiff.fmin_l_bfgs_b( lambda w, b, a: binary_svm_hinge_loss(x, y, w, b, a, None, None, self.l2_regularization), (weights, bias, alpha), # -- the graph is tiny, time spent optimizing it is wasted. theano_mode=theano.Mode(linker='cvm', optimizer='fast_run'), **self.bfgs_kwargs) self.weights = weights self.bias = bias self.alpha = alpha
def main(): # -- top-level parameters of this script dtype = 'float32' # XXX n_examples = 50000 online_batch_size = 1 online_epochs = 2 batch_epochs = 30 lbfgs_m = 20 n_mlp_hiddens = [200] # -- one entry per hidden layer # -- load and prepare the data set data_view = mnist.views.OfficialVectorClassification(x_dtype=dtype) n_classes = 10 x = data_view.train.x[:n_examples] y = data_view.train.y[:n_examples] y1 = -1 * ones((len(y), n_classes)).astype(dtype) y1[arange(len(y)), y] = 1 # -- allocate the model by running one example through it init_params = {} mlp_svm(init_params, x[:1], y[:1], n_mlp_hiddens, n_classes) if online_epochs: # -- stage-1 optimization by stochastic gradient descent print 'Starting SGD' n_batches = n_examples / online_batch_size stage1_params, = fmin_sgd(mlp_svm, (init_params,), streams={ 'x': x.reshape((n_batches, online_batch_size, x.shape[1])), 'y1': y1.reshape((n_batches, online_batch_size, y1.shape[1]))}, loops=online_epochs, stepsize=0.001, print_interval=10000, ) print 'SGD complete, about to start L-BFGS' show_filters(stage1_params['mlp']['weights'][0].T, (28, 28), (8, 25,)) else: print 'Skipping stage-1 SGD' stage1_params = init_params # -- stage-2 optimization by L-BFGS if batch_epochs: def batch_mlp_svm(p): return mlp_svm(p, x, y1) print 'Starting L-BFGS' stage2_params, = fmin_l_bfgs_b(lambda p: mlp_svm(p, x, y1), args=(stage1_params,), maxfun=batch_epochs, iprint=1, m=lbfgs_m) print 'L-BFGS complete' show_filters(stage2_params['mlp']['weights'][0].T, (28, 28), (8, 25,))
def fit_l_bfgs_b(weights, bias, alpha, x, y, l2reg, pxw, pw_l2_sqr, pl2reg, bfgs_kwargs, return_after_one_fit=False): """ Refine `weights, bias, alpha` by l_bfgs_b """ n_features, n_classes = weights.shape n_prev, n_classes = alpha.shape alpha_orig = alpha # -- the inplace alpha2 scaling modifies not-yet-fit weights # as the while loop below works its way across weights = weights.copy() low = 0 high = n_features # -- keep trying to train on less and less of the data until it works while True: x0 = x[:, low:high] x2 = x[:, high:] pxw2 = append_xw(pxw, x2, weights[high:]) pl2reg2 = append_l2_regularization(pl2reg, l2reg) alpha2 = append_alpha(alpha) pw_l2_sqr2 = append_w_l2_sqr(pw_l2_sqr, weights[high:]) def fn(w, b, a): return multi_svm_hinge_loss(x0, y, w, b, a, pxw2, pw_l2_sqr2, l2reg, pl2reg2) try: if l_bfgs_b_debug_feature_limit is not None: # -- this mechanism is used by unit tests if (high - low) > l_bfgs_b_debug_feature_limit: raise MemoryError() (weights_, bias, alpha2), info = autodiff.fmin_l_bfgs_b(fn, (weights[low:high], bias, alpha2), return_info=True, borrowable=[x0], floatX=x.dtype, **bfgs_kwargs) info['feature_high'] = high info['feature_low'] = low gc.collect() logger.info('fitting successful for %i features' % high) break except (MemoryError, RuntimeError), e: high /= 2 if low == high: raise gc.collect() logger.info('fitting required too much memory, falling back to %i' % high) continue
def main(): # -- top-level parameters of this script dtype = "float32" # XXX n_examples = 50000 online_batch_size = 1 online_epochs = 2 batch_epochs = 30 lbfgs_m = 20 # -- load and prepare the data set data_view = mnist.views.OfficialVectorClassification(x_dtype=dtype) n_classes = 10 x = data_view.train.x[:n_examples] y = data_view.train.y[:n_examples] y1 = -1 * ones((len(y), n_classes)).astype(dtype) y1[arange(len(y)), y] = 1 # --initialize the SVM model w = zeros((x.shape[1], n_classes), dtype=dtype) b = zeros(n_classes, dtype=dtype) def svm(ww, bb, xx=x, yy=y1): # -- one vs. all linear SVM loss margin = yy * (dot(xx, ww) + bb) hinge = maximum(0, 1 - margin) cost = hinge.mean(axis=0).sum() return cost # -- stage-1 optimization by stochastic gradient descent print "Starting SGD" n_batches = n_examples / online_batch_size w, b = fmin_sgd( svm, (w, b), streams={ "xx": x.reshape((n_batches, online_batch_size, x.shape[1])), "yy": y1.reshape((n_batches, online_batch_size, y1.shape[1])), }, loops=online_epochs, stepsize=0.001, print_interval=10000, ) print "SGD complete, about to start L-BFGS" show_filters(w.T, (28, 28), (2, 5)) # -- stage-2 optimization by L-BFGS print "Starting L-BFGS" w, b = fmin_l_bfgs_b(svm, (w, b), maxfun=batch_epochs, iprint=1, m=lbfgs_m) print "L-BFGS complete" show_filters(w.T, (28, 28), (2, 5))
def fit(self, x, y, xw=None): """ x - n_examples x n_features design matrix. y - vector of integer labels xw - matrix of real-valued incoming biases obtained by multiplying the existing weight vectors by x """ assert set(y) <= set([-1, 1]) if x.shape[0] != y.shape[0]: raise ValueError('length mismatch between x and y') n_examples, n_features = x.shape if n_features != self.n_features: raise ValueError('n_feature mismatch', (n_features, self.n_features)) weights = self.weights bias = self.bias alpha = self.alpha x = x.astype(self.dtype) y = y.astype(self.dtype) xw = self.as_xw(x, xw) print 'WARNING: IncrementalSVM should use alpha0, n_sgd_iters' # -- warm up with some sgd weights, bias, alpha, = autodiff.fmin_sgd( lambda w, b, a, xi, yi, xwi: binary_svm_hinge_loss( xi, yi, w, b, a, None, None, self.l2_regularization), (weights, bias, alpha), streams={ 'xi': x.reshape((n_examples, 1, x.shape[1])), 'yi': y.reshape((n_examples, 1)), }, stepsize=0.01, loops=max(1, 100000 // len(x)), ) # -- fine-tune without alpha by L-BFGS weights, bias, alpha, = autodiff.fmin_l_bfgs_b( lambda w, b, a: binary_svm_hinge_loss(x, y, w, b, a, None, None, self.l2_regularization), (weights, bias, alpha), # -- the graph is tiny, time spent optimizing it is wasted. theano_mode=theano.Mode(linker='cvm', optimizer='fast_run'), **self.bfgs_kwargs) self.weights = weights self.bias = bias self.alpha = alpha
def test_svm(): """ This test case should match examples/linear_svm.py """ rng = np.random.RandomState(1) # -- create some fake data x = rng.rand(10, 5) y = 2 * (rng.rand(10) > 0.5) - 1 l2_regularization = 1e-4 def loss_fn(weights, bias): margin = y * (np.dot(x, weights) + bias) loss = np.maximum(0, 1 - margin) ** 2 l2_cost = 0.5 * l2_regularization * np.dot(weights, weights) loss = np.mean(loss) + l2_cost print 'ran loss_fn(), returning', loss return loss w, b = fmin_l_bfgs_b(loss_fn, (np.zeros(5), np.zeros(()))) final_loss = loss_fn(w, b) assert np.allclose(final_loss, 0.7229)
def main(): # -- top-level parameters of this script n_hidden1 = n_hidden2 = 25 dtype = "float32" n_examples = 10000 online_batch_size = 1 online_epochs = 3 # -- TIP: partial creates a new function with some parameters filled in # algo = partial(denoising_autoencoder_binary_x, noise_level=0.3) algo = logistic_autoencoder_binary_x batch_epochs = 10 lbfgs_m = 20 n_hidden = n_hidden1 * n_hidden2 rng = np.random.RandomState(123) data_view = mnist.views.OfficialVectorClassification(x_dtype=dtype) x = data_view.train.x[:n_examples] n_examples, n_visible = x.shape x_img_res = 28, 28 # -- uncomment this line to see sample images from the data set # show_filters(x[:100], x_img_res, (10, 10)) # -- create a new model (w, visbias, hidbias) w = rng.uniform( low=-4 * np.sqrt(6.0 / (n_hidden + n_visible)), high=4 * np.sqrt(6.0 / (n_hidden + n_visible)), size=(n_visible, n_hidden), ).astype(dtype) visbias = np.zeros(n_visible).astype(dtype) hidbias = np.zeros(n_hidden).astype(dtype) # show_filters(w.T, x_img_res, (n_hidden1, n_hidden2)) x_stream = x.reshape((n_examples / online_batch_size, online_batch_size, x.shape[1])) def train_criterion(ww, hbias, vbias, x_i=x): cost, hid = algo(x_i, ww, hbias, vbias) l1_cost = abs(ww).sum() * 0.0 # -- raise 0.0 to enforce l1 penalty l2_cost = (ww ** 2).sum() * 0.0 # -- raise 0.0 to enforce l2 penalty return cost.mean() + l1_cost + l2_cost # -- ONLINE TRAINING for epoch in range(online_epochs): t0 = time.time() w, hidbias, visbias = autodiff.fmin_sgd( train_criterion, args=(w, hidbias, visbias), stream=x_stream, # -- fmin_sgd will loop through this once stepsize=0.005, # -- QQ: you should always tune this print_interval=1000, ) print "Online training epoch %i took %f seconds" % (epoch, time.time() - t0) show_filters(w.T, x_img_res, (n_hidden1, n_hidden2)) # -- BATCH TRAINING w, hidbias, visbias = autodiff.fmin_l_bfgs_b( train_criterion, args=(w, hidbias, visbias), # -- scipy.fmin_l_bfgs_b kwargs follow maxfun=batch_epochs, iprint=1, # -- 1 for verbose, 0 for normal, -1 for quiet m=lbfgs_m, # -- how well to approximate the Hessian ) show_filters(w.T, x_img_res, (n_hidden1, n_hidden2))
# streams={ # 'x': x.reshape((n_batches, online_batch_size, x.shape[1])), # 'y1': y1.reshape((n_batches, online_batch_size, y1.shape[1]))}, # loops=n_online_epochs, # step_size=0.01, # print_interval=n_examples, # ) #print 'SGD took %.2f seconds' % (time.time() - t0) #show_filters(W.T, img_shape, (2, 5)) # -- L-BFGS optimization of our SVM cost. def batch_criterion(W, b): return ova_svm_cost(W, b, x, y1) W, b = autodiff.fmin_l_bfgs_b(batch_criterion, (W, b), maxfun=20, m=20, iprint=1) print 'final_cost', batch_criterion(W, b) # -- N. B. the output from this command comes from Fortran, so iPython does not see it. # To monitor progress, look at the terminal from which you launched ipython #show_filters(W.T, img_shape, (2, 5)) train_predictions = ova_svm_prediction(W, b, x) train_errors = y != train_predictions print 'Current train set error rate', np.mean(train_errors) test_predictions = ova_svm_prediction(W, b, iris.data[:,:2]) test_errors = iris.target != test_predictions print 'Current test set error rate', np.mean(test_errors)
def main(): # -- top-level parameters of this script n_hidden1 = n_hidden2 = 25 dtype = 'float32' n_examples = 10000 online_batch_size = 1 online_epochs = 3 # -- TIP: partial creates a new function with some parameters filled in # algo = partial(denoising_autoencoder_binary_x, noise_level=0.3) algo = logistic_autoencoder_binary_x batch_epochs = 10 lbfgs_m = 20 n_hidden = n_hidden1 * n_hidden2 rng = np.random.RandomState(123) data_view = mnist.views.OfficialVectorClassification(x_dtype=dtype) x = data_view.train.x[:n_examples] n_examples, n_visible = x.shape x_img_res = 28, 28 # -- uncomment this line to see sample images from the data set # show_filters(x[:100], x_img_res, (10, 10)) # -- create a new model (w, visbias, hidbias) w = rng.uniform(low=-4 * np.sqrt(6. / (n_hidden + n_visible)), high=4 * np.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)).astype(dtype) visbias = np.zeros(n_visible).astype(dtype) hidbias = np.zeros(n_hidden).astype(dtype) # show_filters(w.T, x_img_res, (n_hidden1, n_hidden2)) x_stream = x.reshape( (n_examples / online_batch_size, online_batch_size, x.shape[1])) def train_criterion(ww, hbias, vbias, x_i=x): cost, hid = algo(x_i, ww, hbias, vbias) l1_cost = abs(ww).sum() * 0.0 # -- raise 0.0 to enforce l1 penalty l2_cost = (ww**2).sum() * 0.0 # -- raise 0.0 to enforce l2 penalty return cost.mean() + l1_cost + l2_cost # -- ONLINE TRAINING for epoch in range(online_epochs): t0 = time.time() w, hidbias, visbias = autodiff.fmin_sgd( train_criterion, args=(w, hidbias, visbias), stream=x_stream, # -- fmin_sgd will loop through this once stepsize=0.005, # -- QQ: you should always tune this print_interval=1000, ) print 'Online training epoch %i took %f seconds' % (epoch, time.time() - t0) show_filters(w.T, x_img_res, (n_hidden1, n_hidden2)) # -- BATCH TRAINING w, hidbias, visbias = autodiff.fmin_l_bfgs_b( train_criterion, args=(w, hidbias, visbias), # -- scipy.fmin_l_bfgs_b kwargs follow maxfun=batch_epochs, iprint=1, # -- 1 for verbose, 0 for normal, -1 for quiet m=lbfgs_m, # -- how well to approximate the Hessian ) show_filters(w.T, x_img_res, (n_hidden1, n_hidden2))
# -- create some fake data x = np.random.rand(10, 5) y = 2 * (np.random.rand(10) > 0.5) - 1 l2_regularization = 1e-4 def loss_fn(weights, bias): margin = y * (np.dot(x, weights) + bias) loss = np.maximum(0, 1 - margin) ** 2 l2_cost = 0.5 * l2_regularization * np.dot(weights, weights) loss = np.mean(loss) + l2_cost print 'ran loss_fn(), returning', loss return loss # -- Run loss_fn once to trace computations. w, b = fmin_l_bfgs_b(loss_fn, [np.zeros(5), np.zeros(())]) # -- run loss_fn as usual final_loss = loss_fn(w, b) print 'Best-fit SVM:' print ' -> cost:', final_loss print ' -> weights:', w print ' -> bias:', b # Program output: # # ran loss_fn(), returning 1.0 # ran loss_fn(), returning 0.722904977725 # Best-fit SVM: # -> cost: 0.722904977725
pl2reg1 = append_l2_regularization(pl2reg, l2reg) alpha = append_alpha(alpha) pw_l2_sqr1 = append_w_l2_sqr(pw_l2_sqr, weights_) x2 = x[:, high:] pxw2 = append_xw(pxw1, x2, weights[high:]) pl2reg2 = append_l2_regularization(pl2reg1, l2reg) alpha2 = append_alpha(alpha) pw_l2_sqr2 = append_w_l2_sqr(pw_l2_sqr1, weights[high:]) def fn(w, b, a): return multi_svm_hinge_loss(x1, y, w, b, a, pxw2, pw_l2_sqr2, l2reg, pl2reg2) (weights_, bias, alpha2), info = autodiff.fmin_l_bfgs_b(fn, (weights[low:high], bias, alpha2), return_info=True, borrowable=[x1], floatX=x.dtype, **bfgs_kwargs) info['feature_high'] = high info['feature_low'] = low # -- pop off the alpha we just added weights[high:] *= alpha2[-1] alpha = alpha2[:-1].copy() w0s.append(weights_) costs.append(info['fopt']) infos.append(info) x0 = x1 pxw = pxw1