def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1,2,3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q,self.p) p = self.p q = self.q optimizer = BatchGradientDescent( max_iter = 100, line_search_mode = 'exhaustive', verbose = True, objective = kl, conjugate = True, params = [ p.mu, p.beta, q.mu, q.beta ], param_constrainers = [ p.censor_updates, q.censor_updates ]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: if config.floatX == 'float32': neg_tol = 4.8e-7 else: neg_tol = 0. if kl < - neg_tol: raise AssertionError("KL divergence should " "be non-negative but is "+ str(kl)) warnings.warn("KL divergence is not very numerically stable, evidently") tol = 6e-5 if kl > tol: print 'kl:',kl print 'tol:',tol assert kl <= tol assert not (kl > tol )
def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1,2,3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q,self.p) p = self.p q = self.q optimizer = BatchGradientDescent( max_iter = 100, line_search_mode = 'exhaustive', verbose = True, objective = kl, conjugate = True, params = [ p.mu, p.beta, q.mu, q.beta ], param_constrainers = [ p.censor_updates, q.censor_updates ]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: if config.floatX == 'float32': neg_tol = 4.8e-7 else: neg_tol = 0. if kl < - neg_tol: raise AssertionError("KL divergence should " "be non-negative but is "+ str(kl)) warnings.warn("KL divergence is not very numerically stable, evidently") tol = 5.4e-5 if kl > tol: print 'kl:',kl print 'tol:',tol assert kl <= tol assert not (kl > tol )
def fit(self, params=None, l1=.0, l2=.0): """ Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization. """ loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps) if params is None: params = [self.eta] # Symbolic Theano variables that represent the L1 and L2 regularization terms L1, L2 = .0, .0 for param in params: L1 += T.sum(abs(param)) L2 += T.sum(param ** 2) regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2 minimizer = BatchGradientDescent(objective=regularized_loo_loss, params=params, inputs=[], verbose=1) minimizer.minimize()
def fit(self, params=None, l1=.0, l2=.0): NLL = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps) if params is None: params = [self.eta] # Symbolic Theano variables that represent the L1 and L2 regularization terms L1, L2 = .0, .0 for param in params: L1 += T.sum(abs(param)) L2 += T.sum(param**2) regularized_NLL = NLL + l1 * L1 + l2 * L2 minimizer = BatchGradientDescent(objective=regularized_NLL, params=params, inputs=[], verbose=1) minimizer.minimize()
def fit(self, params=None, l1=.0, l2=.0): """ Fit the model by minimizing the Leave One Out (LOO) loss using gradient-based optimization. """ loo_loss = self.loss_symbolic(self.L, self.y, self.mu, self.R, self.eta, self.eps) if params is None: params = [self.eta] # Symbolic Theano variables that represent the L1 and L2 regularization terms L1, L2 = .0, .0 for param in params: L1 += T.sum(abs(param)) L2 += T.sum(param**2) regularized_loo_loss = loo_loss + l1 * L1 + l2 * L2 minimizer = BatchGradientDescent(objective=regularized_loo_loss, params=params, inputs=[], verbose=1) minimizer.minimize()
def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1,2,3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1,10.,(dim,)).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q,self.p) p = self.p q = self.q optimizer = BatchGradientDescent( objective = kl, params = [ p.mu, p.beta, q.mu, q.beta ], param_constrainers = [ p.censor_updates, q.censor_updates ]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: raise AssertionError("KL divergence should " "be non-negative but is "+ str(kl)) tol = 5.4e-5 assert kl <= tol assert not (kl > tol )
def test_zero_optimal(self): """ minimizes the kl divergence between q and p using batch gradient descent and checks that the result is zero""" rng = np.random.RandomState([1, 2, 3]) dim = self.dim num_trials = 3 mu = rng.randn(dim).astype(floatX) beta = rng.uniform(.1, 10., (dim, )).astype(floatX) self.p.mu.set_value(mu) mu = rng.randn(dim).astype(floatX) self.q.mu.set_value(mu) self.p.beta.set_value(beta) beta = rng.uniform(.1, 10., (dim, )).astype(floatX) self.q.beta.set_value(beta) kl = kl_divergence(self.q, self.p) p = self.p q = self.q optimizer = BatchGradientDescent( objective=kl, params=[p.mu, p.beta, q.mu, q.beta], param_constrainers=[p.censor_updates, q.censor_updates]) #optimizer.verbose = True kl = optimizer.minimize() if kl < 0.: raise AssertionError("KL divergence should " "be non-negative but is " + str(kl)) tol = 5.4e-5 assert kl <= tol assert not (kl > tol)
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent bgd = BatchGradientDescent(objective=-neuron, params=[X], inputs=None, max_iter=100, lr_scalers=None, verbose=3, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=True, gradients=None, gradient_updates=None, accumulate=False, theano_function_mode=None, param_constrainers=None) bgd.minimize() X = normed.eval()[:,:,:,0].transpose(1,2,0) import numpy as np X /= np.abs(X).max() print (X.min(), X.max()) from pylearn2.utils.image import show show(X)
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class""" def __init__(self, cost, batch_size=None, batches_per_iter=None, updates_per_batch=10, monitoring_batches=None, monitoring_dataset=None, termination_criterion=None, set_batch_size=False, reset_alpha=True, conjugate=False, min_init_alpha=None, reset_conjugate=True, line_search_mode=None): """ cost: a pylearn2 Cost batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm still iterates over minibatches of data. The difference is that this class uses partial line searches to choose the step size along each gradient direction, and can do repeated updates on the same batch. The assumption is that you use big enough minibatches with this algorithm that a large step size will generalize reasonably well to other minibatches. To implement true Batch Gradient Descent, set the batch_size to the total number of examples available. If batch_size is None, it will revert to the model's force_batch_size attribute. set_batch_size: If True, BGD will attempt to override the model's force_batch_size attribute by calling set_batch_size on it. updates_per_batch: Passed through to the optimization.BatchGradientDescent's max_iters parameter reset_alpha, conjugate, reset_conjugate: passed through to the optimization.BatchGradientDescent parameters of the same names monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion self.rng = np.random.RandomState([2012, 10, 16]) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError( "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) X = self.model.get_input_space().make_theano_batch() self.topo = X.ndim != 2 Y = T.matrix() if self.cost.supervised: obj = self.cost(model, X, Y) grads, grad_updates = self.cost.get_gradients(model, X, Y) ipt = (X, Y) else: obj = self.cost(model, X) grads, grad_updates = self.cost.get_gradients(model, X) ipt = X if obj is None: raise ValueError( "BGD is incompatible with " + str(self.cost) + " because " " it is intractable, but BGD uses the cost function value to do " " line searches.") if self.monitoring_dataset is not None: if not any([ dataset.has_targets() for dataset in self.monitoring_dataset.values() ]): Y = None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, X, Y)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) self.monitor.add_channel(prefix + 'objective', ipt=ipt, val=obj, dataset=monitoring_dataset) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel(name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs) if ipt is X: ipts = [X] else: ipts = ipt self.optimizer = BatchGradientDescent( objective=obj, gradients=grads, gradient_updates=grad_updates, params=model.get_params(), param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=ipts, verbose=True, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode) self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model batch_size = self.batch_size if self.topo: get_data = dataset.get_batch_topo else: get_data = dataset.get_batch_design rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, targets=self.cost.supervised, num_batches=self.batches_per_iter, topo=self.topo, rng=rng) for data in iterator: if self.cost.supervised: args = data X, Y = data else: args = [data] X = data self.optimizer.minimize(*args) model.monitor.report_batch(X.shape[0]) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion(self.model)
from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent as BGD import theano.tensor as T def norm_constraint(updates): assert X in updates updates[X] = updates[X] / (1e-7 + T.sqrt(T.sqr(X).sum())) opt = BGD(objective=obj, params=[X], param_constrainers=[norm_constraint], conjugate=True, reset_conjugate=False, reset_alpha=False, line_search_mode='exhaustive', verbose=3, max_iter=20) results = [] import numpy as np rng = np.random.RandomState([1, 2, 3]) for i in xrange(10): X.set_value(rng.randn(*X.get_value().shape).astype(X.dtype) / 10.) opt.minimize() Xv = X.dimshuffle(3, 1, 2, 0).eval() results.append(Xv) X = np.concatenate(results, axis=0) from pylearn2.gui.patch_viewer import make_viewer v = make_viewer(X) v.show()
def test_batch_gradient_descent(): """ Verify that batch gradient descent works by checking that it minimizes a quadratic function f(x) = x^T A x + b^T x + c correctly for several sampled values of A, b, and c. The ground truth minimizer is x = np.linalg.solve(A,-b)""" n = 3 A = T.matrix(name = 'A') b = T.vector(name = 'b') c = T.scalar(name = 'c') x = sharedX( np.zeros((n,)) , name = 'x') half = np.cast[config.floatX](0.5) obj = half * T.dot(T.dot(x,A),x)+T.dot(b,x)+c minimizer = BatchGradientDescent( objective = obj, params = [ x], inputs = [ A, b, c]) num_samples = 3 rng = np.random.RandomState([1,2,3]) for i in xrange(num_samples): A = np.cast[config.floatX](rng.randn(1.5*n,n)) A = np.cast[config.floatX](np.dot(A.T,A)) A += np.cast[config.floatX](np.identity(n) * .02) b = np.cast[config.floatX](rng.randn(n)) c = np.cast[config.floatX](rng.randn()) x.set_value(np.cast[config.floatX](rng.randn(n))) analytical_x = np.linalg.solve(A,-b) actual_obj = minimizer.minimize(A,b,c) actual_x = x.get_value() #Check that the value returned by the minimize method #is the objective function value at the parameters #chosen by the minimize method cur_obj = minimizer.obj(A,b,c) assert np.allclose(actual_obj, cur_obj) x.set_value(analytical_x) analytical_obj = minimizer.obj(A,b,c) #make sure the objective function is accurate to first 4 digits condition1 = not np.allclose(analytical_obj, actual_obj) condition2 = np.abs(analytical_obj-actual_obj) >= 1e-4 * np.abs(analytical_obj) if (config.floatX == 'float64' and condition1) \ or (config.floatX == 'float32' and condition2): print 'objective function value came out wrong on sample ',i print 'analytical obj', analytical_obj print 'actual obj',actual_obj """ The following section of code was used to verify that numerical error can make the objective function look non-convex print 'Checking for numerically induced non-convex behavior' def f(x): return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c x.set_value(actual_x) minimizer._compute_grad(A,b,c) minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() x = actual_x.copy() prev = f(x) print prev step_size = 1e-4 x += step_size * d cur = f(x) print cur cur_sgn = np.sign(cur-prev) flip_cnt = 0 for i in xrange(10000): x += step_size * d prev = cur cur = f(x) print cur prev_sgn = cur_sgn cur_sgn = np.sign(cur-prev) if cur_sgn != prev_sgn: print 'flip' flip_cnt += 1 if flip_cnt > 1: print "Non-convex!" from matplotlib import pyplot as plt y = [] x = actual_x.copy() for j in xrange(10000): y.append(f(x)) x += step_size * d plt.plot(y) plt.show() assert False print 'None found' """ #print 'actual x',actual_x #print 'A:' #print A #print 'b:' #print b #print 'c:' #print c x.set_value(actual_x) minimizer._compute_grad(A,b,c) x_grad = minimizer.param_to_grad_shared[x] actual_grad = x_grad.get_value() correct_grad = 0.5 * np.dot(A,x.get_value())+ 0.5 * np.dot(A.T, x.get_value()) +b if not np.allclose(actual_grad, correct_grad): print 'gradient was wrong at convergence point' print 'actual grad: ' print actual_grad print 'correct grad: ' print correct_grad print 'max difference: ',np.abs(actual_grad-correct_grad).max() assert False minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \ + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d)) g = np.dot(A,actual_x)+b deriv = np.dot(g,d) print 'directional deriv at actual', deriv print 'optimal step_len', step_len optimal_x = actual_x - d * step_len g = np.dot(A,optimal_x) + b deriv = np.dot(g,d) print 'directional deriv at optimal: ',deriv x.set_value(optimal_x) print 'obj at optimal: ',minimizer.obj(A,b,c) print 'eigenvalue range:' val, vec = np.linalg.eig(A) print (val.min(),val.max()) print 'condition number: ',(val.max()/val.min()) assert False
for Y_i in Y: pos_prob = 1./(1.+T.exp(model.free_energy(X)-model.free_energy(Y_i))) acc = (pos_prob > .5).mean() accs.append(acc) acc = sum(accs) / float(len(accs)) print '\tinit accuracy ',function([],acc)() #Minimize the objective function with batch gradient descent minimizer = BatchGradientDescent( objective = J, params = model.get_params(), param_constrainers = [ model.censor_updates ]) print '\tinit obj:',minimizer.obj() #minimizer.verbose = True minimizer.minimize() print '\tfinal obj:',minimizer.obj() recovered_beta = model.beta.get_value() recovered_mu = model.mu.get_value() print '\trecovered beta:',recovered_beta print '\trecovered mu:',recovered_mu kl = kl_divergence(true, model) kl = function([],kl)() assert kl >= 0.0 print '\tkl was ',kl print '\tfinal accuracy ',function([],acc)() kls[trial,idx1] = kl
def test_batch_gradient_descent(): """ Verify that batch gradient descent works by checking that it minimizes a quadratic function f(x) = x^T A x + b^T x + c correctly for several sampled values of A, b, and c. The ground truth minimizer is x = np.linalg.solve(A,-b)""" n = 3 A = T.matrix(name='A') b = T.vector(name='b') c = T.scalar(name='c') x = sharedX(np.zeros((n, )), name='x') half = np.cast[config.floatX](0.5) obj = half * T.dot(T.dot(x, A), x) + T.dot(b, x) + c minimizer = BatchGradientDescent(objective=obj, params=[x], inputs=[A, b, c]) num_samples = 3 rng = np.random.RandomState([1, 2, 3]) for i in xrange(num_samples): A = np.cast[config.floatX](rng.randn(1.5 * n, n)) A = np.cast[config.floatX](np.dot(A.T, A)) A += np.cast[config.floatX](np.identity(n) * .02) b = np.cast[config.floatX](rng.randn(n)) c = np.cast[config.floatX](rng.randn()) x.set_value(np.cast[config.floatX](rng.randn(n))) analytical_x = np.linalg.solve(A, -b) actual_obj = minimizer.minimize(A, b, c) actual_x = x.get_value() #Check that the value returned by the minimize method #is the objective function value at the parameters #chosen by the minimize method cur_obj = minimizer.obj(A, b, c) assert np.allclose(actual_obj, cur_obj) x.set_value(analytical_x) analytical_obj = minimizer.obj(A, b, c) #make sure the objective function is accurate to first 4 digits condition1 = not np.allclose(analytical_obj, actual_obj) condition2 = np.abs(analytical_obj - actual_obj) >= 1e-4 * np.abs(analytical_obj) if (config.floatX == 'float64' and condition1) \ or (config.floatX == 'float32' and condition2): print 'objective function value came out wrong on sample ', i print 'analytical obj', analytical_obj print 'actual obj', actual_obj """ The following section of code was used to verify that numerical error can make the objective function look non-convex print 'Checking for numerically induced non-convex behavior' def f(x): return 0.5 * np.dot(x,np.dot(A,x)) + np.dot(b,x) + c x.set_value(actual_x) minimizer._compute_grad(A,b,c) minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() x = actual_x.copy() prev = f(x) print prev step_size = 1e-4 x += step_size * d cur = f(x) print cur cur_sgn = np.sign(cur-prev) flip_cnt = 0 for i in xrange(10000): x += step_size * d prev = cur cur = f(x) print cur prev_sgn = cur_sgn cur_sgn = np.sign(cur-prev) if cur_sgn != prev_sgn: print 'flip' flip_cnt += 1 if flip_cnt > 1: print "Non-convex!" from matplotlib import pyplot as plt y = [] x = actual_x.copy() for j in xrange(10000): y.append(f(x)) x += step_size * d plt.plot(y) plt.show() assert False print 'None found' """ #print 'actual x',actual_x #print 'A:' #print A #print 'b:' #print b #print 'c:' #print c x.set_value(actual_x) minimizer._compute_grad(A, b, c) x_grad = minimizer.param_to_grad_shared[x] actual_grad = x_grad.get_value() correct_grad = 0.5 * np.dot(A, x.get_value()) + 0.5 * np.dot( A.T, x.get_value()) + b if not np.allclose(actual_grad, correct_grad): print 'gradient was wrong at convergence point' print 'actual grad: ' print actual_grad print 'correct grad: ' print correct_grad print 'max difference: ', np.abs(actual_grad - correct_grad).max() assert False minimizer._normalize_grad() d = minimizer.param_to_grad_shared[x].get_value() step_len = ( np.dot(b,d) + 0.5 * np.dot(d,np.dot(A,actual_x)) \ + 0.5 * np.dot(actual_x,np.dot(A,d)) ) / np.dot(d, np.dot(A,d)) g = np.dot(A, actual_x) + b deriv = np.dot(g, d) print 'directional deriv at actual', deriv print 'optimal step_len', step_len optimal_x = actual_x - d * step_len g = np.dot(A, optimal_x) + b deriv = np.dot(g, d) print 'directional deriv at optimal: ', deriv x.set_value(optimal_x) print 'obj at optimal: ', minimizer.obj(A, b, c) print 'eigenvalue range:' val, vec = np.linalg.eig(A) print(val.min(), val.max()) print 'condition number: ', (val.max() / val.min()) assert False
class InpaintAlgorithm(object): def __init__(self, mask_gen, cost, batch_size=None, batches_per_iter=None, monitoring_batches=None, monitoring_dataset=None, max_iter=5, suicide=False, init_alpha=None, reset_alpha=True, conjugate=False, reset_conjugate=True, termination_criterion=None, set_batch_size=False, line_search_mode=None, min_init_alpha=1e-3, duplicate=1, combine_batches=1, scale_step=1., theano_function_mode=None): assert False # deprecated """ if batch_size is None, reverts to the force_batch_size field of the model """ if line_search_mode is None and init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None if isinstance(monitoring_dataset, Dataset): self.monitoring_dataset = {'': monitoring_dataset} self.bSetup = False self.rng = np.random.RandomState([2012, 10, 17]) def setup_batch(self, X, Y=None): assert not isinstance(X, tuple) self.X.set_value(X) assert self.cost.supervised == (Y is not None) if Y is not None: assert Y.ndim == 2 assert self.Y.ndim == 2 self.Y.set_value(Y) self.update_mask() def get_setup_batch_object(self): return SetupBatch(self) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.set_batch_size: model.set_batch_size(self.batch_size) if self.batch_size is None: self.batch_size = model.force_batch_size model.cost = self.cost model.mask_gen = self.mask_gen self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) prereq = self.get_setup_batch_object() #We want to use big batches. We need to make several theano calls on each #batch. To avoid paying the GPU latency every time, we use a shared variable #but the shared variable needs to stay allocated during the time that the #monitor is working, and we don't want the monitor to increase the memory #overhead. So we make the monitor work off of the same shared variable space = model.get_input_space() X = sharedX(space.get_origin_batch(model.batch_size), 'BGD_X') self.space = space rng = np.random.RandomState([2012, 7, 20]) test_mask = space.get_origin_batch(model.batch_size) test_mask = rng.randint(0, 2, test_mask.shape) if hasattr(self.mask_gen, 'sync_channels') and self.mask_gen.sync_channels: if test_mask.ndim != 4: raise NotImplementedError() test_mask = test_mask[:, :, :, 0] assert test_mask.ndim == 3 drop_mask = sharedX(np.cast[X.dtype](test_mask), name='drop_mask') self.drop_mask = drop_mask assert drop_mask.ndim == test_mask.ndim Y = None drop_mask_Y = None if self.cost.supervised: Y = sharedX( model.get_output_space().get_origin_batch(model.batch_size), 'BGD_Y') self.Y = Y test_mask_Y = rng.randint(0, 2, (model.batch_size, )) drop_mask_Y = sharedX(np.cast[Y.dtype](test_mask_Y), name='drop_mask_Y') self.drop_mask_Y = drop_mask_Y dmx, dmy = self.mask_gen(X, Y) updates = OrderedDict([ (drop_mask, dmx),\ (drop_mask_Y, dmy)] ) else: updates = OrderedDict([(drop_mask, self.mask_gen(X))]) obj = self.cost(model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) gradients, gradient_updates = self.cost.get_gradients( model, X, Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) if hasattr(model.inference_procedure, 'V_dropout'): include_prob = model.inference_procedure.include_prob theano_rng = MRG_RandomStreams(2012 + 11 + 20) for elem in flatten([ model.inference_procedure.V_dropout, model.inference_procedure.H_dropout ]): updates[elem] = theano_rng.binomial( p=include_prob, size=elem.shape, dtype=elem.dtype, n=1) / include_prob self.update_mask = function([], updates=updates) if self.monitoring_dataset is not None: if not any([ dataset.has_targets() for dataset in self.monitoring_dataset.values() ]): Y = None assert X.name is not None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) assert X.name is not None wtf = self.cost.get_monitoring_channels(model, X=X, Y=Y, drop_mask=drop_mask, drop_mask_Y=drop_mask_Y) for key in wtf: channels[key] = wtf[key] for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) #we only need to put the prereq in once to make sure it gets run #adding it more times shouldn't hurt, but be careful #each time you say "self.setup_batch" you get a new object with a #different id, and if you install n of those the prereq will run n #times. It won't cause any wrong results, just a big slowdown warnings.warn( "This is weird-- ipt=(X,Y)=tell the monitor to replace X, Y with the givens dict, " " but you don't actually want them to be replaced.") ipt = X if Y is not None: ipt = [X, Y] self.monitor.add_channel(prefix + 'objective', ipt=ipt, val=obj, dataset=monitoring_dataset, prereqs=[prereq]) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = [] prereqs = list(prereqs) prereqs.append(prereq) if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel(name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs) self.accumulate = self.combine_batches > 1 if self.accumulate: self.inputs = [ elem for elem in [X, Y, drop_mask, drop_mask_Y] if elem is not None ] else: self.inputs = None self.optimizer = BatchGradientDescent( objective=obj, inputs=self.inputs, verbose=1, gradients=gradients, gradient_updates=gradient_updates, params=model.get_params(), lr_scalers=model.get_lr_scalers(), param_constrainers=[model.censor_updates], max_iter=self.max_iter, tol=3e-7, init_alpha=self.init_alpha, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, accumulate=self.accumulate, theano_function_mode=self.theano_function_mode) self.X = X if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=ipt, val=self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=ipt, val=self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=ipt, val=self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True def before_step(self, model): if self.scale_step != 1.: self.params = list(model.get_params()) self.value = [param.get_value() for param in self.params] def after_step(self, model): if self.scale_step != 1: for param, value in safe_zip(self.params, self.value): value = (1. - self.scale_step ) * value + self.scale_step * param.get_value() param.set_value(value) def train(self, dataset): assert self.bSetup model = self.model if self.batch_size is None: batch_size = model.force_batch_size else: batch_size = self.batch_size if hasattr(model, 'force_batch_size'): assert (model.force_batch_size <= 0 or batch_size == model.force_batch_size) assert self.batch_size % self.duplicate == 0 rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size // self.duplicate, num_batches=self.batches_per_iter, targets=self.cost.supervised, topo=self.X.ndim != 2, rng=rng) accum_batches = [] if self.accumulate: warnings.warn( "InpaintAlg.train wastes time setting shared variables only to pull their value back out." ) for data in iterator: if self.cost.supervised: X, Y = data mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): stry = str(Y).replace('\n', ' ') mode.record.handle_line('data Y ' + stry + '\n') if self.duplicate > 1: Y = np.concatenate([Y] * self.duplicate, axis=0) self.Y.set_value(Y) else: X = data if self.duplicate > 1: X = np.concatenate([X] * self.duplicate, axis=0) self.X.set_value(X) self.update_mask() if self.accumulate: accum_batches.append( [elem.get_value() for elem in self.inputs]) if len(accum_batches) == self.combine_batches: self.before_step(model) self.optimizer.minimize(*accum_batches) self.after_step(model) actual_batch_size = sum( [batch[0].shape[0] for batch in accum_batches]) model.monitor.report_batch(actual_batch_size) accum_batches = [] else: self.before_step(model) self.optimizer.minimize() self.after_step(model) actual_batch_size = X.shape[0] model.monitor.report_batch(actual_batch_size) assert len(accum_batches) == 0 def continue_learning(self, model): if self.termination_criterion is not None: return self.termination_criterion(self.model) return True
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class Parameters ---------- cost : pylearn2.costs.Cost A pylearn2 Cost, or None, in which case model.get_default_cost() \ will be used batch_size : int Like the SGD TrainingAlgorithm, this TrainingAlgorithm still \ iterates over minibatches of data. The difference is that this \ class uses partial line searches to choose the step size along \ each gradient direction, and can do repeated updates on the same \ batch. The assumption is that you use big enough minibatches with \ this algorithm that a large step size will generalize reasonably \ well to other minibatches. To implement true Batch Gradient \ Descent, set the batch_size to the total number of examples \ available. If batch_size is None, it will revert to the model's \ force_batch_size attribute. batches_per_iter : int WRITEME updates_per_batch : int Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` monitoring_batch_size : int Size of monitoring batches. monitoring_batches : WRITEME monitoring_dataset: Dataset or dict A Dataset or a dictionary mapping string dataset names to Datasets termination_criterion : WRITEME set_batch_size : bool If True, BGD will attempt to override the model's \ `force_batch_size` attribute by calling set_batch_size on it. reset_alpha : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` conjugate : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` min_init_alpha : float WRITEME reset_conjugate : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` line_search_mode : WRITEME verbose_optimization : bool WRITEME scale_step : float WRITEME theano_function_mode : WRITEME init_alpha : WRITEME seed : WRITEME """ def __init__(self, cost=None, batch_size=None, batches_per_iter=None, updates_per_batch=10, monitoring_batch_size=None, monitoring_batches=None, monitoring_dataset=None, termination_criterion=None, set_batch_size=False, reset_alpha=True, conjugate=False, min_init_alpha=.001, reset_conjugate=True, line_search_mode=None, verbose_optimization=False, scale_step=1., theano_function_mode=None, init_alpha=None, seed=None): self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches is None assert monitoring_batch_size is None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion self.rng = make_np_rng(seed, [2012, 10, 16], which_method=["randn", "randint"]) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train loosely \ implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, **fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, **fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: if (self.monitoring_batch_size is None and self.monitoring_batches is None): self.monitoring_batch_size = self.batch_size self.monitoring_batches = self.batches_per_iter self.monitor.setup(dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.monitoring_batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) params = model.get_params() self.optimizer = BatchGradientDescent( objective=cost_value, gradients=grads, gradient_updates=grad_updates, params=params, param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=theano_args, verbose=self.verbose_optimization, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True def train(self, dataset): """ .. todo:: WRITEME """ assert self.bSetup model = self.model rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError( "Unable to train with BGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, num_batches=self.batches_per_iter, data_specs=flat_data_specs, return_tuple=True, rng=rng) mode = self.theano_function_mode for data in iterator: if ('targets' in source_tuple and mode is not None and hasattr(mode, 'record')): Y = data[source_tuple.index('targets')] stry = str(Y).replace('\n', ' ') mode.record.handle_line('data Y ' + stry + '\n') for on_load_batch in self.on_load_batch: on_load_batch(mapping.nest(data)) self.before_step(model) self.optimizer.minimize(*data) self.after_step(model) actual_batch_size = flat_data_specs[0].np_batch_size(data) model.monitor.report_batch(actual_batch_size) def continue_learning(self, model): """ .. todo:: WRITEME """ if self.termination_criterion is None: return True else: rval = self.termination_criterion.continue_learning(self.model) assert rval in [True, False, 0, 1] return rval def before_step(self, model): """ .. todo:: WRITEME """ if self.scale_step != 1.: self.params = list(model.get_params()) self.value = [param.get_value() for param in self.params] def after_step(self, model): """ .. todo:: WRITEME """ if self.scale_step != 1: for param, value in safe_zip(self.params, self.value): value = (1.-self.scale_step) * value + self.scale_step \ * param.get_value() param.set_value(value)
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class""" def __init__(self, cost=None, batch_size=None, batches_per_iter=None, updates_per_batch=10, monitoring_batches=None, monitoring_dataset=None, termination_criterion = None, set_batch_size=False, reset_alpha=True, conjugate=False, min_init_alpha=.001, reset_conjugate=True, line_search_mode=None, verbose_optimization=False, scale_step=1., theano_function_mode=None, init_alpha=None, seed=None): """ Parameters ---------- cost : pylearn2.costs.Cost A pylearn2 Cost, or None, in which case model.get_default_cost() \ will be used batch_size : int Like the SGD TrainingAlgorithm, this TrainingAlgorithm still \ iterates over minibatches of data. The difference is that this \ class uses partial line searches to choose the step size along \ each gradient direction, and can do repeated updates on the same \ batch. The assumption is that you use big enough minibatches with \ this algorithm that a large step size will generalize reasonably \ well to other minibatches. To implement true Batch Gradient \ Descent, set the batch_size to the total number of examples \ available. If batch_size is None, it will revert to the model's \ force_batch_size attribute. batches_per_iter : int WRITEME updates_per_batch : int Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` monitoring_batches : WRITEME monitoring_dataset: Dataset or dict A Dataset or a dictionary mapping string dataset names to Datasets termination_criterion : WRITEME set_batch_size : bool If True, BGD will attempt to override the model's \ `force_batch_size` attribute by calling set_batch_size on it. reset_alpha : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` conjugate : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` min_init_alpha : float WRITEME reset_conjugate : bool Passed through to the optimization.BatchGradientDescent's \ `max_iters parameter` line_search_mode : WRITEME verbose_optimization : bool WRITEME scale_step : float WRITEME theano_function_mode : WRITEME init_alpha : WRITEME seed : WRITEME """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion if seed is None: seed = [2012, 10, 16] self.rng = np.random.RandomState(seed) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model : object A Python object representing the model to train loosely \ implementing the interface of models.model.Model. dataset : pylearn2.datasets.dataset.Dataset Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but " + "model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) data_specs = self.cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space, # named according to the sources. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = 'BGD_[%s]' % source arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `self.cost` need args to be passed in a format compatible # with their data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = self.cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = self.cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients( model, nested_args, ** fixed_var_descr.fixed_vars) assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if cost_value is None: raise ValueError("BGD is incompatible with " + str(self.cost) + " because it is intractable, but BGD uses the " + "cost function value to do line searches.") # obj_prereqs has to be a list of function f called with f(*data), # where data is a data tuple coming from the iterator. # this function enables capturing "mapping" and "f", while # enabling the "*data" syntax def capture(f, mapping=mapping): new_f = lambda *args: f(mapping.flatten(args, return_tuple=True)) return new_f obj_prereqs = [capture(f) for f in fixed_var_descr.on_load_batch] if self.monitoring_dataset is not None: self.monitor.setup( dataset=self.monitoring_dataset, cost=self.cost, batch_size=self.batch_size, num_batches=self.monitoring_batches, obj_prereqs=obj_prereqs, cost_monitoring_args=fixed_var_descr.fixed_vars) # TODO : Why is this commented? ''' channels = model.get_monitoring_channels(theano_args) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, theano_args, ** fixed_var_descr.fixed_vars)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) # The monitor compiles all channels for the same dataset into one function, and # runs all prereqs before calling the function. So we only need to register the # on_load_batch prereq once per monitoring dataset. self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=cost_value, dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None self.monitor.add_channel(name= prefix + name, ipt=ipt, val=J, data_specs=data_specs, dataset = monitoring_dataset, prereqs=prereqs) ''' params = model.get_params() self.optimizer = BatchGradientDescent( objective = cost_value, gradients = grads, gradient_updates = grad_updates, params = params, param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = theano_args, verbose = self.verbose_optimization, max_iter = self.updates_per_batch, reset_alpha = self.reset_alpha, conjugate = self.conjugate, reset_conjugate = self.reset_conjugate, min_init_alpha = self.min_init_alpha, line_search_mode = self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) # These monitoring channels keep track of shared variables, # which do not need inputs nor data. if self.monitoring_dataset is not None: self.monitor.add_channel( name='ave_step_size', ipt=None, val=self.optimizer.ave_step_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_size', ipt=None, val=self.optimizer.ave_grad_size, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel( name='ave_grad_mult', ipt=None, val=self.optimizer.ave_grad_mult, data_specs=(NullSpace(), ''), dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True def train(self, dataset): """ .. todo:: WRITEME """ assert self.bSetup model = self.model rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None data_specs = self.cost.get_data_specs(self.model) # The iterator should be built from flat data specs, so it returns # flat, non-redundent tuples of data. mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) if len(space_tuple) == 0: # No data will be returned by the iterator, and it is impossible # to know the size of the actual batch. # It is not decided yet what the right thing to do should be. raise NotImplementedError("Unable to train with BGD, because " "the cost does not actually use data from the data set. " "data_specs: %s" % str(data_specs)) flat_data_specs = (CompositeSpace(space_tuple), source_tuple) iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, num_batches=self.batches_per_iter, data_specs=flat_data_specs, return_tuple=True, rng = rng) mode = self.theano_function_mode for data in iterator: if ('targets' in source_tuple and mode is not None and hasattr(mode, 'record')): Y = data[source_tuple.index('targets')] stry = str(Y).replace('\n',' ') mode.record.handle_line('data Y '+stry+'\n') for on_load_batch in self.on_load_batch: on_load_batch(mapping.nest(data)) self.before_step(model) self.optimizer.minimize(*data) self.after_step(model) actual_batch_size = flat_data_specs[0].np_batch_size(data) model.monitor.report_batch(actual_batch_size) def continue_learning(self, model): """ .. todo:: WRITEME """ if self.termination_criterion is None: return True else: rval = self.termination_criterion.continue_learning(self.model) assert rval in [True, False, 0, 1] return rval def before_step(self, model): """ .. todo:: WRITEME """ if self.scale_step != 1.: self.params = list(model.get_params()) self.value = [ param.get_value() for param in self.params ] def after_step(self, model): """ .. todo:: WRITEME """ if self.scale_step != 1: for param, value in safe_zip(self.params, self.value): value = (1.-self.scale_step) * value + self.scale_step * param.get_value() param.set_value(value)
act = p[0,filter_idx,i,j] obj = - act + norm_penalty * T.square(X).sum() assert obj.ndim == 0 optimizer = BatchGradientDescent(objective = obj, params = [X], inputs = None, param_constrainers = None, max_iter = 1000, verbose = True, tol = None, init_alpha = (.001, .005, .01, .05, .1)) optimizer.minimize() img = X.get_value()[0,:,:,:] print 'max mag: ',np.abs(img).max() print 'norm: ',np.square(img).sum() print 'min: ',img.min() print 'max: ',img.max() img /= np.abs(img).max() img *= .5 img += 1 show(img)
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class""" def __init__( self, cost, batch_size=None, batches_per_iter=None, updates_per_batch=10, monitoring_batches=None, monitoring_dataset=None, termination_criterion=None, set_batch_size=False, reset_alpha=True, conjugate=False, min_init_alpha=None, reset_conjugate=True, line_search_mode=None, ): """ cost: a pylearn2 Cost batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm still iterates over minibatches of data. The difference is that this class uses partial line searches to choose the step size along each gradient direction, and can do repeated updates on the same batch. The assumption is that you use big enough minibatches with this algorithm that a large step size will generalize reasonably well to other minibatches. To implement true Batch Gradient Descent, set the batch_size to the total number of examples available. If batch_size is None, it will revert to the model's force_batch_size attribute. set_batch_size: If True, BGD will attempt to override the model's force_batch_size attribute by calling set_batch_size on it. updates_per_batch: Passed through to the optimization.BatchGradientDescent's max_iters parameter reset_alpha, conjugate, reset_conjugate: passed through to the optimization.BatchGradientDescent parameters of the same names monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion self.rng = np.random.RandomState([2012, 10, 16]) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, "force_batch_size"): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError( "batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size) ) self.monitor = Monitor.get_monitor(model) X = self.model.get_input_space().make_theano_batch() self.topo = X.ndim != 2 Y = T.matrix() if self.cost.supervised: obj = self.cost(model, X, Y) grads, grad_updates = self.cost.get_gradients(model, X, Y) ipt = (X, Y) else: obj = self.cost(model, X) grads, grad_updates = self.cost.get_gradients(model, X) ipt = X if obj is None: raise ValueError( "BGD is incompatible with " + str(self.cost) + " because " " it is intractable, but BGD uses the cost function value to do " " line searches." ) if self.monitoring_dataset is not None: if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]): Y = None channels = model.get_monitoring_channels(X, Y) if not isinstance(channels, dict): raise TypeError( "model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels) ) channels.update(self.cost.get_monitoring_channels(model, X, Y)) for dataset_name in self.monitoring_dataset: if dataset_name == "": prefix = "" else: prefix = dataset_name + "_" monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset( dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches, ) self.monitor.add_channel(prefix + "objective", ipt=ipt, val=obj, dataset=monitoring_dataset) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X, Y) else: ipt = X self.monitor.add_channel( name=prefix + name, ipt=ipt, val=J, dataset=monitoring_dataset, prereqs=prereqs ) if ipt is X: ipts = [X] else: ipts = ipt self.optimizer = BatchGradientDescent( objective=obj, gradients=grads, gradient_updates=grad_updates, params=model.get_params(), param_constrainers=[model.censor_updates], lr_scalers=model.get_lr_scalers(), inputs=ipts, verbose=True, max_iter=self.updates_per_batch, reset_alpha=self.reset_alpha, conjugate=self.conjugate, reset_conjugate=self.reset_conjugate, min_init_alpha=self.min_init_alpha, line_search_mode=self.line_search_mode, ) self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model batch_size = self.batch_size if self.topo: get_data = dataset.get_batch_topo else: get_data = dataset.get_batch_design rng = self.rng train_iteration_mode = "shuffled_sequential" if not is_stochastic(train_iteration_mode): rng = None iterator = dataset.iterator( mode=train_iteration_mode, batch_size=self.batch_size, targets=self.cost.supervised, num_batches=self.batches_per_iter, topo=self.topo, rng=rng, ) for data in iterator: if self.cost.supervised: args = data X, Y = data else: args = [data] X = data self.optimizer.minimize(*args) model.monitor.report_batch(X.shape[0]) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion(self.model)
class DNCE_Algorithm(object): def __init__(self, noise, batch_size=1000, batches_per_iter=10, noise_per_clean = 30, monitoring_batches=-1, monitoring_dataset=None): """ if batch_size is None, reverts to the force_batch_size field of the model """ self.batch_size, self.batches_per_iter = batch_size, batches_per_iter if monitoring_dataset is None: assert monitoring_batches == -1 self.monitoring_dataset = monitoring_dataset self.monitoring_batches = monitoring_batches self.bSetup = False self.noise = noise self.noise_per_clean = noise_per_clean def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model self.monitor = Monitor.get_monitor(model) X = T.matrix() Y = T.matrix() dnce = DNCE( self.noise) if self.monitoring_dataset is not None: if not self.monitoring_dataset.has_targets(): Y = None self.monitor.set_dataset(dataset=self.monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) X.tag.test_value = self.monitoring_dataset.get_batch_design(2) channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) dnce.noise_per_clean = self.noise_per_clean obj = dnce(model,X) dnce.noise_per_clean = None self.monitor.add_channel('DNCE',ipt=X,val=obj) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name=name, ipt=ipt, val=J, prereqs=prereqs) X = sharedX( dataset.get_batch_design(1), 'X') Y = [] updates = {} for i in xrange(self.noise_per_clean): Y_i = sharedX( X.get_value().copy() ) updates[Y_i] = self.noise.random_design_matrix(X) Y.append(Y_i) self.update_noise = function([], updates = updates) obj = dnce(model,X,Y) self.optimizer = BatchGradientDescent( objective = obj, params = model.get_params(), param_constrainers = [ model.censor_updates ], max_iter = 5) self.X = X self.Y = Y self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model if self.batch_size is None: batch_size = model.force_batch_size else: batch_size = self.batch_size if hasattr(model, 'force_batch_size'): assert (model.force_batch_size <= 0 or batch_size == model.force_batch_size) for i in xrange(self.batches_per_iter): self.X.set_value(dataset.get_batch_design(self.batch_size)) self.update_noise() self.optimizer.minimize() model.monitor.report_batch( batch_size ) return True
class BGD(object): """Batch Gradient Descent training algorithm class""" def __init__(self, cost, batch_size=None, batches_per_iter=10, updates_per_batch = 10, monitoring_batches=-1, monitoring_dataset=None, termination_criterion = None): """ if batch_size is None, reverts to the force_batch_size field of the model """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == -1 self.bSetup = False self.termination_criterion = termination_criterion def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) X = self.model.get_input_space().make_theano_batch() self.topo = X.ndim != 2 Y = T.matrix() if self.monitoring_dataset is not None: if not self.monitoring_dataset.has_targets(): Y = None self.monitor.add_dataset(dataset=self.monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) #TODO: currently only supports unsupervised costs, support supervised too obj = self.cost(model,X) self.monitor.add_channel('batch_gd_objective',ipt=X,val=obj) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name=name, ipt=ipt, val=J, prereqs=prereqs) obj = self.cost(model,X) self.optimizer = BatchGradientDescent( objective = obj, params = model.get_params(), param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = [ X ], verbose = True, max_iter = self.updates_per_batch) self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model batch_size = self.batch_size if self.topo: get_data = dataset.get_batch_topo else: get_data = dataset.get_batch_design for i in xrange(self.batches_per_iter): X = get_data(self.batch_size) self.optimizer.minimize(X) model.monitor.report_batch( batch_size ) if self.termination_criterion is None: return True else: return self.termination_criterion(self.model)
_, model_path = sys.argv from pylearn2.utils import serial model = serial.load(model_path) d = model.discriminator import gc del model gc.collect() from pylearn2.utils import sharedX X = sharedX(d.get_input_space().get_origin_batch(1)) obj = -d.fprop(X).sum() from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent as BGD import theano.tensor as T def norm_constraint(updates): assert X in updates updates[X] = updates[X] / (1e-7 + T.sqrt(T.sqr(X).sum())) opt = BGD(objective=obj, params=[X], param_constrainers=[norm_constraint], conjugate=True, reset_conjugate=False, reset_alpha=False, line_search_mode='exhaustive', verbose=3, max_iter=20) results = [] import numpy as np rng = np.random.RandomState([1, 2, 3]) for i in xrange(10): X.set_value(rng.randn(*X.get_value().shape).astype(X.dtype) / 10.) opt.minimize() Xv = X.dimshuffle(3, 1, 2, 0).eval() results.append(Xv) X = np.concatenate(results, axis=0) from pylearn2.gui.patch_viewer import make_viewer v = make_viewer(X) v.show()
1. + T.exp(model.free_energy(X) - model.free_energy(Y_i))) acc = (pos_prob > .5).mean() accs.append(acc) acc = sum(accs) / float(len(accs)) print '\tinit accuracy ', function([], acc)() #Minimize the objective function with batch gradient descent minimizer = BatchGradientDescent( objective=J, params=model.get_params(), param_constrainers=[model.censor_updates]) print '\tinit obj:', minimizer.obj() #minimizer.verbose = True minimizer.minimize() print '\tfinal obj:', minimizer.obj() recovered_beta = model.beta.get_value() recovered_mu = model.mu.get_value() print '\trecovered beta:', recovered_beta print '\trecovered mu:', recovered_mu kl = kl_divergence(true, model) kl = function([], kl)() assert kl >= 0.0 print '\tkl was ', kl print '\tfinal accuracy ', function([], acc)() kls[trial, idx1] = kl
class WarmStart(TrainExtension): def __init__(self, num_basis_vectors, num_points, scale, max_jump_norm = 1., method = 'gradient', fitting_cost = 'mse', include_root = False, num_applications = -1, psd = False, use_solver = False, reps=1): self.__dict__.update(locals()) del self.self self.batch_size = 1000 self.rng = np.random.RandomState([2014, 5, 8, 2]) def setup(self, model, dataset, algorithm): """ Train calls this immediately upon instantiation, before any monitoring is done. This subclass uses it to warm-start the parameters. Parameters ---------- model : pylearn2.models.Model The model object being trained. dataset : pylearn2.datasets.Dataset The dataset object being trained. algorithm : pylearn2.training_algorithms.TrainingAlgorithm The object representing the training algorithm being used to train the model. *This must be a TrainingAlgorithm that has a `cost` attribute that is a pylearn2 `Cost`, such as `SGD` or `BGD`.* """ if self.num_applications == 0: return self.num_applications -= 1 for i in xrange(self.reps): self.setup_impl(model, dataset, algorithm) def setup_impl(self, model, dataset, algorithm): cost = algorithm.cost root = model.get_param_vector() dim = root.size rng = self.rng points = rng.randn(self.num_points, self.num_basis_vectors) points = points.astype(root.dtype) points *= self.scale if self.include_root: points[0, :] = 0. if not hasattr(self, 'cost_fn'): # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn else: cost_fn = self.cost_fn cost_values = np.zeros(self.num_points) data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) if self.method == 'gaussian': basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype) elif self.method == 'element': basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): basis[rng.randint(dim), i] = 1. elif self.method == 'gradient': if not hasattr(self, 'grad_fn'): self.grad_fn = function(theano_args, grad(cost_value, model.get_params())) grad_fn = self.grad_fn basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): ipt = list(dataset.get_batch_design(1, include_labels=True)) label = ipt[1] assert label.size == 1 label = label[0] one_hot = np.zeros((1, 10,),dtype='float32') one_hot[0, label] = 1 ipt[1] = one_hot g = grad_fn(*ipt) basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0) else: assert False basis /= np.sqrt(np.square(basis).sum(axis=0)) # Orthogonalize basis for i in xrange(self.num_basis_vectors): v = basis[:,i ].copy() for j in xrange(i - 1): u = basis[:, j].copy() v -= np.dot(u, v) * u norm = np.sqrt(np.square(v).sum()) assert norm > 1e-4 v /= norm basis[:,i] = v for i in xrange(self.num_points): print "Evaluating cost at point ", i point = points[i, :] full_point = root + np.dot(basis, point) model.set_param_vector(full_point) cost_values[i] = cost_fn(*data) print cost_values[i] from pylearn2.utils import sharedX import theano.tensor as T print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!" if not hasattr(self, 'fit_quad'): points = sharedX(points) #from theano import config #config.compute_test_value = 'raise' cost_values = sharedX(cost_values) A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors))) if self.psd: mat = T.dot(A.T, A) else: mat = A b = sharedX(np.zeros(self.num_basis_vectors)) c = sharedX(0.) half_quad = T.dot(points, mat) quad = (points * half_quad).sum(axis=1) lin = T.dot(points, b) pred = quad + lin + c from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent mse = T.square(pred - cost_values).mean() mae = abs(pred - cost_values).mean() obj = locals()[self.fitting_cost] fit_quad = BatchGradientDescent(obj, params = [A, b, c], max_iter = self.num_basis_vectors ** 2, verbose = 3, tol = None, init_alpha = None, min_init_alpha = 1e-7, reset_alpha = False, conjugate = True, reset_conjugate = False, line_search_mode = 'exhaustive') self.fit_quad = fit_quad self.A = A self.b = b self.c = c self.points = points self.cost_values = cost_values else: self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype)) self.b.set_value(self.b.get_value() * 0.) self.c.set_value(self.c.get_value() * 0.) self.points.set_value(points) self.cost_values.set_value(cost_values.astype(self.cost_values.dtype)) self.fit_quad.minimize() print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!" if self.use_solver: if self.psd: Av = self.A.get_value() mat_v = np.dot(Av.T, Av) else: mat_v = self.A.get_value() bv = self.b.get_value() # minimize for x^T A x + b^T x + c # -> solve 2 A x + b = 0 # Ax = - b / 2 print "********** mat_v", mat_v.min(), mat_v.max() x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv) print "********** soln: ", x.min(), x.mean(), x.max() print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max() assert x.ndim == 1, x.shape prod = np.dot(basis, x) norm = np.sqrt(np.square(prod).sum()) print "*************** Moving params by ",norm vector = root + prod model.set_param_vector(vector) else: # use minimizer if not hasattr(self, 'fit_params'): self.vector = sharedX(points.get_value().mean(axis=0)) vector = self.vector obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector) def constrain(d): assert vector in d n = d[vector] norm = T.sqrt(T.square(n).sum()) desired_norm = T.clip(norm, 0., self.max_jump_norm) d[vector] = n * desired_norm / norm self.fit_params = BatchGradientDescent(obj, params=[vector], max_iter = self.num_basis_vectors, verbose = 3, tol=None, param_constrainers = [constrain], init_alpha = None, min_init_alpha = 1e-3, reset_alpha=False, conjugate=True, reset_conjugate=False, line_search_mode='exhaustive') else: self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype)) self.fit_params.minimize() model.set_param_vector(root + np.dot(basis , self.vector.get_value()))
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class""" def __init__(self, cost, batch_size=None, batches_per_iter=None, updates_per_batch = 10, monitoring_batches=None, monitoring_dataset=None, termination_criterion = None, set_batch_size = False, reset_alpha = True, conjugate = False, min_init_alpha = .001, reset_conjugate = True, line_search_mode = None, verbose_optimization=False, scale_step=1., theano_function_mode=None, init_alpha=None, seed=None): """ cost: a pylearn2 Cost batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm still iterates over minibatches of data. The difference is that this class uses partial line searches to choose the step size along each gradient direction, and can do repeated updates on the same batch. The assumption is that you use big enough minibatches with this algorithm that a large step size will generalize reasonably well to other minibatches. To implement true Batch Gradient Descent, set the batch_size to the total number of examples available. If batch_size is None, it will revert to the model's force_batch_size attribute. set_batch_size: If True, BGD will attempt to override the model's force_batch_size attribute by calling set_batch_size on it. updates_per_batch: Passed through to the optimization.BatchGradientDescent's max_iters parameter reset_alpha, conjugate, reset_conjugate: passed through to the optimization.BatchGradientDescent parameters of the same names monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion if seed is None: seed = [2012, 10, 16] self.rng = np.random.RandomState(seed) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) X = self.model.get_input_space().make_theano_batch() X.name = 'BGD_X' self.topo = X.ndim != 2 Y = T.matrix() Y.name = 'BGD_Y' fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y) self.on_load_batch = fixed_var_descr.on_load_batch if not self.cost.supervised: Y = None if self.cost.supervised: obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars) ipt = (X,Y) else: obj = self.cost(model, X, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars) ipt = X Y = None assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if obj is None: raise ValueError("BGD is incompatible with "+str(self.cost)+" because " " it is intractable, but BGD uses the cost function value to do " " line searches.") if self.monitoring_dataset is not None: if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]): Y = None channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) # The monitor compiles all channels for the same dataset into one function, and # runs all prereqs before calling the function. So we only need to register the # on_load_batch prereq once per monitoring dataset. self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj, dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name= prefix + name, ipt=ipt, val=J, dataset = monitoring_dataset, prereqs=prereqs) if self.cost.supervised: ipts = [X, Y] else: ipts = [X] params = model.get_params() self.optimizer = BatchGradientDescent( objective = obj, gradients = grads, gradient_updates = grad_updates, params = params, param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = ipts, verbose = self.verbose_optimization, max_iter = self.updates_per_batch, reset_alpha = self.reset_alpha, conjugate = self.conjugate, reset_conjugate = self.reset_conjugate, min_init_alpha = self.min_init_alpha, line_search_mode = self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) if self.monitoring_dataset is not None: self.monitor.add_channel(name='ave_step_size', ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_size', ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_mult', ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model batch_size = self.batch_size if self.topo: get_data = dataset.get_batch_topo else: get_data = dataset.get_batch_design rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, targets=self.cost.supervised, num_batches=self.batches_per_iter, topo=self.topo, rng = rng) for data in iterator: if self.cost.supervised: args = data X, Y = data mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): stry = str(Y).replace('\n',' ') mode.record.handle_line('data Y '+stry+'\n') for on_load_batch in self.on_load_batch: on_load_batch(X, Y) else: args = [ data ] X = data for on_load_batch in self.on_load_batch: on_load_batch(X, None) self.before_step(model) self.optimizer.minimize(*args) self.after_step(model) model.monitor.report_batch( X.shape[0] ) def continue_learning(self, model): if self.termination_criterion is None: return True else: return self.termination_criterion(self.model) def before_step(self, model): if self.scale_step != 1.: self.params = list(model.get_params()) self.value = [ param.get_value() for param in self.params ] def after_step(self, model): if self.scale_step != 1: for param, value in safe_zip(self.params, self.value): value = (1.-self.scale_step) * value + self.scale_step * param.get_value() param.set_value(value)
class BGD(TrainingAlgorithm): """Batch Gradient Descent training algorithm class""" def __init__(self, cost=None, batch_size=None, batches_per_iter=None, updates_per_batch = 10, monitoring_batches=None, monitoring_dataset=None, termination_criterion = None, set_batch_size = False, reset_alpha = True, conjugate = False, min_init_alpha = .001, reset_conjugate = True, line_search_mode = None, verbose_optimization=False, scale_step=1., theano_function_mode=None, init_alpha=None, seed=None): """ cost: a pylearn2 Cost, or None, in which case model.get_default_cost() will be used batch_size: Like the SGD TrainingAlgorithm, this TrainingAlgorithm still iterates over minibatches of data. The difference is that this class uses partial line searches to choose the step size along each gradient direction, and can do repeated updates on the same batch. The assumption is that you use big enough minibatches with this algorithm that a large step size will generalize reasonably well to other minibatches. To implement true Batch Gradient Descent, set the batch_size to the total number of examples available. If batch_size is None, it will revert to the model's force_batch_size attribute. set_batch_size: If True, BGD will attempt to override the model's force_batch_size attribute by calling set_batch_size on it. updates_per_batch: Passed through to the optimization.BatchGradientDescent's max_iters parameter reset_alpha, conjugate, reset_conjugate: passed through to the optimization.BatchGradientDescent parameters of the same names monitoring_dataset: A Dataset or a dictionary mapping string dataset names to Datasets """ self.__dict__.update(locals()) del self.self if monitoring_dataset is None: assert monitoring_batches == None self._set_monitoring_dataset(monitoring_dataset) self.bSetup = False self.termination_criterion = termination_criterion if seed is None: seed = [2012, 10, 16] self.rng = np.random.RandomState(seed) def setup(self, model, dataset): """ Allows the training algorithm to do some preliminary configuration *before* we actually start training the model. The dataset is provided in case other derived training algorithms need to modify model based on the dataset. Parameters ---------- model: a Python object representing the model to train loosely implementing the interface of models.model.Model. dataset: a pylearn2.datasets.dataset.Dataset object used to draw training data """ self.model = model if self.cost is None: self.cost = model.get_default_cost() if self.batch_size is None: self.batch_size = model.force_batch_size else: batch_size = self.batch_size if self.set_batch_size: model.set_batch_size(batch_size) elif hasattr(model, 'force_batch_size'): if not (model.force_batch_size <= 0 or batch_size == model.force_batch_size): raise ValueError("batch_size is %d but model.force_batch_size is %d" % (batch_size, model.force_batch_size)) self.monitor = Monitor.get_monitor(model) self.monitor.set_theano_function_mode(self.theano_function_mode) X = self.model.get_input_space().make_theano_batch() X.name = 'BGD_X' self.topo = X.ndim != 2 if self.topo: assert self.model.get_input_space().axes == ('b', 0, 1, 'c') Y = T.matrix() Y.name = 'BGD_Y' if config.compute_test_value != 'off': X.tag.test_value = self.model.get_input_space().get_origin_batch(self.batch_size).astype(X.dtype) Y_batch = self.model.get_output_space().get_origin_batch(self.batch_size).astype(Y.dtype) assert Y_batch.ndim == 2 for i in xrange(Y_batch.shape[0]): Y_batch[i, i % Y_batch.shape[1]] = 1 Y.tag.test_value = Y_batch fixed_var_descr = self.cost.get_fixed_var_descr(model, X, Y) self.on_load_batch = fixed_var_descr.on_load_batch if not self.cost.supervised: Y = None if self.cost.supervised: obj = self.cost(model, X, Y, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, Y, ** fixed_var_descr.fixed_vars) ipt = (X,Y) else: obj = self.cost(model, X, ** fixed_var_descr.fixed_vars) grads, grad_updates = self.cost.get_gradients(model, X, ** fixed_var_descr.fixed_vars) ipt = X Y = None assert isinstance(grads, OrderedDict) assert isinstance(grad_updates, OrderedDict) if obj is None: raise ValueError("BGD is incompatible with "+str(self.cost)+" because " " it is intractable, but BGD uses the cost function value to do " " line searches.") # TODO: replace the following if block with a call to monitor.setup (it does the same thing; # this will reduce code duplication) # may need to still manually add some BGD-specific channels like ave_step_size here if self.monitoring_dataset is not None: if not any([dataset.has_targets() for dataset in self.monitoring_dataset.values()]): Y = None channels = model.get_monitoring_channels(X,Y) if not isinstance(channels, dict): raise TypeError("model.get_monitoring_channels must return a " "dictionary, but it returned " + str(channels)) channels.update(self.cost.get_monitoring_channels(model, X, Y, ** fixed_var_descr.fixed_vars)) for dataset_name in self.monitoring_dataset: if dataset_name == '': prefix = '' else: prefix = dataset_name + '_' monitoring_dataset = self.monitoring_dataset[dataset_name] self.monitor.add_dataset(dataset=monitoring_dataset, mode="sequential", batch_size=self.batch_size, num_batches=self.monitoring_batches) # The monitor compiles all channels for the same dataset into one function, and # runs all prereqs before calling the function. So we only need to register the # on_load_batch prereq once per monitoring dataset. self.monitor.add_channel(prefix + 'objective',ipt=ipt,val=obj, dataset = monitoring_dataset, prereqs = fixed_var_descr.on_load_batch) for name in channels: J = channels[name] if isinstance(J, tuple): assert len(J) == 2 J, prereqs = J else: prereqs = None if Y is not None: ipt = (X,Y) else: ipt = X self.monitor.add_channel(name= prefix + name, ipt=ipt, val=J, dataset = monitoring_dataset, prereqs=prereqs) if self.cost.supervised: ipts = [X, Y] else: ipts = [X] params = model.get_params() self.optimizer = BatchGradientDescent( objective = obj, gradients = grads, gradient_updates = grad_updates, params = params, param_constrainers = [ model.censor_updates ], lr_scalers = model.get_lr_scalers(), inputs = ipts, verbose = self.verbose_optimization, max_iter = self.updates_per_batch, reset_alpha = self.reset_alpha, conjugate = self.conjugate, reset_conjugate = self.reset_conjugate, min_init_alpha = self.min_init_alpha, line_search_mode = self.line_search_mode, theano_function_mode=self.theano_function_mode, init_alpha=self.init_alpha) if self.monitoring_dataset is not None: self.monitor.add_channel(name='ave_step_size', ipt=ipt, val = self.optimizer.ave_step_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_size', ipt=ipt, val = self.optimizer.ave_grad_size, dataset=self.monitoring_dataset.values()[0]) self.monitor.add_channel(name='ave_grad_mult', ipt=ipt, val = self.optimizer.ave_grad_mult, dataset=self.monitoring_dataset.values()[0]) self.first = True self.bSetup = True def train(self, dataset): assert self.bSetup model = self.model batch_size = self.batch_size if self.topo: get_data = dataset.get_batch_topo else: get_data = dataset.get_batch_design rng = self.rng train_iteration_mode = 'shuffled_sequential' if not is_stochastic(train_iteration_mode): rng = None iterator = dataset.iterator(mode=train_iteration_mode, batch_size=self.batch_size, targets=self.cost.supervised, num_batches=self.batches_per_iter, topo=self.topo, rng = rng) for data in iterator: if self.cost.supervised: args = data X, Y = data mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): stry = str(Y).replace('\n',' ') mode.record.handle_line('data Y '+stry+'\n') for on_load_batch in self.on_load_batch: on_load_batch(X, Y) else: args = [ data ] X = data for on_load_batch in self.on_load_batch: on_load_batch(X, None) self.before_step(model) self.optimizer.minimize(*args) self.after_step(model) model.monitor.report_batch( X.shape[0] ) def continue_learning(self, model): if self.termination_criterion is None: return True else: rval = self.termination_criterion.continue_learning(self.model) assert rval in [True, False, 0, 1] return rval def before_step(self, model): if self.scale_step != 1.: self.params = list(model.get_params()) self.value = [ param.get_value() for param in self.params ] def after_step(self, model): if self.scale_step != 1: for param, value in safe_zip(self.params, self.value): value = (1.-self.scale_step) * value + self.scale_step * param.get_value() param.set_value(value)