def test_arangedataset(): """ This test will verify if ArangeDataset can be used with preprocessors """ preprocessor = RemoveMean() dataset = ArangeDataset(1000, preprocessor=preprocessor, fit_preprocessor=True) dataset_no_preprocessing = ArangeDataset(1000) assert (dataset.get_data() != dataset_no_preprocessing.get_data()).any()
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_lr_scalers(): """ Tests that SGD respects Model.get_lr_scalers """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): super(ModelWithScalers, self).__init__() self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def __call__(self, X): # Implemented only so that DummyCost would work return X def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(.0), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ cost = SumOfParams() scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val**2 dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_rmsprop(): """ Make sure that learning_rule.RMSProp obtains the same parameter values as with a hand-crafted RMSProp implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.90 max_scaling = 1e5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=RMSProp(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = -scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval manual = rmsprop_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [ -learning_rate * scale + i * momentum for scale, i in izip(scales, vel) ] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_sgd_sequential(): # tests that requesting train_iteration_mode = 'sequential' # works dim = 1 batch_size = 3 m = 5 * batch_size dataset = ArangeDataset(m) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 visited = [False] * m def visit(X): assert X.shape[1] == 1 assert np.all(X[1:] == X[0:-1] + 1) start = int(X[0, 0]) if start > 0: assert visited[start - 1] for i in xrange(batch_size): assert not visited[start + i] visited[start + i] = 1 data_specs = (model.get_input_space(), model.get_input_source()) cost = CallbackCost(visit, data_specs) # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) algorithm = SGD(learning_rate, cost, batch_size=5, train_iteration_mode='sequential', monitoring_dataset=None, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) algorithm.train(dataset) assert all(visited)
def test_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def prepare_adagrad_test(dataset_type='arange', model_type='random'): """ Factor out common code for AdaGrad tests. Parameters ---------- dataset_type : string, optional Can use either `arange` to use an ArangeDataset instance or `zeros` to create an all-zeros DenseDesignMatrix. model_type : string, optional How to initialize the model; `random` will initialize parameters to random values, `zeros` to zero. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales, init_type=model_type) if dataset_type == 'arange': dataset = ArangeDataset(1) elif dataset_type == 'zeros': X = np.zeros((1, 1)) X[:, 0] = np.arange(1) dataset = DenseDesignMatrix(X) else: raise ValueError('Unknown value for dataset_type: %s', dataset_type) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) return (cost, model, dataset, sgd, state)
def test_determinism(): # Verifies that running SGD twice results in the same examples getting visited # in the same order for mode in _iteration_schemes: dim = 1 batch_size = 3 num_batches = 5 m = num_batches * batch_size dataset = ArangeDataset(m) model = SoftmaxModel(dim) learning_rate = 1e-3 batch_size = 5 visited = [[-1] * m] def visit(X): mx = max(visited[0]) counter = mx + 1 for i in X[:, 0]: i = int(i) assert visited[0][i] == -1 visited[0][i] = counter counter += 1 data_specs = (model.get_input_space(), model.get_input_source()) cost = CallbackCost(visit, data_specs) # We need to include this so the test actually stops running at some point termination_criterion = EpochCounter(5) def run_algorithm(): unsupported_modes = ['random_slice', 'random_uniform'] algorithm = SGD(learning_rate, cost, batch_size=5, train_iteration_mode=mode, monitoring_dataset=None, termination_criterion=termination_criterion, update_callbacks=None, init_momentum=None, set_batch_size=False) algorithm.setup(dataset=dataset, model=model) raised = False try: algorithm.train(dataset) except ValueError: print mode assert mode in unsupported_modes raised = True if mode in unsupported_modes: assert raised return True return False if run_algorithm(): continue visited.insert(0, [-1] * m) del model.monitor run_algorithm() for v in visited: assert len(v) == m for elem in range(m): assert elem in v assert len(visited) == 2 print visited[0] print visited[1] assert np.all(np.asarray(visited[0]) == np.asarray(visited[1]))
def test_revisit(): # Test that each call to monitor revisits exactly the same data BATCH_SIZE = 3 MAX_BATCH_SIZE = 12 BATCH_SIZE_STRIDE = 3 NUM_BATCHES = 10 num_examples = NUM_BATCHES * BATCH_SIZE monitoring_dataset = ArangeDataset(num_examples) for mon_batch_size in xrange(BATCH_SIZE, MAX_BATCH_SIZE + 1, BATCH_SIZE_STRIDE): for num_mon_batches in [ 1, 3, num_examples / mon_batch_size, None ]: for mode in sorted(_iteration_schemes): if num_mon_batches is None and mode in ['random_uniform', 'random_slice']: continue model = DummyModel(1) monitor = Monitor.get_monitor(model) try: monitor.add_dataset(monitoring_dataset, mode, batch_size=mon_batch_size, num_batches=num_mon_batches) except TypeError: monitor.add_dataset(monitoring_dataset, mode, batch_size=mon_batch_size, num_batches=num_mon_batches, seed = 0) if num_mon_batches is None: num_mon_batches = int(np.ceil(float(num_examples) / float(mon_batch_size))) batches = [ None ] * num_mon_batches visited = [ False ] * num_mon_batches batch_idx = shared(0) class RecorderAndValidator(object): def __init__(self): self.validate = False def __call__(self, *data): """ Initially, records the batches the monitor shows it. When set to validate mode, makes sure the batches shown on the second monitor call match those from the first.""" X, = data idx = batch_idx.get_value() batch_idx.set_value(idx + 1) # Note: if the monitor starts supporting variable batch sizes, # take this out. Maybe move it to a new test that the iterator's # uneven property is set accurately warnings.warn("TODO: add unit test that iterators uneven property is set correctly.") # assert X.shape[0] == mon_batch_size if self.validate: previous_batch = batches[idx] assert not visited[idx] visited[idx] = True if not np.allclose(previous_batch, X): print 'Visited different data in batch',idx print previous_batch print X print 'Iteration mode', mode assert False else: batches[idx] = X # end if # end __call__ #end class prereq = RecorderAndValidator() monitor.add_channel(name = 'dummy', ipt = model.input_space.make_theano_batch(), val = 0., prereqs = [ prereq ], data_specs=(model.get_input_space(), model.get_input_source())) try: monitor() except RuntimeError: print 'monitor raised RuntimeError for iteration mode', mode raise assert None not in batches batch_idx.set_value(0) prereq.validate = True monitor() assert all(visited)
def test_revisit(): # Test that each call to monitor revisits exactly the same data BATCH_SIZE = 3 MAX_BATCH_SIZE = 12 BATCH_SIZE_STRIDE = 3 NUM_BATCHES = 10 num_examples = NUM_BATCHES * BATCH_SIZE monitoring_dataset = ArangeDataset(num_examples) for mon_batch_size in xrange(BATCH_SIZE, MAX_BATCH_SIZE + 1, BATCH_SIZE_STRIDE): for num_mon_batches in [ 1, 3, num_examples / mon_batch_size, None ]: for mode in sorted(_iteration_schemes): if num_mon_batches is None and mode in ['random_uniform', 'random_slice']: continue model = DummyModel(1) monitor = Monitor.get_monitor(model) try: try: monitor.add_dataset(monitoring_dataset, mode, batch_size=mon_batch_size, num_batches=num_mon_batches) except TypeError: monitor.add_dataset(monitoring_dataset, mode, batch_size=mon_batch_size, num_batches=num_mon_batches, seed = 0) except NotImplementedError: # Monitor does not currently support uneven iterators, so skip # uneven iteration modes # Check that this is what caused the error if num_mon_batches is not None and mon_batch_size * num_mon_batches > num_examples: continue if num_mon_batches is None and num_examples % mon_batch_size != 0: continue print num_mon_batches, mon_batch_size, num_examples, mode raise if num_mon_batches is None: num_mon_batches = num_examples / mon_batch_size batches = [ None ] * num_mon_batches visited = [ False ] * num_mon_batches batch_idx = shared(0) class RecorderAndValidator: def __init__(self): self.validate = False def __call__(self, X, y): """ Initially, records the batches the monitor shows it. When set to validate mode, makes sure the batches shown on the second monitor call match those from the first.""" assert y is None idx = batch_idx.get_value() batch_idx.set_value(idx + 1) # Note: if the monitor starts supporting variable batch sizes, # take this out. Maybe move it to a new test that the iterator's # uneven property is set accurately assert X.shape[0] == mon_batch_size if self.validate: previous_batch = batches[idx] assert not visited[idx] visited[idx] = True if not np.allclose(previous_batch, X): print 'Visited different data in batch',idx print previous_batch print X print 'Iteration mode', mode assert False else: batches[idx] = X # end if # end __call__ #end class prereq = RecorderAndValidator() monitor.add_channel(name = 'dummy', ipt = model.input_space.make_theano_batch(), val = 0., prereqs = [ prereq ]) monitor() assert None not in batches batch_idx.set_value(0) prereq.validate = True monitor() assert all(visited)