def test_lr_scalers(): """ Tests that SGD respects Model.get_lr_scalers """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] learning_rate = .001 class ModelWithScalers(Model): def __init__(self): super(ModelWithScalers, self).__init__() self._params = [sharedX(np.zeros(shape)) for shape in shapes] self.input_space = VectorSpace(1) def __call__(self, X): # Implemented only so that DummyCost would work return X def get_lr_scalers(self): return dict(zip(self._params, scales)) model = ModelWithScalers() dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(.0), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale for param, scale in zip(manual, scales) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_adadelta(): """ Make sure that learning_rule.AdaDelta obtains the same parameter values as with a hand-crafted AdaDelta implementation, given a dummy model and learning rate scaler for each parameter. Reference: "AdaDelta: An Adaptive Learning Rate Method", Matthew D. Zeiler. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) decay = 0.95 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaDelta(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) state[param]['dx2'] = np.zeros(param_shape) def adadelta_manual(model, state): inc = [] rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.sqrt(pstate['g2'] + scale * learning_rate) rms_dx_tm1 = np.sqrt(pstate['dx2'] + scale * learning_rate) dx_t = -rms_dx_tm1 / rms_g_t * param_val pstate['dx2'] = decay * pstate['dx2'] + (1 - decay) * dx_t**2 rval += [param_val + dx_t] return rval manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adadelta_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_adagrad(): """ Make sure that learning_rule.AdaGrad obtains the same parameter values as with a hand-crafted AdaGrad implementation, given a dummy model and learning rate scaler for each parameter. Reference: "Adaptive subgradient methods for online learning and stochastic optimization", Duchi J, Hazan E, Singer Y. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) def adagrad_manual(model, state): rval = [] for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin adadelta pstate['sg2'] += param_val**2 dx_t = -(scale * learning_rate / np.sqrt(pstate['sg2']) * param_val) rval += [param_val + dx_t] return rval manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) manual = adagrad_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def get_costs(self, cost_array): costs = [] for cost_id in cost_array: costs.extend(self.get_cost(cost_id)) if len(costs) > 1: cost = SumOfCosts(costs) else: cost = costs[0] return cost
def test_rmsprop(): """ Make sure that learning_rule.RMSProp obtains the same parameter values as with a hand-crafted RMSProp implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 decay = 0.90 max_scaling = 1e5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=RMSProp(decay), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['g2'] = np.zeros(param_shape) def rmsprop_manual(model, state): inc = [] rval = [] epsilon = 1. / max_scaling for scale, param in izip(scales, model.get_params()): pstate = state[param] param_val = param.get_value() # begin rmsprop pstate['g2'] = decay * pstate['g2'] + (1 - decay) * param_val**2 rms_g_t = np.maximum(np.sqrt(pstate['g2']), epsilon) dx_t = -scale * learning_rate / rms_g_t * param_val rval += [param_val + dx_t] return rval manual = rmsprop_manual(model, state) sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def model2(): #pdb.set_trace() # train set X has dim (60,000, 784), y has dim (60,000, 10) train_set = MNIST(which_set='train', one_hot=True) # test set X has dim (10,000, 784), y has dim (10,000, 10) test_set = MNIST(which_set='test', one_hot=True) # =====<Create the MLP Model>===== h1_layer = RectifiedLinear(layer_name='h1', dim=1000, irange=0.5) #print h1_layer.get_params() h2_layer = RectifiedLinear(layer_name='h2', dim=1000, sparse_init=15, max_col_norm=1) y_layer = Softmax(layer_name='y', n_classes=train_set.y.shape[1], irange=0.5) mlp = MLP(batch_size=100, input_space=VectorSpace(dim=train_set.X.shape[1]), layers=[h1_layer, h2_layer, y_layer]) # =====<Create the SGD algorithm>===== sgd = SGD(batch_size=100, init_momentum=0.1, learning_rate=0.01, monitoring_dataset={ 'valid': train_set, 'test': test_set }, cost=SumOfCosts(costs=[ MethodCost('cost_from_X'), WeightDecay(coeffs=[0.00005, 0.00005, 0.00005]) ]), termination_criterion=MonitorBased( channel_name='valid_y_misclass', prop_decrease=0.0001, N=5)) #sgd.setup(model=mlp, dataset=train_set) # =====<Extensions>===== ext = [MomentumAdjustor(start=1, saturate=10, final_momentum=0.99)] # =====<Create Training Object>===== save_path = './mlp_model2.pkl' train_obj = Train(dataset=train_set, model=mlp, algorithm=sgd, extensions=ext, save_path=save_path, save_freq=0) #train_obj.setup_extensions() train_obj.main_loop()
def test_nesterov_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum, nesterov_momentum=True), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] vel = [-learning_rate * scale for scale in scales] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params())) vel = [ -learning_rate * scale + i * momentum for scale, i in izip(scales, vel) ] updates = [ -learning_rate * scale + v * momentum for scale, v in izip(scales, vel) ] manual = [param + update for param, update in izip(manual, updates)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in izip(manual, model.get_params()))
def test_momentum(): """ Make sure that learning_rule.Momentum obtains the same parameter values as with a hand-crafted sgd w/ momentum implementation, given a dummy model and learning rate scaler for each parameter. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=Momentum(momentum), batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def test_lr_scalers_momentum(): """ Tests that SGD respects Model.get_lr_scalers when using momentum. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfParams(), (0., DummyCost())]) scales = [.01, .02, .05, 1., 5.] shapes = [(1, ), (9, ), (8, 7), (6, 5, 4), (3, 2, 2, 2)] model = DummyModel(shapes, lr_scalers=scales) dataset = ArangeDataset(1) learning_rate = .001 momentum = 0.5 sgd = SGD(cost=cost, learning_rate=learning_rate, init_momentum=momentum, batch_size=1) sgd.setup(model=model, dataset=dataset) manual = [param.get_value() for param in model.get_params()] inc = [-learning_rate * scale for param, scale in zip(manual, scales)] manual = [param + i for param, i in zip(manual, inc)] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params())) manual = [ param - learning_rate * scale + i * momentum for param, scale, i in zip(manual, scales, inc) ] sgd.train(dataset=dataset) assert all( np.allclose(manual_param, sgd_param.get_value()) for manual_param, sgd_param in zip(manual, model.get_params()))
def prepare_adagrad_test(dataset_type='arange', model_type='random'): """ Factor out common code for AdaGrad tests. Parameters ---------- dataset_type : string, optional Can use either `arange` to use an ArangeDataset instance or `zeros` to create an all-zeros DenseDesignMatrix. model_type : string, optional How to initialize the model; `random` will initialize parameters to random values, `zeros` to zero. """ # We include a cost other than SumOfParams so that data is actually # queried from the training set, and the expected number of updates # are applied. cost = SumOfCosts([SumOfOneHalfParamsSquared(), (0., DummyCost())]) model = DummyModel(shapes, lr_scalers=scales, init_type=model_type) if dataset_type == 'arange': dataset = ArangeDataset(1) elif dataset_type == 'zeros': X = np.zeros((1, 1)) X[:, 0] = np.arange(1) dataset = DenseDesignMatrix(X) else: raise ValueError('Unknown value for dataset_type: %s', dataset_type) sgd = SGD(cost=cost, learning_rate=learning_rate, learning_rule=AdaGrad(), batch_size=1) sgd.setup(model=model, dataset=dataset) state = {} for param in model.get_params(): param_shape = param.get_value().shape state[param] = {} state[param]['sg2'] = np.zeros(param_shape) return (cost, model, dataset, sgd, state)
def test_correctness(): """ Test that the cost function works with float64 """ x_train, y_train, x_valid, y_valid = create_dataset() trainset = DenseDesignMatrix(X=np.array(x_train), y=y_train) validset = DenseDesignMatrix(X=np.array(x_valid), y=y_valid) n_inputs = trainset.X.shape[1] n_outputs = 1 n_hidden = 10 hidden_istdev = 4 * (6 / float(n_inputs + n_hidden)) ** 0.5 output_istdev = 4 * (6 / float(n_hidden + n_outputs)) ** 0.5 model = MLP(layers=[Sigmoid(dim=n_hidden, layer_name='hidden', istdev=hidden_istdev), Sigmoid(dim=n_outputs, layer_name='output', istdev=output_istdev)], nvis=n_inputs, seed=[2013, 9, 16]) termination_criterion = And([EpochCounter(max_epochs=1), MonitorBased(prop_decrease=1e-7, N=2)]) cost = SumOfCosts([(0.99, Default()), (0.01, L1WeightDecay({}))]) algo = SGD(1e-1, update_callbacks=[ExponentialDecay(decay_factor=1.00001, min_lr=1e-10)], cost=cost, monitoring_dataset=validset, termination_criterion=termination_criterion, monitor_iteration_mode='even_shuffled_sequential', batch_size=2) train = Train(model=model, dataset=trainset, algorithm=algo) train.main_loop()
def test_fixed_vars(): """ A very basic test of the the fixed vars interface. Checks that the costs' expr and get_gradients methods are called with the right parameters and that the updates functions are called the right number of times. """ rng = np.random.RandomState([2012, 11, 27, 9]) batch_size = 5 updates_per_batch = 4 train_batches = 3 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): m = num_batches * batch_size X = rng.randn(m, num_features) y = rng.randn(m, num_features) rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning return rval train = make_dataset(train_batches) model = SoftmaxModel(num_features) unsup_counter = shared(0) grad_counter = shared(0) called = [False, False, False, False] class UnsupervisedCostWithFixedVars(Cost): def expr(self, model, data, unsup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) X = data assert unsup_aux_var is unsup_counter called[0] = True return (model.P * X).sum() def get_gradients(self, model, data, unsup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) assert unsup_aux_var is unsup_counter called[1] = True gradients, updates = Cost.get_gradients( self, model, data, unsup_aux_var=unsup_aux_var) updates[grad_counter] = grad_counter + 1 return gradients, updates def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} rval.data_specs = data_specs # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function(data_tuple, updates=[(unsup_counter, unsup_counter + 1) ]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval def get_data_specs(self, model): return (model.get_input_space(), model.get_input_source()) sup_counter = shared(0) class SupervisedCostWithFixedVars(Cost): supervised = True def expr(self, model, data, sup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data assert sup_aux_var is sup_counter called[2] = True return (model.P * X * Y).sum() def get_gradients(self, model, data, sup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) assert sup_aux_var is sup_counter called[3] = True return super(SupervisedCostWithFixedVars, self).get_gradients(model=model, data=data, sup_aux_var=sup_aux_var) def get_fixed_var_descr(self, model, data): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.data_specs = data_specs # data has to be flattened into a tuple before being passed # to `function`. mapping = DataSpecsMapping(data_specs) flat_data = mapping.flatten(data, return_tuple=True) theano_func = function(flat_data, updates=[(sup_counter, sup_counter + 1)]) # the on_load_batch function will take numerical data formatted # as rval.data_specs, so we have to flatten it inside the # returned function too. # Using default argument binds the variables used in the lambda # function to the value they have when the lambda is defined. on_load = (lambda batch, mapping=mapping, theano_func=theano_func: theano_func(*mapping.flatten(batch, return_tuple=True))) rval.on_load_batch = [on_load] return rval def get_data_specs(self, model): space = CompositeSpace( (model.get_input_space(), model.get_output_space())) source = (model.get_input_source(), model.get_target_source()) return (space, source) cost = SumOfCosts( costs=[UnsupervisedCostWithFixedVars(), SupervisedCostWithFixedVars()]) algorithm = BGD(cost=cost, batch_size=batch_size, conjugate=1, line_search_mode='exhaustive', updates_per_batch=updates_per_batch) algorithm.setup(model=model, dataset=train) # Make sure all the right methods were used to compute the updates assert all(called) algorithm.train(dataset=train) # Make sure the load_batch callbacks were called the right amount of times assert unsup_counter.get_value() == train_batches assert sup_counter.get_value() == train_batches # Make sure the gradient updates were run the right amount of times assert grad_counter.get_value() == train_batches * updates_per_batch
def test_fixed_vars(): """ A very basic test of the the fixed vars interface. Checks that the costs' expr and get_gradients methods are called with the right parameters and that the updates functions are called the right number of times. """ """ Notes: this test is fairly messy. PL made some change to how FixedVarDescr worked. FixedVarDescr got an added data_specs field. But BGD itself was never changed to obey this data_specs. Somehow these tests passed regardless. It looks like PL just built a lot of machinery into the test itself to make the individual callbacks reformat data internally. This mechanism required the data_specs field to be present. Weirdly, the theano functions never actually used any of the data, so their data_specs should have just been NullSpace anyway. IG deleted a lot of this useless code from these tests but there is still a lot of weird stuff here that he has not attempted to clean up. """ rng = np.random.RandomState([2012, 11, 27, 9]) batch_size = 5 updates_per_batch = 4 train_batches = 3 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): m = num_batches * batch_size X = rng.randn(m, num_features) y = rng.randn(m, num_features) rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning return rval train = make_dataset(train_batches) model = SoftmaxModel(num_features) unsup_counter = shared(0) grad_counter = shared(0) called = [False, False, False, False] class UnsupervisedCostWithFixedVars(Cost): def expr(self, model, data, unsup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) X = data assert unsup_aux_var is unsup_counter called[0] = True return (model.P * X).sum() def get_gradients(self, model, data, unsup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) assert unsup_aux_var is unsup_counter called[1] = True gradients, updates = Cost.get_gradients( self, model, data, unsup_aux_var=unsup_aux_var) updates[grad_counter] = grad_counter + 1 return gradients, updates def get_fixed_var_descr(self, model, data, **kwargs): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} # The input to function should be a flat, non-redundent tuple mapping = DataSpecsMapping(data_specs) data_tuple = mapping.flatten(data, return_tuple=True) theano_func = function([], updates=[(unsup_counter, unsup_counter + 1) ]) def on_load(batch, mapping=mapping, theano_func=theano_func): return theano_func() rval.on_load_batch = [on_load] return rval def get_data_specs(self, model): return (model.get_input_space(), model.get_input_source()) sup_counter = shared(0) class SupervisedCostWithFixedVars(Cost): supervised = True def expr(self, model, data, sup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) X, Y = data assert sup_aux_var is sup_counter called[2] = True return (model.P * X * Y).sum() def get_gradients(self, model, data, sup_aux_var=None, **kwargs): self.get_data_specs(model)[0].validate(data) assert sup_aux_var is sup_counter called[3] = True return super(SupervisedCostWithFixedVars, self).get_gradients(model=model, data=data, sup_aux_var=sup_aux_var) def get_fixed_var_descr(self, model, data): data_specs = self.get_data_specs(model) data_specs[0].validate(data) rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} theano_func = function([], updates=[(sup_counter, sup_counter + 1)]) def on_load(data): theano_func() rval.on_load_batch = [on_load] return rval def get_data_specs(self, model): space = CompositeSpace( (model.get_input_space(), model.get_output_space())) source = (model.get_input_source(), model.get_target_source()) return (space, source) cost = SumOfCosts( costs=[UnsupervisedCostWithFixedVars(), SupervisedCostWithFixedVars()]) algorithm = BGD(cost=cost, batch_size=batch_size, conjugate=1, line_search_mode='exhaustive', updates_per_batch=updates_per_batch) algorithm.setup(model=model, dataset=train) # Make sure all the right methods were used to compute the updates assert all(called) algorithm.train(dataset=train) # Make sure the load_batch callbacks were called the right amount of times assert unsup_counter.get_value() == train_batches assert sup_counter.get_value() == train_batches # Make sure the gradient updates were run the right amount of times assert grad_counter.get_value() == train_batches * updates_per_batch
def test_fixed_vars(): """ A very basic test of the the fixed vars interface. Checks that the costs' __call__ and get_gradients methods are called with the right parameters and that the updates functions are called the right number of times. """ rng = np.random.RandomState([2012, 11, 27, 9]) batch_size = 5 updates_per_batch = 4 train_batches = 3 num_features = 2 # Synthesize dataset with a linear decision boundary w = rng.randn(num_features) def make_dataset(num_batches): m = num_batches * batch_size X = rng.randn(m, num_features) y = rng.randn(m, num_features) rval = DenseDesignMatrix(X=X, y=y) rval.yaml_src = "" # suppress no yaml_src warning return rval train = make_dataset(train_batches) model = SoftmaxModel(num_features) unsup_counter = shared(0) grad_counter = shared(0) called = [False, False, False, False] class UnsupervisedCostWithFixedVars(Cost): def __call__(self, model, X, Y=None, unsup_aux_var=None, **kwargs): assert unsup_aux_var is unsup_counter called[0] = True return (model.P * X).sum() def get_gradients(self, model, X, Y=None, unsup_aux_var=None, **kwargs): assert unsup_aux_var is unsup_counter called[1] = True gradients, updates = Cost.get_gradients( self, model, X, Y, unsup_aux_var=unsup_aux_var) updates[grad_counter] = grad_counter + 1 return gradients, updates def get_fixed_var_descr(self, model, X, Y, **kwargs): rval = FixedVarDescr() rval.fixed_vars = {'unsup_aux_var': unsup_counter} Y = T.matrix() theano_func = function([X, Y], updates=[(unsup_counter, unsup_counter + 1) ]) rval.on_load_batch = [theano_func] return rval sup_counter = shared(0) class SupervisedCostWithFixedVars(Cost): supervised = True def __call__(self, model, X, Y=None, sup_aux_var=None, **kwargs): assert sup_aux_var is sup_counter called[2] = True return (model.P * X * Y).sum() def get_gradients(self, model, X, Y=None, sup_aux_var=None, **kwargs): assert sup_aux_var is sup_counter called[3] = True return super(SupervisedCostWithFixedVars, self).get_gradients(model=model, X=X, Y=Y, sup_aux_var=sup_aux_var) def get_fixed_var_descr(self, model, X, Y=None): rval = FixedVarDescr() rval.fixed_vars = {'sup_aux_var': sup_counter} rval.on_load_batch = [ function([X, Y], updates=[(sup_counter, sup_counter + 1)]) ] return rval cost = SumOfCosts( costs=[UnsupervisedCostWithFixedVars(), SupervisedCostWithFixedVars()]) algorithm = BGD(cost=cost, batch_size=batch_size, conjugate=1, line_search_mode='exhaustive', updates_per_batch=updates_per_batch) algorithm.setup(model=model, dataset=train) # Make sure all the right methods were used to compute the updates assert all(called) algorithm.train(dataset=train) # Make sure the load_batch callbacks were called the right amount of times assert unsup_counter.get_value() == train_batches assert sup_counter.get_value() == train_batches # Make sure the gradient updates were run the right amount of times assert grad_counter.get_value() == train_batches * updates_per_batch
def supervisedLayerwisePRL(trainset, testset): ''' The supervised layerwise training as used in the PRL Paper. Input ------ trainset : A path to an hdf5 file created through h5py. testset : A path to an hdf5 file created through h5py. ''' batch_size = 100 # Both train and test h5py files are expected to have a 'topo_view' and 'y' # datasets side them corresponding to the 'b01c' data format as used in pylearn2 # and 'y' equivalent to the one hot encoded labels trn = HDF5Dataset(filename=trainset, topo_view='topo_view', y='y', load_all=False) tst = HDF5Dataset(filename=testset, topo_view='topo_view', y='y', load_all=False) ''' The 1st Convolution and Pooling Layers are added below. ''' h1 = mlp.ConvRectifiedLinear(layer_name='h1', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='fc', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=171, irange=.005, max_col_norm=1.9365) layers = [h1, fc, output] mdl = mlp.MLP(layers, input_space=Conv2DSpace(shape=(70, 70), num_channels=1)) trainer = sgd.SGD( learning_rate=0.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts( costs=[Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005])]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best1.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop() del mdl mdl = serial.load('./Saved Models/conv_supervised_layerwise_best1.pkl') mdl = push_monitor(mdl, 'k') ''' The 2nd Convolution and Pooling Layers are added below. ''' h2 = mlp.ConvRectifiedLinear(layer_name='h2', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='fc', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=171, irange=.005, max_col_norm=1.9365) del mdl.layers[-1] mdl.layer_names.remove('y') del mdl.layers[-1] mdl.layer_names.remove('fc') mdl.add_layers([h2, fc, output]) trainer = sgd.SGD(learning_rate=0.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts(costs=[ Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005, 0.0005]) ]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best2.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop() del mdl mdl = serial.load('./Saved Models/conv_supervised_layerwise_best2.pkl') mdl = push_monitor(mdl, 'l') ''' The 3rd Convolution and Pooling Layers are added below. ''' h3 = mlp.ConvRectifiedLinear(layer_name='h2', output_channels=64, irange=0.05, kernel_shape=[4, 4], pool_shape=[4, 4], pool_stride=[2, 2], max_kernel_norm=1.9365) fc = mlp.RectifiedLinear(layer_name='h3', dim=1500, irange=0.05) output = mlp.Softmax(layer_name='y', n_classes=10, irange=.005, max_col_norm=1.9365) del mdl.layers[-1] mdl.layer_names.remove('y') del mdl.layers[-1] mdl.layer_names.remove('fc') mdl.add_layers([h3, output]) trainer = sgd.SGD( learning_rate=.002, batch_size=batch_size, learning_rule=learning_rule.RMSProp(), cost=SumOfCosts(costs=[ Default(), WeightDecay(coeffs=[0.0005, 0.0005, 0.0005, 0.0005, 0.0005]) ]), train_iteration_mode='shuffled_sequential', monitor_iteration_mode='sequential', termination_criterion=EpochCounter(max_epochs=15), monitoring_dataset={ 'test': tst, 'valid': vld }) watcher = best_params.MonitorBasedSaveBest( channel_name='valid_y_misclass', save_path='./Saved Models/conv_supervised_layerwise_best3.pkl') decay = sgd.LinearDecayOverEpoch(start=8, saturate=15, decay_factor=0.1) experiment = Train( dataset=trn, model=mdl, algorithm=trainer, extensions=[watcher, decay], ) experiment.main_loop()
y = Softmax(n_classes = 2, layer_name = "y", irange = 0.1) inputSpace = Conv2DSpace(shape = [cropSize,cropSize], num_channels = 3) model = MLP(layers = [h0, h1, y], batch_size = batchSize, input_space = inputSpace) algorithm = SGD(learning_rate = 1E-3, cost = SumOfCosts([ MethodCost("cost_from_X"), Dropout(default_input_include_prob = 0.25, default_input_scale = 1.3333) ]), batch_size = batchSize, monitoring_batch_size = batchSize, monitoring_dataset = {'train': train, 'valid':valid}, monitor_iteration_mode = "even_batchwise_shuffled_sequential", termination_criterion = EpochCounter(max_epochs = 200), learning_rule = Momentum(init_momentum = 0.0), train_iteration_mode = "even_batchwise_shuffled_sequential") train = Train(dataset = train, model = model, algorithm = algorithm, save_path = "ConvNet8.pkl",