def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule( [RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite()]) # Scale(0.01) gradients = dict(equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar()] if test is not None: extensions.append(DataStreamMonitoring( [cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append(Plot( 'Training RNN-RBM', channels=[ ['train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note'], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions ) return main_loop
def test_gradient_descent_finds_inputs_additional_updates(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) n = shared_floatx(1) m = tensor.scalar('m') algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)])) algorithm.add_updates([(n, n + m)]) algorithm.initialize() assert m in algorithm.inputs
def test_gradient_descent(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) W_start_value = W.get_value() cost = tensor.sum(W ** 2) algorithm = GradientDescent(cost=cost, parameters=[W]) algorithm.step_rule.learning_rate.set_value(0.75) algorithm.initialize() algorithm.process_batch(dict()) assert_allclose(W.get_value(), -0.5 * W_start_value)
def create_main_loop(dataset, nvis, nhid, num_epochs, debug_level=0, lrate=1e-3): seed = 188229 n_inference_steps = 6 num_examples = dataset.num_examples batch_size = num_examples train_loop_stream = Flatten( DataStream.default_stream( dataset=dataset, iteration_scheme=SequentialScheme(dataset.num_examples, batch_size) # Repeat( # , n_inference_steps) # ShuffledScheme(dataset.num_examples, batch_size), n_inference_steps)) ), which_sources=("features",), ) model_brick = FivEM( nvis=nvis, nhid=nhid, epsilon=0.01, batch_size=batch_size, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), noise_scaling=1, debug=debug_level, lateral_x=False, lateral_h=False, n_inference_steps=n_inference_steps, ) model_brick.initialize() x = tensor.matrix("features") cost = model_brick.cost(x) computation_graph = ComputationGraph([cost]) model = Model(cost) # step_rule = Adam(learning_rate=2e-5, beta1=0.1, beta2=0.001, epsilon=1e-8, # decay_factor=(1 - 1e-8)) step_rule = Momentum(learning_rate=lrate, momentum=0.95) # step_rule = AdaDelta() # step_rule = RMSProp(learning_rate=0.01) # step_rule = AdaGrad(learning_rate=1e-4) algorithm = GradientDescent(cost=cost, params=computation_graph.parameters, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost] + computation_graph.auxiliary_variables, after_batch=False, after_epoch=True), # every_n_epochs=1), Printing(after_epoch=True, after_batch=False), # every_n_epochs=1, # Checkpoint(path="./fivem.zip",every_n_epochs=10,after_training=True) ] main_loop = MainLoop(model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop
def _test(f): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) W_start_value = W.get_value() cost = tensor.sum(W ** 2) gradients = OrderedDict() gradients[W] = tensor.grad(cost, W) algorithm = GradientDescent(gradients=f(gradients)) algorithm.step_rule.learning_rate.set_value(0.75) algorithm.initialize() algorithm.process_batch(dict()) assert_allclose(W.get_value(), -0.5 * W_start_value)
def test_theano_profile_for_sgd_function(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) W_start_value = W.get_value() cost = tensor.sum(W ** 2) algorithm = GradientDescent( cost=cost, parameters=[W], theano_func_kwargs={'profile': True}) algorithm.step_rule.learning_rate.set_value(0.75) algorithm.initialize() algorithm.process_batch(dict()) assert_allclose(W.get_value(), -0.5 * W_start_value) assert isinstance(algorithm._function.profile, ProfileStats)
def run(): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=True) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs() # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=False, after_epoch=False, every_n_epochs=5) # Prepare checkpoint checkpoint = Checkpoint( 'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def algorithm(self): if self._algorithm is None: self._algorithm = GradientDescent(cost=self.cost, parameters=self.parameters, step_rule=CompositeRule( self.step_rules)) return self._algorithm
def setup_mainloop(extensions): """Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) W = shared_floatx([0, 0], name='W') add_role(W, PARAMETER) x = tensor.vector('features') cost = tensor.sum((x - W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop(model=Model(cost), data_stream=datastream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), ] + extensions) return main_loop
def build_bprop_graph(self): optimizer = self.get_optimizer() costs = self.link_here('costs').values() # there are either costs assigned to specific params isinstance_check = [isinstance(c, ParametersLink) for c in costs] if any(isinstance_check): assert all(isinstance_check), "Some costs have parameters associated "+\ "to them and others don't. None or all costs need to be bound." grads = OrderedDict() for paramlink in costs: cost = paramlink.raw_var assert len(cost) == 1 params = flatten([self.architecture[arch].parameters for arch in \ paramlink.architectures] + paramlink.parameters) grads.update(zip(params, theano.grad(cost[0], params))) cost = None # OR let blocks do the gradient else: assert len(costs) >= 1, "No cost variables?" cost = costs[0] for c in costs[1:]: cost += c grads = None algorithm = GradientDescent(cost=cost, gradients=grads, parameters=self.parameters, step_rule=optimizer, on_unused_sources='warn') self.algorithm = algorithm
def prepare_opti(cost, test, *args): model = Model(cost) logger.info("Model created") algorithm = GradientDescent(cost=cost, parameters=model.parameters, step_rule=Adam(learning_rate=0.0015), on_unused_sources='ignore') to_monitor = [algorithm.cost] if args: to_monitor.extend(args) extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='loglikelihood_nat', epochs=patience), TrainingDataMonitoring(to_monitor, prefix="train", after_epoch=True), DataStreamMonitoring(to_monitor, test_stream, prefix="test"), Printing(), ProgressBar(), ApplyMask(before_first_epoch=True, after_batch=True), Checkpoint(check, every_n_epochs=save_every), SaveModel(name=path + '/' + 'pixelcnn_{}'.format(dataset), every_n_epochs=save_every), GenerateSamples(every_n_epochs=save_every), #Checkpoint(path+'/'+'exp.log', save_separately=['log'],every_n_epochs=save_every), ] if resume: logger.info("Restoring from previous checkpoint") extensions = [Load(path + '/' + check)] return model, algorithm, extensions
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by # `max_value`, which has a default value of 100. If we're still using # `maxval` by accident, this test should fail complaining that # the progress bar has received a value out of range. features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2]] * 101] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def test_shared_variable_modifier(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, parameters=[W], step_rule=step_rule) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[ FinishAfter(after_n_epochs=1), SharedVariableModifier( step_rule.learning_rate, lambda n: numpy.cast[theano.config.floatX](10. / n) )]) main_loop.run() assert_allclose(step_rule.learning_rate.get_value(), numpy.cast[theano.config.floatX](10. / n_batches))
def main(): x = tensor.matrix("features") input_to_hidden1 = get_typical_layer(x, 784, 500) #hidden1_to_hidden2 = get_typical_layer(input_to_hidden1, 500, 300) hidden1_to_latent = get_typical_layer(input_to_hidden1, 500, 20) latent_to_hidden2 = get_typical_layer(hidden1_to_latent, 20, 500) #hidden3_to_hidden4 = get_typical_layer(latent_to_hidden3, 300, 500) hidden2_to_output = get_typical_layer(latent_to_hidden2, 500, 784, Logistic()) hidden2_to_output.name = "last_before_output" from blocks.bricks.cost import SquaredError, AbsoluteError, BinaryCrossEntropy from blocks.graph import ComputationGraph from blocks.algorithms import Adam, GradientDescent, Scale from blocks.roles import WEIGHT cost = BinaryCrossEntropy(name="error").apply(x, hidden2_to_output) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT]) (cg.variables) # cost += 0.0001 * tensor.sum(map(lambda x: (x**2).sum(), weights)) # cost.name = "regularized error" gd = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) from blocks.main_loop import MainLoop from blocks.extensions import FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring monitor = TrainingDataMonitoring([cost], after_epoch=True) main_loop = MainLoop(data_stream=get_data_stream(), algorithm=gd, extensions=[monitor, FinishAfter(after_n_epochs=5), ProgressBar(), Printing()]) main_loop.run() showcase(cg, "last_before_output")
def test_load(): # Create a main loop and checkpoint it mlp = MLP(activations=[None], dims=[10, 10], weights_init=Constant(1.), use_bias=False) mlp.initialize() W = mlp.linear_transformations[0].W x = tensor.vector('data') cost = mlp.apply(x).mean() data = numpy.random.rand(10, 10).astype(theano.config.floatX) data_stream = IterableDataset(data).get_example_stream() main_loop = MainLoop(data_stream=data_stream, algorithm=GradientDescent(cost=cost, parameters=[W]), extensions=[ FinishAfter(after_n_batches=5), Checkpoint('myweirdmodel.picklebarrel') ]) main_loop.run() # Load the parameters, log and iteration state old_value = W.get_value() W.set_value(old_value * 2) main_loop = MainLoop(model=Model(cost), data_stream=data_stream, algorithm=GradientDescent(cost=cost, parameters=[W]), extensions=[ Load('myweirdmodel.picklebarrel', load_iteration_state=True, load_log=True) ]) main_loop.extensions[0].main_loop = main_loop main_loop._run_extensions('before_training') assert_allclose(W.get_value(), old_value) # Make sure things work too if the model was never saved before main_loop = MainLoop(model=Model(cost), data_stream=data_stream, algorithm=GradientDescent(cost=cost, parameters=[W]), extensions=[ Load('mynonexisting.picklebarrel', load_iteration_state=True, load_log=True) ]) main_loop.extensions[0].main_loop = main_loop main_loop._run_extensions('before_training')
def test_gradient_descent_spurious_sources(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) W_start_value = W.get_value() cost = tensor.sum(W ** 2) algorithm = GradientDescent(cost=cost, parameters=[W]) algorithm.step_rule.learning_rate.set_value(0.75) algorithm.initialize() with assert_raises(ValueError): algorithm.process_batch(dict(example_id='test')) algorithm = GradientDescent(cost=cost, parameters=[W], on_unused_sources='ignore') algorithm.step_rule.learning_rate.set_value(0.75) algorithm.initialize() algorithm.process_batch(dict(example_id='test')) assert_allclose(W.get_value(), -0.5 * W_start_value)
def train(cost, error_rate, batch_size=100, num_epochs=150): # Setting Loggesetr timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/memory_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training blocks_model = Model(cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params training_algorithm = GradientDescent(cost=cost, parameters=all_params, step_rule=Adam(learning_rate=0.001)) # training_algorithm = GradientDescent( # cost=cost, params=all_params, # step_rule=Scale(learning_rate=model.default_lr)) monitored_variables = [cost, error_rate] # the rest is for validation # train_data_stream, valid_data_stream = get_mnist_streams( # 50000, batch_size) train_data_stream, valid_data_stream = get_mnist_video_streams(batch_size) train_monitoring = TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring(variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_misclassificationrate_apply_error_rate', blocks_model, save_path), SaveLog(save_path, after_epoch=True), ProgressBar(), Printing() ]) main_loop.run()
def train_model(cost, train_stream, valid_stream, valid_freq, valid_rare, load_location=None, save_location=None): cost.name = 'nll' perplexity = 2**(cost / tensor.log(2)) perplexity.name = 'ppl' # Define the model model = Model(cost) # Load the parameters from a dumped model if load_location is not None: logger.info('Loading parameters...') model.set_param_values(load_parameter_values(load_location)) cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, step_rule=Scale(learning_rate=0.01), params=cg.parameters) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=[ DataStreamMonitoring([cost, perplexity], valid_stream, prefix='valid_all', every_n_batches=5000), # Overfitting of rare words occurs between 3000 and 4000 iterations DataStreamMonitoring([cost, perplexity], valid_rare, prefix='valid_rare', every_n_batches=500), DataStreamMonitoring([cost, perplexity], valid_freq, prefix='valid_frequent', every_n_batches=5000), Printing(every_n_batches=500) ]) main_loop.run() # Save the main loop if save_location is not None: logger.info('Saving the main loop...') dump_manager = MainLoopDumpManager(save_location) dump_manager.dump(main_loop) logger.info('Saved')
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHTS])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, step_rule=SteepestDescent(learning_rate=0.1)) main_loop = MainLoop( mlp, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_every_epoch=True), SerializeMainLoop(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def main(save_to, num_epochs, batch_size): mlp = MLP([Tanh(), Tanh(), Tanh(), Softmax()], [3072, 4096, 1024, 512, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tt.tensor4('features', dtype='float32') y = tt.vector('label', dtype='int32') probs = mlp.apply(x.reshape((-1, 3072))) cost = CategoricalCrossEntropy().apply(y, probs) error_rate = MisclassificationRate().apply(y, probs) cg = ComputationGraph([cost]) ws = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * sum(([(w**2).sum() for w in ws])) cost.name = 'final_cost' train_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=True) valid_dataset = Cifar10Dataset(data_dir='/home/belohlavek/data/cifar10', is_train=False) train_stream = train_dataset.get_stream(batch_size) valid_stream = valid_dataset.get_stream(batch_size) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) extensions = [ Timing(), LogExtension('/home/belohlavek/ALI/mlp.log'), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], valid_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] main_loop = MainLoop(algorithm, train_stream, model=Model(cost), extensions=extensions) main_loop.run()
def prepare_opti(cost, test): model = Model(cost) algorithm = GradientDescent(cost=cost, parameters=model.parameters, step_rule=Adam(), on_unused_sources='ignore') extensions = [ FinishAfter(after_n_epochs=nb_epoch), FinishIfNoImprovementAfter(notification_name='test_vae_cost', epochs=patience), TrainingDataMonitoring([algorithm.cost], after_epoch=True), DataStreamMonitoring([algorithm.cost], test, prefix="test"), Printing(), ProgressBar(), #SaveModel(name='vae', after_n_epochs=save_every) ] return model, algorithm, extensions
s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01)) #update the M and S with batch statistics alpha = 0.1 updates = [] if normalization == 'bn2': for m,s,var in statistics_list: updates.append((m, cast(alpha*m + (1-alpha)*var.mean(axis=0), floatX))) updates.append((s, cast(alpha*s + (1-alpha)*var.std(axis=0) , floatX))) algorithm.add_updates(updates) # Since this line wont work with the extension to include parameters # in the gradient descent. Here's an extension that will do the job. from blocks.extensions import SimpleExtension from theano import function
for param in discriminator_cg.parameters: param.name += '_d' both = list(set(dsamples_cg.parameters) & set(generator_cg.parameters)) indices = [] for (i, par) in enumerate(dsamples_cg.parameters): if par in generator_cg.parameters: indices.append(i) good_params = [dsamples_cg.parameters[i] for i in indices] print 'tests' for param in dsamples_cg.parameters: print param.name discriminator_descent = GradientDescent(cost=cost_discriminator, parameters=discriminator_cg.parameters, step_rule=RMSProp(learning_rate=0.01, decay_rate=0.97)) print filter(lambda x: x.name[-2:] == '_g', dsamples_cg.parameters) generator_descent = GradientDescent(cost=cost_generator, parameters=filter(lambda x: x.name[-2:] == '_g', dsamples_cg.parameters), # parameters=good_params, # parameters=dsamples_cg.parameters, step_rule=RMSProp(learning_rate=1., decay_rate=0.97)) generator_descent.total_step_norm.name = 'generator_total_step_norm' generator_descent.total_gradient_norm.name = 'generator_total_gradient_norm' discriminator_descent.total_step_norm.name = 'discriminator_total_step_norm' discriminator_descent.total_gradient_norm.name = 'discriminator_total_gradient_norm' from fuel.datasets import MNIST mnist = MNIST(("train",))
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([StepClipping(1.0), optimizer]) algorithm = GradientDescent( cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule ) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend( [ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ] ) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append(TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring( [param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items()], data_stream=None, after_epoch=True, ) ) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring( channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length ), ) ) extensions.extend( [ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), # FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ] ) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log")]) main_loop = MainLoop( data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model, ) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
def train_model(cost, cross_entropy, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) algorithm.add_updates(updates) # extensions to be added extensions = [] if args.load_path is not None: extensions.append(Load(args.load_path)) outputs = [ variable for variable in cg.variables if variable.name == "presoft"] if args.generate: extensions.append(TextGenerationExtension( outputs=outputs, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=args.monitoring_freq, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq, after_epoch=True), DataStreamMonitoring([cost, cross_entropy], valid_stream, args.mini_batch_size_valid, state_updates=updates, prefix='valid', before_first_epoch=not(args.visualize_gates), every_n_batches=args.monitoring_freq), ResetStates([v for v, _ in updates], every_n_batches=100), ProgressBar()]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) else: raise Exception('Directory already exists') early_stopping = EarlyStopping('valid_cross_entropy', args.patience, args.save_path, every_n_batches=args.monitoring_freq) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) if args.visualize_gates and (gate_values is not None): if args.rnn_type == "lstm": extensions.append(VisualizeGateLSTM(gate_values, updates, args.dataset, ploting_path=None)) elif args.rnn_type == "soft": extensions.append(VisualizeGateSoft(gate_values, updates, args.dataset, ploting_path=None)) else: assert(False) extensions.append(early_stopping) extensions.append(Printing(every_n_batches=args.monitoring_freq)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr.get_value())) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', ladder.costs.denois.values()), ]), } main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring( [ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm] + ladder.costs.denois.values(), prefix="train", after_epoch=True), SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = DataFrame.from_dict(main_loop.log, orient='index') col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def train(ladder, batch_size=100, num_train_examples=60000, num_epochs=150, lrate_decay=0.67): # Setting Logger timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/mnist_100_standard_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training model = Model(ladder.costs.total) all_params = model.parameters print len(all_params) print all_params training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # Fetch all batch normalization updates. They are in the clean path. # In addition to actual training, also do BN variable approximations bn_updates = ComputationGraph([ladder.costs.class_clean]).updates training_algorithm.add_updates(bn_updates) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, training_algorithm.total_gradient_norm, ladder.costs.total] + ladder.costs.denois.values() train_data_stream, test_data_stream = get_mixed_streams( batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=test_data_stream, prefix="test", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('test_CE_corr', model, save_path), SaveLog(save_path, after_epoch=True), LRDecay(lr=ladder.lr, decay_first=num_epochs * lrate_decay, decay_last=num_epochs, after_epoch=True), Printing()]) main_loop.run()
rnnrbm.initialize() cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([RemoveNotFinite(), StepClipping(20.0), Adam(learning_rate=.001), StepClipping(3.0), RemoveNotFinite()]) # Scale(0.01) gradients = dict(equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) # # algorithm = GradientDescent(step_rule=step_rule, cost=cost, params=cg.parameters) ## l2/l1 regularization # reg = 0.000005 # params = VariableFilter(roles=[WEIGHT, BIAS])(cg.variables) # param_nans = 0 # for i, p in enumerate(params): # # cost += reg * abs(p).sum() # cost += reg * (p ** 2).sum() # param_nans += T.isnan(p).sum() # name = params[i].name # params[i] = params[i].mean()
def train(cli_params): cli_params["save_dir"] = prepare_dir(cli_params["save_to"]) logfile = os.path.join(cli_params["save_dir"], "log.txt") # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info("Logging into %s" % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info("Found the following parameters: %s" % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert "counter" in [u.name for u in bn_updates.keys()], "No batch norm params in graph - the graph has been cut?" training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr) ) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) model = Model(ladder.costs.total) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, # training_algorithm.total_gradient_norm, ladder.costs.total, ] # + ladder.costs.denois.values() # Make a global history recorder so that we can get summary at end of # training when we write to Sentinel # global_history records all relevant monitoring vars # updated by SaveLog every time global_history = {} main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream( data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, ), model=model, extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # write out to sentinel file for experiment automator to work SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( monitored_variables, make_datastream( data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme ), prefix="valid_approx", ), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( monitored_variables, make_datastream( data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme, ), make_datastream( data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme, ), prefix="valid_final", after_n_epochs=p.num_epochs, ), TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True), SaveParams("valid_approx_cost_class_corr", model, p.save_dir), # SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history), Printing(), # ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ], ) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = "valid_final_error_rate" logger.info("%s %g" % (col, df[col].iloc[-1])) if main_loop.log.status["epoch_interrupt_received"]: return None return df
def train(ladder, batch_size=100, labeled_samples=100, unlabeled_samples=50000, valid_set_size=10000, num_epochs=150, valid_batch_size=100, lrate_decay=0.67, save_path='results/mnist_100_full0'): # Setting Logger log_path = os.path.join(save_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % log_path) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # Fetch all batch normalization updates. They are in the clean path. # In addition to actual training, also do BN variable approximations bn_updates = ComputationGraph([ladder.costs.class_clean]).updates training_algorithm.add_updates(bn_updates) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, training_algorithm.total_gradient_norm, ladder.costs.total] + ladder.costs.denois.values() data = get_mnist_data_dict(unlabeled_samples=unlabeled_samples, valid_set_size=valid_set_size) train_data_stream = make_datastream( data.train, data.train_ind, batch_size, n_labeled=labeled_samples, n_unlabeled=unlabeled_samples) valid_data_stream = make_datastream( data.valid, data.valid_ind, valid_batch_size, n_labeled=len(data.valid_ind), n_unlabeled=len(data.valid_ind)) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=Model(ladder.costs.total), extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams(None, all_params, save_path, after_epoch=True), SaveLog(save_path, after_training=True), LRDecay(lr=ladder.lr, decay_first=num_epochs * lrate_decay, decay_last=num_epochs, after_epoch=True), Printing()]) main_loop.run()
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
model_name = sys.argv[-1] config = importlib.import_module('.%s' % model_name, 'config') # Build datastream train_stream = datastream.setup_datastream(config.dataset, config.num_seqs, config.seq_len, config.seq_div_size) # Build model m = config.Model(config) # Train the model cg = Model(m.sgd_cost) algorithm = GradientDescent(cost=m.sgd_cost, step_rule=config.step_rule, parameters=cg.parameters) algorithm.add_updates(m.states) monitor_vars = list(set(v for p in m.monitor_vars for v in p)) extensions = [ ProgressBar(), TrainingDataMonitoring( monitor_vars, prefix='train', every_n_batches=config.monitor_freq), Printing(every_n_batches=config.monitor_freq, after_epoch=False), ResetStates([v for v, _ in m.states], after_epoch=True) ] if plot_avail:
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1): x = tensor.matrix('features') # Construct and initialize model encoding_mlp = MLP([Tanh()], [None, None]) decoding_mlp = MLP([Tanh()], [None, None]) encoding_lstm = LSTM(dim=encoding_lstm_dim) decoding_lstm = LSTM(dim=decoding_lstm_dim) draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp, decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm, decoding_lstm=decoding_lstm, biases_init=Constant(0), weights_init=Orthogonal()) draw.push_initialization_config() encoding_lstm.weights_init = IsotropicGaussian(std=0.001) decoding_lstm.weights_init = IsotropicGaussian(std=0.001) draw.initialize() # Compute cost cost = -draw.log_likelihood_lower_bound(x).mean() cost.name = 'nll_upper_bound' model = Model(cost) # Datasets and data streams mnist_train = BinarizedMNIST('train') train_loop_stream = ForceFloatX(DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))) train_monitor_stream = ForceFloatX(DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))) mnist_valid = BinarizedMNIST('valid') valid_monitor_stream = ForceFloatX(DataStream( dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))) mnist_test = BinarizedMNIST('test') test_monitor_stream = ForceFloatX(DataStream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))) # Get parameters and monitoring channels computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) monitoring_channels = dict([ ('avg_' + channel.tag.name, channel.mean()) for channel in VariableFilter(name='.*term$')(computation_graph.auxiliary_variables)]) for name, channel in monitoring_channels.items(): channel.name = name monitored_quantities = monitoring_channels.values() + [cost] # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) main_loop = MainLoop( model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), SerializeMainLoop('vae.pkl', save_separately=['model']), FinishAfter(after_n_epochs=200), DataStreamMonitoring( monitored_quantities, train_monitor_stream, prefix="train", updates=computation_graph.updates), DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", updates=computation_graph.updates), DataStreamMonitoring( monitored_quantities, test_monitor_stream, prefix="test", updates=computation_graph.updates), ProgressBar(), Printing()]) main_loop.run()
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)') z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)] elif rnn_type.lower() == 'gru': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply(rnn_embedding, zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply( y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost/np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states'%l: norms = _magnitude(output) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x*x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [cost_train, cost, bpc, perp, learning_rate, aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True ) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring( variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates ) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates ) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append(SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value()/lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=True) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) model=Model(ladder.costs.total) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, # training_algorithm.total_gradient_norm, ladder.costs.total] \ # + ladder.costs.denois.values() # Make a global history recorder so that we can get summary at end of # training when we write to Sentinel # global_history records all relevant monitoring vars # updated by SaveLog every time global_history = {} main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=model, extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( monitored_variables, make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( monitored_variables, make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), # DEPREC: we directly test now # make_datastream(data.valid, data.valid_ind, # p.valid_batch_size, # n_labeled=len(data.valid_ind), # whiten=whiten, cnorm=cnorm, # scheme=ShuffledScheme), # prefix="valid_final", make_datastream(data.test, data.test_ind, p.batch_size, n_labeled=len(data.test_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="final_test", after_n_epochs=p.num_epochs), TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True), # write out to sentinel file for experiment automator to work # REMOVE THIS if you're running test mode with early stopping immediately after SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), # originally use 'valid_approx_cost_class_clean' # turns out should use ER as early stopping # use CE as a fallback (secondary early stopvar) if ER is the same # SaveParams(('valid_approx_error_rate', 'valid_approx_cost_class_clean'), # model, p.save_dir), # doesn't do early stopping now SaveParams(None, model, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history), Printing(), # ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # ================= Add testing at end of training ================= # DEPREC don't do early stopping anymore if False: p.load_from = p.save_dir ladder = setup_model(p) logger.info('Start testing on trained_params_best') main_loop = DummyLoop( extensions=[ # write to global history SaveLog(save_dir=p.save_dir, after_training=True, global_history=global_history), # write out to sentinel file for experiment automator to work SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), FinalTestMonitoring( [ladder.costs.class_clean, ladder.error] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, # These need to match with the training p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=len(data.train_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), make_datastream(data.test, data.test_ind, p.batch_size, n_labeled=len(data.test_ind), n_unlabeled=len(data.test_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), prefix="test", before_training=True) ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() # col = 'valid_final_error_rate' # logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def main(name, epochs, batch_size, learning_rate, window_size, conv_sizes, num_filters, fc_dim, enc_dim, dec_dim, step, num_digits, num_classes, oldmodel, live_plotting): channels, img_height, img_width = 1, 100, 100 rnninits = { 'weights_init': Uniform(width=0.02), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } rec_inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } convinits = { 'weights_init': Uniform(width=.2), 'biases_init': Constant(0.), } n_iter = step * num_digits filter_size1, filter_size2 = zip(conv_sizes, conv_sizes)[:] w_height, w_width = window_size.split(',') w_height = int(w_height) w_width = int(w_width) subdir = time.strftime("%Y-%m-%d") + "-" + name if not os.path.exists(subdir): os.makedirs(subdir) lines = ["\n Running experiment", " subdirectory: %s" % subdir, " learning rate: %g" % learning_rate, " attention size: %s" % window_size, " n_iterations: %d" % n_iter, " encoder dimension: %d" % enc_dim, " decoder dimension: %d" % dec_dim, " batch size: %d" % batch_size, " epochs: %d" % epochs, ] for line in lines: print(line) print() rectifier = Rectifier() conv1 = Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 2), num_channels=channels, image_size=(w_height, w_width), border_mode='half', name='conv1', **convinits) conv1_bn = SpatialBatchNormalization(input_dim=(64, 26, 26), conserve_memory=False, n_iter=n_iter, name='conv1_bn') conv2 = Convolutional(filter_size=filter_size2, num_channels=int(num_filters / 2), num_filters=int(num_filters / 2), image_size=(26, 26), name='conv2', **convinits) conv2_bn = SpatialBatchNormalization(input_dim=(64, 24, 24), conserve_memory=False, n_iter=n_iter, name='conv2_bn') max_pooling = MaxPooling(pooling_size=(2, 2), step=(2, 2)) conv3 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=int(num_filters / 2), image_size=(12, 12), border_mode='half', name='conv3', **convinits) conv3_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv3_bn') conv4 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=num_filters, image_size=(12, 12), border_mode='half', name='conv4', **convinits) conv4_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv4_bn') # Max Pooling conv5 = Convolutional(filter_size=filter_size2, num_filters=160, num_channels=num_filters, image_size=(6, 6), border_mode='half', name='conv5', **convinits) conv5_bn = SpatialBatchNormalization(input_dim=(160, 6, 6), conserve_memory=False, n_iter=n_iter, name='conv5_bn') conv6 = Convolutional(filter_size=filter_size2, num_filters=192, num_channels=160, image_size=(6, 6), name='conv6', **convinits) conv6_bn = SpatialBatchNormalization(input_dim=(192, 4, 4), conserve_memory=False, n_iter=n_iter, name='conv6_bn') conv_mlp = MLP(activations=[Identity()], dims=[3072, fc_dim], name="MLP_conv", **inits) conv_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='conv_mlp_bn') loc_mlp = MLP(activations=[Identity()], dims=[6, fc_dim], name="MLP_loc", **inits) loc_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='loc_mlp_bn') encoder_mlp = MLP([Identity()], [fc_dim, 4 * enc_dim], name="MLP_enc", **rec_inits) decoder_mlp = MLP([Identity()], [enc_dim, 4 * dec_dim], name="MLP_dec", **rec_inits) encoder_rnn = LSTM(activation=Tanh(), dim=enc_dim, name="RNN_enc", **rnninits) conv_init = ConvolutionalSequence( [Convolutional(filter_size=filter_size1, num_filters=int(num_filters / 8), name='conv1_init'), SpatialBatchNormalization(conserve_memory=False, name='conv1_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 8), name='conv2_init'), SpatialBatchNormalization(conserve_memory=False, name='conv2_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 4), name='conv3_init'), SpatialBatchNormalization(conserve_memory=False, name='conv3_bn_init'), ], image_size=(12, 12), num_channels=channels, name='conv_seq_init', **convinits) decoder_rnn = LSTM(activation=Tanh(), dim=dec_dim, name="RNN_dec", **rnninits) emit_mlp = MLP(activations=[Tanh()], dims=[dec_dim, 6], name='emit_mlp', weights_init=Constant(0.), biases_init=Constant((1., 0., 0., 0., 1., 0.))) classification_mlp1 = MLP(activations=[Identity()], dims=[enc_dim, fc_dim], name='MPL_class1', **inits) classification_mlp1_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp1_bn') classification_mlp2 = MLP(activations=[Identity()], dims=[fc_dim, fc_dim], name='MPL_class2', **inits) classification_mlp2_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp2_bn') classification_mlp3 = MLP(activations=[Softmax()], dims=[fc_dim, num_classes], name='MPL_class3', **inits) edram = EDRAM(channels=channels, out_height=w_height, out_width=w_width, n_iter=n_iter, num_classes=num_classes, rectifier=rectifier, conv1=conv1, conv1_bn=conv1_bn, conv2=conv2, conv2_bn=conv2_bn, max_pooling=max_pooling, conv3=conv3, conv3_bn=conv3_bn, conv4=conv4, conv4_bn=conv4_bn, conv5=conv5, conv5_bn=conv5_bn, conv6=conv6, conv6_bn=conv6_bn, conv_mlp=conv_mlp, conv_mlp_bn=conv_mlp_bn, loc_mlp=loc_mlp, loc_mlp_bn=loc_mlp_bn, conv_init=conv_init, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, classification_mlp1=classification_mlp1, classification_mlp1_bn=classification_mlp1_bn, classification_mlp2=classification_mlp2, classification_mlp2_bn=classification_mlp2_bn, classification_mlp3=classification_mlp3, emit_mlp=emit_mlp) edram.initialize() # ------------------------------------------------------------------------ x = T.ftensor4('features') x_coarse = T.ftensor4('features_coarse') y = T.ivector('labels') wr = T.fmatrix('locations') with batch_normalization(edram): bn_p, bn_l, m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, \ m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn = edram.calculate_train(x, x_coarse) def compute_cost(p, wr, y, l): cost_where = T.dot(T.sqr(wr - l), [1, 0.5, 1, 0.5, 1, 1]) cost_y = T.stack([T.nnet.categorical_crossentropy(T.maximum(p[i, :], 1e-7), y) for i in range(0, n_iter)]) return cost_where, cost_y cost_where, cost_y = compute_cost(bn_p, wr, y, bn_l) bn_cost = cost_y + cost_where bn_cost = bn_cost.sum(axis=0) bn_cost = bn_cost.mean() bn_cost.name = 'cost' bn_error_rate = MisclassificationRate().apply(y, bn_p[-1]) bn_error_rate.name = 'error_rate' # ------------------------------------------------------------ bn_cg = ComputationGraph([bn_cost, bn_error_rate]) # Prepare algorithm algorithm = GradientDescent( cost=bn_cg.outputs[0], on_unused_sources='ignore', parameters=bn_cg.parameters, step_rule=CompositeRule([ RemoveNotFinite(), StepClipping(10.), Adam(learning_rate) ]) ) pop_updates = get_batch_normalization_updates(bn_cg) update_params = [conv1_bn.population_mean, conv1_bn.population_stdev, conv2_bn.population_mean, conv2_bn.population_stdev, conv3_bn.population_mean, conv3_bn.population_stdev, conv4_bn.population_mean, conv4_bn.population_stdev, conv5_bn.population_mean, conv5_bn.population_stdev, conv6_bn.population_mean, conv6_bn.population_stdev, conv_mlp_bn.population_mean, conv_mlp_bn.population_stdev, loc_mlp_bn.population_mean, loc_mlp_bn.population_stdev, classification_mlp1_bn.population_mean, classification_mlp1_bn.population_stdev, classification_mlp2_bn.population_mean, classification_mlp2_bn.population_stdev] update_values = [m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn] pop_updates.extend([(p, m) for p, m in zip(update_params, update_values)]) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] algorithm.add_updates(extra_updates) # ------------------------------------------------------------------------ # Setup monitors p, l = edram.calculate_test(x, x_coarse) cost_where, cost_y = compute_cost(p, wr, y, l) cost = cost_y + cost_where cost = cost.sum(axis=0) cost = cost.mean() cost.name = 'cost' error_rate = MisclassificationRate().apply(y, p[-1]) error_rate.name = 'error_rate' monitors = [cost, error_rate] plotting_extensions = [] # Live plotting... if live_plotting: plot_channels = [ ['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate'], ] plotting_extensions = [ Plot(subdir, channels=plot_channels, server_url='http://155.69.150.60:80/') ] # ------------------------------------------------------------ mnist_cluttered_train = MNISTCluttered(which_sets=['train'], sources=('features', 'locations', 'labels')) mnist_cluttered_test = MNISTCluttered(which_sets=['test'], sources=('features', 'locations', 'labels')) main_loop = MainLoop( model=Model([bn_cost]), data_stream=DataStream.default_stream(mnist_cluttered_train, iteration_scheme=ShuffledScheme(mnist_cluttered_train.num_examples, batch_size)), algorithm=algorithm, extensions=[Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_train, iteration_scheme=SequentialScheme(mnist_cluttered_train.num_examples, batch_size)), prefix='train'), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_test, iteration_scheme=SequentialScheme(mnist_cluttered_test.num_examples, batch_size)), prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=False, after_epoch=True, save_separately=['log', ]), TrackTheBest('test_error_rate', 'best_test_error_rate'), BestCheckpount("{}/{}".format(subdir, name), 'best_test_error_rate', save_separately=['model', ]), Printing(), ProgressBar(), PrintingTo("\n".join(lines), "{}/{}_log.txt".format(subdir, name)), ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) main_loop.model.get_top_bricks()[0].conv1_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv1_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_stdev.get_value()) del oldmodel main_loop.run()
def main(mode, save_to, num_epochs, load_params=None, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None, batch_size=None, num_batches=None, algo=None, test_set=None, valid_examples=None, dropout=None, max_norm=None, weight_decay=None, batch_norm=None): if feature_maps is None: feature_maps = [20, 50, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5, 5] if pool_sizes is None: pool_sizes = [2, 2, 2] if repeat_times is None: repeat_times = [1, 1, 1] if batch_size is None: batch_size = 500 if valid_examples is None: valid_examples = 2500 if stride is None: stride = 1 if test_set is None: test_set = 'test' if algo is None: algo = 'rmsprop' if batch_norm is None: batch_norm = False image_size = (128, 128) output_size = 2 if (len(feature_maps) != len(conv_sizes) or len(feature_maps) != len(pool_sizes) or len(feature_maps) != len(repeat_times)): raise ValueError("OMG, inconsistent arguments") # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, stride=stride, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), repeat_times=repeat_times, top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', batch_norm=batch_norm, weights_init=Glorot(), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) single_x = tensor.tensor3('image_features') x = tensor.tensor4('image_features') single_y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Training with batch_normalization(convnet): probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) extra_updates = [] if batch_norm: # batch norm: logger.debug("Apply batch norm") pop_updates = get_batch_normalization_updates(cg) # p stands for population mean # m stands for minibatch alpha = 0.005 extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] population_statistics = [p for p, m in extra_updates] if dropout: relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) cg = apply_dropout(cg, relu_outputs, dropout) cost, error_rate = cg.outputs if weight_decay: logger.debug("Apply weight decay {}".format(weight_decay)) cost += weight_decay * l2_norm(cg.parameters) cost.name = 'cost' # Validation valid_probs = convnet.apply_5windows(single_x) valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs) .copy(name='cost')) valid_error_rate = (MisclassificationRate().apply( single_y, valid_probs).copy(name='error_rate')) model = Model([cost, error_rate]) if load_params: logger.info("Loaded params from {}".format(load_params)) with open(load_params, 'r') as src: model.set_parameter_values(load_parameters(src)) # Training stream with random cropping train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None)) train_str = DataStream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size)) train_str = add_transformers(train_str, random_crop=True) # Validation stream without cropping valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None)) valid_str = DataStream( valid, iteration_scheme=SequentialExampleScheme(valid.num_examples)) valid_str = add_transformers(valid_str) if mode == 'train': directory, _ = os.path.split(sys.argv[0]) env = dict(os.environ) env['THEANO_FLAGS'] = 'floatX=float32' port = numpy.random.randint(1025, 10000) server = subprocess.Popen( [directory + '/server.py', str(25000 - valid_examples), str(batch_size), str(port)], env=env, stderr=subprocess.STDOUT) train_str = ServerDataStream( ('image_features', 'targets'), produces_examples=False, port=port) save_to_base, save_to_extension = os.path.splitext(save_to) # Train with simple SGD if algo == 'rmsprop': step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003) elif algo == 'adam': step_rule = Adam() else: assert False if max_norm: conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg) linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg) step_rule = CompositeRule( [step_rule, Restrict(VariableClipping(max_norm, axis=0), linear_params), Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)]) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(every_n_batches=100), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [valid_cost, valid_error_rate], valid_str, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), TrackTheBest("valid_error_rate"), Checkpoint(save_to, save_separately=['log'], parameters=cg.parameters + (population_statistics if batch_norm else []), before_training=True, after_epoch=True) .add_condition( ['after_epoch'], OnLogRecord("valid_error_rate_best_so_far"), (save_to_base + '_best' + save_to_extension,)), Printing(every_n_batches=100)] model = Model(cost) main_loop = MainLoop( algorithm, train_str, model=model, extensions=extensions) try: main_loop.run() finally: server.terminate() elif mode == 'test': classify = theano.function([single_x], valid_probs.argmax()) test = DogsVsCats((test_set,)) test_str = DataStream( test, iteration_scheme=SequentialExampleScheme(test.num_examples)) test_str = add_transformers(test_str) correct = 0 with open(save_to, 'w') as dst: print("id", "label", sep=',', file=dst) for index, example in enumerate(test_str.get_epoch_iterator()): image = example[0] prediction = classify(image) print(index + 1, classify(image), sep=',', file=dst) if len(example) > 1 and prediction == example[1]: correct += 1 print(correct / float(test.num_examples)) else: assert False
max_mu = mu.max().copy(name="mu_max") min_binary = binary.min().copy(name="binary_min") mean_binary = binary.mean().copy(name="binary_mean") max_binary = binary.max().copy(name="binary_max") data_monitoring += [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma, mean_binary, min_binary, max_binary] ################# # Algorithm ################# n_batches = 200 algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]) ) algorithm.add_updates(extra_updates) train_monitor = TrainingDataMonitoring( variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train" ) valid_monitor = DataStreamMonitoring( monitoring_variables + data_monitoring, valid_stream, every_n_batches=n_batches, prefix="valid" ) extensions = [ ProgressBar(), Timing(every_n_batches=n_batches), train_monitor,
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization, classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert, image_size, net_depth, subdir, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, num_epochs): if dataset: streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False, color_convert=color_convert) else: streams = create_celeba_streams(training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifier, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring sys.setrecursionlimit(1000000) monitored_quantities_list = [] for graph in [bn_cg, cg]: # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4] discriminative_layer_terms = graph.outputs[4:] cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' avg_discriminative_term = discriminative_term.mean(axis=0) avg_discriminative_term.name = 'avg_discriminative_term' num_layer_terms = len(discriminative_layer_terms) avg_discriminative_layer_terms = [None] * num_layer_terms for i, term in enumerate(discriminative_layer_terms): avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0) avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i) monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term, avg_discriminative_term] + avg_discriminative_layer_terms) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every, before_training=True, use_cpickle=True) sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2, image_size=(image_size, image_size), channels=3, dataset=dataset, split="valid", save_subdir=subdir, before_training=True, after_epoch=True) # TODO: why does z_dim=foo become foo/2? extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), checkpoint, sample_checkpoint, train_monitoring, valid_monitoring, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) try: saved_model = load(oldmodel) except AttributeError: # newer version of blocks with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length) ### optimization algorithm definition step_rule = CompositeRule([ StepClipping(1.), #Momentum(learning_rate=args.learning_rate, momentum=0.9), RMSProp(learning_rate=args.learning_rate, decay_rate=0.5), ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor (after epoch to limit the log size) step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items()]) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor")
def train_model(cost, unregularized_cost, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) # Define algorithm algorithm = GradientDescent(cost=cost, step_rule=step_rule, parameters=cg.parameters) # Add the updates to carry the hidden state algorithm.add_updates(updates) # Extensions to be added extensions = [] # Load from a dumped model if args.load_path is not None: extensions.append(Load(args.load_path)) # Generation extension if args.generate: extensions.append(TextGenerationExtension( cost=cost, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=1, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) # Training and Validation score monitoring extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq), DataStreamMonitoring([cost, unregularized_cost], valid_stream, args.mini_batch_size_valid, args.dataset, state_updates=updates, prefix='valid', before_first_epoch=(args.visualize == "nothing"), every_n_batches=args.monitoring_freq)]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) elif 'test' in args.save_path: print "Rewriting in " + args.save_path else: raise Exception('Directory already exists') # Early stopping extensions.append(EarlyStopping('valid_' + unregularized_cost.name, args.patience, args.save_path, every_n_batches=args.monitoring_freq)) # Printing extensions.append(ProgressBar()) extensions.append(Printing(every_n_batches=args.monitoring_freq)) # Reset the initial states if args.dataset == "sine": reset_frequency = 1 else: reset_frequency = 100 extensions.append(ResetStates([v for v, _ in updates], every_n_batches=reset_frequency)) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()