def test_get_batch_normalization_updates(self): """Test that get_batch_normalization_updates works as expected.""" with batch_normalization(self.mlp): y_bn = self.mlp.apply(self.x) graph = ComputationGraph([y_bn]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates)
def test_batch_normalized_mlp_transformed(): """Smoke test that a graph involving a BatchNormalizedMLP transforms.""" x = tensor.matrix('x') mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9]) with batch_normalization(mlp): y = mlp.apply(x) assert len(get_batch_normalization_updates(ComputationGraph([y]))) == 4
def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def test_get_batch_normalization_updates_allow_duplicates(self): """Test get_batch_normalization_updates(allow_duplicates=True).""" with batch_normalization(self.mlp): y = self.mlp.apply(self.x) y2 = self.mlp.apply(self.x) graph = ComputationGraph([y, y2]) updates = get_batch_normalization_updates(graph, allow_duplicates=True) self.simple_assertions(updates, num_bricks=2, num_updates=8)
def test_get_batch_normalization_updates_mean_only(self): """Test get_batch_normalization_updates with mean_only bricks.""" mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True) with batch_normalization(mlp): y_bn = mlp.apply(self.x) graph = ComputationGraph([y_bn]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates, num_updates=2, mean_only=True)
def test_get_batch_normalization_updates_non_training_applications(self): """Test updates extracton in graph with non-training apply.""" y = self.mlp.apply(self.x) with batch_normalization(self.mlp): y_bn = self.mlp.apply(self.x) graph = ComputationGraph([y_bn, y]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates)
def run(): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=True) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs() # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=False, after_epoch=False, every_n_epochs=5) # Prepare checkpoint checkpoint = Checkpoint( 'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp' ))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring(monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring(monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [ Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar() ] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization, classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert, image_size, net_depth, subdir, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, num_epochs): if dataset: streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False, color_convert=color_convert) else: streams = create_celeba_streams(training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifier, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring sys.setrecursionlimit(1000000) monitored_quantities_list = [] for graph in [bn_cg, cg]: # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4] discriminative_layer_terms = graph.outputs[4:] cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' avg_discriminative_term = discriminative_term.mean(axis=0) avg_discriminative_term.name = 'avg_discriminative_term' num_layer_terms = len(discriminative_layer_terms) avg_discriminative_layer_terms = [None] * num_layer_terms for i, term in enumerate(discriminative_layer_terms): avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0) avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i) monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term, avg_discriminative_term] + avg_discriminative_layer_terms) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every, before_training=True, use_cpickle=True) sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2, image_size=(image_size, image_size), channels=3, dataset=dataset, split="valid", save_subdir=subdir, before_training=True, after_epoch=True) # TODO: why does z_dim=foo become foo/2? extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), checkpoint, sample_checkpoint, train_monitoring, valid_monitoring, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) try: saved_model = load(oldmodel) except AttributeError: # newer version of blocks with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization, classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert, image_size, net_depth, subdir, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, num_epochs): if dataset: streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False, color_convert=color_convert) else: streams = create_celeba_streams(training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(z_dim, image_size, net_depth, discriminative_regularization, classifier, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp' ))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring sys.setrecursionlimit(1000000) monitored_quantities_list = [] for graph in [bn_cg, cg]: # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[: 4] discriminative_layer_terms = graph.outputs[4:] cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' avg_discriminative_term = discriminative_term.mean(axis=0) avg_discriminative_term.name = 'avg_discriminative_term' num_layer_terms = len(discriminative_layer_terms) avg_discriminative_layer_terms = [None] * num_layer_terms for i, term in enumerate(discriminative_layer_terms): avg_discriminative_layer_terms[i] = discriminative_layer_terms[ i].mean(axis=0) avg_discriminative_layer_terms[ i].name = "avg_discriminative_term_layer_{:02d}".format(i) monitored_quantities_list.append([ cost, avg_kl_term, avg_reconstruction_term, avg_discriminative_term ] + avg_discriminative_layer_terms) train_monitoring = DataStreamMonitoring(monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) valid_monitoring = DataStreamMonitoring(monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every, before_training=True, use_cpickle=True) sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim / 2, image_size=(image_size, image_size), channels=3, dataset=dataset, split="valid", save_subdir=subdir, before_training=True, after_epoch=True) # TODO: why does z_dim=foo become foo/2? extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), checkpoint, sample_checkpoint, train_monitoring, valid_monitoring, Printing(), ProgressBar() ] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) try: saved_model = load(oldmodel) except AttributeError: # newer version of blocks with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3,3),(3,3)] feature_maps = [20, 20] pooling_sizes = [(3,3),(2,2)] save_to="DvC.pkl" image_size = (128, 128) output_size = 2 learningRate=0.1 num_epochs=300 num_batches=None if socket.gethostname()=='tim-X550JX':host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[ DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs) ]) main_loop.run() return main_loop
# print texture_image_nn_input print texture_image_nn_input.shape f_features_gram = theano.function( inputs=[X], outputs=[gram_matrix(f) for f in texture_features(X)] ) target_image_features = f_features_gram(texture_image_nn_input) # print target_image_features print [t.shape for t in target_image_features] from blocks.graph import ComputationGraph, apply_batch_normalization, get_batch_normalization_updates cg = ComputationGraph(generated_image_graph) cg_bn = apply_batch_normalization(cg) pop_updates = get_batch_normalization_updates(cg_bn) text_generated = texture_features(cg.outputs[0]) gram_generated = [gram_matrix(f) for f in text_generated] loss = 0 for i in range(len(target_image_features)): N = text_generated[i].shape[1] M = text_generated[i].shape[2]*text_generated[i].shape[3] loss += 1./ (4 * 16 * N ** 2 * M ** 2) * ((gram_generated[i] - tensor.addbroadcast(theano.shared(target_image_features[i]), 0)) ** 2).sum() alpha = 0.1
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs)]) main_loop.run() return main_loop
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
print 'Optimizing parameters :' print all_parameters for parameters in all_parameters: algorithm = GradientDescent( cost=cost_dropout, parameters=parameters, step_rule=Adam(), on_unused_sources='ignore' ) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg_bn) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] algorithm.add_updates(extra_updates) # In[6]: from blocks.extensions import Printing, Timing, FinishAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.saveload import Checkpoint from blocks_extras.extensions.plot import Plot import socket
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3, 3), (3, 3)] feature_maps = [20, 20] pooling_sizes = [(3, 3), (2, 2)] save_to = "DvC.pkl" image_size = (128, 128) output_size = 2 learningRate = 0.1 num_epochs = 300 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def run(batch_size, classifier, oldmodel, monitor_every, checkpoint_every, final_epoch, dataset, color_convert, image_size, net_depth, allowed, stretch): streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=True, color_convert=color_convert, allowed=allowed, stretch=stretch) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs(image_size, net_depth) model = Model(bn_dropout_cg.outputs[0]) # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=True, after_epoch=False, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(classifier, every_n_epochs=checkpoint_every, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=final_epoch), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def main(save_to, num_epochs, regularization=0.0001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_res_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 500 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=train_cg.parameters, step_rule=momentum) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (1, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), DataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=17), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to, use_cpickle=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(config, use_bokeh=False): tr_stream = get_tr_stream(**config) # dev_stream = get_dev_stream(**config) # Create Theano variables logger.info('Creating theano variables') source_image = tensor.ftensor4('image') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.ftensor4('input') sampling_output = tensor.lmatrix('output') # Construct model logger.info('Building RNN encoder-decoder') cnn_encoder = CNNEncoder(config['batch_norm']) image_embedding = cnn_encoder.conv_sequence.apply(source_image) if config['use_rnn']: encoder = BidirectionalEncoder(config['enc_embed'], config['enc_nhids']) encoder_inputs = image_embedding.dimshuffle(2, 3, 0, 1) encoded_images, _ = theano.map(encoder.apply, sequences=encoder_inputs, name='parallel_encoders') else: encoded_images = image_embedding.dimshuffle(2, 3, 0, 1) encoded_shape = encoded_images.shape annotation_vector = encoded_images.reshape( (-1, encoded_shape[2], encoded_shape[3])) annotation_vector_mask = tensor.ones(annotation_vector.shape[:2]) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(annotation_vector, annotation_vector_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') cnn_encoder.conv_sequence.weights_init = IsotropicGaussian( config['weight_scale']) cnn_encoder.conv_sequence.biases_init = Constant(0) if config['use_rnn']: encoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.initialize() decoder.weights_init = IsotropicGaussian(config['weight_scale']) decoder.biases_init = Constant(0) decoder.push_initialization_config() decoder.transition.weights_init = Orthogonal() decoder.initialize() cnn_encoder.conv_sequence.push_initialization_config() cnn_encoder.conv_sequence.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') cnn_params = Selector( cnn_encoder.conv_sequence).get_parameters().values() enc_params = [] if config['use_rnn']: enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() dec_params += Selector( decoder.transition.initial_transformer).get_parameters().values() cg = apply_noise(cg, cnn_params + enc_params + dec_params, config['weight_noise_ff']) # Apply batch normalization if config['batch_norm']: logger.info('Applying batch normalization') cg = apply_batch_normalization(cg) pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * 0.05 + p * (1 - 0.05)) for p, m in pop_updates] else: extra_updates = [] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names if config['use_rnn']: enc_dec_param_dict = merge( Selector(cnn_encoder.conv_sequence).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) else: enc_dec_param_dict = merge( Selector(cnn_encoder.conv_sequence).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_image_embedding = cnn_encoder.conv_sequence.apply( sampling_input) if config['use_rnn']: sampling_encoder_inputs = sampling_image_embedding.dimshuffle( 2, 3, 0, 1) sampling_encoded_images, _ = theano.map( encoder.apply, sequences=sampling_encoder_inputs, name='parallel_encoders_inf') else: sampling_encoded_images = sampling_image_embedding.dimshuffle( 2, 3, 0, 1) sampling_encoded_shape = sampling_encoded_images.shape sampling_annotation_vector = sampling_encoded_images.reshape( (-1, sampling_encoded_shape[2], sampling_encoded_shape[3])) sampling_annotation_vector_mask = tensor.ones( sampling_annotation_vector.shape[:2]) generated = decoder.generate(sampling_annotation_vector) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], trg_vocab=config['trg_vocab'])) # Add early stopping based on bleu if 'bleu_script' in config: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # algorithm.add_updates(extra_updates) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()