def test_apply_batch_normalization_nested(): x = tensor.matrix() eps = 1e-8 batch_dims = (3, 9) bn = BatchNormalization(input_dim=5, epsilon=eps) mlp = MLP([Sequence([bn.apply, Tanh().apply])], [9, 5], weights_init=Constant(0.4), biases_init=Constant(1)) mlp.initialize() y = mlp.apply(x) cg = apply_batch_normalization(ComputationGraph([y])) y_bn = cg.outputs[0] rng = numpy.random.RandomState((2016, 1, 18)) x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX) y_ = y_bn.eval({x: x_}) W_, b_ = map(lambda s: (getattr(mlp.linear_transformations[0], s) .get_value(borrow=True)), ['W', 'b']) z_ = numpy.dot(x_, W_) + b_ y_expected = numpy.tanh((z_ - z_.mean(axis=0)) / numpy.sqrt(z_.var(axis=0) + eps)) assert_allclose(y_, y_expected, rtol=1e-3)
def create_training_computation_graphs(): x = tensor.tensor4('features') y = tensor.imatrix('targets') convnet, mlp = create_model_bricks() y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2)) cost = BinaryCrossEntropy().apply(y, y_hat) accuracy = 1 - tensor.neq(y > 0.5, y_hat > 0.5).mean() cg = ComputationGraph([cost, accuracy]) # Create a graph which uses batch statistics for batch normalization # as well as dropout on selected variables bn_cg = apply_batch_normalization(cg) bricks_to_drop = ([convnet.layers[i] for i in (5, 11, 17)] + [mlp.application_methods[1].brick]) variables_to_drop = VariableFilter( roles=[OUTPUT], bricks=bricks_to_drop)(bn_cg.variables) bn_dropout_cg = apply_dropout(bn_cg, variables_to_drop, 0.5) return cg, bn_dropout_cg
def test_apply_batch_normalization_nested(): x = tensor.matrix() eps = 1e-8 batch_dims = (3, 9) bn = BatchNormalization(input_dim=5, epsilon=eps) mlp = MLP([Sequence([bn.apply, Tanh().apply])], [9, 5], weights_init=Constant(0.4), biases_init=Constant(1)) mlp.initialize() y = mlp.apply(x) cg = apply_batch_normalization(ComputationGraph([y])) y_bn = cg.outputs[0] rng = numpy.random.RandomState((2016, 1, 18)) x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX) y_ = y_bn.eval({x: x_}) W_, b_ = map( lambda s: (getattr(mlp.linear_transformations[0], s).get_value(borrow=True)), ['W', 'b']) z_ = numpy.dot(x_, W_) + b_ y_expected = numpy.tanh( (z_ - z_.mean(axis=0)) / numpy.sqrt(z_.var(axis=0) + eps)) assert_allclose(y_, y_expected, rtol=1e-3)
def main(num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): ############# Architecture ############# if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 32) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None delta = 0.01 drop_prob = 0.5 weight_noise = 0.75 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') probs = (convnet.apply(x)).copy(name='probs') # Computational Graph just for cost for drop_out and noise application cg_probs = ComputationGraph([probs]) inputs = VariableFilter(roles=[INPUT])(cg_probs.variables) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables) ############# Regularization ############# #regularization = 0 logger.info('Applying regularization') regularization = delta * sum([(W**2).mean() for W in weights]) probs.name = "reg_probs" ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg_train = apply_noise(cg_probs, weights, weight_noise) ############# Dropout ############# logger.info('Applying dropout') cg_probs = apply_dropout(cg_probs, inputs, drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables) inputs_referenced = [var.tag.replacement_of for var in dropped_out] set(inputs) == set(inputs_referenced) ############# Batch normalization ############# # recalculate probs after dropout and noise and regularization: probs = cg_probs.outputs[0] + regularization cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([probs, cost, error_rate]) cg = apply_batch_normalization(cg) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features', )) stream_max = ScikitResize(stream_rotate, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 20))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(20, 30))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) #algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) #extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) logger.info("Building the model") model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3,3),(3,3)] feature_maps = [20, 20] pooling_sizes = [(3,3),(2,2)] save_to="DvC.pkl" image_size = (128, 128) output_size = 2 learningRate=0.1 num_epochs=300 num_batches=None if socket.gethostname()=='tim-X550JX':host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main( num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None ): ############# Architecture ############# if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 32) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None delta = 0.01 drop_prob = 0.5 weight_noise = 0.75 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet( conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode="full", weights_init=Uniform(width=0.2), biases_init=Constant(0), ) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.11) convnet.initialize() logging.info("Input dim: {} {} {}".format(*convnet.children[0].get_dim("input_"))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format(i, layer.__class__.__name__, *layer.get_dim("output"))) x = tensor.tensor4("image_features") y = tensor.lmatrix("targets") probs = (convnet.apply(x)).copy(name="probs") # Computational Graph just for cost for drop_out and noise application cg_probs = ComputationGraph([probs]) inputs = VariableFilter(roles=[INPUT])(cg_probs.variables) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables) ############# Regularization ############# # regularization = 0 logger.info("Applying regularization") regularization = delta * sum([(W ** 2).mean() for W in weights]) probs.name = "reg_probs" ############# Guaussian Noise ############# logger.info("Applying Gaussian noise") cg_train = apply_noise(cg_probs, weights, weight_noise) ############# Dropout ############# logger.info("Applying dropout") cg_probs = apply_dropout(cg_probs, inputs, drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables) inputs_referenced = [var.tag.replacement_of for var in dropped_out] set(inputs) == set(inputs_referenced) ############# Batch normalization ############# # recalculate probs after dropout and noise and regularization: probs = cg_probs.outputs[0] + regularization cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name="cost") error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(name="error_rate") cg = ComputationGraph([probs, cost, error_rate]) cg = apply_batch_normalization(cg) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions(stream, image_size, which_sources=("image_features",)) stream_rotate = Random2DRotation(stream_downscale, which_sources=("image_features",)) stream_max = ScikitResize(stream_rotate, image_size, which_sources=("image_features",)) stream_scale = ScaleAndShift(stream_max, 1.0 / 255, 0, which_sources=("image_features",)) stream_cast = Cast(stream_scale, dtype="float32", which_sources=("image_features",)) # stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data(DogsVsCats(("train",), subset=slice(0, 20))) stream_data_test = create_data(DogsVsCats(("train",), subset=slice(20, 30))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) # algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) # algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append(DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True ) ) # extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) logger.info("Building the model") model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
cost, all_parameters = build_model(images, labels) # LEARN WEIGHTS # In[3]: train_stream = ServerDataStream(('driver_id', 'images', 'labels'), False, hwm=10) valid_stream = ServerDataStream(('driver_id', 'images', 'labels'), False, hwm=10, port=5558) # In[5]: alpha = 0.1 cg = ComputationGraph(cost) cg_bn = apply_batch_normalization(cg) inputs = VariableFilter(roles=[INPUT])(cg_bn.variables) print inputs cg_dropout = apply_dropout(cg_bn, [inputs[11], inputs[0]], .5) cost_bn = cg_bn.outputs[0] cost_dropout = cg_dropout.outputs[0] model = Model(cost) print 'Optimizing parameters :' print all_parameters for parameters in all_parameters:
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3, 3), (3, 3)] feature_maps = [20, 20] pooling_sizes = [(3, 3), (2, 2)] save_to = "DvC.pkl" image_size = (128, 128) output_size = 2 learningRate = 0.1 num_epochs = 300 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[ DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs) ]) main_loop.run() return main_loop
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs)]) main_loop.run() return main_loop
# print texture_image_nn_input print texture_image_nn_input.shape f_features_gram = theano.function( inputs=[X], outputs=[gram_matrix(f) for f in texture_features(X)] ) target_image_features = f_features_gram(texture_image_nn_input) # print target_image_features print [t.shape for t in target_image_features] from blocks.graph import ComputationGraph, apply_batch_normalization, get_batch_normalization_updates cg = ComputationGraph(generated_image_graph) cg_bn = apply_batch_normalization(cg) pop_updates = get_batch_normalization_updates(cg_bn) text_generated = texture_features(cg.outputs[0]) gram_generated = [gram_matrix(f) for f in text_generated] loss = 0 for i in range(len(target_image_features)): N = text_generated[i].shape[1] M = text_generated[i].shape[2]*text_generated[i].shape[3] loss += 1./ (4 * 16 * N ** 2 * M ** 2) * ((gram_generated[i] - tensor.addbroadcast(theano.shared(target_image_features[i]), 0)) ** 2).sum()
def main(config, use_bokeh=False): tr_stream = get_tr_stream(**config) # dev_stream = get_dev_stream(**config) # Create Theano variables logger.info('Creating theano variables') source_image = tensor.ftensor4('image') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.ftensor4('input') sampling_output = tensor.lmatrix('output') # Construct model logger.info('Building RNN encoder-decoder') cnn_encoder = CNNEncoder(config['batch_norm']) image_embedding = cnn_encoder.conv_sequence.apply(source_image) if config['use_rnn']: encoder = BidirectionalEncoder(config['enc_embed'], config['enc_nhids']) encoder_inputs = image_embedding.dimshuffle(2, 3, 0, 1) encoded_images, _ = theano.map(encoder.apply, sequences=encoder_inputs, name='parallel_encoders') else: encoded_images = image_embedding.dimshuffle(2, 3, 0, 1) encoded_shape = encoded_images.shape annotation_vector = encoded_images.reshape( (-1, encoded_shape[2], encoded_shape[3])) annotation_vector_mask = tensor.ones(annotation_vector.shape[:2]) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(annotation_vector, annotation_vector_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') cnn_encoder.conv_sequence.weights_init = IsotropicGaussian( config['weight_scale']) cnn_encoder.conv_sequence.biases_init = Constant(0) if config['use_rnn']: encoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.initialize() decoder.weights_init = IsotropicGaussian(config['weight_scale']) decoder.biases_init = Constant(0) decoder.push_initialization_config() decoder.transition.weights_init = Orthogonal() decoder.initialize() cnn_encoder.conv_sequence.push_initialization_config() cnn_encoder.conv_sequence.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') cnn_params = Selector( cnn_encoder.conv_sequence).get_parameters().values() enc_params = [] if config['use_rnn']: enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() dec_params += Selector( decoder.transition.initial_transformer).get_parameters().values() cg = apply_noise(cg, cnn_params + enc_params + dec_params, config['weight_noise_ff']) # Apply batch normalization if config['batch_norm']: logger.info('Applying batch normalization') cg = apply_batch_normalization(cg) pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * 0.05 + p * (1 - 0.05)) for p, m in pop_updates] else: extra_updates = [] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names if config['use_rnn']: enc_dec_param_dict = merge( Selector(cnn_encoder.conv_sequence).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) else: enc_dec_param_dict = merge( Selector(cnn_encoder.conv_sequence).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_image_embedding = cnn_encoder.conv_sequence.apply( sampling_input) if config['use_rnn']: sampling_encoder_inputs = sampling_image_embedding.dimshuffle( 2, 3, 0, 1) sampling_encoded_images, _ = theano.map( encoder.apply, sequences=sampling_encoder_inputs, name='parallel_encoders_inf') else: sampling_encoded_images = sampling_image_embedding.dimshuffle( 2, 3, 0, 1) sampling_encoded_shape = sampling_encoded_images.shape sampling_annotation_vector = sampling_encoded_images.reshape( (-1, sampling_encoded_shape[2], sampling_encoded_shape[3])) sampling_annotation_vector_mask = tensor.ones( sampling_annotation_vector.shape[:2]) generated = decoder.generate(sampling_annotation_vector) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], trg_vocab=config['trg_vocab'])) # Add early stopping based on bleu if 'bleu_script' in config: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # algorithm.add_updates(extra_updates) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()