def test_get_batch_normalization_updates(self): """Test that get_batch_normalization_updates works as expected.""" with batch_normalization(self.mlp): y_bn = self.mlp.apply(self.x) graph = ComputationGraph([y_bn]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates)
def test_batch_normalized_mlp_transformed(): """Smoke test that a graph involving a BatchNormalizedMLP transforms.""" x = tensor.matrix('x') mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9]) with batch_normalization(mlp): y = mlp.apply(x) assert len(get_batch_normalization_updates(ComputationGraph([y]))) == 4
def test_get_batch_normalization_updates_non_training_applications(self): """Test updates extracton in graph with non-training apply.""" y = self.mlp.apply(self.x) with batch_normalization(self.mlp): y_bn = self.mlp.apply(self.x) graph = ComputationGraph([y_bn, y]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates)
def test_get_batch_normalization_updates_duplicates_error(self): """Test that we get an error by default on multiple apply.""" with batch_normalization(self.mlp): y = self.mlp.apply(self.x) y2 = self.mlp.apply(self.x) graph = ComputationGraph([y, y2]) numpy.testing.assert_raises(ValueError, get_batch_normalization_updates, graph)
def test_get_batch_normalization_updates_allow_duplicates(self): """Test get_batch_normalization_updates(allow_duplicates=True).""" with batch_normalization(self.mlp): y = self.mlp.apply(self.x) y2 = self.mlp.apply(self.x) graph = ComputationGraph([y, y2]) updates = get_batch_normalization_updates(graph, allow_duplicates=True) self.simple_assertions(updates, num_bricks=2, num_updates=8)
def test_get_batch_normalization_updates_mean_only(self): """Test get_batch_normalization_updates with mean_only bricks.""" mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True) with batch_normalization(mlp): y_bn = mlp.apply(self.x) graph = ComputationGraph([y_bn]) updates = get_batch_normalization_updates(graph) self.simple_assertions(updates, num_updates=2, mean_only=True)
def __init__(self, params, feature_source, input_dim): super(MaxoutMLP, self).__init__(params) self.x = tensor.matrix(feature_source, dtype='float32') self.y = tensor.matrix('genres', dtype='int32') mlp = MLPGenreClassifier(input_dim, self.params['n_classes'], self.params['hidden_size'], self.params['init_ranges']) mlp.initialize() with batch_normalization(mlp): self.y_hat = mlp.apply(self.x) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def __init__(self, params): super(MoETrainer, self).__init__(params) self.x_v = tensor.matrix('vgg_features', dtype='float32') self.x_t = tensor.matrix('features', dtype='float32') self.y = tensor.matrix('genres', dtype='int32') model = MoEClassifier(params['visual_dim'], params['textual_dim'], params['n_classes'], params['hidden_size'], params['init_ranges']) model.initialize() with batch_normalization(model): self.y_hat = model.apply(self.x_v, self.x_t) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def test_batch_normalization_simple(): x = tensor.matrix() eps = 1e-4 bn = BatchNormalization(input_dim=4, epsilon=eps) bn.initialize() with batch_normalization(bn): y = bn.apply(x) rng = numpy.random.RandomState((2016, 1, 18)) x_ = rng.uniform(size=(5, 4)).astype(theano.config.floatX) y_ = y.eval({x: x_}) y_expected = (x_ - x_.mean(axis=0)) / numpy.sqrt(x_.var(axis=0) + eps) assert_allclose(y_, y_expected, rtol=1e-4)
def __init__(self, params): super(ConcatenateTrainer, self).__init__(params) x_v = tensor.matrix('vgg_features', dtype='float32') x_t = tensor.matrix('features', dtype='float32') self.x = tensor.concatenate([x_v, x_t], axis=1) self.y = tensor.matrix('genres', dtype='int32') input_dim = params['visual_dim'] + params['textual_dim'] mlp = MLPGenreClassifier(input_dim, self.params['n_classes'], self.params['hidden_size'], self.params['init_ranges']) mlp.initialize() with batch_normalization(mlp): self.y_hat = mlp.apply(self.x) self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
def test_batch_normalization_nested(): x = tensor.tensor4() eps = 1e-4 r_dims = (0, 2, 3) batch_dims = (5, 4, 3, 2) bn = BatchNormalization(input_dim=batch_dims[1:], broadcastable=(False, True, True), epsilon=eps) seq = Sequence([bn.apply, Tanh().apply]) seq.initialize() with batch_normalization(seq): y = seq.apply(x) rng = numpy.random.RandomState((2016, 1, 18)) x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX) y_ = y.eval({x: x_}) y_expected = numpy.tanh( (x_ - x_.mean(axis=r_dims, keepdims=True)) / numpy.sqrt(x_.var(axis=r_dims, keepdims=True) + eps)) assert_allclose(y_, y_expected, rtol=1e-4)
def test_batch_normalization_nested(): x = tensor.tensor4() eps = 1e-4 r_dims = (0, 2, 3) batch_dims = (5, 4, 3, 2) bn = BatchNormalization(input_dim=batch_dims[1:], broadcastable=(False, True, True), epsilon=eps) seq = Sequence([bn.apply, Tanh().apply]) seq.initialize() with batch_normalization(seq): y = seq.apply(x) rng = numpy.random.RandomState((2016, 1, 18)) x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX) y_ = y.eval({x: x_}) y_expected = numpy.tanh((x_ - x_.mean(axis=r_dims, keepdims=True)) / numpy.sqrt(x_.var(axis=r_dims, keepdims=True) + eps)) assert_allclose(y_, y_expected, rtol=1e-4)
def create_training_computation_graphs(discriminative_regularization): x = tensor.tensor4('features') pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks() encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: classifier_model = Model(load('celeba_classifier.zip').algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select('/convnet').bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx(numpy.zeros((3, 64, 64)), name='log_sigma_theta') add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[4::6]: log_sigma = shared_floatx(numpy.zeros(layer.get_dim('output')), name='{}_log_sigma'.format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply( decoder_mlp.apply(z).reshape((-1, ) + decoder_convnet.get_dim('input_'))) log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi**2 - 2 * log_sigma_phi - 1).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta)**2 / tensor.exp(2 * log_sigma)).sum(axis=[1, 2, 3]) total_reconstruction_term = reconstruction_term if discriminative_regularization: # Propagate both the input and the reconstruction through the # classifier acts_cg = ComputationGraph([classifier_convnet.apply(x)]) acts_hat_cg = ComputationGraph( [classifier_convnet.apply(mu_theta)]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms for layer, log_sigma in zip(classifier_convnet.layers[4::6], variance_parameters[1:]): variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) log_sigma = log_sigma.dimshuffle('x', 0, 1, 2) total_reconstruction_term += -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat)**2 / tensor.exp(2 * log_sigma)).sum( axis=[1, 2, 3]) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph([cost, kl_term, reconstruction_term]) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def create_training_computation_graphs(z_dim, image_size, net_depth, discriminative_regularization, classifer, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights): x = tensor.tensor4('features') pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks(z_dim=z_dim, image_size=image_size, depth=net_depth) encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: if vintage: classifier_model = Model(load(classifer).algorithm.cost) else: with open(classifer, 'rb') as src: classifier_model = Model(load(src).algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select('/convnet').bricks classifier_mlp, = selector.select('/mlp').bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)), name='log_sigma_theta') add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] num_disc_layers = 0 if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[1::3]: log_sigma = shared_floatx(numpy.zeros(layer.get_dim('output')), name='{}_log_sigma'.format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # include mlp # DISABLED # log_sigma = shared_floatx( # numpy.zeros([classifier_mlp.output_dim]), # name='{}_log_sigma'.format("MLP")) # add_role(log_sigma, PARAMETER) # variance_parameters.append(log_sigma) # diagnostic num_disc_layers = len(variance_parameters) - 1 print("Applying discriminative regularization on {} layers".format( num_disc_layers)) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply( decoder_mlp.apply(z).reshape((-1, ) + decoder_convnet.get_dim('input_'))) log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi**2 - 2 * log_sigma_phi - 1).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta)**2 / tensor.exp(2 * log_sigma)).sum(axis=[1, 2, 3]) discriminative_layer_terms = [None] * num_disc_layers for i in range(num_disc_layers): discriminative_layer_terms[i] = tensor.zeros_like(kl_term) discriminative_term = tensor.zeros_like(kl_term) if discriminative_regularization: # Propagate both the input and the reconstruction through the classifier acts_cg = ComputationGraph([ classifier_mlp.apply( classifier_convnet.apply(x).flatten(ndim=2)) ]) acts_hat_cg = ComputationGraph([ classifier_mlp.apply( classifier_convnet.apply(mu_theta).flatten(ndim=2)) ]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms cur_layer = 0 # CLASSIFIER MLP DISABLED # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp], for i, zip_pair in enumerate( zip(classifier_convnet.layers[1::3], variance_parameters[1:])): layer, log_sigma = zip_pair variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) # TODO: this conditional could be less brittle if "mlp" in layer.name.lower(): log_sigma = log_sigma.dimshuffle('x', 0) sumaxis = [1] else: log_sigma = log_sigma.dimshuffle('x', 0, 1, 2) sumaxis = [1, 2, 3] discriminative_layer_term_unweighted = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat)**2 / tensor.exp(2 * log_sigma)).sum( axis=sumaxis) discriminative_layer_terms[ i] = discriminative_factor * disc_weights[ cur_layer] * discriminative_layer_term_unweighted discriminative_term = discriminative_term + discriminative_layer_terms[ i] cur_layer = cur_layer + 1 # scale terms (disc is prescaled by layer) reconstruction_term = reconstruction_factor * reconstruction_term kl_term = kl_factor * kl_term # total_reconstruction_term is reconstruction + discriminative total_reconstruction_term = reconstruction_term + discriminative_term # cost is mean(kl - total reconstruction) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph( [cost, kl_term, reconstruction_term, discriminative_term] + discriminative_layer_terms) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifer, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, ): x = tensor.tensor4("features") pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks(z_dim=z_dim, image_size=image_size, depth=net_depth) encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: if vintage: classifier_model = Model(load(classifer).algorithm.cost) else: with open(classifer, "rb") as src: classifier_model = Model(load(src).algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select("/convnet").bricks classifier_mlp, = selector.select("/mlp").bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)), name="log_sigma_theta") add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] num_disc_layers = 0 if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[1::3]: log_sigma = shared_floatx(numpy.zeros(layer.get_dim("output")), name="{}_log_sigma".format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # include mlp # DISABLED # log_sigma = shared_floatx( # numpy.zeros([classifier_mlp.output_dim]), # name='{}_log_sigma'.format("MLP")) # add_role(log_sigma, PARAMETER) # variance_parameters.append(log_sigma) # diagnostic num_disc_layers = len(variance_parameters) - 1 print("Applying discriminative regularization on {} layers".format(num_disc_layers)) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_"))) log_sigma = log_sigma_theta.dimshuffle("x", 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) discriminative_layer_terms = [None] * num_disc_layers for i in range(num_disc_layers): discriminative_layer_terms[i] = tensor.zeros_like(kl_term) discriminative_term = tensor.zeros_like(kl_term) if discriminative_regularization: # Propagate both the input and the reconstruction through the classifier acts_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(x).flatten(ndim=2))]) acts_hat_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(mu_theta).flatten(ndim=2))]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms cur_layer = 0 # CLASSIFIER MLP DISABLED # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp], for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3], variance_parameters[1:])): layer, log_sigma = zip_pair variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) # TODO: this conditional could be less brittle if "mlp" in layer.name.lower(): log_sigma = log_sigma.dimshuffle("x", 0) sumaxis = [1] else: log_sigma = log_sigma.dimshuffle("x", 0, 1, 2) sumaxis = [1, 2, 3] discriminative_layer_term_unweighted = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=sumaxis) discriminative_layer_terms[i] = ( discriminative_factor * disc_weights[cur_layer] * discriminative_layer_term_unweighted ) discriminative_term = discriminative_term + discriminative_layer_terms[i] cur_layer = cur_layer + 1 # scale terms (disc is prescaled by layer) reconstruction_term = reconstruction_factor * reconstruction_term kl_term = kl_factor * kl_term # total_reconstruction_term is reconstruction + discriminative total_reconstruction_term = reconstruction_term + discriminative_term # cost is mean(kl - total reconstruction) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph([cost, kl_term, reconstruction_term, discriminative_term] + discriminative_layer_terms) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def create_training_computation_graphs(discriminative_regularization): x = tensor.tensor4('features') pi = numpy.cast[theano.config.floatX](numpy.pi) bricks = create_model_bricks() encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks if discriminative_regularization: classifier_model = Model(load('celeba_classifier.zip').algorithm.cost) selector = Selector(classifier_model.top_bricks) classifier_convnet, = selector.select('/convnet').bricks random_brick = Random() # Initialize conditional variances log_sigma_theta = shared_floatx( numpy.zeros((3, 64, 64)), name='log_sigma_theta') add_role(log_sigma_theta, PARAMETER) variance_parameters = [log_sigma_theta] if discriminative_regularization: # We add discriminative regularization for the batch-normalized output # of the strided layers of the classifier. for layer in classifier_convnet.layers[4::6]: log_sigma = shared_floatx( numpy.zeros(layer.get_dim('output')), name='{}_log_sigma'.format(layer.name)) add_role(log_sigma, PARAMETER) variance_parameters.append(log_sigma) # Computation graph creation is encapsulated within this function in order # to allow selecting which parts of the graph will use batch statistics for # batch normalization and which parts will use population statistics. # Specifically, we'd like to use population statistics for the classifier # even in the training graph. def create_computation_graph(): # Encode phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2)) nlat = encoder_mlp.output_dim // 2 mu_phi = phi[:, :nlat] log_sigma_phi = phi[:, nlat:] # Sample from the approximate posterior epsilon = random_brick.theano_rng.normal( size=mu_phi.shape, dtype=mu_phi.dtype) z = mu_phi + epsilon * tensor.exp(log_sigma_phi) # Decode mu_theta = decoder_convnet.apply( decoder_mlp.apply(z).reshape( (-1,) + decoder_convnet.get_dim('input_'))) log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2) # Compute KL and reconstruction terms kl_term = 0.5 * ( tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1 ).sum(axis=1) reconstruction_term = -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) total_reconstruction_term = reconstruction_term if discriminative_regularization: # Propagate both the input and the reconstruction through the # classifier acts_cg = ComputationGraph([classifier_convnet.apply(x)]) acts_hat_cg = ComputationGraph( [classifier_convnet.apply(mu_theta)]) # Retrieve activations of interest and compute discriminative # regularization reconstruction terms for layer, log_sigma in zip(classifier_convnet.layers[4::6], variance_parameters[1:]): variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer]) d, = variable_filter(acts_cg) d_hat, = variable_filter(acts_hat_cg) log_sigma = log_sigma.dimshuffle('x', 0, 1, 2) total_reconstruction_term += -0.5 * ( tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma) ).sum(axis=[1, 2, 3]) cost = (kl_term - total_reconstruction_term).mean() return ComputationGraph([cost, kl_term, reconstruction_term]) cg = create_computation_graph() with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp): bn_cg = create_computation_graph() return cg, bn_cg, variance_parameters
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(save_to, num_epochs, regularization=0.0001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_res_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 500 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=train_cg.parameters, step_rule=momentum) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (1, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), DataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=17), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to, use_cpickle=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)