Example #1
0
 def test_get_batch_normalization_updates(self):
     """Test that get_batch_normalization_updates works as expected."""
     with batch_normalization(self.mlp):
         y_bn = self.mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates)
Example #2
0
 def test_get_batch_normalization_updates(self):
     """Test that get_batch_normalization_updates works as expected."""
     with batch_normalization(self.mlp):
         y_bn = self.mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates)
Example #3
0
def test_batch_normalized_mlp_transformed():
    """Smoke test that a graph involving a BatchNormalizedMLP transforms."""
    x = tensor.matrix('x')
    mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9])
    with batch_normalization(mlp):
        y = mlp.apply(x)
    assert len(get_batch_normalization_updates(ComputationGraph([y]))) == 4
Example #4
0
 def test_get_batch_normalization_updates_non_training_applications(self):
     """Test updates extracton in graph with non-training apply."""
     y = self.mlp.apply(self.x)
     with batch_normalization(self.mlp):
         y_bn = self.mlp.apply(self.x)
     graph = ComputationGraph([y_bn, y])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates)
Example #5
0
 def test_get_batch_normalization_updates_duplicates_error(self):
     """Test that we get an error by default on multiple apply."""
     with batch_normalization(self.mlp):
         y = self.mlp.apply(self.x)
         y2 = self.mlp.apply(self.x)
     graph = ComputationGraph([y, y2])
     numpy.testing.assert_raises(ValueError,
                                 get_batch_normalization_updates, graph)
Example #6
0
 def test_get_batch_normalization_updates_allow_duplicates(self):
     """Test get_batch_normalization_updates(allow_duplicates=True)."""
     with batch_normalization(self.mlp):
         y = self.mlp.apply(self.x)
         y2 = self.mlp.apply(self.x)
     graph = ComputationGraph([y, y2])
     updates = get_batch_normalization_updates(graph, allow_duplicates=True)
     self.simple_assertions(updates, num_bricks=2, num_updates=8)
Example #7
0
 def test_get_batch_normalization_updates_allow_duplicates(self):
     """Test get_batch_normalization_updates(allow_duplicates=True)."""
     with batch_normalization(self.mlp):
         y = self.mlp.apply(self.x)
         y2 = self.mlp.apply(self.x)
     graph = ComputationGraph([y, y2])
     updates = get_batch_normalization_updates(graph, allow_duplicates=True)
     self.simple_assertions(updates, num_bricks=2, num_updates=8)
Example #8
0
 def test_get_batch_normalization_updates_duplicates_error(self):
     """Test that we get an error by default on multiple apply."""
     with batch_normalization(self.mlp):
         y = self.mlp.apply(self.x)
         y2 = self.mlp.apply(self.x)
     graph = ComputationGraph([y, y2])
     numpy.testing.assert_raises(ValueError,
                                 get_batch_normalization_updates, graph)
Example #9
0
 def test_get_batch_normalization_updates_non_training_applications(self):
     """Test updates extracton in graph with non-training apply."""
     y = self.mlp.apply(self.x)
     with batch_normalization(self.mlp):
         y_bn = self.mlp.apply(self.x)
     graph = ComputationGraph([y_bn, y])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates)
Example #10
0
 def test_get_batch_normalization_updates_mean_only(self):
     """Test get_batch_normalization_updates with mean_only bricks."""
     mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True)
     with batch_normalization(mlp):
         y_bn = mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates, num_updates=2, mean_only=True)
Example #11
0
 def test_get_batch_normalization_updates_mean_only(self):
     """Test get_batch_normalization_updates with mean_only bricks."""
     mlp = BatchNormalizedMLP([Tanh(), Tanh()], [5, 7, 9], mean_only=True)
     with batch_normalization(mlp):
         y_bn = mlp.apply(self.x)
     graph = ComputationGraph([y_bn])
     updates = get_batch_normalization_updates(graph)
     self.simple_assertions(updates, num_updates=2, mean_only=True)
Example #12
0
 def __init__(self, params, feature_source, input_dim):
     super(MaxoutMLP, self).__init__(params)
     self.x = tensor.matrix(feature_source, dtype='float32')
     self.y = tensor.matrix('genres', dtype='int32')
     mlp = MLPGenreClassifier(input_dim, self.params['n_classes'],
                              self.params['hidden_size'],
                              self.params['init_ranges'])
     mlp.initialize()
     with batch_normalization(mlp):
         self.y_hat = mlp.apply(self.x)
     self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
Example #13
0
 def __init__(self, params):
     super(MoETrainer, self).__init__(params)
     self.x_v = tensor.matrix('vgg_features', dtype='float32')
     self.x_t = tensor.matrix('features', dtype='float32')
     self.y = tensor.matrix('genres', dtype='int32')
     model = MoEClassifier(params['visual_dim'], params['textual_dim'],
                           params['n_classes'], params['hidden_size'],
                           params['init_ranges'])
     model.initialize()
     with batch_normalization(model):
         self.y_hat = model.apply(self.x_v, self.x_t)
     self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
Example #14
0
def test_batch_normalization_simple():
    x = tensor.matrix()
    eps = 1e-4
    bn = BatchNormalization(input_dim=4, epsilon=eps)
    bn.initialize()
    with batch_normalization(bn):
        y = bn.apply(x)
    rng = numpy.random.RandomState((2016, 1, 18))
    x_ = rng.uniform(size=(5, 4)).astype(theano.config.floatX)
    y_ = y.eval({x: x_})
    y_expected = (x_ - x_.mean(axis=0)) / numpy.sqrt(x_.var(axis=0) + eps)
    assert_allclose(y_, y_expected, rtol=1e-4)
Example #15
0
def test_batch_normalization_simple():
    x = tensor.matrix()
    eps = 1e-4
    bn = BatchNormalization(input_dim=4, epsilon=eps)
    bn.initialize()
    with batch_normalization(bn):
        y = bn.apply(x)
    rng = numpy.random.RandomState((2016, 1, 18))
    x_ = rng.uniform(size=(5, 4)).astype(theano.config.floatX)
    y_ = y.eval({x: x_})
    y_expected = (x_ - x_.mean(axis=0)) / numpy.sqrt(x_.var(axis=0) + eps)
    assert_allclose(y_, y_expected, rtol=1e-4)
Example #16
0
 def __init__(self, params):
     super(ConcatenateTrainer, self).__init__(params)
     x_v = tensor.matrix('vgg_features', dtype='float32')
     x_t = tensor.matrix('features', dtype='float32')
     self.x = tensor.concatenate([x_v, x_t], axis=1)
     self.y = tensor.matrix('genres', dtype='int32')
     input_dim = params['visual_dim'] + params['textual_dim']
     mlp = MLPGenreClassifier(input_dim, self.params['n_classes'],
                              self.params['hidden_size'],
                              self.params['init_ranges'])
     mlp.initialize()
     with batch_normalization(mlp):
         self.y_hat = mlp.apply(self.x)
     self.cost = BinaryCrossEntropy().apply(self.y, self.y_hat)
Example #17
0
def test_batch_normalization_nested():
    x = tensor.tensor4()
    eps = 1e-4
    r_dims = (0, 2, 3)
    batch_dims = (5, 4, 3, 2)
    bn = BatchNormalization(input_dim=batch_dims[1:],
                            broadcastable=(False, True, True),
                            epsilon=eps)
    seq = Sequence([bn.apply, Tanh().apply])
    seq.initialize()
    with batch_normalization(seq):
        y = seq.apply(x)
    rng = numpy.random.RandomState((2016, 1, 18))
    x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX)
    y_ = y.eval({x: x_})
    y_expected = numpy.tanh(
        (x_ - x_.mean(axis=r_dims, keepdims=True)) /
        numpy.sqrt(x_.var(axis=r_dims, keepdims=True) + eps))
    assert_allclose(y_, y_expected, rtol=1e-4)
Example #18
0
def test_batch_normalization_nested():
    x = tensor.tensor4()
    eps = 1e-4
    r_dims = (0, 2, 3)
    batch_dims = (5, 4, 3, 2)
    bn = BatchNormalization(input_dim=batch_dims[1:],
                            broadcastable=(False, True, True),
                            epsilon=eps)
    seq = Sequence([bn.apply, Tanh().apply])
    seq.initialize()
    with batch_normalization(seq):
        y = seq.apply(x)
    rng = numpy.random.RandomState((2016, 1, 18))
    x_ = rng.uniform(size=batch_dims).astype(theano.config.floatX)
    y_ = y.eval({x: x_})
    y_expected = numpy.tanh((x_ - x_.mean(axis=r_dims, keepdims=True)) /
                            numpy.sqrt(x_.var(axis=r_dims, keepdims=True) +
                                       eps))
    assert_allclose(y_, y_expected, rtol=1e-4)
Example #19
0
def create_training_computation_graphs(discriminative_regularization):
    x = tensor.tensor4('features')
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks()
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        classifier_model = Model(load('celeba_classifier.zip').algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select('/convnet').bricks
    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(numpy.zeros((3, 64, 64)),
                                    name='log_sigma_theta')
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[4::6]:
            log_sigma = shared_floatx(numpy.zeros(layer.get_dim('output')),
                                      name='{}_log_sigma'.format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(size=mu_phi.shape,
                                                 dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(
            decoder_mlp.apply(z).reshape((-1, ) +
                                         decoder_convnet.get_dim('input_')))
        log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi**2 -
                         2 * log_sigma_phi - 1).sum(axis=1)
        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma +
            (x - mu_theta)**2 / tensor.exp(2 * log_sigma)).sum(axis=[1, 2, 3])
        total_reconstruction_term = reconstruction_term

        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the
            # classifier
            acts_cg = ComputationGraph([classifier_convnet.apply(x)])
            acts_hat_cg = ComputationGraph(
                [classifier_convnet.apply(mu_theta)])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            for layer, log_sigma in zip(classifier_convnet.layers[4::6],
                                        variance_parameters[1:]):
                variable_filter = VariableFilter(roles=[OUTPUT],
                                                 bricks=[layer])
                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)
                log_sigma = log_sigma.dimshuffle('x', 0, 1, 2)

                total_reconstruction_term += -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma +
                    (d - d_hat)**2 / tensor.exp(2 * log_sigma)).sum(
                        axis=[1, 2, 3])

        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph([cost, kl_term, reconstruction_term])

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet,
                             decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Example #20
0
def create_training_computation_graphs(z_dim, image_size, net_depth,
                                       discriminative_regularization,
                                       classifer, vintage,
                                       reconstruction_factor, kl_factor,
                                       discriminative_factor, disc_weights):
    x = tensor.tensor4('features')
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks(z_dim=z_dim,
                                 image_size=image_size,
                                 depth=net_depth)
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        if vintage:
            classifier_model = Model(load(classifer).algorithm.cost)
        else:
            with open(classifer, 'rb') as src:
                classifier_model = Model(load(src).algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select('/convnet').bricks
        classifier_mlp, = selector.select('/mlp').bricks

    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)),
                                    name='log_sigma_theta')
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    num_disc_layers = 0
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[1::3]:
            log_sigma = shared_floatx(numpy.zeros(layer.get_dim('output')),
                                      name='{}_log_sigma'.format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)
        # include mlp
        # DISABLED
        # log_sigma = shared_floatx(
        #     numpy.zeros([classifier_mlp.output_dim]),
        #     name='{}_log_sigma'.format("MLP"))
        # add_role(log_sigma, PARAMETER)
        # variance_parameters.append(log_sigma)
        # diagnostic
        num_disc_layers = len(variance_parameters) - 1
        print("Applying discriminative regularization on {} layers".format(
            num_disc_layers))

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(size=mu_phi.shape,
                                                 dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(
            decoder_mlp.apply(z).reshape((-1, ) +
                                         decoder_convnet.get_dim('input_')))
        log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi**2 -
                         2 * log_sigma_phi - 1).sum(axis=1)

        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma +
            (x - mu_theta)**2 / tensor.exp(2 * log_sigma)).sum(axis=[1, 2, 3])

        discriminative_layer_terms = [None] * num_disc_layers
        for i in range(num_disc_layers):
            discriminative_layer_terms[i] = tensor.zeros_like(kl_term)
        discriminative_term = tensor.zeros_like(kl_term)
        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the classifier
            acts_cg = ComputationGraph([
                classifier_mlp.apply(
                    classifier_convnet.apply(x).flatten(ndim=2))
            ])
            acts_hat_cg = ComputationGraph([
                classifier_mlp.apply(
                    classifier_convnet.apply(mu_theta).flatten(ndim=2))
            ])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            cur_layer = 0
            # CLASSIFIER MLP DISABLED
            # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp],
            for i, zip_pair in enumerate(
                    zip(classifier_convnet.layers[1::3],
                        variance_parameters[1:])):

                layer, log_sigma = zip_pair
                variable_filter = VariableFilter(roles=[OUTPUT],
                                                 bricks=[layer])

                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)

                # TODO: this conditional could be less brittle
                if "mlp" in layer.name.lower():
                    log_sigma = log_sigma.dimshuffle('x', 0)
                    sumaxis = [1]
                else:
                    log_sigma = log_sigma.dimshuffle('x', 0, 1, 2)
                    sumaxis = [1, 2, 3]

                discriminative_layer_term_unweighted = -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma +
                    (d - d_hat)**2 / tensor.exp(2 * log_sigma)).sum(
                        axis=sumaxis)

                discriminative_layer_terms[
                    i] = discriminative_factor * disc_weights[
                        cur_layer] * discriminative_layer_term_unweighted
                discriminative_term = discriminative_term + discriminative_layer_terms[
                    i]

                cur_layer = cur_layer + 1

        # scale terms (disc is prescaled by layer)
        reconstruction_term = reconstruction_factor * reconstruction_term
        kl_term = kl_factor * kl_term

        # total_reconstruction_term is reconstruction + discriminative
        total_reconstruction_term = reconstruction_term + discriminative_term

        # cost is mean(kl - total reconstruction)
        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph(
            [cost, kl_term, reconstruction_term, discriminative_term] +
            discriminative_layer_terms)

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet,
                             decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Example #21
0
def create_training_computation_graphs(
    z_dim,
    image_size,
    net_depth,
    discriminative_regularization,
    classifer,
    vintage,
    reconstruction_factor,
    kl_factor,
    discriminative_factor,
    disc_weights,
):
    x = tensor.tensor4("features")
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks(z_dim=z_dim, image_size=image_size, depth=net_depth)
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        if vintage:
            classifier_model = Model(load(classifer).algorithm.cost)
        else:
            with open(classifer, "rb") as src:
                classifier_model = Model(load(src).algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select("/convnet").bricks
        classifier_mlp, = selector.select("/mlp").bricks

    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(numpy.zeros((3, image_size, image_size)), name="log_sigma_theta")
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    num_disc_layers = 0
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[1::3]:
            log_sigma = shared_floatx(numpy.zeros(layer.get_dim("output")), name="{}_log_sigma".format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)
        # include mlp
        # DISABLED
        # log_sigma = shared_floatx(
        #     numpy.zeros([classifier_mlp.output_dim]),
        #     name='{}_log_sigma'.format("MLP"))
        # add_role(log_sigma, PARAMETER)
        # variance_parameters.append(log_sigma)
        # diagnostic
        num_disc_layers = len(variance_parameters) - 1
        print("Applying discriminative regularization on {} layers".format(num_disc_layers))

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(decoder_mlp.apply(z).reshape((-1,) + decoder_convnet.get_dim("input_")))
        log_sigma = log_sigma_theta.dimshuffle("x", 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1).sum(axis=1)

        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma + (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma)
        ).sum(axis=[1, 2, 3])

        discriminative_layer_terms = [None] * num_disc_layers
        for i in range(num_disc_layers):
            discriminative_layer_terms[i] = tensor.zeros_like(kl_term)
        discriminative_term = tensor.zeros_like(kl_term)
        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the classifier
            acts_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(x).flatten(ndim=2))])
            acts_hat_cg = ComputationGraph([classifier_mlp.apply(classifier_convnet.apply(mu_theta).flatten(ndim=2))])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            cur_layer = 0
            # CLASSIFIER MLP DISABLED
            # for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3] + [classifier_mlp],
            for i, zip_pair in enumerate(zip(classifier_convnet.layers[1::3], variance_parameters[1:])):

                layer, log_sigma = zip_pair
                variable_filter = VariableFilter(roles=[OUTPUT], bricks=[layer])

                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)

                # TODO: this conditional could be less brittle
                if "mlp" in layer.name.lower():
                    log_sigma = log_sigma.dimshuffle("x", 0)
                    sumaxis = [1]
                else:
                    log_sigma = log_sigma.dimshuffle("x", 0, 1, 2)
                    sumaxis = [1, 2, 3]

                discriminative_layer_term_unweighted = -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma + (d - d_hat) ** 2 / tensor.exp(2 * log_sigma)
                ).sum(axis=sumaxis)

                discriminative_layer_terms[i] = (
                    discriminative_factor * disc_weights[cur_layer] * discriminative_layer_term_unweighted
                )
                discriminative_term = discriminative_term + discriminative_layer_terms[i]

                cur_layer = cur_layer + 1

        # scale terms (disc is prescaled by layer)
        reconstruction_term = reconstruction_factor * reconstruction_term
        kl_term = kl_factor * kl_term

        # total_reconstruction_term is reconstruction + discriminative
        total_reconstruction_term = reconstruction_term + discriminative_term

        # cost is mean(kl - total reconstruction)
        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph([cost, kl_term, reconstruction_term, discriminative_term] + discriminative_layer_terms)

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Example #22
0
def create_training_computation_graphs(discriminative_regularization):
    x = tensor.tensor4('features')
    pi = numpy.cast[theano.config.floatX](numpy.pi)

    bricks = create_model_bricks()
    encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp = bricks
    if discriminative_regularization:
        classifier_model = Model(load('celeba_classifier.zip').algorithm.cost)
        selector = Selector(classifier_model.top_bricks)
        classifier_convnet, = selector.select('/convnet').bricks
    random_brick = Random()

    # Initialize conditional variances
    log_sigma_theta = shared_floatx(
        numpy.zeros((3, 64, 64)), name='log_sigma_theta')
    add_role(log_sigma_theta, PARAMETER)
    variance_parameters = [log_sigma_theta]
    if discriminative_regularization:
        # We add discriminative regularization for the batch-normalized output
        # of the strided layers of the classifier.
        for layer in classifier_convnet.layers[4::6]:
            log_sigma = shared_floatx(
                numpy.zeros(layer.get_dim('output')),
                name='{}_log_sigma'.format(layer.name))
            add_role(log_sigma, PARAMETER)
            variance_parameters.append(log_sigma)

    # Computation graph creation is encapsulated within this function in order
    # to allow selecting which parts of the graph will use batch statistics for
    # batch normalization and which parts will use population statistics.
    # Specifically, we'd like to use population statistics for the classifier
    # even in the training graph.
    def create_computation_graph():
        # Encode
        phi = encoder_mlp.apply(encoder_convnet.apply(x).flatten(ndim=2))
        nlat = encoder_mlp.output_dim // 2
        mu_phi = phi[:, :nlat]
        log_sigma_phi = phi[:, nlat:]
        # Sample from the approximate posterior
        epsilon = random_brick.theano_rng.normal(
            size=mu_phi.shape, dtype=mu_phi.dtype)
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        # Decode
        mu_theta = decoder_convnet.apply(
            decoder_mlp.apply(z).reshape(
                (-1,) + decoder_convnet.get_dim('input_')))
        log_sigma = log_sigma_theta.dimshuffle('x', 0, 1, 2)

        # Compute KL and reconstruction terms
        kl_term = 0.5 * (
            tensor.exp(2 * log_sigma_phi) + mu_phi ** 2 - 2 * log_sigma_phi - 1
        ).sum(axis=1)
        reconstruction_term = -0.5 * (
            tensor.log(2 * pi) + 2 * log_sigma +
            (x - mu_theta) ** 2 / tensor.exp(2 * log_sigma)
        ).sum(axis=[1, 2, 3])
        total_reconstruction_term = reconstruction_term

        if discriminative_regularization:
            # Propagate both the input and the reconstruction through the
            # classifier
            acts_cg = ComputationGraph([classifier_convnet.apply(x)])
            acts_hat_cg = ComputationGraph(
                [classifier_convnet.apply(mu_theta)])

            # Retrieve activations of interest and compute discriminative
            # regularization reconstruction terms
            for layer, log_sigma in zip(classifier_convnet.layers[4::6],
                                        variance_parameters[1:]):
                variable_filter = VariableFilter(roles=[OUTPUT],
                                                 bricks=[layer])
                d, = variable_filter(acts_cg)
                d_hat, = variable_filter(acts_hat_cg)
                log_sigma = log_sigma.dimshuffle('x', 0, 1, 2)

                total_reconstruction_term += -0.5 * (
                    tensor.log(2 * pi) + 2 * log_sigma +
                    (d - d_hat) ** 2 / tensor.exp(2 * log_sigma)
                ).sum(axis=[1, 2, 3])

        cost = (kl_term - total_reconstruction_term).mean()

        return ComputationGraph([cost, kl_term, reconstruction_term])

    cg = create_computation_graph()
    with batch_normalization(encoder_convnet, encoder_mlp,
                             decoder_convnet, decoder_mlp):
        bn_cg = create_computation_graph()

    return cg, bn_cg, variance_parameters
Example #23
0
def main(save_to, num_epochs,
         weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10

    prior_noise_level = -10
    noise_step_rule = Scale(1e-6)
    noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX))
    convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True,
            noise_rate=noise_rate,
            prior_noise_level=prior_noise_level)

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        with training_noise(convnet):
            train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # for annealing
    nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX))
    nit_penalty.name = 'nit_penalty'

    # Compute noise rates for training graph
    train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables)
    train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean()
    train_mean_log_sigma.name = 'mean_log_sigma'
    train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables)
    train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean()
    train_nit_rate.name = 'nit_rate'
    train_nit_regularization = nit_penalty * train_nit_rate
    train_nit_regularization.name = 'nit_regularization'

    # Apply regularization to the cost
    trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])(
            train_cg.parameters)
    mask_parameters = [p for p in trainable_parameters
            if get_brick(p).name == 'mask']
    noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters)
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    nonmask_weights = [p for p in weights if get_brick(p).name != 'mask']
    l2_norm = sum([(W ** 2).sum() for W in nonmask_weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = weight_decay * l2_norm
    l2_regularization.name = 'l2_regularization'

    # testversion
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + l2_regularization + train_nit_regularization
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 128
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # Create a step rule that reduces the learning rate of noise
    scale_mask = Restrict(noise_step_rule, mask_parameters)
    step_rule = CompositeRule([scale_mask, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=trainable_parameters,
        step_rule=step_rule)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    exp_name = save_to.replace('.%d', '')

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),     # Warm up with 0.01 learning rate
                      (50, 0.1),     # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  EpochSchedule(noise_step_rule.learning_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4)
                  ]),
                  EpochSchedule(noise_rate, [
                      (0, 1e-2),
                      (2, 1e-1),
                      (4, 1)
                      # (0, 1e-6),
                      # (2, 1e-5),
                      # (4, 1e-4),
                      # (6, 3e-4),
                      # (8, 1e-3), # Causes nit rate to jump
                      # (10, 3e-3),
                      # (12, 1e-2),
                      # (15, 3e-2),
                      # (19, 1e-1),
                      # (24, 3e-1),
                      # (30, 1)
                  ]),
                  NoiseExtension(
                      noise_parameters=noise_parameters),
                  NoisyDataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      noise_parameters=noise_parameters,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate, train_nit_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       train_nit_regularization,
                       momentum.learning_rate,
                       train_mean_log_sigma,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + exp_name,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_nit_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                          ['train_mean_log_sigma'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + exp_name,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(exp_name, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)
Example #24
0
def main(save_to, num_epochs,
         regularization=0.0001, subset=None, num_batches=None,
         batch_size=None, histogram=None, resume=False):
    output_size = 10
    convnet = create_res_net()

    x = tensor.tensor4('features')
    y = tensor.lmatrix('targets')

    # Normalize input and apply the convnet
    test_probs = convnet.apply(x)
    test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs)
            .copy(name='cost'))
    test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs)
                  .copy(name='error_rate'))
    test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs)
                  .copy(name='confusion'))
    test_confusion.tag.aggregation_scheme = Sum(test_confusion)

    test_cg = ComputationGraph([test_cost, test_error_rate])

    # Apply dropout to all layer outputs except final softmax
    # dropout_vars = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables)
    # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5)

    # Apply 0.2 dropout to the pre-averaging layer
    # dropout_vars_2 = VariableFilter(
    #         roles=[OUTPUT], bricks=[Convolutional],
    #         theano_name_regex="^conv_8_apply_output$")(test_cg.variables)
    # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2)

    # Apply 0.2 dropout to the input, as in the paper
    # train_cg = apply_dropout(test_cg, [x], 0.2)
    # train_cg = drop_cg
    # train_cg = apply_batch_normalization(test_cg)

    # train_cost, train_error_rate, train_components = train_cg.outputs

    with batch_normalization(convnet):
        train_probs = convnet.apply(x)
    train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs)
                .copy(name='cost'))
    train_components = (ComponentwiseCrossEntropy().apply(y.flatten(),
                train_probs).copy(name='components'))
    train_error_rate = (MisclassificationRate().apply(y.flatten(),
                train_probs).copy(name='error_rate'))
    train_cg = ComputationGraph([train_cost,
                train_error_rate, train_components])
    population_updates = get_batch_normalization_updates(train_cg)
    bn_alpha = 0.9
    extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha))
                for p, m in population_updates]

    # Apply regularization to the cost
    biases = VariableFilter(roles=[BIAS])(train_cg.parameters)
    weights = VariableFilter(roles=[WEIGHT])(train_cg.variables)
    l2_norm = sum([(W ** 2).sum() for W in weights])
    l2_norm.name = 'l2_norm'
    l2_regularization = regularization * l2_norm
    l2_regularization.name = 'l2_regularization'
    test_cost = test_cost + l2_regularization
    test_cost.name = 'cost_with_regularization'

    # Training version of cost
    train_cost_without_regularization = train_cost
    train_cost_without_regularization.name = 'cost_without_regularization'
    train_cost = train_cost + regularization * l2_norm
    train_cost.name = 'cost_with_regularization'

    cifar10_train = CIFAR10(("train",))
    cifar10_train_stream = RandomPadCropFlip(
        NormalizeBatchLevels(DataStream.default_stream(
            cifar10_train, iteration_scheme=ShuffledScheme(
                cifar10_train.num_examples, batch_size)),
        which_sources=('features',)),
        (32, 32), pad=4, which_sources=('features',))

    test_batch_size = 500
    cifar10_test = CIFAR10(("test",))
    cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream(
        cifar10_test,
        iteration_scheme=ShuffledScheme(
            cifar10_test.num_examples, test_batch_size)),
        which_sources=('features',))

    momentum = Momentum(0.01, 0.9)

    # Create a step rule that doubles the learning rate of biases, like Caffe.
    # scale_bias = Restrict(Scale(2), biases)
    # step_rule = CompositeRule([scale_bias, momentum])

    # from theano.compile.nanguardmode import NanGuardMode

    # Train with simple SGD
    algorithm = GradientDescent(
        cost=train_cost, parameters=train_cg.parameters,
        step_rule=momentum)
    algorithm.add_updates(extra_updates)

    #,
    #    theano_func_kwargs={
    #        'mode': NanGuardMode(
    #            nan_is_error=True, inf_is_error=True, big_is_error=True)})

    # `Timing` extension reports time for reading data, aggregating a batch
    # and monitoring;
    # `ProgressBar` displays a nice progress bar during training.
    extensions = [Timing(),
                  FinishAfter(after_n_epochs=num_epochs,
                              after_n_batches=num_batches),
                  EpochSchedule(momentum.learning_rate, [
                      (0, 0.01),   # Warm up with 0.01 learning rate
                      (1, 0.1),    # Then go back to 0.1
                      (100, 0.01),
                      (150, 0.001)
                      # (83, 0.01),  # Follow the schedule in the paper
                      # (125, 0.001)
                  ]),
                  DataStreamMonitoring(
                      [test_cost, test_error_rate, test_confusion],
                      cifar10_test_stream,
                      prefix="test"),
                  TrainingDataMonitoring(
                      [train_cost, train_error_rate,
                       train_cost_without_regularization,
                       l2_regularization,
                       momentum.learning_rate,
                       aggregation.mean(algorithm.total_gradient_norm)],
                      prefix="train",
                      every_n_batches=17),
                      # after_epoch=True),
                  Plot('Training performance for ' + save_to,
                      channels=[
                          ['train_cost_with_regularization',
                           'train_cost_without_regularization',
                           'train_l2_regularization'],
                          ['train_error_rate'],
                          ['train_total_gradient_norm'],
                      ],
                      every_n_batches=17),
                  Plot('Test performance for ' + save_to,
                      channels=[[
                          'train_error_rate',
                          'test_error_rate',
                          ]],
                      after_epoch=True),
                  Checkpoint(save_to, use_cpickle=True),
                  ProgressBar(),
                  Printing()]

    if histogram:
        attribution = AttributionExtension(
            components=train_components,
            parameters=cg.parameters,
            components_size=output_size,
            after_batch=True)
        extensions.insert(0, attribution)

    if resume:
        extensions.append(Load(save_to, True, True))

    model = Model(train_cost)

    main_loop = MainLoop(
        algorithm,
        cifar10_train_stream,
        model=model,
        extensions=extensions)

    main_loop.run()

    if histogram:
        save_attributions(attribution, filename=histogram)

    with open('execution-log.json', 'w') as outfile:
        json.dump(main_loop.log, outfile, cls=NumpyEncoder)