def testRegularizers(self):
    reg = lambda x: 0.1 * math_ops.reduce_sum(x)
    bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg)
    inputs = random_ops.random_uniform((5, 4, 3), seed=1)
    training = array_ops.placeholder(dtype='bool')
    _ = bn.apply(inputs, training=training)
    self.assertEqual(len(bn.losses), 1)

    bn = normalization_layers.BatchNormalization(axis=1, gamma_regularizer=reg)
    inputs = random_ops.random_uniform((5, 4, 3), seed=1)
    training = array_ops.placeholder(dtype='bool')
    _ = bn.apply(inputs, training=training)
    self.assertEqual(len(bn.losses), 1)
Exemple #2
0
def batchnorm_example(optimizer_fn,
                      batch_per_epoch=1,
                      momentum=0.9,
                      renorm=False,
                      update_ops_in_tower_mode=False):
    """Example of non-distribution-aware legacy code with batch normalization."""
    def dataset_fn():
        # input shape is [16, 8], input values are increasing in both dimensions.
        return dataset_ops.Dataset.from_tensor_slices(
            [[[float(x * 8 + y + z * 100) for y in range(8)]
              for x in range(16)] for z in range(batch_per_epoch)]).repeat()

    optimizer = optimizer_fn()
    batchnorm = normalization.BatchNormalization(renorm=renorm,
                                                 momentum=momentum,
                                                 fused=False)
    layer = core.Dense(1, use_bias=False)

    def model_fn(x):
        """A model that uses batchnorm."""
        def loss_fn():
            y = batchnorm(x, training=True)
            with ops.control_dependencies(
                    ops.get_collection(ops.GraphKeys.UPDATE_OPS
                                       ) if update_ops_in_tower_mode else []):
                loss = math_ops.reduce_mean(
                    math_ops.reduce_sum(layer(y)) - constant_op.constant(1.))
            # `x` and `y` will be fetched by the gradient computation, but not `loss`.
            return loss

        # Callable loss.
        return optimizer.minimize(loss_fn)

    return model_fn, dataset_fn, batchnorm
  def __init__(self,
               batchnorm_layer=None,
               training=True,
               validate_args=False,
               name="batch_normalization"):
    """Instantiates the `BatchNorm` bijector.

    Args:
      batchnorm_layer: `tf.layers.BatchNormalization` layer object. If `None`,
        defaults to
        `tf.layers.BatchNormalization(gamma_constraint=nn_ops.relu(x) + 1e-6)`.
        This ensures positivity of the scale variable.

      training: If True, updates running-average statistics during call to
        `inverse()`.
      validate_args: Python `bool` indicating whether arguments should be
        checked for correctness.
      name: Python `str` name given to ops managed by this object.
    Raises:
      ValueError: If bn_layer is not an instance of
        `tf.layers.BatchNormalization`, or if it is specified with `renorm=True`
        or a virtual batch size.
    """
    # Scale must be positive.
    g_constraint = lambda x: nn.relu(x) + 1e-6
    self.batchnorm = batchnorm_layer or normalization.BatchNormalization(
        gamma_constraint=g_constraint)
    self._validate_bn_layer(self.batchnorm)
    self._training = training
    super(BatchNormalization, self).__init__(
        validate_args=validate_args, name=name)
 def testLogProb(self, event_shape, event_dims, training):
   training = tf.placeholder_with_default(training, (), "training")
   layer = normalization.BatchNormalization(axis=event_dims, epsilon=0.)
   batch_norm = tfb.BatchNormalization(batchnorm_layer=layer,
                                       training=training)
   base_dist = distributions.MultivariateNormalDiag(
       loc=np.zeros(np.prod(event_shape), dtype=np.float32))
   # Reshape the events.
   if isinstance(event_shape, int):
     event_shape = [event_shape]
   base_dist = distributions.TransformedDistribution(
       distribution=base_dist,
       bijector=tfb.Reshape(event_shape_out=event_shape))
   dist = distributions.TransformedDistribution(
       distribution=base_dist,
       bijector=batch_norm,
       validate_args=True)
   samples = dist.sample(int(1e5))
   # No volume distortion since training=False, bijector is initialized
   # to the identity transformation.
   base_log_prob = base_dist.log_prob(samples)
   dist_log_prob = dist.log_prob(samples)
   self.evaluate(tf.global_variables_initializer())
   base_log_prob_, dist_log_prob_ = self.evaluate(
       [base_log_prob, dist_log_prob])
   self.assertAllClose(base_log_prob_, dist_log_prob_)
  def testCreateBN(self):
    # Call layer.
    bn = normalization_layers.BatchNormalization(axis=1)
    inputs = random_ops.random_uniform((5, 4, 3), seed=1)
    training = array_ops.placeholder(dtype='bool')
    outputs = bn.apply(inputs, training=training)

    # Verify shape.
    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])

    # Verify layer attributes.
    self.assertEqual(len(bn.updates), 2)
    self.assertEqual(len(bn.variables), 4)
    self.assertEqual(len(bn.trainable_variables), 2)
    self.assertEqual(len(bn.non_trainable_variables), 2)

    # Test that updates were created and added to UPDATE_OPS.
    self.assertEqual(len(bn.updates), 2)
    self.assertListEqual(
        ops.get_collection(ops.GraphKeys.UPDATE_OPS), bn.updates)

    # Test that weights were created and added to TRAINABLE_VARIABLES.
    self.assertListEqual(
        ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
        bn.trainable_variables)
Exemple #6
0
def batchnorm_example(optimizer_fn,
                      batch_per_epoch=1,
                      momentum=0.9,
                      renorm=False):
  """Example of non-distribution-aware legacy code with batch normalization."""

  def dataset_fn():
    # input shape is [16, 8], input values are increasing in both dimensions.
    return dataset_ops.Dataset.from_tensor_slices(
        [[[float(x * 8 + y + z * 100)
           for y in range(8)]
          for x in range(16)]
         for z in range(batch_per_epoch)]).repeat()

  optimizer = optimizer_fn()
  batchnorm = normalization.BatchNormalization(
      renorm=renorm, momentum=momentum, fused=False)

  def model_fn(x):

    def loss_fn():
      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
      return loss

    # Callable loss.
    return optimizer.minimize(loss_fn)

  return model_fn, dataset_fn, batchnorm
Exemple #7
0
def output_logits_from_dnn(fields_embeddings, params, is_training):
    dropout_rate = params['dropout_rate']
    do_batch_norm = params['batch_norm']

    X = tf.concat(fields_embeddings, axis=1)
    tf.logging.info("initial input to DNN, shape={}".format(X.shape))

    for idx, n_units in enumerate(params['hidden_units'], start=1):
        X = tf.layers.dense(X, units=n_units, activation=tf.nn.relu)
        tf.logging.info("layer[{}] output shape={}".format(idx, X.shape))

        X = tf.layers.dropout(inputs=X,
                              rate=dropout_rate,
                              training=is_training)
        if is_training:
            tf.logging.info("layer[{}] dropout {}".format(idx, dropout_rate))

        if do_batch_norm:
            # BatchNormalization的调用、参数,是从DNNLinearCombinedClassifier源码中拷贝过来的
            batch_norm_layer = normalization.BatchNormalization(
                momentum=0.999,
                trainable=True,
                name='batchnorm_{}'.format(idx))
            X = batch_norm_layer(X, training=is_training)

            if is_training:
                tf.logging.info("layer[{}] batch-normalize".format(idx))

    # connect to final logits, [batch_size,1]
    return tf.layers.dense(X, units=1, use_bias=True, activation=None)
Exemple #8
0
 def __init__(self, name="batch_norm"):
     super(BatchNorm, self).__init__(name=name)
     with self._enter_variable_scope():
         self._bn = normalization.BatchNormalization(axis=1,
                                                     epsilon=np.finfo(
                                                         np.float32).eps,
                                                     momentum=0.9)
  def testRenorm(self):
    shape = (4, 3)
    xt = array_ops.placeholder(dtypes.float32, shape)
    momentum = 0.99
    renorm_momentum = 0.8
    rmax = 1.1
    rmin = 0.9
    dmax = 0.1
    gamma = 2.
    beta = 3.
    epsilon = 0.001
    bn = normalization_layers.BatchNormalization(
        axis=1,
        gamma_initializer=init_ops.constant_initializer(gamma),
        beta_initializer=init_ops.constant_initializer(beta),
        epsilon=epsilon,
        momentum=momentum,
        renorm=True,
        renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax},
        renorm_momentum=renorm_momentum)
    training = array_ops.placeholder(dtypes.bool)
    yt = bn.apply(xt, training=training)

    moving_mean = 0.
    moving_variance = 1.
    renorm_mean = renorm_stddev = 0.
    renorm_weight = 0.
    with self.test_session(use_gpu=True) as sess:
      sess.run(variables.global_variables_initializer())
      for _ in range(5):
        x = np.random.random(shape)

        mean = x.mean(0)
        stddev = np.sqrt(x.var(0) + epsilon)
        adj_mean = renorm_mean + (1. - renorm_weight) * mean
        adj_stddev = renorm_stddev + (1. - renorm_weight) * stddev
        r = (stddev / adj_stddev).clip(rmin, rmax)
        d = ((mean - adj_mean) / adj_stddev).clip(-dmax, dmax)
        y_train = ((x - mean) / stddev * r + d) * gamma + beta
        renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum)
        renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum)
        renorm_weight += (1. - renorm_weight) * (1. - renorm_momentum)
        moving_mean += (renorm_mean / renorm_weight -
                        moving_mean) * (1. - momentum)
        moving_variance += ((renorm_stddev / renorm_weight) ** 2 - epsilon -
                            moving_variance) * (1. - momentum)

        y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 *
                  gamma) + beta

        yt_val_train, _, _ = sess.run([yt] + bn.updates,
                                      feed_dict={xt: x, training: True})
        yt_val_test, _, _ = sess.run([yt] + bn.updates,
                                     feed_dict={xt: x, training: False})

        self.assertAllClose(y_train, yt_val_train, atol=1e-5)
        self.assertAllClose(y_test, yt_val_test, atol=1e-5)
 def testConstraints(self):
   g_constraint = lambda x: x / math_ops.reduce_sum(x)
   b_constraint = lambda x: x / math_ops.reduce_max(x)
   bn = normalization_layers.BatchNormalization(axis=1,
                                                gamma_constraint=g_constraint,
                                                beta_constraint=b_constraint)
   inputs = random_ops.random_uniform((5, 4, 3), seed=1)
   bn(inputs)
   self.assertEqual(bn.gamma_constraint, g_constraint)
   self.assertEqual(bn.beta_constraint, b_constraint)
  def testNoScale(self):
    bn = normalization_layers.BatchNormalization(axis=1, scale=False)
    inputs = random_ops.random_uniform((5, 4, 3), seed=1)
    training = array_ops.placeholder(dtype='bool')
    outputs = bn.apply(inputs, training=training)

    # Verify shape.
    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3])

    # Verify layer attributes.
    self.assertEqual(len(bn.updates), 2)
    self.assertEqual(len(bn.variables), 3)
    self.assertEqual(len(bn.trainable_variables), 1)
    self.assertEqual(len(bn.non_trainable_variables), 2)
  def testGhostBN4DimsAxis1(self):
    shape = [6, 3, 10, 10]
    num_virtual_batches = 3
    beta = 2.
    gamma = 3.
    momentum = 0.8
    epsilon = 1e-3
    moving_means = np.zeros([1, 3, 3, 1, 1], dtype=np.float32)
    moving_vars = np.ones([1, 3, 3, 1, 1], dtype=np.float32)

    inp = array_ops.placeholder(dtypes.float32, shape)
    is_training = array_ops.placeholder(dtypes.bool)
    bn = normalization_layers.BatchNormalization(
        axis=1,
        momentum=momentum,
        epsilon=epsilon,
        beta_initializer=init_ops.constant_initializer(beta),
        gamma_initializer=init_ops.constant_initializer(gamma),
        num_virtual_batches=num_virtual_batches,
        fused=False)      # NCHW is unsupported by CPU fused batch norm
    out = bn.apply(inp, training=is_training)
    ghost_shape = ([shape[0] // num_virtual_batches, num_virtual_batches] +
                   shape[1:])

    with self.test_session(use_gpu=True) as sess:
      sess.run(variables.global_variables_initializer())
      for _ in range(5):
        x = np.random.random(shape)

        sub_batched = np.reshape(x, ghost_shape)
        means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True)
        variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True)
        moving_means = moving_means * momentum + means * (1. - momentum)
        moving_vars = moving_vars * momentum + variances * (1. - momentum)

        y_train = ((sub_batched - means) /
                   (variances + epsilon) ** 0.5 * gamma) + beta
        y_test = ((sub_batched - moving_means) /
                  (moving_vars + epsilon) ** 0.5 * gamma) + beta

        y_train = np.reshape(y_train, shape)
        y_test = np.reshape(y_test, shape)

        y_val_train, _, _ = sess.run([out] + bn.updates,
                                     feed_dict={inp: x, is_training: True})
        y_val_test = sess.run(out, feed_dict={inp: x, is_training: False})

        self.assertAllClose(y_train, y_val_train, atol=1e-2)
        self.assertAllClose(y_test, y_val_test, atol=1e-2)
Exemple #13
0
            def step_fn(is_training, inputs, targets=None):
                bn = normalization.BatchNormalization(axis=3,
                                                      epsilon=1e-3,
                                                      momentum=0.9,
                                                      fused=fused)
                bn_list.append(bn)
                outputs = bn.apply(inputs, training=is_training)
                if not is_training:
                    return outputs

                loss = losses.mean_squared_error(targets, outputs)
                optimizer = gradient_descent.GradientDescentOptimizer(0.01)
                train_op = optimizer.minimize(loss)
                with ops.control_dependencies([train_op]):
                    return array_ops.identity(loss)
Exemple #14
0
    def test4DInputAxis1Fused(self):
        if test.is_gpu_available(cuda_only=True):
            epsilon = 1e-3
            bn = normalization_layers.BatchNormalization(axis=1,
                                                         epsilon=epsilon,
                                                         momentum=0.9,
                                                         fused=True)
            inputs = variables.Variable(np.random.random((5, 4, 3, 6)) + 100,
                                        dtype=dtypes.float32)
            training = array_ops.placeholder(dtype='bool')
            outputs = bn.apply(inputs, training=training)

            with self.test_session() as sess:
                # Test training with placeholder learning phase.
                sess.run(variables.global_variables_initializer())
                np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
                np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
                np_beta = np.reshape(np_beta, (1, 4, 1, 1))
                for _ in range(100):
                    np_output, _, _ = sess.run([outputs] + bn.updates,
                                               feed_dict={training: True})
                    # Verify that the axis is normalized during training.
                    normed_np_output = (
                        (np_output - epsilon) * np_gamma) + np_beta
                    self.assertAlmostEqual(np.mean(normed_np_output),
                                           0.,
                                           places=1)
                    self.assertAlmostEqual(np.std(normed_np_output),
                                           1.,
                                           places=1)

                # Verify that the statistics are updated during training.
                moving_mean, moving_var = sess.run(
                    [bn.moving_mean, bn.moving_variance])
                np_inputs = sess.run(inputs)
                mean = np.mean(np_inputs, axis=(0, 2, 3))
                std = np.std(np_inputs, axis=(0, 2, 3))
                variance = np.square(std)
                self.assertAllClose(mean, moving_mean, atol=1e-2)
                self.assertAllClose(variance, moving_var, atol=1e-2)

                # Test inference with placeholder learning phase.
                np_output = sess.run(outputs, feed_dict={training: False})

                # Verify that the axis is normalized during inference.
                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
                self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
                self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
Exemple #15
0
    def testBNWithZeroBatchInput(self, distribution, fused):
        distribution.extended.experimental_enable_get_next_as_optional = True
        with distribution.scope():
            inputs = np.random.random((0, 4, 4, 3)).astype(np.float32) + 100
            targets = np.random.random((0, 4, 4, 3)).astype(np.float32)
            bn = normalization.BatchNormalization(axis=3,
                                                  epsilon=1e-3,
                                                  momentum=0.9,
                                                  fused=fused)
            optimizer = gradient_descent.GradientDescentOptimizer(0.01)

            @def_function.function
            def train_step():
                def step_fn(inputs, targets):
                    with backprop.GradientTape() as tape:
                        outputs = bn.apply(inputs, training=True)
                        loss = losses.mean_squared_error(targets, outputs)
                    grads = tape.gradient(loss, bn.variables)
                    optimizer.apply_gradients(zip(grads, bn.variables))
                    return loss

                return distribution.experimental_run_v2(step_fn,
                                                        args=(inputs, targets))

            for _ in range(100):
                np_output = train_step().numpy()
                self.assertEqual(0.0, np_output)

            # Verify that the statistics and weights are not changed after training.
            self.assertAllEqual([0, 0, 0], bn.moving_mean.numpy())
            self.assertAllEqual([1, 1, 1], bn.moving_variance.numpy())
            self.assertAllEqual([1, 1, 1], bn.gamma.numpy())
            self.assertAllEqual([0, 0, 0], bn.beta.numpy())

            @def_function.function
            def test_step():
                def step_fn(inputs):
                    outputs = bn.apply(inputs, training=False)
                    return outputs

                return distribution.experimental_run_v2(step_fn,
                                                        args=(inputs, ))

            # Test inference.
            self.assertAllEqual(np.zeros(shape=(0, 4, 4, 3), dtype=np.float32),
                                test_step().numpy())
Exemple #16
0
 def testLogProb(self):
   with self.test_session() as sess:
     layer = normalization.BatchNormalization(epsilon=0.)
     batch_norm = BatchNormalization(batchnorm_layer=layer, training=False)
     base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.])
     dist = transformed_distribution_lib.TransformedDistribution(
         distribution=base_dist,
         bijector=batch_norm,
         validate_args=True)
     samples = dist.sample(int(1e5))
     # No volume distortion since training=False, bijector is initialized
     # to the identity transformation.
     base_log_prob = base_dist.log_prob(samples)
     dist_log_prob = dist.log_prob(samples)
     variables.global_variables_initializer().run()
     base_log_prob_, dist_log_prob_ = sess.run([base_log_prob, dist_log_prob])
     self.assertAllClose(base_log_prob_, dist_log_prob_)
 def testInvertMutuallyConsistent(self):
     # BatchNorm bijector is only mutually consistent when training=False.
     dims = 4
     with self.cached_session() as sess:
         layer = normalization.BatchNormalization(epsilon=0.)
         batch_norm = Invert(
             BatchNormalization(batchnorm_layer=layer, training=False))
         dist = transformed_distribution_lib.TransformedDistribution(
             distribution=normal_lib.Normal(loc=0., scale=1.),
             bijector=batch_norm,
             event_shape=[dims],
             validate_args=True)
         self.run_test_sample_consistent_log_prob(sess_run_fn=sess.run,
                                                  dist=dist,
                                                  num_samples=int(1e5),
                                                  radius=2.,
                                                  center=0.,
                                                  rtol=0.02)
Exemple #18
0
    def batch_norm(self,
                   input_layer=None,
                   decay=0.999,
                   scale=False,
                   epsilon=0.001):
        """Adds a Batch Normalization layer."""
        if input_layer is None:
            input_layer = self.top_layer
        else:
            self.top_size = None
        name = 'batchnorm' + str(self.counts['batchnorm'])
        self.counts['batchnorm'] += 1

        center = True
        with tf.variable_scope(name) as scope:
            if self.use_tf_layers:
                layer_obj = normalization_layers.BatchNormalization(
                    momentum=decay,
                    scale=scale,
                    epsilon=epsilon,
                    fused=True,
                    axis=_data_format_to_channel_axis[self.data_format],
                    # We pass this 'scope' argument for compatibility with checkpoints
                    # created with the contrib version of batch norm. tf_cnn_benchmarks
                    # used to use the contrib version.
                    _scope=scope,
                    center=center,
                    name=scope.name)
                bn = layer_obj.apply(input_layer, training=self.phase_train)
            else:
                bn = self._batch_norm_without_layers(input_layer, decay, scale,
                                                     epsilon)
        self.top_layer = bn
        self.top_size = bn.shape[
            3] if self.data_format == 'NHWC' else bn.shape[1]
        self.top_size = int(self.top_size)
        mlperf.logger.log_batch_norm(input_tensor=input_layer,
                                     output_tensor=bn,
                                     momentum=decay,
                                     epsilon=epsilon,
                                     center=center,
                                     scale=scale,
                                     training=self.phase_train)
        return bn
 def testInvertMutuallyConsistent(self):
   # BatchNorm bijector is only mutually consistent when training=False.
   dims = 4
   training = tf.placeholder_with_default(False, (), "training")
   layer = normalization.BatchNormalization(epsilon=0.)
   batch_norm = tfb.Invert(
       tfb.BatchNormalization(batchnorm_layer=layer, training=training))
   dist = distributions.TransformedDistribution(
       distribution=distributions.Normal(loc=0., scale=1.),
       bijector=batch_norm,
       event_shape=[dims],
       validate_args=True)
   self.run_test_sample_consistent_log_prob(
       sess_run_fn=self.evaluate,
       dist=dist,
       num_samples=int(1e5),
       radius=2.,
       center=0.,
       rtol=0.02)
Exemple #20
0
    def testBNWithDynamicBatchInputEager(self, distribution, fused):
        distribution.extended.experimental_enable_get_next_as_optional = True
        with distribution.scope():
            # Explicitly create dataset with drop_remainder=False.
            # This would make batch size unknown.
            inputs = np.random.random((11, 4, 4, 3)).astype(np.float32) + 100
            targets = np.random.random((11, 4, 4, 3)).astype(np.float32)
            dataset = dataset_ops.Dataset.from_tensor_slices(
                (inputs, targets)).batch(10, drop_remainder=False).repeat()
            dataset_iterator = iter(
                distribution.experimental_distribute_dataset(dataset))

            bn = normalization.BatchNormalization(axis=-1,
                                                  epsilon=1e-3,
                                                  momentum=0.9,
                                                  fused=fused)
            optimizer = gradient_descent.GradientDescentOptimizer(0.01)

            @def_function.function
            def train_step(iterator):
                def step_fn(inputs):
                    features, targets = inputs
                    with backprop.GradientTape() as tape:
                        outputs = bn(features, training=True)
                        loss = losses.mean_squared_error(targets, outputs)

                    grads = tape.gradient(loss, bn.variables)
                    optimizer.apply_gradients(zip(grads, bn.variables))
                    return loss

                return distribution.run(step_fn, args=(next(iterator), ))

            for _ in range(100):
                train_step(dataset_iterator).numpy()

            # Verify that the statistics and weights are updated.
            self.assertNotAllEqual(np.ndarray([0, 0, 0]),
                                   bn.moving_mean.numpy())
            self.assertNotAllEqual(np.ndarray([1, 1, 1]),
                                   bn.moving_variance.numpy())
            self.assertNotAllEqual(np.ndarray([1, 1, 1]), bn.gamma.numpy())
            self.assertNotAllEqual(np.ndarray([0, 0, 0]), bn.beta.numpy())
Exemple #21
0
    def testBooleanLearningPhase(self):
        epsilon = 1e-3
        bn = normalization_layers.BatchNormalization(axis=-1,
                                                     epsilon=epsilon,
                                                     momentum=0.9)
        inputs = variables.Variable(np.random.random((5, 4, 3, 6)) + 100,
                                    dtype=dtypes.float32)
        outputs_training = bn.apply(inputs, training=True)
        outputs_infer = bn.apply(inputs, training=False)

        with self.test_session() as sess:
            # Test training with placeholder learning phase.
            sess.run(variables.global_variables_initializer())
            np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
            np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
            np_beta = np.reshape(np_beta, (1, 1, 1, 6))
            for _ in range(100):
                np_output, _, _ = sess.run([outputs_training] + bn.updates)
                # Verify that the axis is normalized during training.
                normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
                self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2)
                self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)

            # Verify that the statistics are updated during training.
            moving_mean, moving_var = sess.run(
                [bn.moving_mean, bn.moving_variance])
            np_inputs = sess.run(inputs)
            mean = np.mean(np_inputs, axis=(0, 1, 2))
            std = np.std(np_inputs, axis=(0, 1, 2))
            variance = np.square(std)
            self.assertAllClose(mean, moving_mean, atol=1e-2)
            self.assertAllClose(variance, moving_var, atol=1e-2)

            # Test inference with placeholder learning phase.
            np_output = sess.run(outputs_infer)

            # Verify that the axis is normalized during inference.
            normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
            self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1)
            self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
Exemple #22
0
 def _simple_model(self, image, fused, freeze_mode):
     output_channels, kernel_size = 2, 3
     conv = conv_layers.conv2d(
         image,
         output_channels,
         kernel_size,
         use_bias=False,
         kernel_initializer=init_ops.ones_initializer())
     bn_layer = normalization_layers.BatchNormalization(fused=fused)
     bn_layer._bessels_correction_test_only = False
     training = not freeze_mode
     bn = bn_layer.apply(conv, training=training)
     loss = math_ops.reduce_sum(math_ops.abs(bn))
     optimizer = gradient_descent.GradientDescentOptimizer(0.01)
     if not freeze_mode:
         update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
         with ops.control_dependencies(update_ops):
             train_op = optimizer.minimize(loss)
     else:
         train_op = optimizer.minimize(loss)
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
     return loss, train_op, saver
Exemple #23
0
def custom_batch_norm(inputs,
                      decay=0.999,
                      center=True,
                      scale=False,
                      epsilon=0.001,
                      activation_fn=None,
                      param_initializers=None,
                      param_regularizers=None,
                      updates_collections=ops.GraphKeys.UPDATE_OPS,
                      is_training=True,
                      reuse=None,
                      variables_collections=None,
                      outputs_collections=None,
                      trainable=True,
                      batch_weights=None,
                      data_format='NHWC',
                      zero_debias_moving_mean=False,
                      scope=None,
                      renorm=False,
                      renorm_clipping=None,
                      renorm_decay=0.99,
                      noise_std=None):
    """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.

      "Batch Normalization: Accelerating Deep Network Training by Reducing
      Internal Covariate Shift"

      Sergey Ioffe, Christian Szegedy

    Can be used as a normalizer function for conv2d and fully_connected.

    Note: when training, the moving_mean and moving_variance need to be updated.
    By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
    need to be added as a dependency to the `train_op`. For example:

    ```python
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss)
    ```

    One can set updates_collections=None to force the updates in place, but that
    can have a speed penalty, especially in distributed settings.

    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`. The normalization is over all but the last dimension if
        `data_format` is `NHWC` and the second dimension if `data_format` is
        `NCHW`.
      decay: Decay for the moving average. Reasonable values for `decay` are close
        to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
        Lower `decay` value (recommend trying `decay`=0.9) if model experiences
        reasonably good training performance but poor validation and/or test
        performance. Try zero_debias_moving_mean=True for improved stability.
      center: If True, add offset of `beta` to normalized tensor. If False, `beta`
        is ignored.
      scale: If True, multiply by `gamma`. If False, `gamma` is
        not used. When the next layer is linear (also e.g. `nn.relu`), this can be
        disabled since the scaling can be done by the next layer.
      epsilon: Small float added to variance to avoid dividing by zero.
      activation_fn: Activation function, default set to None to skip it and
        maintain a linear activation.
      param_initializers: Optional initializers for beta, gamma, moving mean and
        moving variance.
      param_regularizers: Optional regularizer for beta and gamma.
      updates_collections: Collections to collect the update ops for computation.
        The updates_ops need to be executed with the train_op.
        If None, a control dependency would be added to make sure the updates are
        computed in place.
      is_training: Whether or not the layer is in training mode. In training mode
        it would accumulate the statistics of the moments into `moving_mean` and
        `moving_variance` using an exponential moving average with the given
        `decay`. When it is not in training mode then it would use the values of
        the `moving_mean` and the `moving_variance`.
      reuse: Whether or not the layer and its variables should be reused. To be
        able to reuse the layer scope must be given.
      variables_collections: Optional collections for the variables.
      outputs_collections: Collections to add the outputs.
      trainable: If `True` also add variables to the graph collection
        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
      batch_weights: An optional tensor of shape `[batch_size]`,
        containing a frequency weight for each batch item. If present,
        then the batch normalization uses weighted mean and
        variance. (This can be used to correct for bias in training
        example selection.)
      fused:  Use nn.fused_batch_norm if True, nn.batch_normalization otherwise.
      data_format: A string. `NHWC` (default) and `NCHW` are supported.
      zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
        pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
      scope: Optional scope for `variable_scope`.
      renorm: Whether to use Batch Renormalization
        (https://arxiv.org/abs/1702.03275). This adds extra variables during
        training. The inference is the same for either value of this parameter.
      renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
        scalar `Tensors` used to clip the renorm correction. The correction
        `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
        `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
        dmax are set to inf, 0, inf, respectively.
      renorm_decay: Momentum used to update the moving means and standard
        deviations with renorm. Unlike `momentum`, this affects training
        and should be neither too small (which would add noise) nor too large
        (which would give stale estimates). Note that `decay` is still applied
        to get the means and variances for inference.

    Returns:
      A `Tensor` representing the output of the operation.

    Raises:
      ValueError: If `batch_weights` is not None and `fused` is True.
      ValueError: If `param_regularizers` is not None and `fused` is True.
      ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
      ValueError: If the rank of `inputs` is undefined.
      ValueError: If rank or channels dimension of `inputs` is undefined.
    """

    layer_variable_getter = slim.layers._build_variable_getter()
    with variable_scope.variable_scope(
            scope,
            'BatchNorm', [inputs],
            reuse=reuse,
            custom_getter=layer_variable_getter) as sc:
        inputs = ops.convert_to_tensor(inputs)

        # Determine whether we can use the core layer class.
        if (batch_weights is None
                and updates_collections is ops.GraphKeys.UPDATE_OPS
                and not zero_debias_moving_mean):
            # Use the core layer class.
            axis = 1 if data_format == 'NCHW' else -1
            if not param_initializers:
                param_initializers = {}
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            moving_mean_initializer = param_initializers.get(
                'moving_mean', init_ops.zeros_initializer())
            moving_variance_initializer = param_initializers.get(
                'moving_variance', init_ops.ones_initializer())
            if not param_regularizers:
                param_regularizers = {}
            beta_regularizer = param_regularizers.get('beta')
            gamma_regularizer = param_regularizers.get('gamma')
            layer = normalization_layers.BatchNormalization(
                axis=axis,
                momentum=decay,
                epsilon=epsilon,
                center=center,
                scale=scale,
                beta_initializer=beta_initializer,
                gamma_initializer=gamma_initializer,
                moving_mean_initializer=moving_mean_initializer,
                moving_variance_initializer=moving_variance_initializer,
                beta_regularizer=beta_regularizer,
                gamma_regularizer=gamma_regularizer,
                trainable=trainable,
                renorm=renorm,
                renorm_clipping=renorm_clipping,
                renorm_momentum=renorm_decay,
                name=sc.name,
                _scope=sc,
                _reuse=reuse)
            outputs = layer.apply(inputs, training=is_training)

            # Add variables to collections.
            slim.layers._add_variable_to_collections(layer.moving_mean,
                                                     variables_collections,
                                                     'moving_mean')
            slim.layers._add_variable_to_collections(layer.moving_variance,
                                                     variables_collections,
                                                     'moving_variance')
            if layer.beta:
                slim.layers._add_variable_to_collections(
                    layer.beta, variables_collections, 'beta')
            if layer.gamma:
                slim.layers._add_variable_to_collections(
                    layer.gamma, variables_collections, 'gamma')

            if activation_fn is not None:
                outputs = activation_fn(outputs)
            return utils.collect_named_outputs(outputs_collections,
                                               sc.original_name_scope, outputs)

        # Not supported by layer class: batch_weights argument,
        # and custom updates_collections. In that case, use the legacy BN
        # implementation.
        # Custom updates collections are not supported because the update logic
        # is different in this case, in particular w.r.t. "forced updates" and
        # update op reuse.
        if renorm:
            raise ValueError('renorm is not supported with batch_weights, '
                             'updates_collections or zero_debias_moving_mean')
        inputs_shape = inputs.get_shape()
        inputs_rank = inputs_shape.ndims
        if inputs_rank is None:
            raise ValueError('Inputs %s has undefined rank.' % inputs.name)
        dtype = inputs.dtype.base_dtype
        if batch_weights is not None:
            batch_weights = ops.convert_to_tensor(batch_weights)
            inputs_shape[0:1].assert_is_compatible_with(
                batch_weights.get_shape())
            # Reshape batch weight values so they broadcast across inputs.
            nshape = [-1] + [1 for _ in range(inputs_rank - 1)]
            batch_weights = array_ops.reshape(batch_weights, nshape)

        if data_format == 'NCHW':
            moments_axes = [0] + list(range(2, inputs_rank))
            params_shape = inputs_shape[1:2]
            # For NCHW format, rather than relying on implicit broadcasting, we
            # explicitly reshape the params to params_shape_broadcast when computing
            # the moments and the batch normalization.
            params_shape_broadcast = list([1, inputs_shape[1].value] +
                                          [1 for _ in range(2, inputs_rank)])
        else:
            moments_axes = list(range(inputs_rank - 1))
            params_shape = inputs_shape[-1:]
            params_shape_broadcast = None
        if not params_shape.is_fully_defined():
            raise ValueError('Inputs %s has undefined channels dimension %s.' %
                             (inputs.name, params_shape))

        # Allocate parameters for the beta and gamma of the normalization.
        beta, gamma = None, None
        if not param_initializers:
            param_initializers = {}
        if center:
            beta_collections = utils.get_variable_collections(
                variables_collections, 'beta')
            beta_initializer = param_initializers.get(
                'beta', init_ops.zeros_initializer())
            beta = variables.model_variable('beta',
                                            shape=params_shape,
                                            dtype=dtype,
                                            initializer=beta_initializer,
                                            collections=beta_collections,
                                            trainable=trainable)
        if scale:
            gamma_collections = utils.get_variable_collections(
                variables_collections, 'gamma')
            gamma_initializer = param_initializers.get(
                'gamma', init_ops.ones_initializer())
            gamma = variables.model_variable('gamma',
                                             shape=params_shape,
                                             dtype=dtype,
                                             initializer=gamma_initializer,
                                             collections=gamma_collections,
                                             trainable=trainable)

        # Create moving_mean and moving_variance variables and add them to the
        # appropriate collections. We disable variable partitioning while creating
        # them, because assign_moving_average is not yet supported for partitioned
        # variables.
        partitioner = variable_scope.get_variable_scope().partitioner
        try:
            variable_scope.get_variable_scope().set_partitioner(None)
            moving_mean_collections = utils.get_variable_collections(
                variables_collections, 'moving_mean')
            moving_mean_initializer = param_initializers.get(
                'moving_mean', init_ops.zeros_initializer())
            moving_mean = variables.model_variable(
                'moving_mean',
                shape=params_shape,
                dtype=dtype,
                initializer=moving_mean_initializer,
                trainable=False,
                collections=moving_mean_collections)
            moving_variance_collections = utils.get_variable_collections(
                variables_collections, 'moving_variance')
            moving_variance_initializer = param_initializers.get(
                'moving_variance', init_ops.ones_initializer())
            moving_variance = variables.model_variable(
                'moving_variance',
                shape=params_shape,
                dtype=dtype,
                initializer=moving_variance_initializer,
                trainable=False,
                collections=moving_variance_collections)
        finally:
            variable_scope.get_variable_scope().set_partitioner(partitioner)

        # If `is_training` doesn't have a constant value, because it is a `Tensor`,
        # a `Variable` or `Placeholder` then is_training_value will be None and
        # `needs_moments` will be true.
        is_training_value = utils.constant_value(is_training)
        need_moments = is_training_value is None or is_training_value
        if need_moments:
            # Calculate the moments based on the individual batch.
            if batch_weights is None:
                if data_format == 'NCHW':
                    mean, variance = nn.moments(inputs,
                                                moments_axes,
                                                keep_dims=True)
                    mean = array_ops.reshape(mean, [-1])
                    variance = array_ops.reshape(variance, [-1])
                else:
                    mean, variance = nn.moments(inputs, moments_axes)
            else:
                if data_format == 'NCHW':
                    mean, variance = nn.weighted_moments(inputs,
                                                         moments_axes,
                                                         batch_weights,
                                                         keep_dims=True)
                    mean = array_ops.reshape(mean, [-1])
                    variance = array_ops.reshape(variance, [-1])
                else:
                    mean, variance = nn.weighted_moments(
                        inputs, moments_axes, batch_weights)

            moving_vars_fn = lambda: (moving_mean, moving_variance)
            if updates_collections is None:

                def _force_updates():
                    """Internal function forces updates moving_vars if is_training."""
                    update_moving_mean = moving_averages.assign_moving_average(
                        moving_mean,
                        mean,
                        decay,
                        zero_debias=zero_debias_moving_mean)
                    update_moving_variance = moving_averages.assign_moving_average(
                        moving_variance, variance, decay, zero_debias=False)
                    with ops.control_dependencies(
                        [update_moving_mean, update_moving_variance]):
                        return array_ops.identity(mean), array_ops.identity(
                            variance)

                mean, variance = utils.smart_cond(is_training, _force_updates,
                                                  moving_vars_fn)
            else:

                def _delay_updates():
                    """Internal function that delay updates moving_vars if is_training."""
                    update_moving_mean = moving_averages.assign_moving_average(
                        moving_mean,
                        mean,
                        decay,
                        zero_debias=zero_debias_moving_mean)
                    update_moving_variance = moving_averages.assign_moving_average(
                        moving_variance, variance, decay, zero_debias=False)
                    return update_moving_mean, update_moving_variance

                update_mean, update_variance = utils.smart_cond(
                    is_training, _delay_updates, moving_vars_fn)
                ops.add_to_collections(updates_collections, update_mean)
                ops.add_to_collections(updates_collections, update_variance)
                # Use computed moments during training and moving_vars otherwise.
                vars_fn = lambda: (mean, variance)
                mean, variance = utils.smart_cond(is_training, vars_fn,
                                                  moving_vars_fn)
        else:
            mean, variance = moving_mean, moving_variance
        if data_format == 'NCHW':
            mean = array_ops.reshape(mean, params_shape_broadcast)
            variance = array_ops.reshape(variance, params_shape_broadcast)
            beta = array_ops.reshape(beta, params_shape_broadcast)
            if gamma is not None:
                gamma = array_ops.reshape(gamma, params_shape_broadcast)

        # Compute batch_normalization.
        outputs = batch_normalization(inputs,
                                      mean,
                                      variance,
                                      beta,
                                      gamma,
                                      epsilon,
                                      noise_std=noise_std)
        outputs.set_shape(inputs_shape)
        if activation_fn is not None:
            outputs = activation_fn(outputs)
        return utils.collect_named_outputs(outputs_collections,
                                           sc.original_name_scope, outputs)
Exemple #24
0
def residual(x,
             filters,
             kernel_size=3,
             stride=1,
             train=True,
             wd=0.0,
             bn_momentum=0.99,
             bn_epsilon=0.001,
             name='res'):
    """ Residual layer

    Uses the _residual_core function to create F(x), then adds x to it.

    Parameters
    ----------
    x : tf tensor
        Input to be modified
    filters : int
        Number of output filters (will be used for all convolutions in the
        resnet core).
    stride : int
        Conv stride
    train : bool or tf boolean tensor
        Whether we are in the train phase or not. Can set to a tensorflow tensor
        so that it can be modified on the fly.
    wd : float
        Weight decay term for the convolutional weights
    bn_momentum : float
        The momentum for the batch normalization layers in the resnet
    bn_epsilon : float
        The epsilon for the batch normalization layers in the resnet

    Notes
    -----
    When training, the moving_mean and moving_variance need to be updated. By
    default the update ops are placed in tf.GraphKeys.UPDATE_OPS, so they need
    to be added as a dependency to the train_op. For example::

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss)
    """
    bn_class = lambda name: normalization.BatchNormalization(
        name=name, momentum=bn_momentum, epsilon=bn_epsilon)

    orig_x = x
    with tf.variable_scope(name):
        bn = bn_class('init_bn')
        x = bn.apply(x, training=train)
        x = tf.nn.relu(x)

        # The projection shortcut should come after the first batch norm and
        # ReLU since it performs a 1x1 convolution.
        if stride > 1:
            orig_x = tf.layers.conv2d(
                orig_x,
                filters=filters,
                strides=stride,
                kernel_size=1,
                padding='VALID',
                use_bias=False,
                kernel_initializer=tf.variance_scaling_initializer(),
                data_format='channels_last')

        x = _residual_core(x, filters, kernel_size, stride, train, wd,
                           bn_momentum, bn_epsilon)

        y = tf.add(x, orig_x)

    return y
Exemple #25
0
def _residual_core(x,
                   filters,
                   kernel_size=3,
                   stride=1,
                   train=True,
                   wd=0.0,
                   bn_momentum=0.99,
                   bn_epsilon=0.001):
    """ Core function of a residual unit.

    In -> conv -> bn -> relu -> conv

    Note that the normal residual layer has a batch norm and relu before the
    first conv. This is in the residual function which calls this.

    Parameters
    ----------
    x : tf tensor
        Input to be modified
    filters : int
        Number of output filters (will be used for all convolutions in the
        resnet core).
    kernel_size : int
        Size of the filter kernels
    stride : int
        Conv stride
    train : bool or tf boolean tensor
        Whether we are in the train phase or not. Can set to a tensorflow tensor
        so that it can be modified on the fly.
    wd : float
        Weight decay term for the convolutional weights
    bn_momentum : float
        The momentum for the batch normalization layers in the resnet
    bn_epsilon : float
        The epsilon for the batch normalization layers in the resnet
    """

    init = init_ops.VarianceScaling(scale=1.0, mode='fan_out')
    reg = lambda w: real_reg(w, wd, norm=2)
    bn_class = lambda name: normalization.BatchNormalization(
        name=name, momentum=bn_momentum, epsilon=bn_epsilon)
    conv_class = lambda name, stride: convolutional.Conv2D(
        filters,
        3, (stride, stride),
        use_bias=False,
        padding=('SAME' if stride == 1 else 'VALID'),
        kernel_initializer=init,
        kernel_regularizer=reg,
        name=name)

    with tf.variable_scope('sub1'):
        # As we will do downsampling with strides, need to make sure the output
        # size is the correct format.
        if stride > 1:
            x = fixed_padding(x, kernel_size, 'channels_last')

        conv = conv_class('conv1', stride)
        x = conv.apply(x)

    with tf.variable_scope('sub2'):
        bn = bn_class('between_bn')
        x = bn.apply(x, training=train)
        x = tf.nn.relu(x)
        conv = conv_class('conv2', 1)
        x = conv.apply(x)

    return x
Exemple #26
0
    def __init__(self,
                 units,
                 hidden_units,
                 feature_columns,
                 activation_fn,
                 dropout,
                 batch_norm,
                 name=None,
                 **kwargs):
        super(_DNNModelV2, self).__init__(name=name, **kwargs)

        # Add this name_scope for backward compatibility, as previously it's used
        # in variable_scope
        with ops.name_scope(
                'input_from_feature_columns') as input_feature_column_scope:
            layer_name = input_feature_column_scope + 'input_layer'
            if feature_column_lib.is_feature_column_v2(feature_columns):
                self._input_layer = feature_column_lib.DenseFeatures(
                    feature_columns=feature_columns, name=layer_name)
            else:
                self._input_layer = feature_column.InputLayer(
                    feature_columns=feature_columns,
                    name=layer_name,
                    create_scope_now=False)

        self._add_layer(self._input_layer, self._input_layer.name)

        self._dropout = dropout
        self._batch_norm = batch_norm

        self._hidden_layers = []
        self._dropout_layers = []
        self._batch_norm_layers = []
        self._hidden_layer_scope_names = []
        for layer_id, num_hidden_units in enumerate(hidden_units):
            with ops.name_scope('hiddenlayer_%d' %
                                layer_id) as hidden_layer_scope:
                # Get scope name without the trailing slash.
                hidden_shared_name = _name_from_scope_name(hidden_layer_scope)
                hidden_layer = core_layers.Dense(
                    units=num_hidden_units,
                    activation=activation_fn,
                    kernel_initializer=init_ops.glorot_uniform_initializer(),
                    name=hidden_shared_name)
                self._add_layer(hidden_layer, hidden_shared_name)
                self._hidden_layer_scope_names.append(hidden_shared_name)
                self._hidden_layers.append(hidden_layer)
                if self._dropout is not None:
                    dropout_layer = core_layers.Dropout(rate=self._dropout)
                    self._add_layer(dropout_layer, dropout_layer.name)
                    self._dropout_layers.append(dropout_layer)
                if self._batch_norm:
                    batch_norm_name = hidden_shared_name + '/batchnorm_%d' % layer_id
                    batch_norm_layer = normalization.BatchNormalization(
                        # The default momentum 0.99 actually crashes on certain
                        # problem, so here we use 0.999, which is the default of
                        # tf.contrib.layers.batch_norm.
                        momentum=0.999,
                        trainable=True,
                        name=batch_norm_name)
                    self._add_layer(batch_norm_layer, batch_norm_name)
                    self._batch_norm_layers.append(batch_norm_layer)

        with ops.name_scope('logits') as logits_scope:
            logits_shared_name = _name_from_scope_name(logits_scope)
            self._logits_layer = core_layers.Dense(
                units=units,
                activation=None,
                kernel_initializer=init_ops.glorot_uniform_initializer(),
                name=logits_shared_name)
            self._add_layer(self._logits_layer, logits_shared_name)
            self._logits_scope_name = logits_shared_name
    def testForwardInverse(self):
        """Tests forward and backward passes with different event shapes.

    input_shape: Tuple of shapes for input tensor.
    event_dims: Tuple of dimension indices that will be normalized.
    training: Boolean of whether bijector runs in training or inference mode.
    """
        params = [((5 * 2, 4), [-1], False), ((5, 2, 4), [-1], False),
                  ((5, 2, 4), [1, 2], False), ((5, 2, 4), [0, 1], False),
                  ((5 * 2, 4), [-1], True), ((5, 2, 4), [-1], True),
                  ((5, 2, 4), [1, 2], True), ((5, 2, 4), [0, 1], True)]
        for input_shape, event_dims, training in params:
            x_ = np.arange(5 * 4 * 2).astype(np.float32).reshape(input_shape)
            with self.cached_session() as sess:
                x = constant_op.constant(x_)
                # When training, memorize the exact mean of the last
                # minibatch that it normalized (instead of moving average assignment).
                layer = normalization.BatchNormalization(axis=event_dims,
                                                         momentum=0.,
                                                         epsilon=0.)
                batch_norm = BatchNormalization(batchnorm_layer=layer,
                                                training=training)
                # Minibatch statistics are saved only after norm_x has been computed.
                norm_x = batch_norm.inverse(x)
                with ops.control_dependencies(batch_norm.batchnorm.updates):
                    moving_mean = array_ops.identity(
                        batch_norm.batchnorm.moving_mean)
                    moving_var = array_ops.identity(
                        batch_norm.batchnorm.moving_variance)
                    denorm_x = batch_norm.forward(array_ops.identity(norm_x))
                    fldj = batch_norm.forward_log_det_jacobian(
                        x, event_ndims=len(event_dims))
                    # Use identity to invalidate cache.
                    ildj = batch_norm.inverse_log_det_jacobian(
                        array_ops.identity(denorm_x),
                        event_ndims=len(event_dims))
                variables.global_variables_initializer().run()
                # Update variables.
                norm_x_ = sess.run(norm_x)
                [
                    norm_x_,
                    moving_mean_,
                    moving_var_,
                    denorm_x_,
                    ildj_,
                    fldj_,
                ] = sess.run([
                    norm_x,
                    moving_mean,
                    moving_var,
                    denorm_x,
                    ildj,
                    fldj,
                ])
                self.assertEqual("batch_normalization", batch_norm.name)

                reduction_axes = self._reduction_axes(input_shape, event_dims)
                keepdims = len(event_dims) > 1

                expected_batch_mean = np.mean(x_,
                                              axis=reduction_axes,
                                              keepdims=keepdims)
                expected_batch_var = np.var(x_,
                                            axis=reduction_axes,
                                            keepdims=keepdims)

                if training:
                    # When training=True, values become normalized across batch dim and
                    # original values are recovered after de-normalizing.
                    zeros = np.zeros_like(norm_x_)
                    self.assertAllClose(np.mean(zeros, axis=reduction_axes),
                                        np.mean(norm_x_, axis=reduction_axes))

                    self.assertAllClose(expected_batch_mean, moving_mean_)
                    self.assertAllClose(expected_batch_var, moving_var_)
                    self.assertAllClose(x_, denorm_x_, atol=1e-5)
                    # Since moving statistics are set to batch statistics after
                    # normalization, ildj and -fldj should match.
                    self.assertAllClose(ildj_, -fldj_)
                    # ildj is computed with minibatch statistics.
                    expected_ildj = np.sum(
                        np.log(1.) - .5 * np.log(expected_batch_var +
                                                 batch_norm.batchnorm.epsilon))
                    self.assertAllClose(expected_ildj, ildj_)
                else:
                    # When training=False, moving_mean, moving_var remain at their
                    # initialized values (0., 1.), resulting in no scale/shift (a small
                    # shift occurs if epsilon > 0.)
                    self.assertAllClose(x_, norm_x_)
                    self.assertAllClose(x_, denorm_x_, atol=1e-5)
                    # ildj is computed with saved statistics.
                    expected_ildj = np.sum(
                        np.log(1.) -
                        .5 * np.log(1. + batch_norm.batchnorm.epsilon))
                    self.assertAllClose(expected_ildj, ildj_)
Exemple #28
0
  def __init__(self,
               units,
               hidden_units,
               feature_columns,
               activation_fn,
               dropout,
               input_layer_partitioner,
               batch_norm,
               name=None,
               **kwargs):
    super(_DNNModel, self).__init__(name=name, **kwargs)
    if feature_column_lib.is_feature_column_v2(feature_columns):
      self._input_layer = feature_column_lib.DenseFeatures(
          feature_columns=feature_columns, name='input_layer')
    else:
      self._input_layer = feature_column.InputLayer(
          feature_columns=feature_columns,
          name='input_layer',
          create_scope_now=False)

    self._add_layer(self._input_layer, 'input_layer')

    self._dropout = dropout
    self._batch_norm = batch_norm

    self._hidden_layers = []
    self._dropout_layers = []
    self._batch_norm_layers = []
    self._hidden_layer_scope_names = []
    for layer_id, num_hidden_units in enumerate(hidden_units):
      with variable_scope.variable_scope(
          'hiddenlayer_%d' % layer_id) as hidden_layer_scope:
        hidden_layer = core_layers.Dense(
            units=num_hidden_units,
            activation=activation_fn,
            kernel_initializer=init_ops.glorot_uniform_initializer(),
            name=hidden_layer_scope,
            _scope=hidden_layer_scope)
        self._add_layer(hidden_layer, hidden_layer_scope.name)
        self._hidden_layer_scope_names.append(hidden_layer_scope.name)
        self._hidden_layers.append(hidden_layer)
        if self._dropout is not None:
          dropout_layer = core_layers.Dropout(rate=self._dropout)
          self._add_layer(dropout_layer, dropout_layer.name)
          self._dropout_layers.append(dropout_layer)
        if self._batch_norm:
          batch_norm_layer = normalization.BatchNormalization(
              # The default momentum 0.99 actually crashes on certain
              # problem, so here we use 0.999, which is the default of
              # tf.contrib.layers.batch_norm.
              momentum=0.999,
              trainable=True,
              name='batchnorm_%d' % layer_id,
              _scope='batchnorm_%d' % layer_id)
          self._add_layer(batch_norm_layer, batch_norm_layer.name)
          self._batch_norm_layers.append(batch_norm_layer)

    with variable_scope.variable_scope('logits') as logits_scope:
      self._logits_layer = core_layers.Dense(
          units=units,
          activation=None,
          kernel_initializer=init_ops.glorot_uniform_initializer(),
          name=logits_scope,
          _scope=logits_scope)
      self._add_layer(self._logits_layer, logits_scope.name)
      self._logits_scope_name = logits_scope.name
    self._input_layer_partitioner = input_layer_partitioner