def testRegularizers(self): reg = lambda x: 0.1 * math_ops.reduce_sum(x) bn = normalization_layers.BatchNormalization(axis=1, beta_regularizer=reg) inputs = random_ops.random_uniform((5, 4, 3), seed=1) training = array_ops.placeholder(dtype='bool') _ = bn.apply(inputs, training=training) self.assertEqual(len(bn.losses), 1) bn = normalization_layers.BatchNormalization(axis=1, gamma_regularizer=reg) inputs = random_ops.random_uniform((5, 4, 3), seed=1) training = array_ops.placeholder(dtype='bool') _ = bn.apply(inputs, training=training) self.assertEqual(len(bn.losses), 1)
def batchnorm_example(optimizer_fn, batch_per_epoch=1, momentum=0.9, renorm=False, update_ops_in_tower_mode=False): """Example of non-distribution-aware legacy code with batch normalization.""" def dataset_fn(): # input shape is [16, 8], input values are increasing in both dimensions. return dataset_ops.Dataset.from_tensor_slices( [[[float(x * 8 + y + z * 100) for y in range(8)] for x in range(16)] for z in range(batch_per_epoch)]).repeat() optimizer = optimizer_fn() batchnorm = normalization.BatchNormalization(renorm=renorm, momentum=momentum, fused=False) layer = core.Dense(1, use_bias=False) def model_fn(x): """A model that uses batchnorm.""" def loss_fn(): y = batchnorm(x, training=True) with ops.control_dependencies( ops.get_collection(ops.GraphKeys.UPDATE_OPS ) if update_ops_in_tower_mode else []): loss = math_ops.reduce_mean( math_ops.reduce_sum(layer(y)) - constant_op.constant(1.)) # `x` and `y` will be fetched by the gradient computation, but not `loss`. return loss # Callable loss. return optimizer.minimize(loss_fn) return model_fn, dataset_fn, batchnorm
def __init__(self, batchnorm_layer=None, training=True, validate_args=False, name="batch_normalization"): """Instantiates the `BatchNorm` bijector. Args: batchnorm_layer: `tf.layers.BatchNormalization` layer object. If `None`, defaults to `tf.layers.BatchNormalization(gamma_constraint=nn_ops.relu(x) + 1e-6)`. This ensures positivity of the scale variable. training: If True, updates running-average statistics during call to `inverse()`. validate_args: Python `bool` indicating whether arguments should be checked for correctness. name: Python `str` name given to ops managed by this object. Raises: ValueError: If bn_layer is not an instance of `tf.layers.BatchNormalization`, or if it is specified with `renorm=True` or a virtual batch size. """ # Scale must be positive. g_constraint = lambda x: nn.relu(x) + 1e-6 self.batchnorm = batchnorm_layer or normalization.BatchNormalization( gamma_constraint=g_constraint) self._validate_bn_layer(self.batchnorm) self._training = training super(BatchNormalization, self).__init__( validate_args=validate_args, name=name)
def testLogProb(self, event_shape, event_dims, training): training = tf.placeholder_with_default(training, (), "training") layer = normalization.BatchNormalization(axis=event_dims, epsilon=0.) batch_norm = tfb.BatchNormalization(batchnorm_layer=layer, training=training) base_dist = distributions.MultivariateNormalDiag( loc=np.zeros(np.prod(event_shape), dtype=np.float32)) # Reshape the events. if isinstance(event_shape, int): event_shape = [event_shape] base_dist = distributions.TransformedDistribution( distribution=base_dist, bijector=tfb.Reshape(event_shape_out=event_shape)) dist = distributions.TransformedDistribution( distribution=base_dist, bijector=batch_norm, validate_args=True) samples = dist.sample(int(1e5)) # No volume distortion since training=False, bijector is initialized # to the identity transformation. base_log_prob = base_dist.log_prob(samples) dist_log_prob = dist.log_prob(samples) self.evaluate(tf.global_variables_initializer()) base_log_prob_, dist_log_prob_ = self.evaluate( [base_log_prob, dist_log_prob]) self.assertAllClose(base_log_prob_, dist_log_prob_)
def testCreateBN(self): # Call layer. bn = normalization_layers.BatchNormalization(axis=1) inputs = random_ops.random_uniform((5, 4, 3), seed=1) training = array_ops.placeholder(dtype='bool') outputs = bn.apply(inputs, training=training) # Verify shape. self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3]) # Verify layer attributes. self.assertEqual(len(bn.updates), 2) self.assertEqual(len(bn.variables), 4) self.assertEqual(len(bn.trainable_variables), 2) self.assertEqual(len(bn.non_trainable_variables), 2) # Test that updates were created and added to UPDATE_OPS. self.assertEqual(len(bn.updates), 2) self.assertListEqual( ops.get_collection(ops.GraphKeys.UPDATE_OPS), bn.updates) # Test that weights were created and added to TRAINABLE_VARIABLES. self.assertListEqual( ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES), bn.trainable_variables)
def batchnorm_example(optimizer_fn, batch_per_epoch=1, momentum=0.9, renorm=False): """Example of non-distribution-aware legacy code with batch normalization.""" def dataset_fn(): # input shape is [16, 8], input values are increasing in both dimensions. return dataset_ops.Dataset.from_tensor_slices( [[[float(x * 8 + y + z * 100) for y in range(8)] for x in range(16)] for z in range(batch_per_epoch)]).repeat() optimizer = optimizer_fn() batchnorm = normalization.BatchNormalization( renorm=renorm, momentum=momentum, fused=False) def model_fn(x): def loss_fn(): y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1) loss = math_ops.reduce_mean(y - constant_op.constant(1.)) return loss # Callable loss. return optimizer.minimize(loss_fn) return model_fn, dataset_fn, batchnorm
def output_logits_from_dnn(fields_embeddings, params, is_training): dropout_rate = params['dropout_rate'] do_batch_norm = params['batch_norm'] X = tf.concat(fields_embeddings, axis=1) tf.logging.info("initial input to DNN, shape={}".format(X.shape)) for idx, n_units in enumerate(params['hidden_units'], start=1): X = tf.layers.dense(X, units=n_units, activation=tf.nn.relu) tf.logging.info("layer[{}] output shape={}".format(idx, X.shape)) X = tf.layers.dropout(inputs=X, rate=dropout_rate, training=is_training) if is_training: tf.logging.info("layer[{}] dropout {}".format(idx, dropout_rate)) if do_batch_norm: # BatchNormalization的调用、参数,是从DNNLinearCombinedClassifier源码中拷贝过来的 batch_norm_layer = normalization.BatchNormalization( momentum=0.999, trainable=True, name='batchnorm_{}'.format(idx)) X = batch_norm_layer(X, training=is_training) if is_training: tf.logging.info("layer[{}] batch-normalize".format(idx)) # connect to final logits, [batch_size,1] return tf.layers.dense(X, units=1, use_bias=True, activation=None)
def __init__(self, name="batch_norm"): super(BatchNorm, self).__init__(name=name) with self._enter_variable_scope(): self._bn = normalization.BatchNormalization(axis=1, epsilon=np.finfo( np.float32).eps, momentum=0.9)
def testRenorm(self): shape = (4, 3) xt = array_ops.placeholder(dtypes.float32, shape) momentum = 0.99 renorm_momentum = 0.8 rmax = 1.1 rmin = 0.9 dmax = 0.1 gamma = 2. beta = 3. epsilon = 0.001 bn = normalization_layers.BatchNormalization( axis=1, gamma_initializer=init_ops.constant_initializer(gamma), beta_initializer=init_ops.constant_initializer(beta), epsilon=epsilon, momentum=momentum, renorm=True, renorm_clipping={'rmax': rmax, 'rmin': rmin, 'dmax': dmax}, renorm_momentum=renorm_momentum) training = array_ops.placeholder(dtypes.bool) yt = bn.apply(xt, training=training) moving_mean = 0. moving_variance = 1. renorm_mean = renorm_stddev = 0. renorm_weight = 0. with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) for _ in range(5): x = np.random.random(shape) mean = x.mean(0) stddev = np.sqrt(x.var(0) + epsilon) adj_mean = renorm_mean + (1. - renorm_weight) * mean adj_stddev = renorm_stddev + (1. - renorm_weight) * stddev r = (stddev / adj_stddev).clip(rmin, rmax) d = ((mean - adj_mean) / adj_stddev).clip(-dmax, dmax) y_train = ((x - mean) / stddev * r + d) * gamma + beta renorm_mean += (mean - renorm_mean) * (1. - renorm_momentum) renorm_stddev += (stddev - renorm_stddev) * (1. - renorm_momentum) renorm_weight += (1. - renorm_weight) * (1. - renorm_momentum) moving_mean += (renorm_mean / renorm_weight - moving_mean) * (1. - momentum) moving_variance += ((renorm_stddev / renorm_weight) ** 2 - epsilon - moving_variance) * (1. - momentum) y_test = ((x - moving_mean) / (moving_variance + epsilon) ** 0.5 * gamma) + beta yt_val_train, _, _ = sess.run([yt] + bn.updates, feed_dict={xt: x, training: True}) yt_val_test, _, _ = sess.run([yt] + bn.updates, feed_dict={xt: x, training: False}) self.assertAllClose(y_train, yt_val_train, atol=1e-5) self.assertAllClose(y_test, yt_val_test, atol=1e-5)
def testConstraints(self): g_constraint = lambda x: x / math_ops.reduce_sum(x) b_constraint = lambda x: x / math_ops.reduce_max(x) bn = normalization_layers.BatchNormalization(axis=1, gamma_constraint=g_constraint, beta_constraint=b_constraint) inputs = random_ops.random_uniform((5, 4, 3), seed=1) bn(inputs) self.assertEqual(bn.gamma_constraint, g_constraint) self.assertEqual(bn.beta_constraint, b_constraint)
def testNoScale(self): bn = normalization_layers.BatchNormalization(axis=1, scale=False) inputs = random_ops.random_uniform((5, 4, 3), seed=1) training = array_ops.placeholder(dtype='bool') outputs = bn.apply(inputs, training=training) # Verify shape. self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3]) # Verify layer attributes. self.assertEqual(len(bn.updates), 2) self.assertEqual(len(bn.variables), 3) self.assertEqual(len(bn.trainable_variables), 1) self.assertEqual(len(bn.non_trainable_variables), 2)
def testGhostBN4DimsAxis1(self): shape = [6, 3, 10, 10] num_virtual_batches = 3 beta = 2. gamma = 3. momentum = 0.8 epsilon = 1e-3 moving_means = np.zeros([1, 3, 3, 1, 1], dtype=np.float32) moving_vars = np.ones([1, 3, 3, 1, 1], dtype=np.float32) inp = array_ops.placeholder(dtypes.float32, shape) is_training = array_ops.placeholder(dtypes.bool) bn = normalization_layers.BatchNormalization( axis=1, momentum=momentum, epsilon=epsilon, beta_initializer=init_ops.constant_initializer(beta), gamma_initializer=init_ops.constant_initializer(gamma), num_virtual_batches=num_virtual_batches, fused=False) # NCHW is unsupported by CPU fused batch norm out = bn.apply(inp, training=is_training) ghost_shape = ([shape[0] // num_virtual_batches, num_virtual_batches] + shape[1:]) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) for _ in range(5): x = np.random.random(shape) sub_batched = np.reshape(x, ghost_shape) means = np.mean(sub_batched, axis=(0, 3, 4), keepdims=True) variances = np.var(sub_batched, axis=(0, 3, 4), keepdims=True) moving_means = moving_means * momentum + means * (1. - momentum) moving_vars = moving_vars * momentum + variances * (1. - momentum) y_train = ((sub_batched - means) / (variances + epsilon) ** 0.5 * gamma) + beta y_test = ((sub_batched - moving_means) / (moving_vars + epsilon) ** 0.5 * gamma) + beta y_train = np.reshape(y_train, shape) y_test = np.reshape(y_test, shape) y_val_train, _, _ = sess.run([out] + bn.updates, feed_dict={inp: x, is_training: True}) y_val_test = sess.run(out, feed_dict={inp: x, is_training: False}) self.assertAllClose(y_train, y_val_train, atol=1e-2) self.assertAllClose(y_test, y_val_test, atol=1e-2)
def step_fn(is_training, inputs, targets=None): bn = normalization.BatchNormalization(axis=3, epsilon=1e-3, momentum=0.9, fused=fused) bn_list.append(bn) outputs = bn.apply(inputs, training=is_training) if not is_training: return outputs loss = losses.mean_squared_error(targets, outputs) optimizer = gradient_descent.GradientDescentOptimizer(0.01) train_op = optimizer.minimize(loss) with ops.control_dependencies([train_op]): return array_ops.identity(loss)
def test4DInputAxis1Fused(self): if test.is_gpu_available(cuda_only=True): epsilon = 1e-3 bn = normalization_layers.BatchNormalization(axis=1, epsilon=epsilon, momentum=0.9, fused=True) inputs = variables.Variable(np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32) training = array_ops.placeholder(dtype='bool') outputs = bn.apply(inputs, training=training) with self.test_session() as sess: # Test training with placeholder learning phase. sess.run(variables.global_variables_initializer()) np_gamma, np_beta = sess.run([bn.gamma, bn.beta]) np_gamma = np.reshape(np_gamma, (1, 4, 1, 1)) np_beta = np.reshape(np_beta, (1, 4, 1, 1)) for _ in range(100): np_output, _, _ = sess.run([outputs] + bn.updates, feed_dict={training: True}) # Verify that the axis is normalized during training. normed_np_output = ( (np_output - epsilon) * np_gamma) + np_beta self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1) self.assertAlmostEqual(np.std(normed_np_output), 1., places=1) # Verify that the statistics are updated during training. moving_mean, moving_var = sess.run( [bn.moving_mean, bn.moving_variance]) np_inputs = sess.run(inputs) mean = np.mean(np_inputs, axis=(0, 2, 3)) std = np.std(np_inputs, axis=(0, 2, 3)) variance = np.square(std) self.assertAllClose(mean, moving_mean, atol=1e-2) self.assertAllClose(variance, moving_var, atol=1e-2) # Test inference with placeholder learning phase. np_output = sess.run(outputs, feed_dict={training: False}) # Verify that the axis is normalized during inference. normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1) self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
def testBNWithZeroBatchInput(self, distribution, fused): distribution.extended.experimental_enable_get_next_as_optional = True with distribution.scope(): inputs = np.random.random((0, 4, 4, 3)).astype(np.float32) + 100 targets = np.random.random((0, 4, 4, 3)).astype(np.float32) bn = normalization.BatchNormalization(axis=3, epsilon=1e-3, momentum=0.9, fused=fused) optimizer = gradient_descent.GradientDescentOptimizer(0.01) @def_function.function def train_step(): def step_fn(inputs, targets): with backprop.GradientTape() as tape: outputs = bn.apply(inputs, training=True) loss = losses.mean_squared_error(targets, outputs) grads = tape.gradient(loss, bn.variables) optimizer.apply_gradients(zip(grads, bn.variables)) return loss return distribution.experimental_run_v2(step_fn, args=(inputs, targets)) for _ in range(100): np_output = train_step().numpy() self.assertEqual(0.0, np_output) # Verify that the statistics and weights are not changed after training. self.assertAllEqual([0, 0, 0], bn.moving_mean.numpy()) self.assertAllEqual([1, 1, 1], bn.moving_variance.numpy()) self.assertAllEqual([1, 1, 1], bn.gamma.numpy()) self.assertAllEqual([0, 0, 0], bn.beta.numpy()) @def_function.function def test_step(): def step_fn(inputs): outputs = bn.apply(inputs, training=False) return outputs return distribution.experimental_run_v2(step_fn, args=(inputs, )) # Test inference. self.assertAllEqual(np.zeros(shape=(0, 4, 4, 3), dtype=np.float32), test_step().numpy())
def testLogProb(self): with self.test_session() as sess: layer = normalization.BatchNormalization(epsilon=0.) batch_norm = BatchNormalization(batchnorm_layer=layer, training=False) base_dist = distributions.MultivariateNormalDiag(loc=[0., 0.]) dist = transformed_distribution_lib.TransformedDistribution( distribution=base_dist, bijector=batch_norm, validate_args=True) samples = dist.sample(int(1e5)) # No volume distortion since training=False, bijector is initialized # to the identity transformation. base_log_prob = base_dist.log_prob(samples) dist_log_prob = dist.log_prob(samples) variables.global_variables_initializer().run() base_log_prob_, dist_log_prob_ = sess.run([base_log_prob, dist_log_prob]) self.assertAllClose(base_log_prob_, dist_log_prob_)
def testInvertMutuallyConsistent(self): # BatchNorm bijector is only mutually consistent when training=False. dims = 4 with self.cached_session() as sess: layer = normalization.BatchNormalization(epsilon=0.) batch_norm = Invert( BatchNormalization(batchnorm_layer=layer, training=False)) dist = transformed_distribution_lib.TransformedDistribution( distribution=normal_lib.Normal(loc=0., scale=1.), bijector=batch_norm, event_shape=[dims], validate_args=True) self.run_test_sample_consistent_log_prob(sess_run_fn=sess.run, dist=dist, num_samples=int(1e5), radius=2., center=0., rtol=0.02)
def batch_norm(self, input_layer=None, decay=0.999, scale=False, epsilon=0.001): """Adds a Batch Normalization layer.""" if input_layer is None: input_layer = self.top_layer else: self.top_size = None name = 'batchnorm' + str(self.counts['batchnorm']) self.counts['batchnorm'] += 1 center = True with tf.variable_scope(name) as scope: if self.use_tf_layers: layer_obj = normalization_layers.BatchNormalization( momentum=decay, scale=scale, epsilon=epsilon, fused=True, axis=_data_format_to_channel_axis[self.data_format], # We pass this 'scope' argument for compatibility with checkpoints # created with the contrib version of batch norm. tf_cnn_benchmarks # used to use the contrib version. _scope=scope, center=center, name=scope.name) bn = layer_obj.apply(input_layer, training=self.phase_train) else: bn = self._batch_norm_without_layers(input_layer, decay, scale, epsilon) self.top_layer = bn self.top_size = bn.shape[ 3] if self.data_format == 'NHWC' else bn.shape[1] self.top_size = int(self.top_size) mlperf.logger.log_batch_norm(input_tensor=input_layer, output_tensor=bn, momentum=decay, epsilon=epsilon, center=center, scale=scale, training=self.phase_train) return bn
def testInvertMutuallyConsistent(self): # BatchNorm bijector is only mutually consistent when training=False. dims = 4 training = tf.placeholder_with_default(False, (), "training") layer = normalization.BatchNormalization(epsilon=0.) batch_norm = tfb.Invert( tfb.BatchNormalization(batchnorm_layer=layer, training=training)) dist = distributions.TransformedDistribution( distribution=distributions.Normal(loc=0., scale=1.), bijector=batch_norm, event_shape=[dims], validate_args=True) self.run_test_sample_consistent_log_prob( sess_run_fn=self.evaluate, dist=dist, num_samples=int(1e5), radius=2., center=0., rtol=0.02)
def testBNWithDynamicBatchInputEager(self, distribution, fused): distribution.extended.experimental_enable_get_next_as_optional = True with distribution.scope(): # Explicitly create dataset with drop_remainder=False. # This would make batch size unknown. inputs = np.random.random((11, 4, 4, 3)).astype(np.float32) + 100 targets = np.random.random((11, 4, 4, 3)).astype(np.float32) dataset = dataset_ops.Dataset.from_tensor_slices( (inputs, targets)).batch(10, drop_remainder=False).repeat() dataset_iterator = iter( distribution.experimental_distribute_dataset(dataset)) bn = normalization.BatchNormalization(axis=-1, epsilon=1e-3, momentum=0.9, fused=fused) optimizer = gradient_descent.GradientDescentOptimizer(0.01) @def_function.function def train_step(iterator): def step_fn(inputs): features, targets = inputs with backprop.GradientTape() as tape: outputs = bn(features, training=True) loss = losses.mean_squared_error(targets, outputs) grads = tape.gradient(loss, bn.variables) optimizer.apply_gradients(zip(grads, bn.variables)) return loss return distribution.run(step_fn, args=(next(iterator), )) for _ in range(100): train_step(dataset_iterator).numpy() # Verify that the statistics and weights are updated. self.assertNotAllEqual(np.ndarray([0, 0, 0]), bn.moving_mean.numpy()) self.assertNotAllEqual(np.ndarray([1, 1, 1]), bn.moving_variance.numpy()) self.assertNotAllEqual(np.ndarray([1, 1, 1]), bn.gamma.numpy()) self.assertNotAllEqual(np.ndarray([0, 0, 0]), bn.beta.numpy())
def testBooleanLearningPhase(self): epsilon = 1e-3 bn = normalization_layers.BatchNormalization(axis=-1, epsilon=epsilon, momentum=0.9) inputs = variables.Variable(np.random.random((5, 4, 3, 6)) + 100, dtype=dtypes.float32) outputs_training = bn.apply(inputs, training=True) outputs_infer = bn.apply(inputs, training=False) with self.test_session() as sess: # Test training with placeholder learning phase. sess.run(variables.global_variables_initializer()) np_gamma, np_beta = sess.run([bn.gamma, bn.beta]) np_gamma = np.reshape(np_gamma, (1, 1, 1, 6)) np_beta = np.reshape(np_beta, (1, 1, 1, 6)) for _ in range(100): np_output, _, _ = sess.run([outputs_training] + bn.updates) # Verify that the axis is normalized during training. normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta self.assertAlmostEqual(np.mean(normed_np_output), 0., places=2) self.assertAlmostEqual(np.std(normed_np_output), 1., places=1) # Verify that the statistics are updated during training. moving_mean, moving_var = sess.run( [bn.moving_mean, bn.moving_variance]) np_inputs = sess.run(inputs) mean = np.mean(np_inputs, axis=(0, 1, 2)) std = np.std(np_inputs, axis=(0, 1, 2)) variance = np.square(std) self.assertAllClose(mean, moving_mean, atol=1e-2) self.assertAllClose(variance, moving_var, atol=1e-2) # Test inference with placeholder learning phase. np_output = sess.run(outputs_infer) # Verify that the axis is normalized during inference. normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta self.assertAlmostEqual(np.mean(normed_np_output), 0., places=1) self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
def _simple_model(self, image, fused, freeze_mode): output_channels, kernel_size = 2, 3 conv = conv_layers.conv2d( image, output_channels, kernel_size, use_bias=False, kernel_initializer=init_ops.ones_initializer()) bn_layer = normalization_layers.BatchNormalization(fused=fused) bn_layer._bessels_correction_test_only = False training = not freeze_mode bn = bn_layer.apply(conv, training=training) loss = math_ops.reduce_sum(math_ops.abs(bn)) optimizer = gradient_descent.GradientDescentOptimizer(0.01) if not freeze_mode: update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS) with ops.control_dependencies(update_ops): train_op = optimizer.minimize(loss) else: train_op = optimizer.minimize(loss) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) return loss, train_op, saver
def custom_batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=ops.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, data_format='NHWC', zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99, noise_std=None): """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167. "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" Sergey Ioffe, Christian Szegedy Can be used as a normalizer function for conv2d and fully_connected. Note: when training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they need to be added as a dependency to the `train_op`. For example: ```python update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) ``` One can set updates_collections=None to force the updates in place, but that can have a speed penalty, especially in distributed settings. Args: inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`. The normalization is over all but the last dimension if `data_format` is `NHWC` and the second dimension if `data_format` is `NCHW`. decay: Decay for the moving average. Reasonable values for `decay` are close to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc. Lower `decay` value (recommend trying `decay`=0.9) if model experiences reasonably good training performance but poor validation and/or test performance. Try zero_debias_moving_mean=True for improved stability. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. epsilon: Small float added to variance to avoid dividing by zero. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. param_initializers: Optional initializers for beta, gamma, moving mean and moving variance. param_regularizers: Optional regularizer for beta and gamma. updates_collections: Collections to collect the update ops for computation. The updates_ops need to be executed with the train_op. If None, a control dependency would be added to make sure the updates are computed in place. is_training: Whether or not the layer is in training mode. In training mode it would accumulate the statistics of the moments into `moving_mean` and `moving_variance` using an exponential moving average with the given `decay`. When it is not in training mode then it would use the values of the `moving_mean` and the `moving_variance`. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). batch_weights: An optional tensor of shape `[batch_size]`, containing a frequency weight for each batch item. If present, then the batch normalization uses weighted mean and variance. (This can be used to correct for bias in training example selection.) fused: Use nn.fused_batch_norm if True, nn.batch_normalization otherwise. data_format: A string. `NHWC` (default) and `NCHW` are supported. zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new pair of variables 'moving_mean/biased' and 'moving_mean/local_step'. scope: Optional scope for `variable_scope`. renorm: Whether to use Batch Renormalization (https://arxiv.org/abs/1702.03275). This adds extra variables during training. The inference is the same for either value of this parameter. renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to scalar `Tensors` used to clip the renorm correction. The correction `(r, d)` is used as `corrected_value = normalized_value * r + d`, with `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin, dmax are set to inf, 0, inf, respectively. renorm_decay: Momentum used to update the moving means and standard deviations with renorm. Unlike `momentum`, this affects training and should be neither too small (which would add noise) nor too large (which would give stale estimates). Note that `decay` is still applied to get the means and variances for inference. Returns: A `Tensor` representing the output of the operation. Raises: ValueError: If `batch_weights` is not None and `fused` is True. ValueError: If `param_regularizers` is not None and `fused` is True. ValueError: If `data_format` is neither `NHWC` nor `NCHW`. ValueError: If the rank of `inputs` is undefined. ValueError: If rank or channels dimension of `inputs` is undefined. """ layer_variable_getter = slim.layers._build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = ops.convert_to_tensor(inputs) # Determine whether we can use the core layer class. if (batch_weights is None and updates_collections is ops.GraphKeys.UPDATE_OPS and not zero_debias_moving_mean): # Use the core layer class. axis = 1 if data_format == 'NCHW' else -1 if not param_initializers: param_initializers = {} beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) if not param_regularizers: param_regularizers = {} beta_regularizer = param_regularizers.get('beta') gamma_regularizer = param_regularizers.get('gamma') layer = normalization_layers.BatchNormalization( axis=axis, momentum=decay, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, moving_mean_initializer=moving_mean_initializer, moving_variance_initializer=moving_variance_initializer, beta_regularizer=beta_regularizer, gamma_regularizer=gamma_regularizer, trainable=trainable, renorm=renorm, renorm_clipping=renorm_clipping, renorm_momentum=renorm_decay, name=sc.name, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs, training=is_training) # Add variables to collections. slim.layers._add_variable_to_collections(layer.moving_mean, variables_collections, 'moving_mean') slim.layers._add_variable_to_collections(layer.moving_variance, variables_collections, 'moving_variance') if layer.beta: slim.layers._add_variable_to_collections( layer.beta, variables_collections, 'beta') if layer.gamma: slim.layers._add_variable_to_collections( layer.gamma, variables_collections, 'gamma') if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs) # Not supported by layer class: batch_weights argument, # and custom updates_collections. In that case, use the legacy BN # implementation. # Custom updates collections are not supported because the update logic # is different in this case, in particular w.r.t. "forced updates" and # update op reuse. if renorm: raise ValueError('renorm is not supported with batch_weights, ' 'updates_collections or zero_debias_moving_mean') inputs_shape = inputs.get_shape() inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % inputs.name) dtype = inputs.dtype.base_dtype if batch_weights is not None: batch_weights = ops.convert_to_tensor(batch_weights) inputs_shape[0:1].assert_is_compatible_with( batch_weights.get_shape()) # Reshape batch weight values so they broadcast across inputs. nshape = [-1] + [1 for _ in range(inputs_rank - 1)] batch_weights = array_ops.reshape(batch_weights, nshape) if data_format == 'NCHW': moments_axes = [0] + list(range(2, inputs_rank)) params_shape = inputs_shape[1:2] # For NCHW format, rather than relying on implicit broadcasting, we # explicitly reshape the params to params_shape_broadcast when computing # the moments and the batch normalization. params_shape_broadcast = list([1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) else: moments_axes = list(range(inputs_rank - 1)) params_shape = inputs_shape[-1:] params_shape_broadcast = None if not params_shape.is_fully_defined(): raise ValueError('Inputs %s has undefined channels dimension %s.' % (inputs.name, params_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if not param_initializers: param_initializers = {} if center: beta_collections = utils.get_variable_collections( variables_collections, 'beta') beta_initializer = param_initializers.get( 'beta', init_ops.zeros_initializer()) beta = variables.model_variable('beta', shape=params_shape, dtype=dtype, initializer=beta_initializer, collections=beta_collections, trainable=trainable) if scale: gamma_collections = utils.get_variable_collections( variables_collections, 'gamma') gamma_initializer = param_initializers.get( 'gamma', init_ops.ones_initializer()) gamma = variables.model_variable('gamma', shape=params_shape, dtype=dtype, initializer=gamma_initializer, collections=gamma_collections, trainable=trainable) # Create moving_mean and moving_variance variables and add them to the # appropriate collections. We disable variable partitioning while creating # them, because assign_moving_average is not yet supported for partitioned # variables. partitioner = variable_scope.get_variable_scope().partitioner try: variable_scope.get_variable_scope().set_partitioner(None) moving_mean_collections = utils.get_variable_collections( variables_collections, 'moving_mean') moving_mean_initializer = param_initializers.get( 'moving_mean', init_ops.zeros_initializer()) moving_mean = variables.model_variable( 'moving_mean', shape=params_shape, dtype=dtype, initializer=moving_mean_initializer, trainable=False, collections=moving_mean_collections) moving_variance_collections = utils.get_variable_collections( variables_collections, 'moving_variance') moving_variance_initializer = param_initializers.get( 'moving_variance', init_ops.ones_initializer()) moving_variance = variables.model_variable( 'moving_variance', shape=params_shape, dtype=dtype, initializer=moving_variance_initializer, trainable=False, collections=moving_variance_collections) finally: variable_scope.get_variable_scope().set_partitioner(partitioner) # If `is_training` doesn't have a constant value, because it is a `Tensor`, # a `Variable` or `Placeholder` then is_training_value will be None and # `needs_moments` will be true. is_training_value = utils.constant_value(is_training) need_moments = is_training_value is None or is_training_value if need_moments: # Calculate the moments based on the individual batch. if batch_weights is None: if data_format == 'NCHW': mean, variance = nn.moments(inputs, moments_axes, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = nn.moments(inputs, moments_axes) else: if data_format == 'NCHW': mean, variance = nn.weighted_moments(inputs, moments_axes, batch_weights, keep_dims=True) mean = array_ops.reshape(mean, [-1]) variance = array_ops.reshape(variance, [-1]) else: mean, variance = nn.weighted_moments( inputs, moments_axes, batch_weights) moving_vars_fn = lambda: (moving_mean, moving_variance) if updates_collections is None: def _force_updates(): """Internal function forces updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) with ops.control_dependencies( [update_moving_mean, update_moving_variance]): return array_ops.identity(mean), array_ops.identity( variance) mean, variance = utils.smart_cond(is_training, _force_updates, moving_vars_fn) else: def _delay_updates(): """Internal function that delay updates moving_vars if is_training.""" update_moving_mean = moving_averages.assign_moving_average( moving_mean, mean, decay, zero_debias=zero_debias_moving_mean) update_moving_variance = moving_averages.assign_moving_average( moving_variance, variance, decay, zero_debias=False) return update_moving_mean, update_moving_variance update_mean, update_variance = utils.smart_cond( is_training, _delay_updates, moving_vars_fn) ops.add_to_collections(updates_collections, update_mean) ops.add_to_collections(updates_collections, update_variance) # Use computed moments during training and moving_vars otherwise. vars_fn = lambda: (mean, variance) mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn) else: mean, variance = moving_mean, moving_variance if data_format == 'NCHW': mean = array_ops.reshape(mean, params_shape_broadcast) variance = array_ops.reshape(variance, params_shape_broadcast) beta = array_ops.reshape(beta, params_shape_broadcast) if gamma is not None: gamma = array_ops.reshape(gamma, params_shape_broadcast) # Compute batch_normalization. outputs = batch_normalization(inputs, mean, variance, beta, gamma, epsilon, noise_std=noise_std) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.original_name_scope, outputs)
def residual(x, filters, kernel_size=3, stride=1, train=True, wd=0.0, bn_momentum=0.99, bn_epsilon=0.001, name='res'): """ Residual layer Uses the _residual_core function to create F(x), then adds x to it. Parameters ---------- x : tf tensor Input to be modified filters : int Number of output filters (will be used for all convolutions in the resnet core). stride : int Conv stride train : bool or tf boolean tensor Whether we are in the train phase or not. Can set to a tensorflow tensor so that it can be modified on the fly. wd : float Weight decay term for the convolutional weights bn_momentum : float The momentum for the batch normalization layers in the resnet bn_epsilon : float The epsilon for the batch normalization layers in the resnet Notes ----- When training, the moving_mean and moving_variance need to be updated. By default the update ops are placed in tf.GraphKeys.UPDATE_OPS, so they need to be added as a dependency to the train_op. For example:: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss) """ bn_class = lambda name: normalization.BatchNormalization( name=name, momentum=bn_momentum, epsilon=bn_epsilon) orig_x = x with tf.variable_scope(name): bn = bn_class('init_bn') x = bn.apply(x, training=train) x = tf.nn.relu(x) # The projection shortcut should come after the first batch norm and # ReLU since it performs a 1x1 convolution. if stride > 1: orig_x = tf.layers.conv2d( orig_x, filters=filters, strides=stride, kernel_size=1, padding='VALID', use_bias=False, kernel_initializer=tf.variance_scaling_initializer(), data_format='channels_last') x = _residual_core(x, filters, kernel_size, stride, train, wd, bn_momentum, bn_epsilon) y = tf.add(x, orig_x) return y
def _residual_core(x, filters, kernel_size=3, stride=1, train=True, wd=0.0, bn_momentum=0.99, bn_epsilon=0.001): """ Core function of a residual unit. In -> conv -> bn -> relu -> conv Note that the normal residual layer has a batch norm and relu before the first conv. This is in the residual function which calls this. Parameters ---------- x : tf tensor Input to be modified filters : int Number of output filters (will be used for all convolutions in the resnet core). kernel_size : int Size of the filter kernels stride : int Conv stride train : bool or tf boolean tensor Whether we are in the train phase or not. Can set to a tensorflow tensor so that it can be modified on the fly. wd : float Weight decay term for the convolutional weights bn_momentum : float The momentum for the batch normalization layers in the resnet bn_epsilon : float The epsilon for the batch normalization layers in the resnet """ init = init_ops.VarianceScaling(scale=1.0, mode='fan_out') reg = lambda w: real_reg(w, wd, norm=2) bn_class = lambda name: normalization.BatchNormalization( name=name, momentum=bn_momentum, epsilon=bn_epsilon) conv_class = lambda name, stride: convolutional.Conv2D( filters, 3, (stride, stride), use_bias=False, padding=('SAME' if stride == 1 else 'VALID'), kernel_initializer=init, kernel_regularizer=reg, name=name) with tf.variable_scope('sub1'): # As we will do downsampling with strides, need to make sure the output # size is the correct format. if stride > 1: x = fixed_padding(x, kernel_size, 'channels_last') conv = conv_class('conv1', stride) x = conv.apply(x) with tf.variable_scope('sub2'): bn = bn_class('between_bn') x = bn.apply(x, training=train) x = tf.nn.relu(x) conv = conv_class('conv2', 1) x = conv.apply(x) return x
def __init__(self, units, hidden_units, feature_columns, activation_fn, dropout, batch_norm, name=None, **kwargs): super(_DNNModelV2, self).__init__(name=name, **kwargs) # Add this name_scope for backward compatibility, as previously it's used # in variable_scope with ops.name_scope( 'input_from_feature_columns') as input_feature_column_scope: layer_name = input_feature_column_scope + 'input_layer' if feature_column_lib.is_feature_column_v2(feature_columns): self._input_layer = feature_column_lib.DenseFeatures( feature_columns=feature_columns, name=layer_name) else: self._input_layer = feature_column.InputLayer( feature_columns=feature_columns, name=layer_name, create_scope_now=False) self._add_layer(self._input_layer, self._input_layer.name) self._dropout = dropout self._batch_norm = batch_norm self._hidden_layers = [] self._dropout_layers = [] self._batch_norm_layers = [] self._hidden_layer_scope_names = [] for layer_id, num_hidden_units in enumerate(hidden_units): with ops.name_scope('hiddenlayer_%d' % layer_id) as hidden_layer_scope: # Get scope name without the trailing slash. hidden_shared_name = _name_from_scope_name(hidden_layer_scope) hidden_layer = core_layers.Dense( units=num_hidden_units, activation=activation_fn, kernel_initializer=init_ops.glorot_uniform_initializer(), name=hidden_shared_name) self._add_layer(hidden_layer, hidden_shared_name) self._hidden_layer_scope_names.append(hidden_shared_name) self._hidden_layers.append(hidden_layer) if self._dropout is not None: dropout_layer = core_layers.Dropout(rate=self._dropout) self._add_layer(dropout_layer, dropout_layer.name) self._dropout_layers.append(dropout_layer) if self._batch_norm: batch_norm_name = hidden_shared_name + '/batchnorm_%d' % layer_id batch_norm_layer = normalization.BatchNormalization( # The default momentum 0.99 actually crashes on certain # problem, so here we use 0.999, which is the default of # tf.contrib.layers.batch_norm. momentum=0.999, trainable=True, name=batch_norm_name) self._add_layer(batch_norm_layer, batch_norm_name) self._batch_norm_layers.append(batch_norm_layer) with ops.name_scope('logits') as logits_scope: logits_shared_name = _name_from_scope_name(logits_scope) self._logits_layer = core_layers.Dense( units=units, activation=None, kernel_initializer=init_ops.glorot_uniform_initializer(), name=logits_shared_name) self._add_layer(self._logits_layer, logits_shared_name) self._logits_scope_name = logits_shared_name
def testForwardInverse(self): """Tests forward and backward passes with different event shapes. input_shape: Tuple of shapes for input tensor. event_dims: Tuple of dimension indices that will be normalized. training: Boolean of whether bijector runs in training or inference mode. """ params = [((5 * 2, 4), [-1], False), ((5, 2, 4), [-1], False), ((5, 2, 4), [1, 2], False), ((5, 2, 4), [0, 1], False), ((5 * 2, 4), [-1], True), ((5, 2, 4), [-1], True), ((5, 2, 4), [1, 2], True), ((5, 2, 4), [0, 1], True)] for input_shape, event_dims, training in params: x_ = np.arange(5 * 4 * 2).astype(np.float32).reshape(input_shape) with self.cached_session() as sess: x = constant_op.constant(x_) # When training, memorize the exact mean of the last # minibatch that it normalized (instead of moving average assignment). layer = normalization.BatchNormalization(axis=event_dims, momentum=0., epsilon=0.) batch_norm = BatchNormalization(batchnorm_layer=layer, training=training) # Minibatch statistics are saved only after norm_x has been computed. norm_x = batch_norm.inverse(x) with ops.control_dependencies(batch_norm.batchnorm.updates): moving_mean = array_ops.identity( batch_norm.batchnorm.moving_mean) moving_var = array_ops.identity( batch_norm.batchnorm.moving_variance) denorm_x = batch_norm.forward(array_ops.identity(norm_x)) fldj = batch_norm.forward_log_det_jacobian( x, event_ndims=len(event_dims)) # Use identity to invalidate cache. ildj = batch_norm.inverse_log_det_jacobian( array_ops.identity(denorm_x), event_ndims=len(event_dims)) variables.global_variables_initializer().run() # Update variables. norm_x_ = sess.run(norm_x) [ norm_x_, moving_mean_, moving_var_, denorm_x_, ildj_, fldj_, ] = sess.run([ norm_x, moving_mean, moving_var, denorm_x, ildj, fldj, ]) self.assertEqual("batch_normalization", batch_norm.name) reduction_axes = self._reduction_axes(input_shape, event_dims) keepdims = len(event_dims) > 1 expected_batch_mean = np.mean(x_, axis=reduction_axes, keepdims=keepdims) expected_batch_var = np.var(x_, axis=reduction_axes, keepdims=keepdims) if training: # When training=True, values become normalized across batch dim and # original values are recovered after de-normalizing. zeros = np.zeros_like(norm_x_) self.assertAllClose(np.mean(zeros, axis=reduction_axes), np.mean(norm_x_, axis=reduction_axes)) self.assertAllClose(expected_batch_mean, moving_mean_) self.assertAllClose(expected_batch_var, moving_var_) self.assertAllClose(x_, denorm_x_, atol=1e-5) # Since moving statistics are set to batch statistics after # normalization, ildj and -fldj should match. self.assertAllClose(ildj_, -fldj_) # ildj is computed with minibatch statistics. expected_ildj = np.sum( np.log(1.) - .5 * np.log(expected_batch_var + batch_norm.batchnorm.epsilon)) self.assertAllClose(expected_ildj, ildj_) else: # When training=False, moving_mean, moving_var remain at their # initialized values (0., 1.), resulting in no scale/shift (a small # shift occurs if epsilon > 0.) self.assertAllClose(x_, norm_x_) self.assertAllClose(x_, denorm_x_, atol=1e-5) # ildj is computed with saved statistics. expected_ildj = np.sum( np.log(1.) - .5 * np.log(1. + batch_norm.batchnorm.epsilon)) self.assertAllClose(expected_ildj, ildj_)
def __init__(self, units, hidden_units, feature_columns, activation_fn, dropout, input_layer_partitioner, batch_norm, name=None, **kwargs): super(_DNNModel, self).__init__(name=name, **kwargs) if feature_column_lib.is_feature_column_v2(feature_columns): self._input_layer = feature_column_lib.DenseFeatures( feature_columns=feature_columns, name='input_layer') else: self._input_layer = feature_column.InputLayer( feature_columns=feature_columns, name='input_layer', create_scope_now=False) self._add_layer(self._input_layer, 'input_layer') self._dropout = dropout self._batch_norm = batch_norm self._hidden_layers = [] self._dropout_layers = [] self._batch_norm_layers = [] self._hidden_layer_scope_names = [] for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( 'hiddenlayer_%d' % layer_id) as hidden_layer_scope: hidden_layer = core_layers.Dense( units=num_hidden_units, activation=activation_fn, kernel_initializer=init_ops.glorot_uniform_initializer(), name=hidden_layer_scope, _scope=hidden_layer_scope) self._add_layer(hidden_layer, hidden_layer_scope.name) self._hidden_layer_scope_names.append(hidden_layer_scope.name) self._hidden_layers.append(hidden_layer) if self._dropout is not None: dropout_layer = core_layers.Dropout(rate=self._dropout) self._add_layer(dropout_layer, dropout_layer.name) self._dropout_layers.append(dropout_layer) if self._batch_norm: batch_norm_layer = normalization.BatchNormalization( # The default momentum 0.99 actually crashes on certain # problem, so here we use 0.999, which is the default of # tf.contrib.layers.batch_norm. momentum=0.999, trainable=True, name='batchnorm_%d' % layer_id, _scope='batchnorm_%d' % layer_id) self._add_layer(batch_norm_layer, batch_norm_layer.name) self._batch_norm_layers.append(batch_norm_layer) with variable_scope.variable_scope('logits') as logits_scope: self._logits_layer = core_layers.Dense( units=units, activation=None, kernel_initializer=init_ops.glorot_uniform_initializer(), name=logits_scope, _scope=logits_scope) self._add_layer(self._logits_layer, logits_scope.name) self._logits_scope_name = logits_scope.name self._input_layer_partitioner = input_layer_partitioner