def benchmark_layers_normalization_layer_normalization_overhead(self):

        layer = normalization.LayerNormalization()
        x = array_ops.ones((1, 1))

        def fn():
            layer(x, training=True)

        self._run(fn, 10000)
Ejemplo n.º 2
0
    def _test_forward_pass(self,
                           batch_input_shape,
                           axis,
                           fp64_tol=1e-14,
                           fp32_tol=1e-6,
                           fp16_tol=1e-2):
        """Tests the forward pass of layer normalization.

    Args:
      batch_input_shape: The input shape that will be used to test, including
        the batch dimension.
      axis: A list of axises to normalize. Will be passed to the `axis` argument
        of LayerNormalization.
      fp64_tol: The relative and absolute tolerance for float64.
      fp32_tol: The relative and absolute tolerance for float32.
      fp16_tol: The relative and absolute tolerance for float16.
    """
        param_shape = [batch_input_shape[i] for i in axis]
        param_elems = 1
        for dim in param_shape:
            param_elems *= dim
        beta = np.arange(param_elems, dtype='float64').reshape(param_shape)
        gamma = np.arange(1, param_elems + 1,
                          dtype='float64').reshape(param_shape)
        x = np.random.normal(size=batch_input_shape)

        for epsilon in 1e-12, 1e-3:
            expected = self._expected_layer_norm(x, beta, gamma,
                                                 batch_input_shape, axis,
                                                 epsilon)
            for dtype in 'float64', 'float32', 'float16':
                norm = normalization.LayerNormalization(
                    axis=axis,
                    dtype=dtype,
                    batch_input_shape=batch_input_shape,
                    epsilon=epsilon,
                    beta_initializer=keras.initializers.constant(beta),
                    gamma_initializer=keras.initializers.constant(gamma))
                y = norm(keras.backend.cast(x, dtype))
                actual = keras.backend.eval(y)

                if dtype == 'float64':
                    tol = fp64_tol
                elif dtype == 'float32':
                    tol = fp32_tol
                else:
                    assert dtype == 'float16'
                    tol = fp16_tol

                # We use absolute tolerances in addition to relative tolerances, because
                # some of the values are very close to zero.
                self.assertAllClose(expected, actual, rtol=tol, atol=tol)
Ejemplo n.º 3
0
class LayerCorrectnessTest(keras_parameterized.TestCase):

  def setUp(self):
    super(LayerCorrectnessTest, self).setUp()
    # Set two virtual CPUs to test MirroredStrategy with multiple devices
    cpus = config_module.list_physical_devices('CPU')
    config_module.set_logical_device_configuration(cpus[0], [
        context.LogicalDeviceConfiguration(),
        context.LogicalDeviceConfiguration(),
    ])

  def _create_model_from_layer(self, layer, input_shapes):
    inputs = [layers.Input(batch_input_shape=s) for s in input_shapes]
    if len(inputs) == 1:
      inputs = inputs[0]
    y = layer(inputs)
    model = models.Model(inputs, y)
    model.compile('sgd', 'mse')
    return model

  @parameterized.named_parameters(
      ('LeakyReLU', advanced_activations.LeakyReLU, (2, 2)),
      ('PReLU', advanced_activations.PReLU, (2, 2)),
      ('ELU', advanced_activations.ELU, (2, 2)),
      ('ThresholdedReLU', advanced_activations.ThresholdedReLU, (2, 2)),
      ('Softmax', advanced_activations.Softmax, (2, 2)),
      ('ReLU', advanced_activations.ReLU, (2, 2)),
      ('Conv1D', lambda: convolutional.Conv1D(2, 2), (2, 2, 1)),
      ('Conv2D', lambda: convolutional.Conv2D(2, 2), (2, 2, 2, 1)),
      ('Conv3D', lambda: convolutional.Conv3D(2, 2), (2, 2, 2, 2, 1)),
      ('Conv2DTranspose', lambda: convolutional.Conv2DTranspose(2, 2),
       (2, 2, 2, 2)),
      ('SeparableConv2D', lambda: convolutional.SeparableConv2D(2, 2),
       (2, 2, 2, 1)),
      ('DepthwiseConv2D', lambda: convolutional.DepthwiseConv2D(2, 2),
       (2, 2, 2, 1)),
      ('UpSampling2D', convolutional.UpSampling2D, (2, 2, 2, 1)),
      ('ZeroPadding2D', convolutional.ZeroPadding2D, (2, 2, 2, 1)),
      ('Cropping2D', convolutional.Cropping2D, (2, 3, 3, 1)),
      ('ConvLSTM2D',
       lambda: convolutional_recurrent.ConvLSTM2D(4, kernel_size=(2, 2)),
       (4, 4, 4, 4, 4)),
      ('Dense', lambda: core.Dense(2), (2, 2)),
      ('Dropout', lambda: core.Dropout(0.5), (2, 2)),
      ('SpatialDropout2D', lambda: core.SpatialDropout2D(0.5), (2, 2, 2, 2)),
      ('Activation', lambda: core.Activation('sigmoid'), (2, 2)),
      ('Reshape', lambda: core.Reshape((1, 4, 1)), (2, 2, 2)),
      ('Permute', lambda: core.Permute((2, 1)), (2, 2, 2)),
      ('Attention', dense_attention.Attention, [(2, 2, 3), (2, 3, 3),
                                                (2, 3, 3)]),
      ('AdditiveAttention', dense_attention.AdditiveAttention, [(2, 2, 3),
                                                                (2, 3, 3),
                                                                (2, 3, 3)]),
      ('Embedding', lambda: embeddings.Embedding(4, 4),
       (2, 4), 2e-3, 2e-3, np.random.randint(4, size=(2, 4))),
      ('LocallyConnected1D', lambda: local.LocallyConnected1D(2, 2), (2, 2, 1)),
      ('LocallyConnected2D', lambda: local.LocallyConnected2D(2, 2),
       (2, 2, 2, 1)),
      ('Add', merge.Add, [(2, 2), (2, 2)]),
      ('Subtract', merge.Subtract, [(2, 2), (2, 2)]),
      ('Multiply', merge.Multiply, [(2, 2), (2, 2)]),
      ('Average', merge.Average, [(2, 2), (2, 2)]),
      ('Maximum', merge.Maximum, [(2, 2), (2, 2)]),
      ('Minimum', merge.Minimum, [(2, 2), (2, 2)]),
      ('Concatenate', merge.Concatenate, [(2, 2), (2, 2)]),
      ('Dot', lambda: merge.Dot(1), [(2, 2), (2, 2)]),
      ('GaussianNoise', lambda: noise.GaussianNoise(0.5), (2, 2)),
      ('GaussianDropout', lambda: noise.GaussianDropout(0.5), (2, 2)),
      ('AlphaDropout', lambda: noise.AlphaDropout(0.5), (2, 2)),
      ('BatchNormalization', normalization_v2.BatchNormalization,
       (2, 2), 1e-2, 1e-2),
      ('LayerNormalization', normalization.LayerNormalization, (2, 2)),
      ('LayerNormalizationUnfused',
       lambda: normalization.LayerNormalization(axis=1), (2, 2, 2)),
      ('MaxPooling2D', pooling.MaxPooling2D, (2, 2, 2, 1)),
      ('AveragePooling2D', pooling.AveragePooling2D, (2, 2, 2, 1)),
      ('GlobalMaxPooling2D', pooling.GlobalMaxPooling2D, (2, 2, 2, 1)),
      ('GlobalAveragePooling2D', pooling.GlobalAveragePooling2D, (2, 2, 2, 1)),
      ('SimpleRNN', lambda: recurrent.SimpleRNN(units=4),
       (4, 4, 4), 1e-2, 1e-2),
      ('GRU', lambda: recurrent.GRU(units=4), (4, 4, 4)),
      ('LSTM', lambda: recurrent.LSTM(units=4), (4, 4, 4)),
      ('GRUV2', lambda: recurrent_v2.GRU(units=4), (4, 4, 4)),
      ('LSTMV2', lambda: recurrent_v2.LSTM(units=4), (4, 4, 4)),
      ('TimeDistributed', lambda: wrappers.TimeDistributed(core.Dense(2)),
       (2, 2, 2)),
      ('Bidirectional',
       lambda: wrappers.Bidirectional(recurrent.SimpleRNN(units=4)), (2, 2, 2)),
      ('AttentionLayerCausal', lambda: dense_attention.Attention(causal=True), [
          (2, 2, 3), (2, 3, 3), (2, 3, 3)
      ]),
      ('AdditiveAttentionLayerCausal',
       lambda: dense_attention.AdditiveAttention(causal=True), [(2, 3, 4),
                                                                (2, 3, 4),
                                                                (2, 3, 4)]),
  )
  def test_layer(self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3,
                 input_data=None):
    """Tests a layer by comparing the float32 and mixed precision weights.

    A float32 layer, a mixed precision layer, and a distributed mixed precision
    layer are run. The three layers are identical other than their dtypes and
    distribution strategies. The outputs after predict() and weights after fit()
    are asserted to be close.

    Args:
      f32_layer_fn: A function returning a float32 layer. The other two layers
        will automatically be created from this
      input_shape: The shape of the input to the layer, including the batch
        dimension. Or a list of shapes if the layer takes multiple inputs.
      rtol: The relative tolerance to be asserted.
      atol: The absolute tolerance to be asserted.
      input_data: A Numpy array with the data of the input. If None, input data
        will be randomly generated
    """

    if f32_layer_fn == convolutional.ZeroPadding2D and \
       test.is_built_with_rocm():
      return
    if isinstance(input_shape[0], int):
      input_shapes = [input_shape]
    else:
      input_shapes = input_shape
    strategy = create_mirrored_strategy()
    f32_layer = f32_layer_fn()

    # Create the layers
    assert f32_layer.dtype == f32_layer._compute_dtype == 'float32'
    config = f32_layer.get_config()
    config['dtype'] = policy.Policy('mixed_float16')
    mp_layer = f32_layer.__class__.from_config(config)
    distributed_mp_layer = f32_layer.__class__.from_config(config)

    # Compute per_replica_input_shapes for the distributed model
    global_batch_size = input_shapes[0][0]
    assert global_batch_size % strategy.num_replicas_in_sync == 0, (
        'The number of replicas, %d, does not divide the global batch size of '
        '%d' % (strategy.num_replicas_in_sync, global_batch_size))
    per_replica_batch_size = (
        global_batch_size // strategy.num_replicas_in_sync)
    per_replica_input_shapes = [(per_replica_batch_size,) + s[1:]
                                for s in input_shapes]

    # Create the models
    f32_model = self._create_model_from_layer(f32_layer, input_shapes)
    mp_model = self._create_model_from_layer(mp_layer, input_shapes)
    with strategy.scope():
      distributed_mp_model = self._create_model_from_layer(
          distributed_mp_layer, per_replica_input_shapes)

    # Set all model weights to the same values
    f32_weights = f32_model.get_weights()
    mp_model.set_weights(f32_weights)
    distributed_mp_model.set_weights(f32_weights)

    # Generate input data
    if input_data is None:
      # Cast inputs to float16 to avoid measuring error from having f16 layers
      # cast to float16.
      input_data = [np.random.normal(size=s).astype('float16')
                    for s in input_shapes]
      if len(input_data) == 1:
        input_data = input_data[0]

    # Assert all models have close outputs.
    f32_output = f32_model.predict(input_data)
    mp_output = mp_model.predict(input_data)
    self.assertAllClose(
        mp_output, f32_output, rtol=rtol, atol=atol)
    self.assertAllClose(
        distributed_mp_model.predict(input_data), f32_output, rtol=rtol,
        atol=atol)

    # Run fit() on models
    output = np.random.normal(size=f32_model.outputs[0].shape).astype('float16')
    for model in f32_model, mp_model, distributed_mp_model:
      model.fit(input_data, output, batch_size=global_batch_size)

    # Assert all models have close weights
    f32_weights = f32_model.get_weights()
    self.assertAllClose(
        mp_model.get_weights(), f32_weights, rtol=rtol, atol=atol)
    self.assertAllClose(
        distributed_mp_model.get_weights(), f32_weights, rtol=rtol, atol=atol)
Ejemplo n.º 4
0
    def _test_backward_pass(self,
                            batch_input_shape,
                            axis,
                            fp64_tol=1e-5,
                            fp32_tol=1e-5,
                            fp16_tol=2e-2):
        """Tests the backwards pass of layer normalization.

    Args:
      batch_input_shape: The input shape that will be used to test, including
        the batch dimension.
      axis: A list of axises to normalize. Will be passed to the `axis` argument
        of LayerNormalization.
      fp64_tol: The relative and absolute tolerance for float64.
      fp32_tol: The relative and absolute tolerance for float32.
      fp16_tol: The relative and absolute tolerance for float16.
    """
        param_shape = [batch_input_shape[i] for i in axis]
        param_elems = 1
        for dim in param_shape:
            param_elems *= dim
        beta = np.arange(param_elems, dtype='float64').reshape(param_shape)
        gamma = np.arange(1, param_elems + 1,
                          dtype='float64').reshape(param_shape)
        x = np.random.normal(size=batch_input_shape)

        for epsilon in 1e-12, 1e-3:
            # Float64 must come first in this list, as we use the float64 numerical
            # gradients to compare to the float32 and float16 symbolic gradients as
            # well. Computing float32/float16 numerical gradients is too numerically
            # unstable.
            for dtype in 'float64', 'float32', 'float16':
                norm = normalization.LayerNormalization(
                    axis=axis,
                    dtype=dtype,
                    batch_input_shape=batch_input_shape,
                    epsilon=epsilon,
                    beta_initializer=keras.initializers.constant(beta),
                    gamma_initializer=keras.initializers.constant(gamma))
                norm.build(x.shape)

                # pylint: disable=cell-var-from-loop
                def forward_fn(x, beta, gamma):
                    # We must monkey-patch the attributes of `norm` with the function
                    # arguments, so that the gradient checker will properly compute their
                    # gradients. The gradient checker computes gradients with respect to
                    # the input arguments of `f`.
                    with test.mock.patch.object(norm, 'beta', beta):
                        with test.mock.patch.object(norm, 'gamma', gamma):
                            return norm(x)

                # pylint: enable=cell-var-from-loop
                results = gradient_checker_v2.compute_gradient(
                    forward_fn,
                    [keras.backend.cast(x, dtype), norm.beta, norm.gamma])
                ([x_grad_t, beta_grad_t,
                  gamma_grad_t], [x_grad_n, beta_grad_n,
                                  gamma_grad_n]) = results

                if dtype == 'float64':
                    # We use the float64 numeric gradients as the reference, to compare
                    # against the symbolic gradients for all dtypes.
                    x_grad_ref = x_grad_n
                    beta_grad_ref = beta_grad_n
                    gamma_grad_ref = gamma_grad_n
                    tol = fp64_tol
                elif dtype == 'float32':
                    tol = fp32_tol
                else:
                    assert dtype == 'float16'
                    tol = fp16_tol

                # We use absolute tolerances in addition to relative tolerances, because
                # some of the values are very close to zero.
                self.assertAllClose(x_grad_t, x_grad_ref, rtol=tol, atol=tol)
                self.assertAllClose(beta_grad_t,
                                    beta_grad_ref,
                                    rtol=tol,
                                    atol=tol)
                self.assertAllClose(gamma_grad_t,
                                    gamma_grad_ref,
                                    rtol=tol,
                                    atol=tol)
Ejemplo n.º 5
0
 def testFusedAttr(self):
     layer_norm = normalization.LayerNormalization(axis=[-2, -1])
     layer_norm.build(input_shape=(2, 2, 2))
     self.assertEqual(layer_norm._fused, True)
Ejemplo n.º 6
0
 def testDuplicateAxis(self):
     with self.assertRaisesRegex(ValueError, r'Duplicate axis:'):
         layer_norm = normalization.LayerNormalization(axis=[-1, -1])
         layer_norm.build(input_shape=(2, 2, 2))
Ejemplo n.º 7
0
 def testInvalidAxis(self):
     with self.assertRaisesRegex(ValueError, r'Invalid axis: 3'):
         layer_norm = normalization.LayerNormalization(axis=3)
         layer_norm.build(input_shape=(2, 2, 2))
Ejemplo n.º 8
0
 def testIncorrectAxisType(self):
     with self.assertRaisesRegex(
             TypeError, r'Expected an int or a list/tuple of ints'):
         _ = normalization.LayerNormalization(axis={'axis': -1})
    def doOutputTest(self,
                     input_shape,
                     tol=1e-5,
                     norm_axis=None,
                     params_axis=-1,
                     dtype=None):
        ndim = len(input_shape)
        if norm_axis is None:
            moments_axis = range(1, ndim)
        elif isinstance(norm_axis, int):
            if norm_axis < 0:
                moments_axis = [norm_axis + ndim]
            else:
                moments_axis = [norm_axis]
        else:
            moments_axis = []
            for dim in norm_axis:
                if dim < 0:
                    dim = dim + ndim
                moments_axis.append(dim)

        moments_axis = tuple(moments_axis)
        expected_shape = []
        for i in range(ndim):
            if i not in moments_axis:
                expected_shape.append(input_shape[i])

        expected_mean = np.zeros(expected_shape)
        expected_var = np.ones(expected_shape)
        for mu in [0.0, 1e2]:
            for sigma in [1.0, 0.1]:
                inputs = np.random.randn(*input_shape) * sigma + mu
                inputs_t = constant_op.constant(inputs, shape=input_shape)
                layer = normalization.LayerNormalization(
                    norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
                outputs = layer(inputs_t)
                beta = layer.beta
                gamma = layer.gamma
                for weight in layer.weights:
                    self.evaluate(weight.initializer)
                outputs = self.evaluate(outputs)
                beta = self.evaluate(beta)
                gamma = self.evaluate(gamma)

                # The mean and variance of the output should be close to 0 and 1
                # respectively.

                # Make sure that there are no NaNs
                self.assertFalse(np.isnan(outputs).any())
                mean = np.mean(outputs, axis=moments_axis)
                var = np.var(outputs, axis=moments_axis)
                # Layer-norm implemented in numpy
                eps = 1e-12
                expected_out = (
                    (gamma *
                     (inputs -
                      np.mean(inputs, axis=moments_axis, keepdims=True)) /
                     np.sqrt(eps +
                             np.var(inputs, axis=moments_axis, keepdims=True)))
                    + beta)
                self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
                self.assertAllClose(expected_var, var, atol=tol)
                # The full computation gets a bigger tolerance
                self.assertAllClose(expected_out, outputs, atol=5 * tol)