def layer_norm(x,
               begin_norm_axis=1,
               epsilon=1e-12,
               param_attr=None,
               bias_attr=None):
    """
    Replace build-in layer_norm op with this function
    """
    helper = LayerHelper('layer_norm', **locals())
    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
    variance = layers.reduce_mean(
        layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
    r_stdev = layers.rsqrt(variance + epsilon)
    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)

    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
    param_dtype = norm_x.dtype
    scale = helper.create_parameter(
        attr=param_attr,
        shape=param_shape,
        dtype=param_dtype,
        default_initializer=fluid.initializer.Constant(1.))
    bias = helper.create_parameter(
        attr=bias_attr,
        shape=param_shape,
        dtype=param_dtype,
        is_bias=True,
        default_initializer=fluid.initializer.Constant(0.))

    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
    out = layers.elementwise_add(x=out, y=bias, axis=-1)

    return out
Exemple #2
0
 def forward(self, x):
     """ Forward process of LayerNorm. """
     mean = layers.reduce_mean(x,
                               dim=list(range(self._begin_norm_axis, len(x.shape))),
                               keep_dim=True)
     shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
     variance = layers.reduce_mean(layers.square(shift_x),
                                   dim=list(range(self._begin_norm_axis, len(x.shape))),
                                   keep_dim=True)
     r_stdev = layers.rsqrt(variance + self._epsilon)
     norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
     out = layers.elementwise_mul(x=norm_x, y=self._scale_w, axis=-1)
     out = layers.elementwise_add(x=out, y=self._bias_w, axis=-1)
     return out
Exemple #3
0
    def func(self, place):
        shape = [2, 3, 7, 9]
        eps = 0.0001
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        x.persistable = True

        y = layers.rsqrt(x)
        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)

        gradient_checker.double_grad_check(
            [x], y, x_init=x_arr, place=place, eps=eps)
        gradient_checker.double_grad_check_for_dygraph(
            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)