Beispiel #1
0
def learning_rate_factor(name, step_num, hparams):
    """Compute the designated learning rate factor from hparams."""
    if name == "constant":
        tf.logging.info("Base learning rate: %f",
                        hparams.learning_rate_constant)
        return hparams.learning_rate_constant
    elif name == "linear_warmup":
        return tf.minimum(1.0, step_num / hparams.learning_rate_warmup_steps)
    elif name == "linear_decay":
        ret = (hparams.train_steps -
               step_num) / hparams.learning_rate_decay_steps
        return tf.minimum(1.0, tf.maximum(0.0, ret))
    elif name == "cosdecay":  # openai gpt
        in_warmup = tf.cast(step_num <= hparams.learning_rate_warmup_steps,
                            dtype=tf.float32)
        ret = 0.5 * (
            1 + tf.cos(np.pi * step_num / hparams.learning_rate_decay_steps))
        # if in warmup stage return 1 else return the decayed value
        return in_warmup * 1 + (1 - in_warmup) * ret
    elif name == "single_cycle_cos_decay":
        # Cosine decay to zero with a single cycle. This is different from
        # "cosdecay" because it starts at 1 when the warmup steps end.
        x = tf.maximum(step_num, hparams.learning_rate_warmup_steps)
        step = x - hparams.learning_rate_warmup_steps
        if hparams.train_steps <= hparams.learning_rate_warmup_steps:
            raise ValueError("single_cycle_cos_decay cannot be used unless "
                             "hparams.train_steps > "
                             "hparams.learning_rate_warmup_steps")
        return tf.math.cos(step * np.pi /
                           (hparams.train_steps -
                            hparams.learning_rate_warmup_steps)) / 2.0 + 0.5
    elif name == "multi_cycle_cos_decay":
        # Cosine decay with a variable number of cycles. This is different from
        # "cosdecay" because it starts at 1 when the warmup steps end. Use
        # hparams.learning_rate_decay_steps to determine the number of cycles.
        x = tf.maximum(step_num, hparams.learning_rate_warmup_steps)
        step = x - hparams.learning_rate_warmup_steps
        return tf.math.cos(
            step * np.pi / hparams.learning_rate_decay_steps) / 2.0 + 0.5
    elif name == "rsqrt_decay":
        return tf.rsqrt(
            tf.maximum(step_num, hparams.learning_rate_warmup_steps))
    elif name == "rsqrt_normalized_decay":
        scale = tf.sqrt(tf.to_float(hparams.learning_rate_warmup_steps))
        return scale * tf.rsqrt(
            tf.maximum(step_num, hparams.learning_rate_warmup_steps))
    elif name == "exp_decay":
        decay_steps = hparams.learning_rate_decay_steps
        warmup_steps = hparams.learning_rate_warmup_steps
        p = (step_num - warmup_steps) / decay_steps
        p = tf.maximum(p, 0.)
        if hparams.learning_rate_decay_staircase:
            p = tf.floor(p)
        return tf.pow(hparams.learning_rate_decay_rate, p)
    elif name == "rsqrt_hidden_size":
        return hparams.hidden_size**-0.5
    elif name == "legacy":
        return legacy_learning_rate_schedule(hparams)
    else:
        raise ValueError("unknown learning rate factor %s" % name)
Beispiel #2
0
    def multihead_attn(q, k, v):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        if int(tf.__version__[0]) > 1:
            w = w * tf.rsqrt(tf.cast(v.shape[-1], w.dtype))
        else:
            w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        a = tf.matmul(w, v)
        return a
Beispiel #3
0
def ProcessGradients(grads_and_vars,
                     global_gradient_clip=0.0,
                     sanitize_gradients=False,
                     normalize_gradients=False):
    tf.logging.info("Prcessing gradients")
    grads, vars_ = list(zip(*grads_and_vars))
    if sanitize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g = tf.where(tf.is_finite(g), g, tf.zeros_like(g))
            new_grads.append(g)
        grads = new_grads
    if normalize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g *= tf.rsqrt(tf.maximum(1e-12, tf.reduce_sum(tf.square(g))))
            new_grads.append(g)
        grads = new_grads
    if global_gradient_clip > 0:
        grads, grad_norm = tf.clip_by_global_norm(grads, global_gradient_clip)
        grads_and_vars = list(zip(grads, vars_))
    else:
        grad_norm = tf.global_norm(grads)
    tf.summary.scalar("global_grad_norm", grad_norm)
    return grads_and_vars
Beispiel #4
0
def layer_norm_all(h,
                   batch_size,
                   base,
                   num_units,
                   scope='layer_norm',
                   reuse=False,
                   gamma_start=1.0,
                   epsilon=1e-3,
                   use_bias=True):
    """Layer Norm (faster version, but not using defun)."""
    # Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
    # Reshapes h in to perform layer norm in parallel
    h_reshape = tf.reshape(h, [batch_size, base, num_units])
    mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
    var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
    epsilon = tf.constant(epsilon)
    rstd = tf.rsqrt(var + epsilon)
    h_reshape = (h_reshape - mean) * rstd
    # reshape back to original
    h = tf.reshape(h_reshape, [batch_size, base * num_units])
    with tf.variable_scope(scope):
        if reuse:
            tf.get_variable_scope().reuse_variables()
        gamma = tf.get_variable(
            'ln_gamma', [4 * num_units],
            initializer=tf.constant_initializer(gamma_start))
        if use_bias:
            beta = tf.get_variable(
                'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0))
    if use_bias:
        return gamma * h + beta
    return gamma * h
Beispiel #5
0
def _norm(x, g=None, b=None, e=1e-5, axis=[1]):
    u = tf.reduce_mean(x, axis=axis, keep_dims=True)
    s = tf.reduce_mean(tf.square(x-u), axis=axis, keep_dims=True)
    x = (x - u) * tf.rsqrt(s + e)
    if g is not None and b is not None:
        x = x*g + b
    return x
Beispiel #6
0
def apply_norm(x, epsilon=1e-6):
    """Applies layer normalization to x.

  Based on "Layer Normalization":
  https://arxiv.org/abs/1607.06450

  Args:
    x: <float>[..., input_size]
    epsilon: Used to avoid division by 0.

  Returns:
    <float>[..., input_size]
  """
    input_size = x.get_shape()[-1]
    with tf.variable_scope("layer_norm", values=[x]):
        scale = tf.get_variable("layer_norm_scale", [input_size],
                                initializer=tf.ones_initializer())
        bias = tf.get_variable("layer_norm_bias", [input_size],
                               initializer=tf.zeros_initializer())
        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(tf.square(x - mean),
                                  axis=[-1],
                                  keepdims=True)
        norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
        result = norm_x * scale + bias
        return result
            def do_update(g, flat_v, m, rms):
                """Do a single tensor's update."""
                flat_g = tf.reshape(g, [-1, 1])

                rsqrt = tf.rsqrt(rms + 1e-6)
                norm_g = m * rsqrt

                inp = tf.concat([flat_g, norm_g, flat_v, m, rms, rsqrt], 1)

                inp = normalizer(inp, is_training=True)

                step = utils.tanh_embedding(training_step)
                stack_step = tf.tile(tf.reshape(step, [1, -1]),
                                     tf.stack([tf.shape(flat_g)[0], 1]))

                inp = tf.concat([inp, stack_step], axis=1)

                output = mod(inp)

                direction = output[:, 0:1]
                magnitude = output[:, 1:2]

                step = direction * tf.exp(
                    magnitude * self.magnitude_rate) * self.step_multiplier

                new_flat_v = flat_v - step
                return new_flat_v,
Beispiel #8
0
def signal_to_noise_ratio_gain_invariant(estimate, target, epsilon=1.0e-5):
    """Computes the signal to noise ratio in a gain invariant manner.

  This computes SNR in a scale-free manner by projecting the estimate onto the
  target for the signal, and the projection onto the orthogonal subspace for the
  noise.

  Args:
    estimate: An estimate of the target of size [..., samples].
    target: A ground truth tensor, matching estimate above.
    epsilon: An optional float introduced for numerical stability in the
      projections only.

  Returns:
    A tensor of size [...] with SNR computed between matching slices of the
    input signal and noise tensors.
  """
    scaling_factors = tf.rsqrt(
        tf.reduce_sum(
            tf.square(target), keep_dims=True, reduction_indices=[-1]) +
        epsilon**2.0)
    scaled_target = tf.multiply(target, scaling_factors)
    signal = tf.reduce_sum(tf.multiply(estimate, scaled_target),
                           keep_dims=True,
                           reduction_indices=[-1]) * scaled_target
    noise = estimate - signal

    return calculate_signal_to_noise_ratio(signal, noise)
Beispiel #9
0
def layer_norm(x,
               num_units,
               scope='layer_norm',
               reuse=False,
               gamma_start=1.0,
               epsilon=1e-3,
               use_bias=True):
    """Calculate layer norm."""
    axes = [1]
    mean = tf.reduce_mean(x, axes, keep_dims=True)
    x_shifted = x - mean
    var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True)
    inv_std = tf.rsqrt(var + epsilon)
    with tf.variable_scope(scope):
        if reuse:
            tf.get_variable_scope().reuse_variables()
        gamma = tf.get_variable(
            'ln_gamma', [num_units],
            initializer=tf.constant_initializer(gamma_start))
        if use_bias:
            beta = tf.get_variable(
                'ln_beta', [num_units], initializer=tf.constant_initializer(0.0))
    output = gamma * (x_shifted) * inv_std
    if use_bias:
        output += beta
    return output
def ae_latent_softmax(latents_pred, latents_discrete_hot, vocab_size, hparams):
    """Latent prediction and loss.

  Args:
    latents_pred: Tensor of shape [..., depth].
    latents_discrete_hot: Tensor of shape [..., vocab_size].
    vocab_size: an int representing the vocab size.
    hparams: HParams.

  Returns:
    sample: Tensor of shape [...], a sample from a multinomial distribution.
    loss: Tensor of shape [...], the softmax cross-entropy.
  """
    with tf.variable_scope("latent_logits"):
        latents_logits = tf.layers.dense(latents_pred,
                                         vocab_size,
                                         name="logits_dense")
        if hparams.logit_normalization:
            latents_logits *= tf.rsqrt(
                1e-8 + tf.reduce_mean(tf.square(latents_logits)))
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=latents_discrete_hot, logits=latents_logits)

        # TODO(trandustin): tease this out from ae_latent_softmax.
        # we use just the loss portion to anchor prior / encoder on text.
        sample = multinomial_sample(latents_logits, vocab_size,
                                    hparams.sampling_method,
                                    hparams.sampling_temp)
        return sample, loss
Beispiel #11
0
def l2_normalize(incoming, dim, epsilon=1e-12, name="l2_normalize"):
    """ L2 Normalization.

    Normalizes along dimension `dim` using an L2 norm.

    For a 1-D tensor with `dim = 0`, computes
    ```
    output = x / sqrt(max(sum(x**2), epsilon))
    ```

    For `x` with more dimensions, independently normalizes each 1-D slice along
    dimension `dim`.

    Arguments:
        incoming: `Tensor`. Incoming Tensor.
        dim: `int`. Dimension along which to normalize.
        epsilon: `float`. A lower bound value for the norm. Will use
            `sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`.
        name: `str`. A name for this layer (optional).

    Returns:
      A `Tensor` with the same shape as `x`.
    """
    with tf.name_scope(name) as name:
        x = tf.convert_to_tensor(incoming, name="x")
        square_sum = tf.reduce_sum(tf.square(x), [dim], keep_dims=True)
        x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon))

    return tf.multiply(x, x_inv_norm, name=name)
Beispiel #12
0
 def call(self, x, epsilon=1e-6):
     mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
     variance = tf.reduce_mean(tf.square(x - mean),
                               axis=[-1],
                               keepdims=True)
     norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
     return norm_x * self.scale + self.bias
Beispiel #13
0
 def diet_expert_internal(x):
     dim = x.get_shape().as_list()[-1]
     h = tf.layers.dense(x,
                         hidden_size,
                         activation=tf.nn.relu,
                         use_bias=False)
     y = tf.layers.dense(h, dim, use_bias=False)
     y *= tf.rsqrt(tf.to_float(dim * hidden_size))
     return y
Beispiel #14
0
def layer_norm_compute(x, epsilon, scale, bias):
    """Layer norm raw computation."""
    epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]]
    counts, means_ss, variance_ss, _, = tf.nn.sufficient_statistics(
        x, axes=[-1], keep_dims=True)
    mean, variance = tf.nn.normalize_moments(counts, means_ss, variance_ss,
                                             None)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    return norm_x * scale + bias
Beispiel #15
0
    def multihead_attn(q, k, v):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w * tf.rsqrt(tf.cast(shape_list(v)[-1], w.dtype))

        w = mask_attn_weights(w)
        w = softmax(w)
        a = tf.matmul(w, v)
        return a
Beispiel #16
0
def instance_norm(x, epsilon=1e-8):
    assert len(x.shape) == 4 # NCHW
    with tf.variable_scope('InstanceNorm'):
        orig_dtype = x.dtype
        x = tf.cast(x, tf.float32)
        x -= tf.reduce_mean(x, axis=[2,3], keepdims=True)
        epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon')
        x *= tf.rsqrt(tf.reduce_mean(tf.square(x), axis=[2,3], keepdims=True) + epsilon)
        x = tf.cast(x, orig_dtype)
        return x
Beispiel #17
0
def instance_norm(input, name="instance_norm", ):
    with tf.variable_scope(name):
        depth = input.get_shape()[3]
        scale = tf.get_variable("scale", [depth], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32))
        offset = tf.get_variable("offset", [depth], initializer=tf.constant_initializer(0.0))
        mean, variance = tf.nn.moments(input, axes=[1,2], keep_dims=True)
        epsilon = 1e-5
        inv = tf.rsqrt(variance + epsilon)
        normalized = (input-mean)*inv
        return scale*normalized + offset
Beispiel #18
0
def norm(x, scope, *, axis=-1, epsilon=1e-5):
    """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
    with tf.variable_scope(scope):
        n_state = x.shape[-1]
        g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
        b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
        u = tf.reduce_mean(x, axis=axis, keepdims=True)
        s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
        x = (x - u) * tf.rsqrt(s + epsilon)
        x = x*g + b
        return x
def transformer_pointer_prediction_layer(feature_name,
                                         encoder_output,
                                         x,
                                         encoder_decoder_attention_bias,
                                         hparams,
                                         features,
                                         loss_mask,
                                         layer_collection=None):
  """Layer that predicts the start or end token position.

  Args:
    feature_name: 'targets_start_token' or 'targets_end_token'
    encoder_output: [batch_size, input_length, hidden_size] tensor with encoder
      outputs
    x: [batch_size, target_length, 1, hidden_size] tensor with decoder outputs
    encoder_decoder_attention_bias: [batch_size, input_length, target_length]
      attention mask
    hparams: Hyper parameters
    features: Feature dictionary
    loss_mask: [batch_size, target_length] mask for loss computation.
    layer_collection: Layer collection

  Returns:
    (x, logits, loss)
  """
  if isinstance(encoder_output, list):
    pointer_encoder_output = encoder_output[1]
    encoder_output = sum(encoder_output)
  else:
    pointer_encoder_output = encoder_output
  with tf.variable_scope("%s_prediction" % feature_name):
    x = maybe_flatten4d3d(x)
    encoder_decoder_attention_bias = common_layers.flatten4d3d(
        encoder_decoder_attention_bias)
    q = common_attention.compute_attention_component(x, hparams.hidden_size)
    k = common_attention.compute_attention_component(encoder_output,
                                                     hparams.hidden_size)
    # Scaled dot-product attention
    scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2]))
    logits = tf.matmul(q * scalar, k, transpose_b=True)

    logits += encoder_decoder_attention_bias

    labels = features["%s_raw" % feature_name]
    xent = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=labels)
    loss = tf.reduce_sum(xent * loss_mask)

    pointer_out = gather_2d(pointer_encoder_output, labels)
    y = common_layers.layer_preprocess(
        pointer_out, hparams, layer_collection=layer_collection)
    x = common_layers.layer_postprocess(x, y, hparams)
    return x, logits, loss
Beispiel #20
0
def _instance_norm(input):
    """ Instance Normalization
  """
    with tf.variable_scope("instance_norm"):
        depth = input.get_shape()[3]
        scale = _weights("scale", [depth], mean=1.0)
        offset = _biases("offset", [depth])
        mean, variance = tf.nn.moments(input, axes=[1, 2], keep_dims=True)
        epsilon = 1e-5
        inv = tf.rsqrt(variance + epsilon)
        normalized = (input - mean) * inv
        return scale * normalized + offset
Beispiel #21
0
def batch_norm(input_, name="batch_norm"):
    with tf.variable_scope(name):
        input_dim = input_.get_shape()[-1]
        scale = tf.get_variable("scale", [input_dim],
                                initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32))
        offset = tf.get_variable("offset", [input_dim], initializer=tf.constant_initializer(0.0))
        mean, variance = tf.nn.moments(input_, axes=[1, 2], keep_dims=True)
        epsilon = 1e-5
        inv = tf.rsqrt(variance + epsilon)
        normalized = (input_ - mean) * inv
        output = scale * normalized + offset
        return output
Beispiel #22
0
 def normalize(kernel, g, axis, epsilon):
     # Weight norm and what I'm currently doing are slightly different
     # in that the normalization axis is very different...
     # The easiest thing to do is to specify a normalization axis
     # So, adding 1e-3 works
     # kernel = tf.math.l2_normalize(kernel, axis=-1)
     kernel = kernel * tf.rsqrt(
         tf.reduce_sum(tf.square(kernel), axis=axis, keepdims=True) +
         epsilon)
     if g is not None:
         kernel = kernel * g
     return kernel
Beispiel #23
0
def layer_norm(input_tensor, name=None, epsilon=1e-5):
    """Run layer normalization on the last dimension of the tensor."""
    name2use = f'LayerNorm_{name}' if name is not None else name
    with tf.variable_scope(name2use, default_name='LayerNorm'):
        dim = input_tensor.shape[-1].value
        gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1))
        beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0))
        mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True)
        std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True)
        input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon)
        input_tensor = input_tensor * gamma + beta
    return input_tensor
Beispiel #24
0
def ae_latent_softmax(latents_pred, latents_discrete, hparams):
    """Latent prediction and loss."""
    vocab_size = 2**hparams.z_size
    if hparams.num_decode_blocks < 2:
        latents_logits = tf.layers.dense(latents_pred,
                                         vocab_size,
                                         name="extra_logits")
        if hparams.logit_normalization:
            latents_logits *= tf.rsqrt(
                1e-8 + tf.reduce_mean(tf.square(latents_logits)))

        loss = None
        if latents_discrete is not None:
            if hparams.soft_em:
                # latents_discrete is actually one-hot of multinomial samples
                assert hparams.num_decode_blocks == 1
                loss = tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=latents_discrete, logits=latents_logits)
            else:
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=latents_discrete, logits=latents_logits)
        sample = multinomial_sample(latents_logits, vocab_size,
                                    hparams.sampling_temp)
        return sample, loss

    # Multi-block case.
    vocab_bits = int(math.log(vocab_size, 2))
    assert vocab_size == 2**vocab_bits
    assert vocab_bits % hparams.num_decode_blocks == 0
    block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks)
    latents_logits = [
        tf.layers.dense(latents_pred,
                        block_vocab_size,
                        name="extra_logits_%d" % i)
        for i in range(hparams.num_decode_blocks)
    ]
    loss = None
    if latents_discrete is not None:
        losses = []
        for i in range(hparams.num_decode_blocks):
            d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i),
                            block_vocab_size)
            losses.append(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=d, logits=latents_logits[i]))
        loss = sum(losses)
    samples = [
        multinomial_sample(l, block_vocab_size, hparams.sampling_temp)
        for l in latents_logits
    ]
    sample = sum([s * block_vocab_size**i for i, s in enumerate(samples)])
    return sample, loss
def decode_batchnorm(batchnorm_module):
    """Calculates the neuron-wise multipliers and biases of the batch norm layer.

  Note that, in the case of a convolution, the returned bias will have
  spatial dimensions.

  Args:
    batchnorm_module: `snt.BatchNorm` module.

  Returns:
    w: 1D tensor of shape (output_size) or 3D tensor of shape
      (output_height, output_width, output_channels) containing
      neuron-wise multipliers for the batch norm layer.
    b: 1D tensor of shape (output_size) or 3D tensor of shape
      (output_height, output_width, output_channels) containing
      neuron-wise biases for the batch norm layer.
  """
    if isinstance(batchnorm_module, layers.BatchNorm):
        mean = batchnorm_module.mean
        variance = batchnorm_module.variance
        variance_epsilon = batchnorm_module.epsilon
        scale = batchnorm_module.scale
        offset = batchnorm_module.bias

    else:
        assert isinstance(batchnorm_module, snt.BatchNorm)
        mean = batchnorm_module.moving_mean
        variance = batchnorm_module.moving_variance
        variance_epsilon = batchnorm_module._eps  # pylint: disable=protected-access
        try:
            scale = batchnorm_module.gamma
        except snt.Error:
            scale = None
        try:
            offset = batchnorm_module.beta
        except snt.Error:
            offset = None

    w = tf.rsqrt(variance + variance_epsilon)
    if scale is not None:
        w *= scale

    b = -w * mean
    if offset is not None:
        b += offset

    # Batchnorm vars have a redundant leading dim.
    w = tf.squeeze(w, axis=0)
    b = tf.squeeze(b, axis=0)
    return w, b
Beispiel #26
0
def _attn(q, k, v, train=False, scale=False):
    w = tf.matmul(q, k)

    if scale:
        n_state = shape_list(v)[-1]
        w = w*tf.rsqrt(tf.cast(n_state, tf.float32))

    w = mask_attn_weights(w)
    w = tf.nn.softmax(w)

    w = dropout(w, attn_pdrop, train)

    a = tf.matmul(w, v)
    return a
Beispiel #27
0
def instance_norm(inputs, is_training, name='', data_format='channels_first', epsilon=1e-5, beta_initializer=tf.constant_initializer(0.0), gamma_initializer=tf.constant_initializer(1.0)):
    with tf.variable_scope(name):
        channel_index = get_channel_index(inputs, data_format)
        image_axes = get_image_axes(inputs, data_format=data_format)
        depth = inputs.get_shape()[channel_index]
        mean, variance = tf.nn.moments(inputs, axes=image_axes, keep_dims=True)
        inv = tf.rsqrt(variance + epsilon)
        normalized = (inputs - mean) * inv
        offset = tf.get_variable('offset', [depth], trainable=is_training, initializer=beta_initializer)
        scale = tf.get_variable('scale', [depth], trainable=is_training, initializer=gamma_initializer)
        offset_scale_shape = [1] * inputs.shape.ndims
        offset_scale_shape[channel_index] = depth
        offset = tf.reshape(offset, offset_scale_shape)
        scale = tf.reshape(scale, offset_scale_shape)
        return tf.identity(scale * normalized + offset, name='output')
def layer_norm(inp, scale, bias, eps=1e-6):
    """Applies group normalization assuming nhwc format"""
    n, h, w, c = inp.shape

    mean, var = tf.nn.moments(inp, [1, 2, 3], keep_dims=True)
    gain = tf.rsqrt(var + eps)
    output = gain * (inp - mean)

    if scale is not None:
        output = output * scale

    if bias is not None:
        output = output + bias

    return output
Beispiel #29
0
def l2_batch_normalize(x, epsilon=1e-12, scope=None):
  """
  Helper function to normalize a batch of vectors.
  :param x: the input placeholder
  :param epsilon: stabilizes division
  :return: the batch of l2 normalized vector
  """
  with tf.name_scope(scope, "l2_batch_normalize") as scope:
    x_shape = tf.shape(x)
    x = tf.contrib.layers.flatten(x)
    x /= (epsilon + reduce_max(tf.abs(x), 1, keepdims=True))
    square_sum = reduce_sum(tf.square(x), 1, keepdims=True)
    x_inv_norm = tf.rsqrt(np.sqrt(epsilon) + square_sum)
    x_norm = tf.multiply(x, x_inv_norm)
    return tf.reshape(x_norm, x_shape, scope)
Beispiel #30
0
def pixel_norm(images, epsilon=1.0e-8):
    """Pixel normalization.

  For each pixel a[i,j,k] of image in HWC format, normalize its value to
  b[i,j,k] = a[i,j,k] / SQRT(SUM_k(a[i,j,k]^2) / C + eps).

  Args:
    images: A 4D `Tensor` of NHWC format.
    epsilon: A small positive number to avoid division by zero.

  Returns:
    A 4D `Tensor` with pixel-wise normalized channels.
  """
    return images * tf.rsqrt(
        tf.reduce_mean(tf.square(images), axis=3, keepdims=True) + epsilon)