def learning_rate_factor(name, step_num, hparams): """Compute the designated learning rate factor from hparams.""" if name == "constant": tf.logging.info("Base learning rate: %f", hparams.learning_rate_constant) return hparams.learning_rate_constant elif name == "linear_warmup": return tf.minimum(1.0, step_num / hparams.learning_rate_warmup_steps) elif name == "linear_decay": ret = (hparams.train_steps - step_num) / hparams.learning_rate_decay_steps return tf.minimum(1.0, tf.maximum(0.0, ret)) elif name == "cosdecay": # openai gpt in_warmup = tf.cast(step_num <= hparams.learning_rate_warmup_steps, dtype=tf.float32) ret = 0.5 * ( 1 + tf.cos(np.pi * step_num / hparams.learning_rate_decay_steps)) # if in warmup stage return 1 else return the decayed value return in_warmup * 1 + (1 - in_warmup) * ret elif name == "single_cycle_cos_decay": # Cosine decay to zero with a single cycle. This is different from # "cosdecay" because it starts at 1 when the warmup steps end. x = tf.maximum(step_num, hparams.learning_rate_warmup_steps) step = x - hparams.learning_rate_warmup_steps if hparams.train_steps <= hparams.learning_rate_warmup_steps: raise ValueError("single_cycle_cos_decay cannot be used unless " "hparams.train_steps > " "hparams.learning_rate_warmup_steps") return tf.math.cos(step * np.pi / (hparams.train_steps - hparams.learning_rate_warmup_steps)) / 2.0 + 0.5 elif name == "multi_cycle_cos_decay": # Cosine decay with a variable number of cycles. This is different from # "cosdecay" because it starts at 1 when the warmup steps end. Use # hparams.learning_rate_decay_steps to determine the number of cycles. x = tf.maximum(step_num, hparams.learning_rate_warmup_steps) step = x - hparams.learning_rate_warmup_steps return tf.math.cos( step * np.pi / hparams.learning_rate_decay_steps) / 2.0 + 0.5 elif name == "rsqrt_decay": return tf.rsqrt( tf.maximum(step_num, hparams.learning_rate_warmup_steps)) elif name == "rsqrt_normalized_decay": scale = tf.sqrt(tf.to_float(hparams.learning_rate_warmup_steps)) return scale * tf.rsqrt( tf.maximum(step_num, hparams.learning_rate_warmup_steps)) elif name == "exp_decay": decay_steps = hparams.learning_rate_decay_steps warmup_steps = hparams.learning_rate_warmup_steps p = (step_num - warmup_steps) / decay_steps p = tf.maximum(p, 0.) if hparams.learning_rate_decay_staircase: p = tf.floor(p) return tf.pow(hparams.learning_rate_decay_rate, p) elif name == "rsqrt_hidden_size": return hparams.hidden_size**-0.5 elif name == "legacy": return legacy_learning_rate_schedule(hparams) else: raise ValueError("unknown learning rate factor %s" % name)
def multihead_attn(q, k, v): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if int(tf.__version__[0]) > 1: w = w * tf.rsqrt(tf.cast(v.shape[-1], w.dtype)) else: w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) w = mask_attn_weights(w) w = softmax(w) a = tf.matmul(w, v) return a
def ProcessGradients(grads_and_vars, global_gradient_clip=0.0, sanitize_gradients=False, normalize_gradients=False): tf.logging.info("Prcessing gradients") grads, vars_ = list(zip(*grads_and_vars)) if sanitize_gradients: new_grads = [] for g in grads: if g is not None: g = tf.where(tf.is_finite(g), g, tf.zeros_like(g)) new_grads.append(g) grads = new_grads if normalize_gradients: new_grads = [] for g in grads: if g is not None: g *= tf.rsqrt(tf.maximum(1e-12, tf.reduce_sum(tf.square(g)))) new_grads.append(g) grads = new_grads if global_gradient_clip > 0: grads, grad_norm = tf.clip_by_global_norm(grads, global_gradient_clip) grads_and_vars = list(zip(grads, vars_)) else: grad_norm = tf.global_norm(grads) tf.summary.scalar("global_grad_norm", grad_norm) return grads_and_vars
def layer_norm_all(h, batch_size, base, num_units, scope='layer_norm', reuse=False, gamma_start=1.0, epsilon=1e-3, use_bias=True): """Layer Norm (faster version, but not using defun).""" # Performs layer norm on multiple base at once (ie, i, g, j, o for lstm) # Reshapes h in to perform layer norm in parallel h_reshape = tf.reshape(h, [batch_size, base, num_units]) mean = tf.reduce_mean(h_reshape, [2], keep_dims=True) var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True) epsilon = tf.constant(epsilon) rstd = tf.rsqrt(var + epsilon) h_reshape = (h_reshape - mean) * rstd # reshape back to original h = tf.reshape(h_reshape, [batch_size, base * num_units]) with tf.variable_scope(scope): if reuse: tf.get_variable_scope().reuse_variables() gamma = tf.get_variable( 'ln_gamma', [4 * num_units], initializer=tf.constant_initializer(gamma_start)) if use_bias: beta = tf.get_variable( 'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0)) if use_bias: return gamma * h + beta return gamma * h
def _norm(x, g=None, b=None, e=1e-5, axis=[1]): u = tf.reduce_mean(x, axis=axis, keep_dims=True) s = tf.reduce_mean(tf.square(x-u), axis=axis, keep_dims=True) x = (x - u) * tf.rsqrt(s + e) if g is not None and b is not None: x = x*g + b return x
def apply_norm(x, epsilon=1e-6): """Applies layer normalization to x. Based on "Layer Normalization": https://arxiv.org/abs/1607.06450 Args: x: <float>[..., input_size] epsilon: Used to avoid division by 0. Returns: <float>[..., input_size] """ input_size = x.get_shape()[-1] with tf.variable_scope("layer_norm", values=[x]): scale = tf.get_variable("layer_norm_scale", [input_size], initializer=tf.ones_initializer()) bias = tf.get_variable("layer_norm_bias", [input_size], initializer=tf.zeros_initializer()) mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) result = norm_x * scale + bias return result
def do_update(g, flat_v, m, rms): """Do a single tensor's update.""" flat_g = tf.reshape(g, [-1, 1]) rsqrt = tf.rsqrt(rms + 1e-6) norm_g = m * rsqrt inp = tf.concat([flat_g, norm_g, flat_v, m, rms, rsqrt], 1) inp = normalizer(inp, is_training=True) step = utils.tanh_embedding(training_step) stack_step = tf.tile(tf.reshape(step, [1, -1]), tf.stack([tf.shape(flat_g)[0], 1])) inp = tf.concat([inp, stack_step], axis=1) output = mod(inp) direction = output[:, 0:1] magnitude = output[:, 1:2] step = direction * tf.exp( magnitude * self.magnitude_rate) * self.step_multiplier new_flat_v = flat_v - step return new_flat_v,
def signal_to_noise_ratio_gain_invariant(estimate, target, epsilon=1.0e-5): """Computes the signal to noise ratio in a gain invariant manner. This computes SNR in a scale-free manner by projecting the estimate onto the target for the signal, and the projection onto the orthogonal subspace for the noise. Args: estimate: An estimate of the target of size [..., samples]. target: A ground truth tensor, matching estimate above. epsilon: An optional float introduced for numerical stability in the projections only. Returns: A tensor of size [...] with SNR computed between matching slices of the input signal and noise tensors. """ scaling_factors = tf.rsqrt( tf.reduce_sum( tf.square(target), keep_dims=True, reduction_indices=[-1]) + epsilon**2.0) scaled_target = tf.multiply(target, scaling_factors) signal = tf.reduce_sum(tf.multiply(estimate, scaled_target), keep_dims=True, reduction_indices=[-1]) * scaled_target noise = estimate - signal return calculate_signal_to_noise_ratio(signal, noise)
def layer_norm(x, num_units, scope='layer_norm', reuse=False, gamma_start=1.0, epsilon=1e-3, use_bias=True): """Calculate layer norm.""" axes = [1] mean = tf.reduce_mean(x, axes, keep_dims=True) x_shifted = x - mean var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True) inv_std = tf.rsqrt(var + epsilon) with tf.variable_scope(scope): if reuse: tf.get_variable_scope().reuse_variables() gamma = tf.get_variable( 'ln_gamma', [num_units], initializer=tf.constant_initializer(gamma_start)) if use_bias: beta = tf.get_variable( 'ln_beta', [num_units], initializer=tf.constant_initializer(0.0)) output = gamma * (x_shifted) * inv_std if use_bias: output += beta return output
def ae_latent_softmax(latents_pred, latents_discrete_hot, vocab_size, hparams): """Latent prediction and loss. Args: latents_pred: Tensor of shape [..., depth]. latents_discrete_hot: Tensor of shape [..., vocab_size]. vocab_size: an int representing the vocab size. hparams: HParams. Returns: sample: Tensor of shape [...], a sample from a multinomial distribution. loss: Tensor of shape [...], the softmax cross-entropy. """ with tf.variable_scope("latent_logits"): latents_logits = tf.layers.dense(latents_pred, vocab_size, name="logits_dense") if hparams.logit_normalization: latents_logits *= tf.rsqrt( 1e-8 + tf.reduce_mean(tf.square(latents_logits))) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=latents_discrete_hot, logits=latents_logits) # TODO(trandustin): tease this out from ae_latent_softmax. # we use just the loss portion to anchor prior / encoder on text. sample = multinomial_sample(latents_logits, vocab_size, hparams.sampling_method, hparams.sampling_temp) return sample, loss
def l2_normalize(incoming, dim, epsilon=1e-12, name="l2_normalize"): """ L2 Normalization. Normalizes along dimension `dim` using an L2 norm. For a 1-D tensor with `dim = 0`, computes ``` output = x / sqrt(max(sum(x**2), epsilon)) ``` For `x` with more dimensions, independently normalizes each 1-D slice along dimension `dim`. Arguments: incoming: `Tensor`. Incoming Tensor. dim: `int`. Dimension along which to normalize. epsilon: `float`. A lower bound value for the norm. Will use `sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`. name: `str`. A name for this layer (optional). Returns: A `Tensor` with the same shape as `x`. """ with tf.name_scope(name) as name: x = tf.convert_to_tensor(incoming, name="x") square_sum = tf.reduce_sum(tf.square(x), [dim], keep_dims=True) x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon)) return tf.multiply(x, x_inv_norm, name=name)
def call(self, x, epsilon=1e-6): mean = tf.reduce_mean(x, axis=[-1], keepdims=True) variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * self.scale + self.bias
def diet_expert_internal(x): dim = x.get_shape().as_list()[-1] h = tf.layers.dense(x, hidden_size, activation=tf.nn.relu, use_bias=False) y = tf.layers.dense(h, dim, use_bias=False) y *= tf.rsqrt(tf.to_float(dim * hidden_size)) return y
def layer_norm_compute(x, epsilon, scale, bias): """Layer norm raw computation.""" epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]] counts, means_ss, variance_ss, _, = tf.nn.sufficient_statistics( x, axes=[-1], keep_dims=True) mean, variance = tf.nn.normalize_moments(counts, means_ss, variance_ss, None) norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * scale + bias
def multihead_attn(q, k, v): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) w = w * tf.rsqrt(tf.cast(shape_list(v)[-1], w.dtype)) w = mask_attn_weights(w) w = softmax(w) a = tf.matmul(w, v) return a
def instance_norm(x, epsilon=1e-8): assert len(x.shape) == 4 # NCHW with tf.variable_scope('InstanceNorm'): orig_dtype = x.dtype x = tf.cast(x, tf.float32) x -= tf.reduce_mean(x, axis=[2,3], keepdims=True) epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon') x *= tf.rsqrt(tf.reduce_mean(tf.square(x), axis=[2,3], keepdims=True) + epsilon) x = tf.cast(x, orig_dtype) return x
def instance_norm(input, name="instance_norm", ): with tf.variable_scope(name): depth = input.get_shape()[3] scale = tf.get_variable("scale", [depth], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32)) offset = tf.get_variable("offset", [depth], initializer=tf.constant_initializer(0.0)) mean, variance = tf.nn.moments(input, axes=[1,2], keep_dims=True) epsilon = 1e-5 inv = tf.rsqrt(variance + epsilon) normalized = (input-mean)*inv return scale*normalized + offset
def norm(x, scope, *, axis=-1, epsilon=1e-5): """Normalize to mean = 0, std = 1, then do a diagonal affine transform.""" with tf.variable_scope(scope): n_state = x.shape[-1] g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0)) u = tf.reduce_mean(x, axis=axis, keepdims=True) s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True) x = (x - u) * tf.rsqrt(s + epsilon) x = x*g + b return x
def transformer_pointer_prediction_layer(feature_name, encoder_output, x, encoder_decoder_attention_bias, hparams, features, loss_mask, layer_collection=None): """Layer that predicts the start or end token position. Args: feature_name: 'targets_start_token' or 'targets_end_token' encoder_output: [batch_size, input_length, hidden_size] tensor with encoder outputs x: [batch_size, target_length, 1, hidden_size] tensor with decoder outputs encoder_decoder_attention_bias: [batch_size, input_length, target_length] attention mask hparams: Hyper parameters features: Feature dictionary loss_mask: [batch_size, target_length] mask for loss computation. layer_collection: Layer collection Returns: (x, logits, loss) """ if isinstance(encoder_output, list): pointer_encoder_output = encoder_output[1] encoder_output = sum(encoder_output) else: pointer_encoder_output = encoder_output with tf.variable_scope("%s_prediction" % feature_name): x = maybe_flatten4d3d(x) encoder_decoder_attention_bias = common_layers.flatten4d3d( encoder_decoder_attention_bias) q = common_attention.compute_attention_component(x, hparams.hidden_size) k = common_attention.compute_attention_component(encoder_output, hparams.hidden_size) # Scaled dot-product attention scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2])) logits = tf.matmul(q * scalar, k, transpose_b=True) logits += encoder_decoder_attention_bias labels = features["%s_raw" % feature_name] xent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) loss = tf.reduce_sum(xent * loss_mask) pointer_out = gather_2d(pointer_encoder_output, labels) y = common_layers.layer_preprocess( pointer_out, hparams, layer_collection=layer_collection) x = common_layers.layer_postprocess(x, y, hparams) return x, logits, loss
def _instance_norm(input): """ Instance Normalization """ with tf.variable_scope("instance_norm"): depth = input.get_shape()[3] scale = _weights("scale", [depth], mean=1.0) offset = _biases("offset", [depth]) mean, variance = tf.nn.moments(input, axes=[1, 2], keep_dims=True) epsilon = 1e-5 inv = tf.rsqrt(variance + epsilon) normalized = (input - mean) * inv return scale * normalized + offset
def batch_norm(input_, name="batch_norm"): with tf.variable_scope(name): input_dim = input_.get_shape()[-1] scale = tf.get_variable("scale", [input_dim], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32)) offset = tf.get_variable("offset", [input_dim], initializer=tf.constant_initializer(0.0)) mean, variance = tf.nn.moments(input_, axes=[1, 2], keep_dims=True) epsilon = 1e-5 inv = tf.rsqrt(variance + epsilon) normalized = (input_ - mean) * inv output = scale * normalized + offset return output
def normalize(kernel, g, axis, epsilon): # Weight norm and what I'm currently doing are slightly different # in that the normalization axis is very different... # The easiest thing to do is to specify a normalization axis # So, adding 1e-3 works # kernel = tf.math.l2_normalize(kernel, axis=-1) kernel = kernel * tf.rsqrt( tf.reduce_sum(tf.square(kernel), axis=axis, keepdims=True) + epsilon) if g is not None: kernel = kernel * g return kernel
def layer_norm(input_tensor, name=None, epsilon=1e-5): """Run layer normalization on the last dimension of the tensor.""" name2use = f'LayerNorm_{name}' if name is not None else name with tf.variable_scope(name2use, default_name='LayerNorm'): dim = input_tensor.shape[-1].value gamma = tf.get_variable('gamma', [dim], initializer=tf.constant_initializer(1)) beta = tf.get_variable('beta', [dim], initializer=tf.constant_initializer(0)) mean = tf.reduce_mean(input_tensor, axis=-1, keepdims=True) std = tf.reduce_mean(tf.square(input_tensor - mean), axis=-1, keepdims=True) input_tensor = (input_tensor - mean) * tf.rsqrt(std + epsilon) input_tensor = input_tensor * gamma + beta return input_tensor
def ae_latent_softmax(latents_pred, latents_discrete, hparams): """Latent prediction and loss.""" vocab_size = 2**hparams.z_size if hparams.num_decode_blocks < 2: latents_logits = tf.layers.dense(latents_pred, vocab_size, name="extra_logits") if hparams.logit_normalization: latents_logits *= tf.rsqrt( 1e-8 + tf.reduce_mean(tf.square(latents_logits))) loss = None if latents_discrete is not None: if hparams.soft_em: # latents_discrete is actually one-hot of multinomial samples assert hparams.num_decode_blocks == 1 loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=latents_discrete, logits=latents_logits) else: loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=latents_discrete, logits=latents_logits) sample = multinomial_sample(latents_logits, vocab_size, hparams.sampling_temp) return sample, loss # Multi-block case. vocab_bits = int(math.log(vocab_size, 2)) assert vocab_size == 2**vocab_bits assert vocab_bits % hparams.num_decode_blocks == 0 block_vocab_size = 2**(vocab_bits // hparams.num_decode_blocks) latents_logits = [ tf.layers.dense(latents_pred, block_vocab_size, name="extra_logits_%d" % i) for i in range(hparams.num_decode_blocks) ] loss = None if latents_discrete is not None: losses = [] for i in range(hparams.num_decode_blocks): d = tf.floormod(tf.floordiv(latents_discrete, block_vocab_size**i), block_vocab_size) losses.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=d, logits=latents_logits[i])) loss = sum(losses) samples = [ multinomial_sample(l, block_vocab_size, hparams.sampling_temp) for l in latents_logits ] sample = sum([s * block_vocab_size**i for i, s in enumerate(samples)]) return sample, loss
def decode_batchnorm(batchnorm_module): """Calculates the neuron-wise multipliers and biases of the batch norm layer. Note that, in the case of a convolution, the returned bias will have spatial dimensions. Args: batchnorm_module: `snt.BatchNorm` module. Returns: w: 1D tensor of shape (output_size) or 3D tensor of shape (output_height, output_width, output_channels) containing neuron-wise multipliers for the batch norm layer. b: 1D tensor of shape (output_size) or 3D tensor of shape (output_height, output_width, output_channels) containing neuron-wise biases for the batch norm layer. """ if isinstance(batchnorm_module, layers.BatchNorm): mean = batchnorm_module.mean variance = batchnorm_module.variance variance_epsilon = batchnorm_module.epsilon scale = batchnorm_module.scale offset = batchnorm_module.bias else: assert isinstance(batchnorm_module, snt.BatchNorm) mean = batchnorm_module.moving_mean variance = batchnorm_module.moving_variance variance_epsilon = batchnorm_module._eps # pylint: disable=protected-access try: scale = batchnorm_module.gamma except snt.Error: scale = None try: offset = batchnorm_module.beta except snt.Error: offset = None w = tf.rsqrt(variance + variance_epsilon) if scale is not None: w *= scale b = -w * mean if offset is not None: b += offset # Batchnorm vars have a redundant leading dim. w = tf.squeeze(w, axis=0) b = tf.squeeze(b, axis=0) return w, b
def _attn(q, k, v, train=False, scale=False): w = tf.matmul(q, k) if scale: n_state = shape_list(v)[-1] w = w*tf.rsqrt(tf.cast(n_state, tf.float32)) w = mask_attn_weights(w) w = tf.nn.softmax(w) w = dropout(w, attn_pdrop, train) a = tf.matmul(w, v) return a
def instance_norm(inputs, is_training, name='', data_format='channels_first', epsilon=1e-5, beta_initializer=tf.constant_initializer(0.0), gamma_initializer=tf.constant_initializer(1.0)): with tf.variable_scope(name): channel_index = get_channel_index(inputs, data_format) image_axes = get_image_axes(inputs, data_format=data_format) depth = inputs.get_shape()[channel_index] mean, variance = tf.nn.moments(inputs, axes=image_axes, keep_dims=True) inv = tf.rsqrt(variance + epsilon) normalized = (inputs - mean) * inv offset = tf.get_variable('offset', [depth], trainable=is_training, initializer=beta_initializer) scale = tf.get_variable('scale', [depth], trainable=is_training, initializer=gamma_initializer) offset_scale_shape = [1] * inputs.shape.ndims offset_scale_shape[channel_index] = depth offset = tf.reshape(offset, offset_scale_shape) scale = tf.reshape(scale, offset_scale_shape) return tf.identity(scale * normalized + offset, name='output')
def layer_norm(inp, scale, bias, eps=1e-6): """Applies group normalization assuming nhwc format""" n, h, w, c = inp.shape mean, var = tf.nn.moments(inp, [1, 2, 3], keep_dims=True) gain = tf.rsqrt(var + eps) output = gain * (inp - mean) if scale is not None: output = output * scale if bias is not None: output = output + bias return output
def l2_batch_normalize(x, epsilon=1e-12, scope=None): """ Helper function to normalize a batch of vectors. :param x: the input placeholder :param epsilon: stabilizes division :return: the batch of l2 normalized vector """ with tf.name_scope(scope, "l2_batch_normalize") as scope: x_shape = tf.shape(x) x = tf.contrib.layers.flatten(x) x /= (epsilon + reduce_max(tf.abs(x), 1, keepdims=True)) square_sum = reduce_sum(tf.square(x), 1, keepdims=True) x_inv_norm = tf.rsqrt(np.sqrt(epsilon) + square_sum) x_norm = tf.multiply(x, x_inv_norm) return tf.reshape(x_norm, x_shape, scope)
def pixel_norm(images, epsilon=1.0e-8): """Pixel normalization. For each pixel a[i,j,k] of image in HWC format, normalize its value to b[i,j,k] = a[i,j,k] / SQRT(SUM_k(a[i,j,k]^2) / C + eps). Args: images: A 4D `Tensor` of NHWC format. epsilon: A small positive number to avoid division by zero. Returns: A 4D `Tensor` with pixel-wise normalized channels. """ return images * tf.rsqrt( tf.reduce_mean(tf.square(images), axis=3, keepdims=True) + epsilon)