Exemple #1
0
def cosine_distances(test, support):
  """Computes pairwise cosine distances between provided tensors

  Parameters
  ----------
  test: tf.Tensor
    Of shape (n_test, n_feat)
  support: tf.Tensor
    Of shape (n_support, n_feat)

  Returns
  -------
  tf.Tensor:
    Of shape (n_test, n_support)
  """
  rnorm_test = tf.rsqrt(
      tf.reduce_sum(tf.square(test), 1, keep_dims=True)) + 1e-7
  rnorm_support = tf.rsqrt(
      tf.reduce_sum(tf.square(support), 1, keep_dims=True)) + 1e-7
  test_normalized = test * rnorm_test
  support_normalized = support * rnorm_support

  # Transpose for mul
  support_normalized_t = tf.transpose(support_normalized, perm=[1, 0])
  g = tf.matmul(test_normalized, support_normalized_t)  # Gram matrix
  return g
Exemple #2
0
  def _apply_noisy_update(self, mom, grad, var):
    # Compute and apply the gradient update following
    # preconditioned Langevin dynamics
    stddev = tf.where(
        tf.squeeze(self._counter > self._burnin),
        tf.cast(tf.rsqrt(self._learning_rate), grad.dtype),
        tf.zeros([], grad.dtype))
    # Keep an exponentially weighted moving average of squared gradients.
    # Not thread safe
    decay_tensor = tf.cast(self._decay_tensor, grad.dtype)
    new_mom = decay_tensor * mom + (1. - decay_tensor) * tf.square(grad)
    preconditioner = tf.rsqrt(
        new_mom + tf.cast(self._diagonal_bias, grad.dtype))

    # Compute gradients of the preconsitionaer
    _, preconditioner_grads = diag_jacobian(
        xs=var,
        ys=preconditioner,
        parallel_iterations=self._parallel_iterations)

    mean = 0.5 * (preconditioner * grad *
                  tf.cast(self._data_size, grad.dtype)
                  - preconditioner_grads[0])
    stddev *= tf.sqrt(preconditioner)
    result_shape = tf.broadcast_dynamic_shape(tf.shape(mean),
                                              tf.shape(stddev))
    with tf.control_dependencies([tf.assign(mom, new_mom)]):
      return tf.random_normal(shape=result_shape,
                              mean=mean,
                              stddev=stddev,
                              dtype=grad.dtype)
Exemple #3
0
def batch_normalized_linear_layer(state_below, scope_name, n_inputs, n_outputs, stddev, wd, eps=.00001, test=False):
    """
    A linear layer with batch normalization
    """
    with tf.variable_scope(scope_name) as scope:
        weight = _variable_with_weight_decay(
            "weights", shape=[n_inputs, n_outputs],
            stddev=stddev, wd=wd
        )
        act = tf.matmul(state_below, weight)
        # get moments
        act_mean, act_variance = tf.nn.moments(act, [0])
        # get mean and variance variables
        mean = _variable_on_cpu('bn_mean', [n_outputs], tf.constant_initializer(0.0), trainable=False)
        variance = _variable_on_cpu('bn_variance', [n_outputs], tf.constant_initializer(1.0), trainable=False)
        # assign the moments

        if not test:
            assign_mean = mean.assign(act_mean)
            assign_variance = variance.assign(act_variance)
            act_bn = tf.mul((act - act_mean), tf.rsqrt(act_variance + eps), name=scope.name+"_bn")
        else:
            act_bn = tf.mul((act - mean), tf.rsqrt(variance + eps), name=scope.name+"_bn")

        beta = _variable_on_cpu("beta", [n_outputs], tf.constant_initializer(0.0))
        gamma = _variable_on_cpu("gamma", [n_outputs], tf.constant_initializer(1.0))
        bn = tf.add(tf.mul(act_bn, gamma), beta)
        # output = tf.nn.relu(bn, name=scope.name)
        output = randomized_relu(bn, .1, name=scope.name, is_training=(not test))
        if not test:
            output = control_flow_ops.with_dependencies(dependencies=[assign_mean, assign_variance], output_tensor=output)
        _activation_summary(output)
    return output
Exemple #4
0
def batch_normalized_conv_layer(state_below, scope_name, n_inputs, n_outputs, filter_shape, stddev, wd, eps=.00001, test=False):
    """
    Convolutional layer with batch normalization
    """
    with tf.variable_scope(scope_name) as scope:
        kernel = _variable_with_weight_decay(
            "weights", shape=[filter_shape[0], filter_shape[1], n_inputs, n_outputs],
            stddev=stddev, wd=wd
        )
        conv = tf.nn.conv2d(state_below, kernel, [1, 1, 1, 1], padding='SAME')
        # get moments
        conv_mean, conv_variance = tf.nn.moments(conv, [0, 1, 2])
        # get mean and variance variables
        mean = _variable_on_cpu("bn_mean", [n_outputs], tf.constant_initializer(0.0), False)
        variance = _variable_on_cpu("bn_variance", [n_outputs], tf.constant_initializer(1.0), False)
        # assign the moments

        if not test:
            assign_mean = mean.assign(conv_mean)
            assign_variance = variance.assign(conv_variance)
            conv_bn = tf.mul((conv - conv_mean), tf.rsqrt(conv_variance + eps), name=scope.name+"_bn")
        else:
            conv_bn = tf.mul((conv - mean), tf.rsqrt(variance + eps), name=scope.name+"_bn")

        beta = _variable_on_cpu("beta", [n_outputs], tf.constant_initializer(0.0))
        gamma = _variable_on_cpu("gamma", [n_outputs], tf.constant_initializer(1.0))
        bn = tf.add(tf.mul(conv_bn, gamma), beta)
        # output = tf.nn.relu(bn, name=scope.name)
        output = randomized_relu(bn, .1, name=scope.name, is_training=(not test))
        if not test:
            output = control_flow_ops.with_dependencies(dependencies=[assign_mean, assign_variance], output_tensor=output)
        _activation_summary(output)

    return output
Exemple #5
0
 def _resource_apply_dense(self, grad, var):
   grad_squared = tf.square(grad) + 1e-30
   grad_squared_mean = tf.reduce_mean(grad_squared)
   decay_rate = self._decay_rate
   update_scale = self._learning_rate
   if self._multiply_by_parameter_scale:
     update_scale *= self._parameter_scale(var)
   # HACK: Make things dependent on grad.
   # This confounds the XLA rewriter and keeps it from fusing computations
   # across different variables.  This fusion is a bad for HBM usage, since
   # it causes the gradients to persist in memory.
   decay_rate += grad_squared_mean * 1e-30
   update_scale += grad_squared_mean * 1e-30
   # END HACK
   mixing_rate = 1.0 - decay_rate
   shape = var.get_shape().as_list()
   updates = []
   if self._should_use_factored_second_moment_estimate(shape):
     grad_squared_row_mean = tf.reduce_mean(grad_squared, 1)
     grad_squared_col_mean = tf.reduce_mean(grad_squared, 0)
     vr = self.get_slot(var, "vr")
     new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
     vc = self.get_slot(var, "vc")
     new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
     vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
     vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
     updates = [vr_update, vc_update]
     long_term_mean = tf.reduce_mean(new_vr)
     r_factor = tf.rsqrt(new_vr / long_term_mean)
     c_factor = tf.rsqrt(new_vc)
     x = grad * tf.expand_dims(r_factor, 1) * tf.expand_dims(c_factor, 0)
   else:
     v = self.get_slot(var, "v")
     new_v = decay_rate * v + mixing_rate * grad_squared
     v_update = tf.assign(v, new_v, use_locking=self._use_locking)
     updates = [v_update]
     x = grad * tf.rsqrt(new_v)
   if self._clipping_threshold is not None:
     clipping_denom = tf.maximum(1.0, reduce_rms(x) / self._clipping_threshold)
     x /= clipping_denom
   subtrahend = update_scale * x
   if self._beta1:
     m = self.get_slot(var, "m")
     new_m = self._beta1 * m + (1.0 - self._beta1) * subtrahend
     updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
     subtrahend = new_m
   var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
   updates = [var_update] + updates
   return tf.group(*updates)
Exemple #6
0
 def _opsBatchNorm(self, x, m, v, beta, gamma, epsilon,
                   scale_after_normalization):
   y = (x - m) * tf.rsqrt(v + epsilon)
   if scale_after_normalization:
     y = gamma * y
   y += beta
   return y
Exemple #7
0
def l2_normalize(incoming, dim, epsilon=1e-12, name="l2_normalize"):
    """ L2 Normalization.

    Normalizes along dimension `dim` using an L2 norm.

    For a 1-D tensor with `dim = 0`, computes
    ```
    output = x / sqrt(max(sum(x**2), epsilon))
    ```

    For `x` with more dimensions, independently normalizes each 1-D slice along
    dimension `dim`.

    Arguments:
        incoming: `Tensor`. Incoming Tensor.
        dim: `int`. Dimension along which to normalize.
        epsilon: `float`. A lower bound value for the norm. Will use
            `sqrt(epsilon)` as the divisor if `norm < sqrt(epsilon)`.
        name: `str`. A name for this layer (optional).

    Returns:
      A `Tensor` with the same shape as `x`.
    """
    with tf.variable_op_scope([incoming], name) as name:
        x = tf.ops.convert_to_tensor(incoming, name="x")
        square_sum = tf.reduce_sum(tf.square(x), [dim], keep_dims=True)
        x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon))

    return tf.mul(x, x_inv_norm, name=name)
def ae_latent_softmax(latents_pred, latents_discrete_hot, vocab_size, hparams):
  """Latent prediction and loss.

  Args:
    latents_pred: Tensor of shape [..., depth].
    latents_discrete_hot: Tensor of shape [..., vocab_size].
    vocab_size: an int representing the vocab size.
    hparams: tf.contrib.training.HParams.

  Returns:
    sample: Tensor of shape [...], a sample from a multinomial distribution.
    loss: Tensor of shape [...], the softmax cross-entropy.
  """
  with tf.variable_scope("latent_logits"):
    latents_logits = tf.layers.dense(latents_pred, vocab_size,
                                     name="logits_dense")
    if hparams.logit_normalization:
      latents_logits *= tf.rsqrt(1e-8 +
                                 tf.reduce_mean(tf.square(latents_logits)))
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=latents_discrete_hot, logits=latents_logits)

    # TODO(trandustin): tease this out from ae_latent_softmax.
    # we use just the loss portion to anchor prior / encoder on text.
    sample = multinomial_sample(latents_logits,
                                vocab_size,
                                hparams.sampling_method,
                                hparams.sampling_temp)
    return sample, loss
Exemple #9
0
def layer_norm_all(h,
                   batch_size,
                   base,
                   num_units,
                   scope='layer_norm',
                   reuse=False,
                   gamma_start=1.0,
                   epsilon=1e-3,
                   use_bias=True):
  """Layer Norm (faster version, but not using defun)."""
  # Performs layer norm on multiple base at once (ie, i, g, j, o for lstm)
  # Reshapes h in to perform layer norm in parallel
  h_reshape = tf.reshape(h, [batch_size, base, num_units])
  mean = tf.reduce_mean(h_reshape, [2], keep_dims=True)
  var = tf.reduce_mean(tf.square(h_reshape - mean), [2], keep_dims=True)
  epsilon = tf.constant(epsilon)
  rstd = tf.rsqrt(var + epsilon)
  h_reshape = (h_reshape - mean) * rstd
  # reshape back to original
  h = tf.reshape(h_reshape, [batch_size, base * num_units])
  with tf.variable_scope(scope):
    if reuse:
      tf.get_variable_scope().reuse_variables()
    gamma = tf.get_variable(
        'ln_gamma', [4 * num_units],
        initializer=tf.constant_initializer(gamma_start))
    if use_bias:
      beta = tf.get_variable(
          'ln_beta', [4 * num_units], initializer=tf.constant_initializer(0.0))
  if use_bias:
    return gamma * h + beta
  return gamma * h
Exemple #10
0
def BatchClipByL2norm(t, upper_bound, name=None):
  """Clip an array of tensors by L2 norm.

  Shrink each dimension-0 slice of tensor (for matrix it is each row) such
  that the l2 norm is at most upper_bound. Here we clip each row as it
  corresponds to each example in the batch.

  Args:
    t: the input tensor.
    upper_bound: the upperbound of the L2 norm.
    name: optional name.
  Returns:
    the clipped tensor.
  """

  assert upper_bound > 0
  with tf.op_scope([t, upper_bound], name, "batch_clip_by_l2norm") as name:
    saved_shape = tf.shape(t)
    batch_size = tf.slice(saved_shape, [0], [1])
    t2 = tf.reshape(t, tf.concat(0, [batch_size, [-1]]))
    upper_bound_inv = tf.fill(tf.slice(saved_shape, [0], [1]),
                              tf.constant(1.0/upper_bound))
    # Add a small number to avoid divide by 0
    l2norm_inv = tf.rsqrt(tf.reduce_sum(t2 * t2, [1]) + 0.000001)
    scale = tf.minimum(l2norm_inv, upper_bound_inv) * upper_bound
    clipped_t = tf.matmul(tf.diag(scale), t2)
    clipped_t = tf.reshape(clipped_t, saved_shape, name=name)
  return clipped_t
Exemple #11
0
 def _norm(x, g=None, b=None, e=1e-5, axis=[1]):
     u = tf.reduce_mean(x, axis=axis, keepdims=True)
     s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
     x = (x - u) * tf.rsqrt(s + e)
     if g is not None and b is not None:
         x = x*g + b
     return x
  def compute_next_h_d(self, meta_opt, w_bot, w_top, bias, x, z, d, backward_w):
    """ Propogate error back down the network while computing hidden state.
    """
    if z is None:
      z = x

    h = meta_opt.compute_h(x, z, d, bias, w_bot,
                           w_top)  # [bs x 60 x h_channels]

    # compute the next d
    delta = meta_opt.next_delta(z, h, d)

    if backward_w is not None:

      def delta_matmul(w, delta):
        d = tf.transpose(delta, [0, 2, 1])  # [bs x delta_channels x n_units)
        d = snt.BatchApply(lambda x: tf.matmul(x, w, transpose_b=True))(d)
        d = tf.transpose(d, [0, 2, 1])
        return d

      # replace the "backward pass" with a random matrix.
      d = delta_matmul(backward_w, delta)  # [bs x 60 x delta_channels]
      var = tf.reduce_mean(tf.square(d), [2], keepdims=True)
      d = d * tf.rsqrt(1e-6 + var)

    return h, d
Exemple #13
0
def multihead_attn(q, k, v):
  # q, k, v have shape [batch, heads, sequence, features]
  w = tf.matmul(q, k, transpose_b=True)
  w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
  w = tf.nn.softmax(w)
  a = tf.matmul(w, v)
  return a
def simple_attention(target, source, bias=None, summaries=True):
  """A simple attention function.

  Args:
    target: a `Tensor` with shape `[batch, target_timesteps, depth]` or
     `[batch, target_timesteps_1, target_timesteps_2, depth]`
    source: a `Tensor` with shape `[batch, source_timesteps, depth]` or
     `[batch, source_timesteps_1, source_timesteps_2, depth]`
    bias: an optional `Tensor` with shape `[batch, timesteps, 1, 1]` used
     to mask the attention to not attend to padding of input.
    summaries: Boolean, whether to output summaries.

  Returns:
    a `Tensor` with same shape as `target`
  """
  with tf.name_scope("simple_attention", [target, source]):
    target_shape = tf.shape(target)
    source_shape = tf.shape(source)
    target = tf.reshape(target, [
        target_shape[0], target_shape[1] * target_shape[2], target_shape[3]
    ])
    source = tf.reshape(source, [
        source_shape[0], source_shape[1] * source_shape[2], source_shape[3]
    ])
    attention = tf.matmul(target, source, transpose_b=True)
    attention *= tf.rsqrt(tf.to_float(tf.shape(target)[2]))
    if bias is not None:
      attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1)
    attention = tf.nn.softmax(attention)
    if summaries and not tf.get_variable_scope().reuse:
      tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5)
    attended = tf.matmul(attention, source)
    return tf.reshape(attended, target_shape)
Exemple #15
0
def layer_norm(x: tf.Tensor, epsilon: float = 1e-6) -> tf.Tensor:
    """Layer normalize the tensor x, averaging over the last dimension.

    Implementation based on tensor2tensor.

    Arguments:
        x: The ``Tensor`` to normalize.
        epsilon: The smoothing parameter of the normalization.

    Returns:
        The normalized tensor.
    """
    with tf.variable_scope("LayerNorm"):
        gamma = get_variable(
            name="gamma",
            shape=[x.get_shape()[-1]],
            dtype=tf.float32,
            initializer=tf.ones_initializer())
        beta = get_variable(
            name="beta",
            shape=[x.get_shape()[-1]],
            dtype=tf.float32,
            initializer=tf.zeros_initializer())

        mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
        variance = tf.reduce_mean(
            tf.square(x - mean),
            axis=[-1],
            keepdims=True)
        norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
        return norm_x * gamma + beta
Exemple #16
0
def inst_norm(x, train, data_format='NHWC', name=None, affine=False, act=lrelu, epsilon=1e-5):
    with tf.variable_scope(name, default_name='Inst', reuse=None) as vs:
        if x.get_shape().ndims == 4 and data_format == 'NCHW':
            x = nchw_to_nhwc(x)

        if x.get_shape().ndims == 4:
            mean_dim = [1,2]
        else: # 2
            mean_dim = [1]

        mu, sigma_sq = tf.nn.moments(x, mean_dim, keep_dims=True)
        inv = tf.rsqrt(sigma_sq+epsilon)
        normalized = (x-mu)*inv

        if affine:
            var_shape = [x.get_shape()[-1]]
            shift = slim.model_variable('shift', shape=var_shape, initializer=tf.zeros_initializer)
            scale = slim.model_variable('scale', shape=var_shape, initializer=tf.ones_initializer)
            out = scale*normalized + shift
        else:
            out = normalized
        
        if x.get_shape().ndims == 4 and data_format == 'NCHW':
            out = nhwc_to_nchw(out)

        if act is None: return out
        else: return act(out)
Exemple #17
0
def layer_norm(x,
               num_units,
               scope='layer_norm',
               reuse=False,
               gamma_start=1.0,
               epsilon=1e-3,
               use_bias=True):
  """Calculate layer norm."""
  axes = [1]
  mean = tf.reduce_mean(x, axes, keep_dims=True)
  x_shifted = x - mean
  var = tf.reduce_mean(tf.square(x_shifted), axes, keep_dims=True)
  inv_std = tf.rsqrt(var + epsilon)
  with tf.variable_scope(scope):
    if reuse is True:
      tf.get_variable_scope().reuse_variables()
    gamma = tf.get_variable(
        'ln_gamma', [num_units],
        initializer=tf.constant_initializer(gamma_start))
    if use_bias:
      beta = tf.get_variable(
          'ln_beta', [num_units], initializer=tf.constant_initializer(0.0))
  output = gamma * (x_shifted) * inv_std
  if use_bias:
    output += beta
  return output
Exemple #18
0
def ae_latent_softmax(latents_pred, latents_discrete_hot, hparams):
  """Latent prediction and loss.

  Args:
    latents_pred: Tensor of shape [..., depth].
    latents_discrete_hot: Tensor of shape [..., vocab_size].
    hparams: tf.contrib.training.HParams.

  Returns:
    sample: Tensor of shape [...], a sample from a multinomial distribution.
    loss: Tensor of shape [...], the softmax cross-entropy.
  """
  vocab_size = 2**hparams.bottleneck_bits
  with tf.variable_scope("latent_logits"):
    latents_logits = tf.layers.dense(latents_pred, vocab_size,
                                     name="logits_dense")
    if hparams.logit_normalization:
      latents_logits *= tf.rsqrt(1e-8 +
                                 tf.reduce_mean(tf.square(latents_logits)))
    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=latents_discrete_hot, logits=latents_logits)
    sample = multinomial_sample(latents_logits,
                                vocab_size,
                                hparams.sampling_method,
                                hparams.sampling_temp)
    return sample, loss
Exemple #19
0
def full_batchnorm(pre_activations, batch, epsilon=1e-8, train=True,
                   beta_init=tf.constant_initializer(0),
                   gamma_init=tf.constant_initializer(1)):
    """Does full batch normalisation of pre activations.
    Expects to get given something pre-nonlinearity.

    This is only set up for feed forward nets, in order to work properly for
    recurrent nets we will need to know what step we are up to, as in the 
    paper they calculate population statistics at every time step.

    Args:
      pre_activations: the logits who will be normalised. We assume this is
        of shape [batch_size, num_units]
      batch: the data which generated the logits, which we need to calculate
        statistics used to normalise.
      train: if true, the statistics will be recalculated for each batch. If not,
        then the average from the training phase will be used.

    Returns:
      batch normalised activations.
    """
    # get beta and gamma
    num_units = pre_activations.get_shape()[0]
    beta = tf.get_variable('beta', [num_units])
    gamma = tf.get_variable('gamma', [num_units])
    mean, variance = tf.nn.moments(pre_activations, [0])
    isqr = tf.rsqrt(variance+epsilon)
    centered = pre_activations - mean
    return beta + gamma * centered * isqr
Exemple #20
0
 def grad(grad_ys):
   large_float_like_x = np.sqrt(np.finfo(x.dtype.as_numpy_dtype()).max)
   safe_grads = tf.where(
       tf.equal(x, 0),
       tf.fill(x.shape, large_float_like_x),
       0.5 * tf.rsqrt(x))
   return grad_ys * safe_grads
Exemple #21
0
def clip_weight_norm(t, clip_norm, name=None):
  with tf.op_scope([t, clip_norm], name, "clip_weight_norm") as scope:
    l2norm_inv = tf.rsqrt(
      tf.reduce_sum(t * t, 0))
    tclip = tf.identity(t * clip_norm * tf.minimum(
      l2norm_inv, tf.constant(1.0 / clip_norm)))

    return tclip
Exemple #22
0
 def call(self, x, epsilon=1e-6):
   dtype = x.dtype
   x = tf.cast(x=x, dtype=tf.float32)
   mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
   variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
   norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
   result = norm_x * self.scale + self.bias
   return tf.cast(x=result, dtype=dtype)
 def true_log_joint(features, prior_precision, w, y):
   log_prob = tf.reduce_sum(tfd.Normal(
       loc=0.,
       scale=tf.rsqrt(prior_precision)).log_prob(w))
   log_prob += tf.reduce_sum(tfd.Normal(
       loc=tf.tensordot(features, w, [[1], [0]]),
       scale=1.).log_prob(y))
   return log_prob
Exemple #24
0
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
                use_scale=True, use_bias=True):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    """

    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
    n_out = shape[-1]
    if len(shape) == 2:
        x = tf.reshape(x, [-1, 1, 1, n_out])
    beta, gamma, moving_mean, moving_var = get_bn_variables(
        n_out, use_scale, use_bias, tf.constant_initializer(1.0))

    ctx = get_current_tower_context()
    use_local_stat = ctx.is_training
    # for BatchRenorm, use_local_stat should always be is_training, unless a
    # different usage comes out in the future.

    if use_local_stat:
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta,
                                                           epsilon=epsilon, is_training=True)
        inv_sigma = tf.rsqrt(moving_var, 'inv_sigma')
        r = tf.stop_gradient(tf.clip_by_value(
            tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax))
        d = tf.stop_gradient(tf.clip_by_value(
            (batch_mean - moving_mean) * inv_sigma,
            -dmax, dmax))
        xn = xn * r + d
    else:
        xn = tf.nn.batch_normalization(
            x, moving_mean, moving_var, beta, gamma, epsilon)

    if len(shape) == 2:
        xn = tf.squeeze(xn, [1, 2])
    if ctx.is_main_training_tower:
        return update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
    else:
        return tf.identity(xn, name='output')
    def get_function(points, mu, sigma): # f_ik [n,k]
        div = coef*tf.rsqrt(tf.batch_matrix_determinant(sigma)) # ((2pi)^p*|S_k|)^-1/2  [k]
        div = tf.tile(tf.reshape(div, [1,k]), [n,1]) # [n,k]
        diff = tf.sub(tf.tile(points, [k,1,1]), tf.tile(mu, [n,1,1])) # x_i-u_k [n*k, p, 1]
        sigma = tf.tile(sigma, [n,1,1]) # [n*k,p,p]
        exp = tf.exp(-0.5*tf.batch_matmul( tf.transpose(diff,perm=[0,2,1]), tf.batch_matmul(tf.batch_matrix_inverse(sigma), diff) )) # e^(d'*S^-1*d)_ik [n*k, 1, 1]
        exp = tf.reshape(exp, [n,k])

        return tf.mul(div, exp) # Multivariate normal distribution evaluated for each vector, for each cluster parameter. Hence the [n,k] shape.
 def model(features, prior_precision):
   w = ed.Normal(loc=0.,
                 scale=tf.rsqrt(prior_precision),
                 sample_shape=features.shape[1],
                 name="w")
   y = ed.Normal(loc=tf.tensordot(features, w, [[1], [0]]),
                 scale=1.,
                 name="y")
   return y
Exemple #27
0
 def normalize(self, x, train=True):
   """Returns a batch-normalized version of x."""
   if train:
     mean, variance = tf.nn.moments(x, [0])
     assign_mean = self.mean.assign(mean)
     assign_variance = self.variance.assign(tf.mul(variance, self.keep_prob_prior))
     with tf.control_dependencies([assign_mean, assign_variance]):
       act_bn = tf.mul((x - mean), tf.rsqrt(variance + self.epsilon), name="act_bn")
       return tf.add(tf.mul(act_bn, self.gamma), self.beta)
     
   else:
     mean = self.ewma_trainer.average(self.mean) or self.epsilon
     variance = self.ewma_trainer.average(self.variance) or self.epsilon
     local_beta = tf.identity(self.beta)
     local_gamma = tf.identity(self.gamma)
     act_bn = tf.mul((x-mean), tf.rsqrt(variance + self.epsilon), name="act1_bn")
     return tf.add(tf.mul(act_bn, local_gamma), local_beta)
     
Exemple #28
0
def get_weightnormed_matrix(shape, axis=1, name='weightnorm',
                            V_init=tf.random_normal_initializer(stddev=0.015),
                            train_gains=True, dtype=tf.float32,
                            trainable=True, squared=False):
    """Returns a matrix weightnormed across a given index.

    Adds 2 trainable variables:
      - V, a matrix, initialised with the default init
      - g, a vector, initialised to 1s

    returns g * V / elementwise l2 norm of V.

    Args:
        shape: sequence of 2 ints. We are only dealing with matrices
            here.
        axis: how to do the normalising, defaults to 1, which is likely
            to be what you want if your data is `[batch_size x d]`.
        name: name for the scope, defaults to weightnorm
        V_init: initialiser for the unnormalised part of the matrix.
        train_gains: if false, gains will be always one.
        dtype: type for the created variables.
        trainable: whether the matrix should be added to the tensorflow
            trainable variables collection.
        squared: if true, don't take the square root and just divide by the
            squared norm.

    Returns:
        Tensor: the matrix whose rows or columns will never exceed the learned
            norm.
    """
    if len(shape) != 2:
        raise ValueError(
            'Expected two dimensional shape, but it is {}'.format(shape))
    with tf.name_scope(name):
        unnormed_w = tf.get_variable(name+'_V', shape,
                                     trainable=trainable,
                                     initializer=V_init,
                                     dtype=dtype)
        if axis:
            gains = tf.get_variable(name+'_g', [shape[0], 1],
                                    trainable=train_gains,
                                    initializer=tf.constant_initializer(1.0),
                                    dtype=dtype)
        else:
            gains = 1.0
        sqr_norms = tf.reduce_sum(
                tf.square(unnormed_w),
                axis=axis,
                keep_dims=True)

        if not squared:
            inv_norms = tf.rsqrt(sqr_norms)
        else:
            inv_norms = 1.0 / sqr_norms

        return gains * unnormed_w * inv_norms
Exemple #29
0
def instance_norm(input, name="instance_norm"):
    with tf.variable_scope(name):
        depth = input.get_shape()[3]
        scale = tf.get_variable("scale", [depth], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32))
        offset = tf.get_variable("offset", [depth], initializer=tf.constant_initializer(0.0))
        mean, variance = tf.nn.moments(input, axes=[1,2], keep_dims=True)
        epsilon = 1e-5
        inv = tf.rsqrt(variance + epsilon)
        normalized = (input-mean)*inv
        return scale*normalized + offset
Exemple #30
0
def batch_norm(x,  name="batch_norm"):
    eps = 1e-6
    with tf.variable_scope(name):
        nchannels = x.get_shape()[3]
        scale = tf.get_variable("scale", [nchannels], initializer=tf.random_normal_initializer(1.0, 0.02, dtype=tf.float32))
        center = tf.get_variable("center", [nchannels], initializer=tf.constant_initializer(0.0, dtype = tf.float32))
        ave, dev = tf.nn.moments(x, axes=[1,2], keep_dims=True)
        inv_dev = tf.rsqrt(dev + eps)
        normalized = (x-ave)*inv_dev * scale + center
        return normalized
Exemple #31
0
def training(features, labels, mode):

    dist_min = 9999999.0
    dist_max = 0.0
    dist = []

    cost_train = net_archs.conv_net(features,
                                    labels,
                                    dropout,
                                    reuse=False,
                                    is_training=True)
    cost_test = net_archs.conv_net(features,
                                   labels,
                                   dropout,
                                   reuse=True,
                                   is_training=False)

    logits_test = rrt_star_module.rrt_star(features['images'], cost_train,
                                           features['labels'], 0)

    dist_paths_rrt, dissimilarity = metric_path_module.metric_path(
        logits_test, features['labels'], features['images'], 0)
    mse_rrt = tf.losses.mean_squared_error(logits_test, features['labels'])
    dist.append(dist_paths_rrt)

    dist_min = tf.cond(dist_paths_rrt < dist_min, lambda: dist_paths_rrt,
                       lambda: dist_min)
    dist_max = tf.cond(dist_paths_rrt > dist_max, lambda: dist_paths_rrt,
                       lambda: dist_max)

    for i in range(N_REP_NET - 1):
        logits_test_i = rrt_star_module.rrt_star(features['images'],
                                                 cost_train,
                                                 features['labels'], i + 1)
        dist_paths_rrt_aux, dissimilarity_aux = metric_path_module.metric_path(
            logits_test_i, features['labels'], features['images'], 0)
        mse_rrt += tf.losses.mean_squared_error(logits_test_i,
                                                features['labels'])
        logits_test += logits_test_i
        dissimilarity += dissimilarity_aux
        dist.append(dist_paths_rrt_aux)
        dist_paths_rrt += dist_paths_rrt_aux
        dist_min = tf.cond(dist_paths_rrt_aux < dist_min,
                           lambda: dist_paths_rrt_aux, lambda: dist_min)
        dist_max = tf.cond(dist_paths_rrt_aux > dist_max,
                           lambda: dist_paths_rrt_aux, lambda: dist_max)

    logits_test /= N_REP_NET
    dist_paths_rrt /= N_REP_NET
    dissimilarity /= N_REP_NET
    mse_rrt /= N_REP_NET

    tf_dist = tf.stack(dist)

    size_dist = float(N_REP_NET)

    stddev = tf.sqrt(
        tf.reduce_sum(tf.pow(tf_dist - dist_paths_rrt, 2)) / size_dist)
    stderror = stddev * tf.rsqrt(size_dist)

    mse_cost = tf.losses.mean_squared_error(cost_test,
                                            1.0 - features['labels'])
    log_likelihood = tf.reduce_sum(
        tf.multiply(cost_test, features['labels'] - logits_test + sum_eps))

    loss_op = log_likelihood
    log_image = tf.log(
        tf.clip_by_value(tf.reshape(cost_test, shape=[-1, 200, 200, 1]), 0,
                         0.1))

    tf.summary.image('images',
                     tf.reshape(features['images'], shape=[-1, 200, 200, 1]))
    tf.summary.image('rrt_in', tf.reshape(cost_test, shape=[-1, 200, 200, 1]))
    tf.summary.image('rrt_out', tf.reshape(logits_test,
                                           shape=[-1, 200, 200, 1]))
    tf.summary.image('label',
                     tf.reshape(features['labels'], shape=[-1, 200, 200, 1]))

    #~ tf.summary.scalar('dist_between_paths', dist_paths_rrt)
    #~ tf.summary.scalar('dist_min', dist_min)
    #~ tf.summary.scalar('dist_max', dist_max)
    #~ tf.summary.scalar('dissimilarity', dissimilarity)
    tf.summary.scalar('mse_rrt', mse_rrt)
    tf.summary.scalar('mse_cost', mse_cost)
    tf.summary.scalar('log_likelihood', log_likelihood)
    #~ tf.summary.scalar('stderror', stderror)

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(mse_rrt,
                                  global_step=tf.train.get_global_step())

    if mode == tf.estimator.ModeKeys.PREDICT:
        #estim_specs = tf.estimator.EstimatorSpec(mode, predictions=cost_test)
        estim_specs = tf.estimator.EstimatorSpec(mode, predictions=logits_test)
    elif mode == tf.estimator.ModeKeys.EVAL:
        metrics = {
            'mse_rrt': tf.metrics.mean(mse_rrt),
            'mse_cost': tf.metrics.mean(mse_cost),
            'log_likelihood': tf.metrics.mean(log_likelihood)
        }
        estim_specs = tf.estimator.EstimatorSpec(mode,
                                                 loss=loss_op,
                                                 eval_metric_ops=metrics)
    else:
        estim_specs = tf.estimator.EstimatorSpec(mode=mode,
                                                 predictions=logits_test,
                                                 loss=mse_rrt,
                                                 train_op=train_op)
    return estim_specs
Exemple #32
0
def pixel_norm(x, epsilon=1e-8):
    return x * tf.rsqrt(tf.reduce_mean(tf.square(x), axis=-1, keepdims=True) + epsilon)
Exemple #33
0
    def model_fn(features, labels, mode, params):
        tf.logging.info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info(
                '  name = %s, shape = %s' % (name, features[name].shape)
            )

        inputs = features['input_ids']
        targets = features['target_ids']

        is_training = mode == tf.estimator.ModeKeys.TRAIN

        model = modeling.TransformerModel(bert_config)
        (llh, logits, pred_ids), _ = model(
            inputs, target_ids=targets, training=is_training
        )

        total_loss = padded_cross_entropy_loss(
            logits,
            targets,
            bert_config['label_smoothing'],
            bert_config['vocab_size'],
        )

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (
                assignment_map,
                initialized_variable_names,
            ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(
                        init_checkpoint, assignment_map
                    )
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info('**** Trainable Variables ****')
        print(initialized_variable_names)
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info(
                '  name = %s, shape = %s%s', var.name, var.shape, init_string
            )

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            init_lr = learning_rate
            global_step = tf.train.get_global_step()
            lr = (
                init_lr
                / 0.01
                * tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000))
            )

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0,
            )
            if use_tpu:
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            train_op = optimizer.minimize(total_loss, global_step=global_step)

            # if not bert_config['use_bias']:
            #     logging.info('Fixing position embedding, i.e. not trainable.')
            #     posemb = 'pegasus/embeddings/position_embeddings'
            #     tvars = list(
            #         filter(lambda v: v.name.split(':')[0] != posemb, tvars)
            #     )

            # gradients = optimizer.compute_gradients(total_loss, tvars)

            # train_op = optimization.create_optimizer(
            #     total_loss,
            #     learning_rate,
            #     num_train_steps,
            #     num_warmup_steps,
            #     use_tpu,
            # )

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn,
            )
        elif mode == tf.estimator.ModeKeys.EVAL:

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=None,
                scaffold_fn=scaffold_fn,
            )
        else:
            raise ValueError(
                'Only TRAIN and EVAL modes are supported: %s' % (mode)
            )

        return output_spec
Exemple #34
0
 def __call__(self, x):
     assert [x.shape[-1]] == self.g.shape == self.b.shape
     u = tf.reduce_mean(x, axis=-1, keepdims=True)
     v = tf.reduce_mean(tf.squared_difference(x, u), axis=-1, keepdims=True)
     return (x - u) * tf.rsqrt(v + self.eps) * self.g + self.b
 def call(self, x, epsilon=1e-6):
     mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
     variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
     norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
     return norm_x * self.scale + self.bias
Exemple #36
0
 def _learning_rate_default(self, multiply_by_parameter_scale):
     learning_rate = tf.minimum(tf.rsqrt(step_num() + 1.0), 0.01)
     if not multiply_by_parameter_scale:
         learning_rate *= 0.05
     return learning_rate
Exemple #37
0
def apply_cmvn(feats, mean, variance, epsilon=1e-9):
    ''' TF: apply CMVN on feature'''
    return (feats - mean) * tf.rsqrt(variance + epsilon)
Exemple #38
0
def tfe_rsqrt(t):
  return tf.rsqrt(t)
Exemple #39
0
def fit_one_step(
    model_matrix,
    response,
    model,
    model_coefficients_start=None,
    predicted_linear_response_start=None,
    l2_regularizer=None,
    dispersion=None,
    offset=None,
    learning_rate=None,
    fast_unsafe_numerics=True,
    name=None):
  """Runs one step of Fisher scoring.

  Args:
    model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row
      represents a sample's features.
    response: (Batch of) vector-shaped `Tensor` where each element represents a
      sample's observed response (to the corresponding row of features). Must
      have same `dtype` as `model_matrix`.
    model: `tfp.glm.ExponentialFamily`-like instance used to construct the
      negative log-likelihood loss, gradient, and expected Hessian (i.e., the
      Fisher information matrix).
    model_coefficients_start: Optional (batch of) vector-shaped `Tensor`
      representing the initial model coefficients, one for each column in
      `model_matrix`. Must have same `dtype` as `model_matrix`.
      Default value: Zeros.
    predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype`
      matching `response`; represents `offset` shifted initial linear
      predictions based on `model_coefficients_start`.
      Default value: `offset` if `model_coefficients is None`, and
      `tfp.math.matvecmul(model_matrix, model_coefficients_start) + offset`
      otherwise.
    l2_regularizer: Optional scalar `Tensor` representing L2 regularization
      penalty, i.e.,
      `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`.
      Default value: `None` (i.e., no L2 regularization).
    dispersion: Optional (batch of) `Tensor` representing `response` dispersion,
      i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`.
      Must broadcast with rows of `model_matrix`.
      Default value: `None` (i.e., "no dispersion").
    offset: Optional `Tensor` representing constant shift applied to
      `predicted_linear_response`.  Must broadcast to `response`.
      Default value: `None` (i.e., `tf.zeros_like(response)`).
    learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative
      progress. Typically only needed if optimization diverges, should be no
      larger than `1` and typically very close to `1`.
      Default value: `None` (i.e., `1`).
    fast_unsafe_numerics: Optional Python `bool` indicating if solve should be
      based on Cholesky or QR decomposition.
      Default value: `True` (i.e., "prefer speed via Cholesky decomposition").
    name: Python `str` used as name prefix to ops created by this function.
      Default value: `"fit_one_step"`.

  Returns:
    model_coefficients: (Batch of) vector-shaped `Tensor`; represents the
      next estimate of the model coefficients, one for each column in
      `model_matrix`.
    predicted_linear_response: `response`-shaped `Tensor` representing linear
      predictions based on new `model_coefficients`, i.e.,
      `tfp.math.matvecmul(model_matrix, model_coefficients_next) + offset`.
  """
  graph_deps = [model_matrix, response, model_coefficients_start,
                predicted_linear_response_start, dispersion, learning_rate]
  with tf.name_scope(name, 'fit_one_step', graph_deps):

    [
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset,
    ] = prepare_args(
        model_matrix,
        response,
        model_coefficients_start,
        predicted_linear_response_start,
        offset)

    # Compute: mean, grad(mean, predicted_linear_response_start), and variance.
    mean, variance, grad_mean = model(predicted_linear_response_start)

    # If either `grad_mean` or `variance is non-finite or zero, then we'll
    # replace it with a value such that the row is zeroed out. Although this
    # procedure may seem circuitous, it is necessary to ensure this algorithm is
    # itself differentiable.
    is_valid = (tf.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) &
                tf.is_finite(variance) & (variance > 0.))
    def mask_if_invalid(x, mask):
      mask = tf.fill(tf.shape(x), value=np.array(mask, x.dtype.as_numpy_dtype))
      return tf.where(is_valid, x, mask)

    # Run one step of iteratively reweighted least-squares.
    # Compute "`z`", the adjusted predicted linear response.
    # z = predicted_linear_response_start
    #     + learning_rate * (response - mean) / grad_mean
    z = (response - mean) / mask_if_invalid(grad_mean, 1.)
    # TODO(jvdillon): Rather than use learning rate, we should consider using
    # backtracking line search.
    if learning_rate is not None:
      z *= learning_rate[..., tf.newaxis]
    z += predicted_linear_response_start

    # Compute "`w`", the per-sample weight.
    if dispersion is not None:
      # For convenience, we'll now scale the variance by the dispersion factor.
      variance *= dispersion
    w = (mask_if_invalid(grad_mean, 0.) *
         tf.rsqrt(mask_if_invalid(variance, np.inf)))

    a = model_matrix * w[..., tf.newaxis]
    b = z * w
    # Solve `min{ || A @ model_coefficients - b ||_2**2 : model_coefficients }`
    # where `@` denotes `matmul`.

    if l2_regularizer is None:
      l2_regularizer = np.array(0, a.dtype.as_numpy_dtype)
    else:
      l2_regularizer_ = distributions_util.maybe_get_static_value(
          l2_regularizer, a.dtype.as_numpy_dtype)
      if l2_regularizer_ is not None:
        l2_regularizer = l2_regularizer_

    def _embed_l2_regularization():
      """Adds synthetic observations to implement L2 regularization."""
      # `tf.matrix_solve_ls` does not respect the `l2_regularization` argument
      # when `fast_unsafe_numerics` is `False`. This function  adds synthetic
      # observations to the data to implement the regularization instead.
      # Adding observations `sqrt(l2_regularizer) * I` is mathematically
      # equivalent to adding the term
      # `-l2_regularizer ||coefficients||_2**2` to the log-likelihood.
      num_model_coefficients = num_cols(model_matrix)
      batch_shape = tf.shape(model_matrix)[:-2]
      eye = tf.eye(
          num_model_coefficients, batch_shape=batch_shape, dtype=a.dtype)
      a_ = tf.concat([a, tf.sqrt(l2_regularizer) * eye], axis=-2)
      b_ = distributions_util.pad(
          b, count=num_model_coefficients, axis=-1, back=True)
      # Return l2_regularizer=0 since its now embedded.
      l2_regularizer_ = np.array(0, a.dtype.as_numpy_dtype)
      return a_, b_, l2_regularizer_

    a, b, l2_regularizer = smart_cond.smart_cond(
        smart_reduce_all([not(fast_unsafe_numerics),
                          l2_regularizer > 0.]),
        _embed_l2_regularization,
        lambda: (a, b, l2_regularizer))

    model_coefficients_next = tf.matrix_solve_ls(
        a, b[..., tf.newaxis],
        fast=fast_unsafe_numerics,
        l2_regularizer=l2_regularizer,
        name='model_coefficients_next')
    model_coefficients_next = model_coefficients_next[..., 0]

    # TODO(b/79122261): The approach used in `matrix_solve_ls` could be made
    # faster by avoiding explicitly forming Q and instead keeping the
    # factorization in 'implicit' form with stacked (rescaled) Householder
    # vectors underneath the 'R' and then applying the (accumulated)
    # reflectors in the appropriate order to apply Q'. However, we don't
    # presently do this because we lack core TF functionality. For reference,
    # the vanilla QR approach is:
    #   q, r = tf.linalg.qr(a)
    #   c = tf.matmul(q, b, adjoint_a=True)
    #   model_coefficients_next = tf.matrix_triangular_solve(
    #       r, c, lower=False, name='model_coefficients_next')

    predicted_linear_response_next = calculate_linear_predictor(
        model_matrix,
        model_coefficients_next,
        offset,
        name='predicted_linear_response_next')

    return model_coefficients_next, predicted_linear_response_next
Exemple #40
0
 def testRenames(self):
     self.assertAllClose(1.04719755, tf.acos(0.5))
     self.assertAllClose(0.5, tf.rsqrt(4.0))
Exemple #41
0
    def bottom(self, x):
        """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
        inputs = x
        p = self._model_hparams

        num_mel_bins = p.audio_num_mel_bins
        num_channels = 3 if p.audio_add_delta_deltas else 1

        with tf.variable_scope(self.name):
            if p.audio_preproc_in_bottom:
                # Compute filterbanks
                with tf.variable_scope("fbanks"):
                    waveforms = tf.squeeze(inputs, [2, 3])
                    mel_fbanks = common_audio.compute_mel_filterbank_features(
                        waveforms,
                        sample_rate=p.audio_sample_rate,
                        dither=p.audio_dither,
                        preemphasis=p.audio_preemphasis,
                        frame_length=p.audio_frame_length,
                        frame_step=p.audio_frame_step,
                        lower_edge_hertz=p.audio_lower_edge_hertz,
                        upper_edge_hertz=p.audio_upper_edge_hertz,
                        num_mel_bins=p.audio_num_mel_bins,
                        apply_mask=True)
                    if p.audio_add_delta_deltas:
                        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
                    x = tf.reshape(
                        mel_fbanks,
                        common_layers.shape_list(mel_fbanks)[:2] +
                        [num_mel_bins, num_channels])

                    nonpadding_mask = 1. - common_attention.embedding_to_padding(
                        x)
                    num_of_nonpadding_elements = tf.reduce_sum(
                        nonpadding_mask) * num_mel_bins * num_channels

                    # This replaces CMVN estimation on data
                    var_epsilon = 1e-09
                    mean = tf.reduce_sum(x, axis=[
                        1
                    ], keepdims=True) / num_of_nonpadding_elements
                    variance = (
                        num_of_nonpadding_elements * mean**2. -
                        2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                        tf.reduce_sum(x**2, axis=[1], keepdims=True)
                    ) / num_of_nonpadding_elements
                    x = (x - mean) * tf.rsqrt(variance +
                                              var_epsilon) * tf.expand_dims(
                                                  nonpadding_mask, -1)
            else:
                x = inputs

            # The convention is that the models are flattened along the spatial,
            # dimensions, thus the speech preprocessor treats frequencies and
            # channels as image colors (last axis)
            x.set_shape([None, None, num_mel_bins, num_channels])

            # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
            x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
            for _ in range(2):
                x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False)
                x = common_layers.layer_norm(x)
                x = tf.nn.relu(x)

            xshape = common_layers.shape_list(x)
            # apply a conv that will remove all frequencies and at the same time
            # project the output into desired hidden_size
            x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
            x = tf.layers.conv2d(x,
                                 p.hidden_size, (3, xshape[2]),
                                 use_bias=False)

            assert common_layers.shape_list(x)[2] == 1
            x = common_layers.layer_norm(x)
            x = tf.nn.relu(x)
        return x
Exemple #42
0
    def call(self, inputs, training=None):
        axis = self.axis
        input_shape = K.int_shape(inputs)
        ndim = len(input_shape)
        dim = input_shape[axis]
        dtype = K.dtype(inputs)

        if ndim > 2:
            image_axes = list(ax for ax in range(1, ndim) if ax != axis)
            if self.image_size is not None:
                scale_squared_norm = tf.cast(1.0 / self.image_size, dtype=dtype)
            else:
                num_pixels = K.prod([K.shape(inputs)[ax] for ax in image_axes])
                scale_squared_norm = 1.0 / K.cast(num_pixels, dtype=dtype)
            if self.scale_coe != 1.0:
                scale_squared_norm /= self.scale_coe

        broadcast_shape = [1] * ndim
        broadcast_shape[axis] = dim

        def unitized_inference():
            broadcasted_moving_mean = K.reshape(
                self.moving_mean, broadcast_shape
            )
            broadcasted_moving_variance = K.reshape(
                self.moving_variance, broadcast_shape
            )

            broadcasted_moving_variance += self.epsilon
            scale = tf.rsqrt(broadcasted_moving_variance)
            centered_inputs = inputs - broadcasted_moving_mean
            if ndim > 2:
                squared_inputs = tf.reduce_mean(
                    centered_inputs**2, image_axes, True
                )
            else:
                squared_inputs = centered_inputs**2
            normalized_inputs = squared_inputs / broadcasted_moving_variance
            squared_norm = tf.reduce_sum(normalized_inputs, [axis], True)
            if ndim > 2:
                squared_norm *= scale_squared_norm

            alpha = K.reshape(self.alpha, broadcast_shape)
            scale *= alpha * tf.rsqrt(squared_norm + self.epsilon) + (1 - alpha)
            if self.scale:
                scale *= K.reshape(self.gamma, broadcast_shape)
            outputs = scale * centered_inputs

            if self.center:
                outputs += K.reshape(self.beta, broadcast_shape)

            return outputs

        if training in {0, False}:
            unitized_inputs = unitize_inference()

        else:
            reduction_axes = list(ax for ax in range(ndim) if ax != axis)
            mean = tf.reduce_mean(inputs, reduction_axes, False)
            broadcasted_mean = K.reshape(mean, broadcast_shape)
            centered_inputs = inputs - broadcasted_mean
            if ndim > 2:
                squared_inputs = tf.reduce_mean(
                    centered_inputs**2, image_axes, True
                )
            else:
                squared_inputs = centered_inputs**2
            broadcasted_variance = tf.reduce_mean(squared_inputs, [0], True)
            sample_size = K.prod([
                K.shape(inputs)[axis] for axis in reduction_axes
            ])
            sample_size = K.cast(sample_size, dtype=dtype)
            broadcasted_variance *= sample_size / (sample_size - (1.0 + self.epsilon))
            variance = tf.squeeze(broadcasted_variance, reduction_axes)

            self.add_update(
                [
                    K.moving_average_update(
                        self.moving_mean, mean, self.momentum
                    ),
                    K.moving_average_update(
                        self.moving_variance, variance, self.momentum
                    )
                ],
                inputs
            )

            broadcasted_variance += self.epsilon
            scale = tf.rsqrt(broadcasted_variance)

            normalized_inputs = squared_inputs / broadcasted_variance
            squared_norm = tf.reduce_sum(normalized_inputs, [axis], True)
            if ndim > 2:
                squared_norm *= scale_squared_norm
            alpha = K.reshape(self.alpha, broadcast_shape)
            scale *= alpha * tf.rsqrt(squared_norm + self.epsilon) + (1 - alpha)

            if self.scale:
                scale *= K.reshape(self.gamma, broadcast_shape)

            unitized_inputs = scale * centered_inputs

            if self.center:
                unitized_inputs += K.reshape(self.beta, broadcast_shape)

        return K.in_train_phase(
            unitized_inputs, unitized_inference, training=training
        )
def pixel_norm(x, epsilon=1e-8):
    with tf.variable_scope("PixelNorm"):
        return x * tf.rsqrt(
            tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + epsilon)
def modulated_conv2d_layer(x,
                           y,
                           fmaps,
                           kernel,
                           up=False,
                           down=False,
                           demodulate=True,
                           resample_kernel=None,
                           gain=1,
                           use_wscale=True,
                           lrmul=1,
                           fused_modconv=True,
                           weight_var='weight',
                           mod_weight_var='mod_weight',
                           mod_bias_var='mod_bias'):
    assert not (up and down)
    assert kernel >= 1 and kernel % 2 == 1

    # Get weight.
    w = get_weight([kernel, kernel, x.shape[1].value, fmaps],
                   gain=gain,
                   use_wscale=use_wscale,
                   lrmul=lrmul,
                   weight_var=weight_var)
    ww = w[np.newaxis]  # [BkkIO] Introduce minibatch dimension.

    # Modulate.
    s = dense_layer(
        y, fmaps=x.shape[1].value,
        weight_var=mod_weight_var)  # [BI] Transform incoming W to style.
    s = apply_bias_act(
        s, bias_var=mod_bias_var) + 1  # [BI] Add bias (initially 1).
    ww *= tf.cast(s[:, np.newaxis, np.newaxis, :, np.newaxis],
                  w.dtype)  # [BkkIO] Scale input feature maps.

    # Demodulate.
    if demodulate:
        d = tf.rsqrt(tf.reduce_sum(tf.square(ww), axis=[1, 2, 3]) +
                     1e-8)  # [BO] Scaling factor.
        ww *= d[:, np.newaxis, np.newaxis,
                np.newaxis, :]  # [BkkIO] Scale output feature maps.

    # Reshape/scale input.
    if fused_modconv:
        x = tf.reshape(x,
                       [1, -1, x.shape[2], x.shape[3]
                        ])  # Fused => reshape minibatch to convolution groups.
        w = tf.reshape(tf.transpose(ww, [1, 2, 3, 0, 4]),
                       [ww.shape[1], ww.shape[2], ww.shape[3], -1])
    else:
        x *= tf.cast(s[:, :, np.newaxis, np.newaxis],
                     x.dtype)  # [BIhw] Not fused => scale input activations.

    # Convolution with optional up/downsampling.
    if up:
        x = upsample_conv_2d(x,
                             tf.cast(w, x.dtype),
                             data_format='NCHW',
                             k=resample_kernel)
    elif down:
        x = conv_downsample_2d(x,
                               tf.cast(w, x.dtype),
                               data_format='NCHW',
                               k=resample_kernel)
    else:
        x = tf.nn.conv2d(x,
                         tf.cast(w, x.dtype),
                         data_format='NCHW',
                         strides=[1, 1, 1, 1],
                         padding='SAME')

    # Reshape/scale output.
    if fused_modconv:
        x = tf.reshape(
            x, [-1, fmaps, x.shape[2], x.shape[3]
                ])  # Fused => reshape convolution groups back to minibatch.
    elif demodulate:
        x *= tf.cast(d[:, :, np.newaxis, np.newaxis],
                     x.dtype)  # [BOhw] Not fused => scale output activations.
    return x
def normalize_vector(d, scope=None):
    with tf.name_scope(scope, 'norm_vec'):
        ndim = len(d.shape)
        output = d * tf.rsqrt(1e-6 + tf.reduce_sum(
            tf.square(d), axis=range(1, ndim), keep_dims=True))
    return output
Exemple #46
0
def apply_local_cmvn(feats, epsilon=1e-9):
    ''' feats: (NHWC) '''
    mean = tf.expand_dims(keras_backend.mean(feats, axis=1), axis=1)
    var = tf.expand_dims(keras_backend.var(feats, axis=1), axis=1)
    feats = (feats - mean) * tf.rsqrt(var + epsilon)
    return feats
Exemple #47
0
else:
    ff_loss_reg = ff_loss

opt = tf.train.GradientDescentOptimizer(learning_rate).minimize(
    ff_loss_reg, global_step=batch)

op_list = []
if FLAGS.cges:
    # Normalization parameter
    glayerwise = utils.glayerwise
    elayerwise = utils.elayerwise
    for vind, var in enumerate(S_vars):
        # GS
        group_sum = tf.reduce_sum(tf.square(var), -1)
        g_param = learning_rate * FLAGS.lamb * (FLAGS.mu - vind * FLAGS.chvar)
        gl_comp = 1. - g_param * glayerwise[vind] * tf.rsqrt(group_sum)
        gl_plus = tf.cast(gl_comp > 0, tf.float32) * gl_comp
        gl_stack = tf.stack([gl_plus for _ in range(var.get_shape()[-1])], -1)
        gl_op = gl_stack * var

        # ES
        e_param = learning_rate * FLAGS.lamb * (
            (1. - FLAGS.mu) + vind * FLAGS.chvar)
        W_sum = e_param * elayerwise[vind] * tf.reduce_sum(tf.abs(gl_op), -1)
        W_sum_stack = tf.stack([W_sum for _ in range(gl_op.get_shape()[-1])],
                               -1)
        el_comp = tf.abs(gl_op) - W_sum_stack
        el_plus = tf.cast(el_comp > 0, tf.float32) * el_comp
        cges_op = var.assign(el_plus * tf.sign(gl_op))
        op_list.append(cges_op)
Exemple #48
0
def layer_norm_compute_python(x, epsilon, scale, bias):
    """Layer norm raw computation."""
    mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    return norm_x * scale + bias
    def layerfrn(input_tensor, name, eps=1e-6, learn_eps=True, scale=True):
        """

        :param input_tensor:
        :param name:
        :param eps:
        :param learn_eps:
        :param scale:
        :return:
        """
        with tf.variable_scope(name_or_scope=name):
            input_channels = input_tensor.get_shape().as_list()[-1]

            # compute norm
            norm_square = tf.pow(input_tensor, 2, name='power')
            norm_square = tf.reduce_mean(input_tensor=norm_square,
                                         axis=[1, 2],
                                         keepdims=True)
            if scale:
                gamma = tf.get_variable(name='gamma',
                                        shape=[1, 1, 1, input_channels],
                                        dtype=tf.float32,
                                        initializer=tf.ones_initializer(),
                                        trainable=True)
            else:
                gamma = tf.get_variable(name='gamma',
                                        shape=[1, 1, 1, input_channels],
                                        dtype=tf.float32,
                                        initializer=tf.ones_initializer(),
                                        trainable=False)
            beta = tf.get_variable(name='beta',
                                   shape=[1, 1, 1, input_channels],
                                   dtype=tf.float32,
                                   initializer=tf.zeros_initializer(),
                                   trainable=True)

            # apply frn
            if learn_eps:
                eps_ = tf.get_variable(
                    name='eps',
                    shape=[1, 1, 1, input_channels],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(eps),
                    trainable=True)
            else:
                eps_ = tf.get_variable(
                    name='eps',
                    shape=[1, 1, 1, input_channels],
                    dtype=tf.float32,
                    initializer=tf.constant_initializer(eps),
                    trainable=False)
            frn = input_tensor * tf.rsqrt(norm_square + tf.abs(eps_))
            frn = gamma * frn + beta

            # apply tlu
            t_thresh = tf.get_variable(
                name='t_thresh',
                shape=[1, 1, 1, input_channels],
                dtype=tf.float32,
                initializer=tf.constant_initializer(eps),
                trainable=True)
            frn_output = tf.maximum(frn, t_thresh, 'frn_output')
        return frn_output
Exemple #50
0
 def _resource_apply_dense(self, grad, var):
     grad = tf.to_float(grad)
     grad_squared = tf.square(grad) + 1e-30
     grad_squared_mean = tf.reduce_mean(grad_squared)
     decay_rate = self._decay_rate
     update_scale = self._learning_rate
     old_val = var
     if var.dtype.base_dtype == tf.bfloat16:
         old_val = tf.to_float(self._parameter_encoding.decode(old_val))
     if self._multiply_by_parameter_scale:
         update_scale *= tf.to_float(self._parameter_scale(old_val))
     # HACK: Make things dependent on grad.
     # This confounds the XLA rewriter and keeps it from fusing computations
     # across different variables.  This fusion is a bad for HBM usage, since
     # it causes the gradients to persist in memory.
     decay_rate += grad_squared_mean * 1e-30
     update_scale += grad_squared_mean * 1e-30
     # END HACK
     mixing_rate = 1.0 - decay_rate
     shape = var.get_shape().as_list()
     updates = []
     if self._should_use_factored_second_moment_estimate(shape):
         grad_squared_row_mean = tf.reduce_mean(grad_squared, -1)
         grad_squared_col_mean = tf.reduce_mean(grad_squared, -2)
         vr = self.get_slot(var, "vr")
         new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
         vc = self.get_slot(var, "vc")
         new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
         vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
         vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
         updates = [vr_update, vc_update]
         long_term_mean = tf.reduce_mean(new_vr, -1, keep_dims=True)
         r_factor = tf.rsqrt(new_vr / long_term_mean)
         c_factor = tf.rsqrt(new_vc)
         x = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(
             c_factor, -2)
     else:
         v = self.get_slot(var, "v")
         new_v = decay_rate * v + mixing_rate * grad_squared
         v_update = tf.assign(v, new_v, use_locking=self._use_locking)
         updates = [v_update]
         x = grad * tf.rsqrt(new_v)
     if self._clipping_threshold is not None:
         clipping_denom = tf.maximum(
             1.0,
             reduce_rms(x) / self._clipping_threshold)
         x /= clipping_denom
     subtrahend = update_scale * x
     if self._beta1:
         m = self.get_slot(var, "m")
         new_m = self._beta1 * tf.to_float(m) + (1.0 -
                                                 self._beta1) * subtrahend
         subtrahend = new_m
         new_m = tf.cast(new_m, var.dtype)
         updates.append(tf.assign(m, new_m, use_locking=self._use_locking))
     new_val = tf.to_float(old_val) - subtrahend
     if var.dtype.base_dtype == tf.bfloat16:
         new_val = self._parameter_encoding.encode(new_val,
                                                   self._quantization_noise)
     if self._simulated_quantize_bits:
         new_val = quantization.simulated_quantize(
             var - subtrahend, self._simulated_quantize_bits,
             self._quantization_noise)
     var_update = tf.assign(var, new_val, use_locking=self._use_locking)
     updates = [var_update] + updates
     return tf.group(*updates)
Exemple #51
0
 def __call__(self, x, name=None):
     with scope(name or self.name):
         mean, var = tf.nn.moments(x, 1, keep_dims=True)
         return (x - mean) * tf.rsqrt(var + 1e-12) * self.gain + self.bias
Exemple #52
0
 def __call__(self, shape, dtype=None, partition_info=None):
     del partition_info
     dtype = self.dtype if dtype is None else dtype
     std = tf.rsqrt(tf.cast(tf.reduce_prod(shape[:-1]), tf.float32) + 1e-7)
     return tf.random_normal(shape, stddev=std, dtype=dtype)
 def center_y(y):
     y -= tf.reduce_mean(y)
     y *= tf.rsqrt(
         tf.reduce_mean(
             tf.reduce_sum(y**2, axis=[1], keep_dims=True)))
     return y
Exemple #54
0
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
                use_scale=True, use_bias=True, data_format='NHWC'):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    """

    shape = x.get_shape().as_list()
    ndims = len(shape)
    assert ndims in [2, 4]
    if ndims == 2:
        data_format = 'NHWC'    # error using NCHW? (see #190)
    if data_format == 'NCHW':
        n_out = shape[1]
    else:
        n_out = shape[-1]  # channel
    assert n_out is not None, "Input to BatchRenorm cannot have unknown channels!"

    beta, gamma, moving_mean, moving_var = get_bn_variables(
        n_out, use_scale, use_bias, tf.constant_initializer(1.0))

    ctx = get_current_tower_context()
    use_local_stat = ctx.is_training
    # for BatchRenorm, use_local_stat should always be is_training, unless a
    # different usage comes out in the future.

    if use_local_stat:
        if ndims == 2:
            x = tf.reshape(x, [-1, 1, 1, n_out])

        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(
            x, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format)

        inv_sigma = tf.rsqrt(moving_var, 'inv_sigma')
        r = tf.stop_gradient(tf.clip_by_value(
            tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax))
        d = tf.stop_gradient(tf.clip_by_value(
            (batch_mean - moving_mean) * inv_sigma,
            -dmax, dmax))
        r = reshape_for_bn(r, ndims, n_out, data_format)
        d = reshape_for_bn(d, ndims, n_out, data_format)
        xn = xn * r + d

        if ndims == 2:
            xn = tf.squeeze(xn, [1, 2])

    else:
        if ndims == 4 and data_format == 'NCHW':
            [g, b, mm, mv] = [reshape_for_bn(_, ndims, n_out, data_format)
                              for _ in [gamma, beta, moving_mean, moving_var]]
            xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon)
        else:
            xn = tf.nn.batch_normalization(
                x, moving_mean, moving_var, beta, gamma, epsilon)

    # training also needs EMA, so ideally we should maintain it on every tower
    if ctx.is_main_training_tower or ctx.has_own_variables:
        ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
    else:
        ret = tf.identity(xn, name='output')

    vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var)
    if use_scale:
        vh.gamma = gamma
    if use_bias:
        vh.beta = beta
    return ret
def adaptive_instance_norm(x, mu, sigma):
    mean, variance = tf.nn.moments(x, axes=[1, 2], keep_dims=True)
    inv = tf.rsqrt(variance + EPS)
    return sigma * (x - mean) * inv + mu
Exemple #56
0
 def layer_pixel_norm(self, net, args, options):
     epsilon = 1e-8
     return net * tf.rsqrt(
         tf.reduce_mean(tf.square(net), axis=1, keepdims=True) + epsilon)
def pixel_norm(x):
    with tf.variable_scope('PixelNorm'):
        return x * tf.rsqrt(
            tf.reduce_mean(tf.square(x), axis=1, keep_dims=True) + 1e-8)
Exemple #58
0
def pixel_norm(x, epsilon=1e-8):
    with tf.variable_scope('PixelNorm'):
        epsilon = tf.constant(epsilon, dtype=x.dtype, name='epsilon')
        return x * tf.rsqrt(
            tf.reduce_mean(tf.square(x), axis=1, keepdims=True) + epsilon)
 def __l1_normalize(x, dim, epsilon=1e-12, name=None):
     square_sum = tf.reduce_sum(tf.abs(x), [dim], keep_dims=True)
     x_inv_norm = tf.rsqrt(tf.maximum(square_sum, epsilon))
     return tf.mul(x, x_inv_norm, name=name)
Exemple #60
0
 def __call__(self, inp):
     with tf.variable_scope(self.name):
         mean = tf.reduce_mean(inp, axis=[-1], keep_dims=True)
         variance = tf.reduce_mean(tf.square(inp - mean), axis=[-1], keep_dims=True)
         norm_x = (inp - mean) * tf.rsqrt(variance + self.epsilon)
         return norm_x * self.scale + self.bias