Ejemplo n.º 1
0
def _r2(probabilities, targets, weights=None):
  targets = math_ops.cast(targets, dtypes.float32)
  y_mean = math_ops.reduce_mean(targets, 0)
  squares_total = math_ops.reduce_sum(
      math_ops.squared_difference(targets, y_mean), 0)
  squares_residuals = math_ops.reduce_sum(
      math_ops.squared_difference(targets, probabilities), 0)
  score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
  return metrics.mean(score, weights=weights)
Ejemplo n.º 2
0
def _mean_squared_loss(logits, target):
  # To prevent broadcasting inside "-".
  if len(target.get_shape()) == 1:
    target = array_ops.expand_dims(target, axis=1)

  logits.get_shape().assert_is_compatible_with(target.get_shape())
  return math_ops.squared_difference(logits, math_ops.to_float(target))
  def testSampleConsistentStats(self):
    loc = np.float32([[-1., 1], [1, -1]])
    scale = np.float32([1., 0.5])
    n_samp = 1e4
    with self.test_session() as sess:
      ind = independent_lib.Independent(
          distribution=mvn_diag_lib.MultivariateNormalDiag(
              loc=loc,
              scale_identity_multiplier=scale),
          reduce_batch_ndims=1)

      x = ind.sample(int(n_samp), seed=42)
      sample_mean = math_ops.reduce_mean(x, axis=0)
      sample_var = math_ops.reduce_mean(
          math_ops.squared_difference(x, sample_mean), axis=0)
      sample_std = math_ops.sqrt(sample_var)
      sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0)

      [
          sample_mean_, sample_var_, sample_std_, sample_entropy_,
          actual_mean_, actual_var_, actual_std_, actual_entropy_,
          actual_mode_,
      ] = sess.run([
          sample_mean, sample_var, sample_std, sample_entropy,
          ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(),
      ])

      self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.)
      self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.)
      self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.)
      self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
Ejemplo n.º 4
0
def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
  """Adds a Sum-of-Squares loss to the training procedure.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  [batch_size], then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    predictions: The predicted outputs.
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    weights: Coefficients for the loss a scalar, a tensor of shape
      [batch_size] or a tensor whose shape matches `predictions`.
    scope: The scope for the operations performed in computing the loss.

  Returns:
    A scalar `Tensor` representing the loss value.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.
  """
  with ops.name_scope(scope, "mean_squared_error",
                      [predictions, labels, weights]) as scope:
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    predictions = math_ops.cast(predictions, dtypes.float32)
    labels = math_ops.cast(labels, dtypes.float32)
    losses = math_ops.squared_difference(predictions, labels)
    return compute_weighted_loss(losses, weights, scope=scope)
Ejemplo n.º 5
0
def exact_gaussian_kernel(x, y, stddev):
  r"""Computes exact Gaussian kernel value(s) for tensors x and y and stddev.

  The Gaussian kernel for vectors u, v is defined as follows:
       K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
  where the norm is the l2-norm. x, y can be either vectors or matrices. If they
  are vectors, they must have the same dimension. If they are matrices, they
  must have the same number of columns. In the latter case, the method returns
  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
  v is a row from y.

  Args:
    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
    stddev: The width of the Gaussian kernel.

  Returns:
    A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix
      of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for
      all (u,v) pairs where u, v are rows from x and y respectively.

  Raises:
    ValueError: if the shapes of x, y are not compatible.
  """
  x_aligned, y_aligned = _align_matrices(x, y)
  diff_squared_l2_norm = math_ops.reduce_sum(
      math_ops.squared_difference(x_aligned, y_aligned), 2)
  return math_ops.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
Ejemplo n.º 6
0
def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
                     margin=1.0):
  """Computes the contrastive loss.

  This loss encourages the embedding to be close to each other for
    the samples of the same label and the embedding to be far apart at least
    by the margin constant for the samples of different labels.
  See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      binary labels indicating positive vs negative pair.
    embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor
      images. Embeddings should be l2 normalized.
    embeddings_positive: 2-D float `Tensor` of embedding vectors for the
      positive images. Embeddings should be l2 normalized.
    margin: margin term in the loss definition.

  Returns:
    contrastive_loss: tf.float32 scalar.
  """
  # Get per pair distances
  distances = math_ops.sqrt(
      math_ops.reduce_sum(
          math_ops.squared_difference(embeddings_anchor, embeddings_positive),
          1))

  # Add contrastive loss for the siamese network.
  #   label here is {0,1} for neg, pos.
  return math_ops.reduce_mean(
      math_ops.to_float(labels) * math_ops.square(distances) +
      (1. - math_ops.to_float(labels)) *
      math_ops.square(math_ops.maximum(margin - distances, 0.)),
      name='contrastive_loss')
Ejemplo n.º 7
0
 def testSquaredDifference(self):
   x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
   y = np.array([-3, -2, -1], dtype=np.int32)
   z = (x - y)*(x - y)
   with self.test_session():
     z_tf = math_ops.squared_difference(x, y).eval()
     self.assertAllClose(z, z_tf)
Ejemplo n.º 8
0
 def testSquaredDifference(self):
   for dtype in [np.int32, np.float16]:
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
     y = np.array([-3, -2, -1], dtype=dtype)
     z = (x - y)*(x - y)
     with self.test_session(use_gpu=True):
       z_tf = math_ops.squared_difference(x, y).eval()
       self.assertAllClose(z, z_tf)
Ejemplo n.º 9
0
 def testSquaredDifference(self):
   for dtype in [np.int32, np.float16]:
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
     y = np.array([-3, -2, -1], dtype=dtype)
     z = (x - y) * (x - y)
     with test_util.device(use_gpu=True):
       z_tf = self.evaluate(math_ops.squared_difference(x, y))
       self.assertAllClose(z, z_tf)
Ejemplo n.º 10
0
def moments(x, axes, name=None, keep_dims=False):
  """Calculate the mean and variance of `x`.

  The mean and variance are calculated by aggregating the contents of `x`
  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
  and variance of a vector.

  When using these moments for batch normalization (see
  `tf.nn.batch_normalization`):
    * for so-called "global normalization", used with convolutional filters with
      shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
    * for simple batch normalization pass `axes=[0]` (batch only).

  Args:
    x: A `Tensor`.
    axes: array of ints.  Axes along which to compute mean and
      variance.
    keep_dims: produce moments with the same dimensionality as the input.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  """
  with ops.op_scope([x, axes], name, "moments"):
    x = ops.convert_to_tensor(x, name="x")
    x_shape = x.get_shape()
    if all(x_shape[d].value is not None for d in axes):
      # The shape is known in the relevant axes, so we can statically
      # compute the divisor.
      divisor = 1.0
      for d in set(axes):
        divisor *= x.get_shape()[d].value
      divisor = constant_op.constant(1.0 / divisor, x.dtype, name="divisor")
    else:
      divisor = constant_op.constant(1.0, dtype=x.dtype)
      x_dynamic_shape = array_ops.shape(x)
      for d in set(axes):
        divisor *= math_ops.cast(x_dynamic_shape[d], x.dtype)
      divisor = math_ops.inv(divisor, name="divisor")
    constant_axes = constant_op.constant(axes, name="axes")
    # Note: We do not use Mean here because it is very slow on GPU.
    mean = math_ops.mul(
        math_ops.reduce_sum(x,
                            constant_axes,
                            keep_dims=True),
        divisor,
        name="mean")
    var = math_ops.mul(
        math_ops.reduce_sum(
            math_ops.squared_difference(x, mean),
            constant_axes,
            keep_dims=keep_dims),
        divisor,
        name="variance")
    if keep_dims:
      return mean, var
    else:
      return array_ops.squeeze(mean, squeeze_dims=axes), var
Ejemplo n.º 11
0
 def testComplexSquaredDifference(self):
   for dtype in [np.complex64, np.complex128]:
     x = np.array([[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]],
                  dtype=dtype)
     y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype)
     z = np.conj(x - y) * (x - y)
     with test_util.device(use_gpu=False):
       z_tf = self.evaluate(math_ops.squared_difference(x, y))
       self.assertAllClose(z, z_tf)
Ejemplo n.º 12
0
def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
  """Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:
    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  """
  with ops.op_scope([x, axes, shift], name, "sufficient_statistics"):
    x = ops.convert_to_tensor(x, name="x")
    x_shape = x.get_shape()
    if x_shape.is_fully_defined():
      counts = 1
      m_shape = []
      for d in xrange(x_shape.ndims):
        dim = x_shape[d].value
        if d in set(axes):
          counts *= dim
          dim = 1
        m_shape.append(dim)
      counts = constant_op.constant(counts, dtype=x.dtype)
    else:  # shape needs to be inferred at runtime.
      x_shape = array_ops.shape(x)
      select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape),
                                               True, False)
      m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape),
                                x_shape)
      counts = math_ops.cast(
          math_ops.reduce_prod(x_shape / m_shape),
          x.dtype,
          name="count")
    if shift is not None:
      shift = ops.convert_to_tensor(shift, name="shift")
      m_ss = math_ops.sub(x, shift)
      v_ss = math_ops.squared_difference(x, shift)
    else:  # no shift.
      m_ss = x
      v_ss = math_ops.square(x)
    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
  return counts, m_ss, v_ss, shift
def _reduce_variance(x, axis=None, biased=True, keepdims=False):
  with ops.name_scope("reduce_variance"):
    x = ops.convert_to_tensor(x, name="x")
    mean = math_ops.reduce_mean(x, axis=axis, keepdims=True)
    biased_var = math_ops.reduce_mean(
        math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims)
    if biased:
      return biased_var
    n = _axis_size(x, axis)
    return (n / (n - 1.)) * biased_var
Ejemplo n.º 14
0
def mean_only_frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model from activations.

  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

                                |m - m_w|^2

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  In this variant, we only compute the difference between the means of the
  fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet
  still retains much of the same information as FID.

  Args:
    real_activations: 2D array of activations of real images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.
    generated_activations: 2D array of activations of generated images of size
      [num_images, num_dims] to use to compute Frechet Inception distance.

  Returns:
    The mean-only Frechet Inception distance. A floating-point scalar of the
    same type as the output of the activations.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.cast(real_activations, dtypes.float64)
    generated_activations = math_ops.cast(generated_activations, dtypes.float64)

  # Compute means of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_w = math_ops.reduce_mean(generated_activations, 0)

  # Next the distance between means.
  mean = math_ops.reduce_sum(
      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
  mofid = mean
  if activations_dtype != dtypes.float64:
    mofid = math_ops.cast(mofid, activations_dtype)

  return mofid
Ejemplo n.º 15
0
def mean_squared_error(
    labels, predictions, weights=1.0, scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Adds a Sum-of-Squares loss to the training procedure.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "mean_squared_error",
                      (predictions, labels, weights)) as scope:
    predictions = math_ops.to_float(predictions)
    labels = math_ops.to_float(labels)
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    losses = math_ops.squared_difference(predictions, labels)
    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
Ejemplo n.º 16
0
def psnr(y_true, y_pred):
    if (y_pred.shape[3] == 5):
        return 0
    img1 = tf.matmul(y_true, [[65.481], [128.553], [24.966]]) / 255.0 + 16.0
    img2 = tf.matmul(y_pred, [[65.481], [128.553], [24.966]]) / 255.0 + 16.0
    mse = math_ops.reduce_mean(math_ops.squared_difference(img1, img2),
                               [-3, -2, -1])

    def log10(x):
        numerator = tf.compat.v1.log(x)
        denominator = tf.compat.v1.log(tf.constant(10, dtype=numerator.dtype))
        return numerator / denominator

    return 10 * log10(255.0 * 255.0 / mse)
Ejemplo n.º 17
0
def mean_squared_error(labels,
                       predictions,
                       weights=1.0,
                       scope=None,
                       loss_collection=ops.GraphKeys.LOSSES,
                       reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
    """Adds a Sum-of-Squares loss to the training procedure.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.
  """
    if labels is None:
        raise ValueError("labels must not be None.")
    if predictions is None:
        raise ValueError("predictions must not be None.")
    with ops.name_scope(scope, "mean_squared_error",
                        (predictions, labels, weights)) as scope:
        predictions = math_ops.to_float(predictions)
        labels = math_ops.to_float(labels)
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        losses = math_ops.squared_difference(predictions, labels)
        return compute_weighted_loss(losses,
                                     weights,
                                     scope,
                                     loss_collection,
                                     reduction=reduction)
Ejemplo n.º 18
0
    def _get_mse(y_true, y_pred, mask):
        """
        Calculates mean squared error using mask. Mask is of shape (batch_size, seq_len) i.e if a time step in a
        batch is masked or not. Hence mask is multiplied by number of outputs.
        :param y_true:
        :param y_pred:
        :param mask:
        :return:
        """
        diff = tf.reduce_sum(math_ops.squared_difference(y_pred, y_true))
        mask_sum = tf.reduce_sum(tf.cast(mask, tf.float32)) * y_true.shape[-1]
        mask_sum = tf.cast(mask_sum, dtype=diff.dtype)

        return diff / mask_sum
Ejemplo n.º 19
0
 def _variance(self):
   with ops.control_dependencies(self._runtime_assertions):
     # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
     probs = self._pad_mix_dims(
         self.mixture_distribution.probs)                   # [B, k, [1]*e]
     mean_cond_var = math_ops.reduce_sum(
         probs * self.components_distribution.variance(),
         axis=-1 - self._event_ndims)                       # [B, E]
     var_cond_mean = math_ops.reduce_sum(
         probs * math_ops.squared_difference(
             self.components_distribution.mean(),
             self._pad_sample_dims(self._mean())),
         axis=-1 - self._event_ndims)                       # [B, E]
     return mean_cond_var + var_cond_mean                   # [B, E]
Ejemplo n.º 20
0
    def loss_msr_sequence_relative(y_true, y_predicted):
        y_predicted = ops.convert_to_tensor_v2_with_dispatch(y_predicted)
        y_true = math_ops.cast(y_true, y_predicted.dtype)
        losses = K.mean(math_ops.squared_difference(y_predicted, y_true)/(math_ops.square(y_true)+0.01), axis=-1)

        # losses has shape [batch_size, time steps] -> this is the loss for every time step
        losses = losses[:, wash_out_len:]  # This discards losses for timesteps ≤ wash_out_len

        # Get discounted some of losses for a time series
        # Axis (2,1) results in the natural operation of losses * discount_vector
        # loss = keras.layers.Dot(axes=(1, 0))([losses, discount_vector])
        loss = tf.linalg.matvec(losses, discount_vector)

        return loss
Ejemplo n.º 21
0
 def _variance(self):
     with ops.control_dependencies(self._runtime_assertions):
         # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
         probs = self._pad_mix_dims(
             self.mixture_distribution.probs)  # [B, k, [1]*e]
         mean_cond_var = math_ops.reduce_sum(
             probs * self.components_distribution.variance(),
             axis=-1 - self._event_ndims)  # [B, E]
         var_cond_mean = math_ops.reduce_sum(
             probs * math_ops.squared_difference(
                 self.components_distribution.mean(),
                 self._pad_sample_dims(self._mean())),
             axis=-1 - self._event_ndims)  # [B, E]
         return mean_cond_var + var_cond_mean  # [B, E]
Ejemplo n.º 22
0
def MaskedMSE(output, target, lengths):
    with tf.name_scope('MaskedMSE'):
        # check loss of masked sequence
        mask = tf.cast(tf.expand_dims(
            tf.transpose(tf.sequence_mask(lengths), [1, 0]), 2),
                       dtype=tf.float32)
        remp1 = tf.multiply(output, mask)
        remp2 = tf.multiply(target, mask)
        loss = tf.reduce_sum(math_ops.squared_difference(remp1, remp2))

        # loss = tf.losses.mean_squared_error(remp1, remp2)  # check not divideded
        loss = loss / tf.reduce_sum(mask)
        tf.summary.scalar("loss", loss)
        return loss
Ejemplo n.º 23
0
def mean_squared_error_velocity(y_true, y_pred, epsilon=1e-08):

    y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)

    # mask error in prediction if note is not played
    y_pred = y_pred[:, :, :, 0:1] * y_true[:, :, :, 0:1]
    count_non_zero = tf.math.maximum(
        tf.math.count_nonzero(y_pred, dtype=tf.dtypes.float32),
        1)  # avoid division by zero

    #tf.print([y_pred[0,0:88,10,:], y_true[0,0:88,10,2:3],  K.sum(math_ops.squared_difference(y_pred, y_true[:,:,:,2:3]))/ count_non_zero, count_non_zero], summarize=-1)
    return K.sum(math_ops.squared_difference(
        y_pred, y_true[:, :, :, 2:3])) / count_non_zero + epsilon
Ejemplo n.º 24
0
 def loss_op(self, targets, prediction_ops):
   """Create loss_op."""
   prediction = prediction_ops["mean"]
   if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
     covariance = prediction_ops["covariance"]
     sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
     loss_op = -math_ops.reduce_sum(
         math_utils.normal_log_prob(targets, sigma, prediction))
   else:
     assert self.loss == ARModel.SQUARED_LOSS, self.loss
     loss_op = math_ops.reduce_sum(
         math_ops.squared_difference(prediction, targets))
   loss_op /= math_ops.cast(
       math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
   return loss_op
Ejemplo n.º 25
0
def mean_squared_error(y_true, y_pred):
  """Computes the mean squared error between labels and predictions.

  `loss = square(y_true - y_pred)`

  Args:
    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.

  Returns:
    Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
  """
  y_pred = ops.convert_to_tensor(y_pred)
  y_true = math_ops.cast(y_true, y_pred.dtype)
  return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
Ejemplo n.º 26
0
    def testSampleConsistentStats(self):
        loc = np.float32([[-1., 1], [1, -1]])
        scale = np.float32([1., 0.5])
        n_samp = 1e4
        with self.test_session() as sess:
            ind = independent_lib.Independent(
                distribution=mvn_diag_lib.MultivariateNormalDiag(
                    loc=loc, scale_identity_multiplier=scale),
                reduce_batch_ndims=1)

            x = ind.sample(int(n_samp), seed=42)
            sample_mean = math_ops.reduce_mean(x, axis=0)
            sample_var = math_ops.reduce_mean(math_ops.squared_difference(
                x, sample_mean),
                                              axis=0)
            sample_std = math_ops.sqrt(sample_var)
            sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0)

            [
                sample_mean_,
                sample_var_,
                sample_std_,
                sample_entropy_,
                actual_mean_,
                actual_var_,
                actual_std_,
                actual_entropy_,
                actual_mode_,
            ] = sess.run([
                sample_mean,
                sample_var,
                sample_std,
                sample_entropy,
                ind.mean(),
                ind.variance(),
                ind.stddev(),
                ind.entropy(),
                ind.mode(),
            ])

            self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.)
            self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.)
            self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.)
            self.assertAllClose(sample_entropy_,
                                actual_entropy_,
                                rtol=0.01,
                                atol=0.)
            self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
Ejemplo n.º 27
0
def per_example_squared_loss(labels, weights, predictions):
  """Squared loss given labels, example weights and predictions.

  Args:
    labels: Rank 2 (N, D) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    predictions: Rank 2 (N, D) tensor of per-example predictions.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example squared loss.
    update_op: An update operation to update the loss's internal state.
  """
  unweighted_loss = math_ops.reduce_sum(
      math_ops.squared_difference(predictions, labels), 1, keepdims=True)

  return unweighted_loss * weights, control_flow_ops.no_op()
Ejemplo n.º 28
0
def least_squares_generator_loss(
        discriminator_gen_outputs,
        real_label=1,
        weights=1.0,
        scope=None,
        loss_collection=ops.GraphKeys.LOSSES,
        reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
        add_summaries=False):
    """Least squares generator loss.

  This loss comes from `Least Squares Generative Adversarial Networks`
  (https://arxiv.org/abs/1611.04076).

  L = 1/2 * (D(G(z)) - `real_label`) ** 2

  where D(y) are discriminator logits.

  Args:
    discriminator_gen_outputs: Discriminator output on generated data. Expected
      to be in the range of (-inf, inf).
    real_label: The value that the generator is trying to get the discriminator
      to output on generated data.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `discriminator_gen_outputs`, and must be broadcastable to
      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
      the same as the corresponding dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
    reduction: A `tf.losses.Reduction` to apply to loss.
    add_summaries: Whether or not to add summaries for the loss.

  Returns:
    A loss Tensor. The shape depends on `reduction`.
  """
    with ops.name_scope(scope, 'lsq_generator_loss',
                        (discriminator_gen_outputs, real_label)) as scope:
        discriminator_gen_outputs = math_ops.to_float(
            discriminator_gen_outputs)
        loss = math_ops.squared_difference(discriminator_gen_outputs,
                                           real_label) / 2.0
        loss = losses.compute_weighted_loss(loss, weights, scope,
                                            loss_collection, reduction)

    if add_summaries:
        summary.scalar('generator_lsq_loss', loss)

    return loss
Ejemplo n.º 29
0
def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
  """Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted. See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: A `Tensor` containing the value by which to shift the data for
      numerical stability, or `None` if no shift is to be performed. A shift
      close to the true mean provides the most numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:

    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is None.
  """
  axes = list(set(axes))
  with ops.name_scope(name, "sufficient_statistics", [x, shift]):
    x = ops.convert_to_tensor(x, name="x")
    x_shape = x.get_shape()
    if all(x_shape[d].value is not None for d in axes):
      counts = 1
      for d in axes:
        counts *= x_shape[d].value
      counts = constant_op.constant(counts, dtype=x.dtype)
    else:  # shape needs to be inferred at runtime.
      x_dims = array_ops.gather(
          math_ops.cast(array_ops.shape(x), x.dtype), axes)
      counts = math_ops.reduce_prod(x_dims, name="count")
    if shift is not None:
      shift = ops.convert_to_tensor(shift, name="shift")
      m_ss = math_ops.subtract(x, shift)
      v_ss = math_ops.squared_difference(x, shift)
    else:  # no shift.
      m_ss = x
      v_ss = math_ops.square(x)
    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
  return counts, m_ss, v_ss, shift
Ejemplo n.º 30
0
 def _variance(self):
   var = (
       math_ops.square(self.rate) / math_ops.squared_difference(
           self.concentration, 1.) / (self.concentration - 2.))
   if self.allow_nan_stats:
     nan = array_ops.fill(
         self.batch_shape_tensor(),
         np.array(np.nan, dtype=self.dtype.as_numpy_dtype()),
         name="nan")
     return array_ops.where(self.concentration > 2., var, nan)
   else:
     return control_flow_ops.with_dependencies([
         check_ops.assert_less(
             constant_op.constant(2., dtype=self.dtype),
             self.concentration,
             message="variance undefined when any concentration <= 2"),
     ], var)
Ejemplo n.º 31
0
 def _variance(self):
     var = (math_ops.square(self.rate) /
            math_ops.squared_difference(self.concentration, 1.) /
            (self.concentration - 2.))
     if self.allow_nan_stats:
         nan = array_ops.fill(self.batch_shape_tensor(),
                              np.array(np.nan,
                                       dtype=self.dtype.as_numpy_dtype()),
                              name="nan")
         return array_ops.where(self.concentration > 2., var, nan)
     else:
         return control_flow_ops.with_dependencies([
             check_ops.assert_less(
                 constant_op.constant(2., dtype=self.dtype),
                 self.concentration,
                 message="variance undefined when any concentration <= 2"),
         ], var)
Ejemplo n.º 32
0
  def _calculate_mean_and_var(self, x, axes, keep_dims):

    with backend.name_scope('moments'):
      # The dynamic range of fp16 is too limited to support the collection of
      # sufficient statistics. As a workaround we simply perform the operations
      # on 32-bit floats before converting the mean and variance back to fp16
      y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
      replica_ctx = ds.get_replica_context()
      if replica_ctx:
        local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True)
        local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes,
                                                keepdims=True)
        batch_size = math_ops.cast(array_ops.shape_v2(y)[0], dtypes.float32)
        y_sum, y_squared_sum, global_batch_size = (
            replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, [
                local_sum, local_squared_sum, batch_size]))

        axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))]
        multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals),
                                   dtypes.float32)
        multiplier = multiplier * global_batch_size

        mean = y_sum / multiplier
        y_squared_mean = y_squared_sum / multiplier
        # var = E(x^2) - E(x)^2
        variance = y_squared_mean - math_ops.square(mean)
      else:
        # Compute true mean while keeping the dims for proper broadcasting.
        mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean')
        # sample variance, not unbiased variance
        # Note: stop_gradient does not change the gradient that gets
        #       backpropagated to the mean from the variance calculation,
        #       because that gradient is zero
        variance = math_ops.reduce_mean(
            math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
            axes,
            keepdims=True,
            name='variance')
      if not keep_dims:
        mean = array_ops.squeeze(mean, axes)
        variance = array_ops.squeeze(variance, axes)
      if x.dtype == dtypes.float16:
        return (math_ops.cast(mean, dtypes.float16),
                math_ops.cast(variance, dtypes.float16))
      else:
        return (mean, variance)
Ejemplo n.º 33
0
def least_squares_generator_loss(
    discriminator_gen_outputs,
    real_label=1,
    weights=1.0,
    scope=None,
    loss_collection=ops.GraphKeys.LOSSES,
    reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
    add_summaries=False):
  """Least squares generator loss.

  This loss comes from `Least Squares Generative Adversarial Networks`
  (https://arxiv.org/abs/1611.04076).

  L = 1/2 * (D(G(z)) - `real_label`) ** 2

  where D(y) are discriminator logits.

  Args:
    discriminator_gen_outputs: Discriminator output on generated data. Expected
      to be in the range of (-inf, inf).
    real_label: The value that the generator is trying to get the discriminator
      to output on generated data.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `discriminator_gen_outputs`, and must be broadcastable to
      `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or
      the same as the corresponding dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which this loss will be added.
    reduction: A `tf.losses.Reduction` to apply to loss.
    add_summaries: Whether or not to add summaries for the loss.

  Returns:
    A loss Tensor. The shape depends on `reduction`.
  """
  with ops.name_scope(scope, 'lsq_generator_loss',
                      (discriminator_gen_outputs, real_label)) as scope:
    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
    loss = math_ops.squared_difference(
        discriminator_gen_outputs, real_label) / 2.0
    loss = losses.compute_weighted_loss(
        loss, weights, scope, loss_collection, reduction)

  if add_summaries:
    summary.scalar('generator_lsq_loss', loss)

  return loss
Ejemplo n.º 34
0
def mean_squared_logarithmic_error(y_true, y_pred):
  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.

  `loss = square(log(y_true) - log(y_pred))`

  Args:
    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.

  Returns:
    Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`.
  """
  y_pred = ops.convert_to_tensor(y_pred)
  y_true = math_ops.cast(y_true, y_pred.dtype)
  first_log = math_ops.log(K.maximum(y_pred, K.epsilon()) + 1.)
  second_log = math_ops.log(K.maximum(y_true, K.epsilon()) + 1.)
  return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
Ejemplo n.º 35
0
def mean_squared_error(y_true, y_pred):
    """Computes the mean squared error between labels and predictions.

  After computing the squared distance between the inputs, the mean value over
  the last dimension is returned.

  `loss = mean(square(y_true - y_pred), axis=-1)`

  Args:
    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.

  Returns:
    Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
  """
    y_pred = ops.convert_to_tensor(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)
    return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
Ejemplo n.º 36
0
def per_example_squared_loss(labels, weights, predictions):
    """Squared loss given labels, example weights and predictions.

  Args:
    labels: Rank 2 (N, D) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    predictions: Rank 2 (N, D) tensor of per-example predictions.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example squared loss.
    update_op: An update operation to update the loss's internal state.
  """
    unweighted_loss = math_ops.reduce_sum(math_ops.squared_difference(
        predictions, labels),
                                          1,
                                          keepdims=True)

    return unweighted_loss * weights, control_flow_ops.no_op()
Ejemplo n.º 37
0
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format):
    """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
    if data_format == b"NHWC":
        keep_dims = False
        reduce_axis = [0, 1, 2]
    else:
        keep_dims = True
        reduce_axis = [0, 2, 3]
        shape = [1, array_ops.size(scale), 1, 1]
        scale = array_ops.reshape(scale, shape)
    mean_grad_y = math_ops.reduce_mean(grad_y,
                                       reduce_axis,
                                       keep_dims=keep_dims)
    mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
    var_x = math_ops.reduce_mean(math_ops.squared_difference(
        x, array_ops.stop_gradient(mean_x)),
                                 reduce_axis,
                                 keep_dims=keep_dims)
    grad_y_offset = grad_y - mean_grad_y
    x_offset = x - mean_x
    mean = math_ops.reduce_mean(grad_y * x_offset,
                                axis=reduce_axis,
                                keep_dims=keep_dims)
    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
        grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
    if data_format == b"NCHW":
        grad_scale = array_ops.squeeze(grad_scale)
    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    return grad_x, grad_scale, grad_offset
Ejemplo n.º 38
0
def mse_align_to_y_true(y_true, y_pred):
    y_true_to_calc, y_pred_to_calc = _align_y_pre_process(y_true, y_pred)
    non_zero_count = tf.math.count_nonzero(tf.where(
        tf.math.is_nan(y_true_to_calc), 0, 1),
                                           axis=[-1, -2],
                                           dtype=tf.dtypes.float32)
    y_true_nan_to_zero = tf.where(tf.math.is_nan(y_true_to_calc), 0.,
                                  y_true_to_calc)
    y_pred_nan_to_zero = tf.where(tf.math.is_nan(y_pred_to_calc), 0.,
                                  y_pred_to_calc)

    squared_difference = math_ops.squared_difference(y_pred_nan_to_zero,
                                                     y_true_nan_to_zero)
    mse = tf.math.divide(tf.math.reduce_sum(squared_difference, axis=[-1, -2]),
                         non_zero_count)

    # old version
    # rlt = K.mean(math_ops.squared_difference(y_pred_to_calc, y_true_to_calc), axis=[-1, -2])
    return mse
Ejemplo n.º 39
0
  def _testGrad(self, left_shape, right_shape):

    if len(left_shape) > len(right_shape):
      output_shape = left_shape
    else:
      output_shape = right_shape
    l = np.random.randn(*left_shape)
    r = np.random.randn(*right_shape)

    with self.cached_session(use_gpu=True):
      left_tensor = constant_op.constant(l, shape=left_shape)
      right_tensor = constant_op.constant(r, shape=right_shape)
      output = math_ops.squared_difference(left_tensor, right_tensor)
      left_err = gradient_checker.compute_gradient_error(
          left_tensor, left_shape, output, output_shape, x_init_value=l)
      right_err = gradient_checker.compute_gradient_error(
          right_tensor, right_shape, output, output_shape, x_init_value=r)
    self.assertLess(left_err, 1e-10)
    self.assertLess(right_err, 1e-10)
Ejemplo n.º 40
0
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  if data_format == b"NHWC":
    keep_dims = False
    reduce_axis = [0, 1, 2]
  else:
    keep_dims = True
    reduce_axis = [0, 2, 3]
    shape = [1, array_ops.size(scale), 1, 1]
    scale = array_ops.reshape(scale, shape)
  mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims)
  mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims)
  var_x = math_ops.reduce_mean(
      math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
      reduce_axis,
      keep_dims=keep_dims)
  grad_y_offset = grad_y - mean_grad_y
  x_offset = x - mean_x
  mean = math_ops.reduce_mean(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
      grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
  grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
      grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims)
  if data_format == b"NCHW":
    grad_scale = array_ops.squeeze(grad_scale)
  grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
  return grad_x, grad_scale, grad_offset
Ejemplo n.º 41
0
def _kl_normal_normal(n_a, n_b, name=None):
  """Calculate the batched KL divergence KL(n_a || n_b) with n_a and n_b Normal.

  Args:
    n_a: instance of a Normal distribution object.
    n_b: instance of a Normal distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_normal_normal".

  Returns:
    Batchwise KL(n_a || n_b)
  """
  with ops.name_scope(name, "kl_normal_normal", [n_a.loc, n_b.loc]):
    one = constant_op.constant(1, dtype=n_a.dtype)
    two = constant_op.constant(2, dtype=n_a.dtype)
    half = constant_op.constant(0.5, dtype=n_a.dtype)
    s_a_squared = math_ops.square(n_a.scale)
    s_b_squared = math_ops.square(n_b.scale)
    ratio = s_a_squared / s_b_squared
    return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared)
            + half * (ratio - one - math_ops.log(ratio)))
Ejemplo n.º 42
0
def _kl_normal_normal(n_a, n_b, name=None):
  """Calculate the batched KL divergence KL(n_a || n_b) with n_a and n_b Normal.

  Args:
    n_a: instance of a Normal distribution object.
    n_b: instance of a Normal distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_normal_normal".

  Returns:
    Batchwise KL(n_a || n_b)
  """
  with ops.name_scope(name, "kl_normal_normal", [n_a.loc, n_b.loc]):
    one = constant_op.constant(1, dtype=n_a.dtype)
    two = constant_op.constant(2, dtype=n_a.dtype)
    half = constant_op.constant(0.5, dtype=n_a.dtype)
    s_a_squared = math_ops.square(n_a.scale)
    s_b_squared = math_ops.square(n_b.scale)
    ratio = s_a_squared / s_b_squared
    return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared)
            + half * (ratio - one - math_ops.log(ratio)))
Ejemplo n.º 43
0
def dsnt_mse(y_true, y_pred):
    """Computes the mean squared error between labels and predictions.
    After computing the squared distance between the inputs, the mean value
    over the last dimension is returned.
    `loss = mean(square(y_true - y_pred), axis=-1)`
    Standalone usage:
    >>> y_true = np.random.randint(0, 2, size=(2, 3))
    >>> y_pred = np.random.random(size=(2, 3))
    >>> loss = tf.keras.losses.mean_squared_error(y_true, y_pred)
    >>> assert loss.shape == (2,)
    >>> assert np.array_equal(
    ...     loss.numpy(), np.mean(np.square(y_true - y_pred), axis=-1))
    Args:
      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
    Returns:
      Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
    """
    y_pred = ops.convert_to_tensor_v2(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)
    return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
    def update_state(self, y_true, y_pred, sample_weight=None):
        # tf.print("before: \n", y_true)
        weights = y_true[:, 4:]
        y_true = y_true[:, :4]
        # tf.print("after: \n", y_true)
        # tf.print("weights: \n", weights)
        # tf.print("pred: \n", y_pred)

        y_pred = ops.convert_to_tensor(y_pred)
        y_true = math_ops.cast(y_true, y_pred.dtype)

        #loss = K.sum(tf.math.multiply(math_ops.squared_difference(y_pred, y_true), weights))/K.sum(weights)
        loss = K.mean(K.sum(tf.math.multiply(
            math_ops.squared_difference(y_pred, y_true), weights),
                            axis=-1) / K.sum(weights, axis=-1),
                      axis=-1)

        # tf.print("loss: \n", loss)

        #loss = K.mean(tf.math.multiply(math_ops.squared_difference(y_pred, y_true), weights))  #,axis=-1)
        return self.metric.assign_add(loss)
Ejemplo n.º 45
0
  def _define_diag_covariance_probs(self, shard_id, shard):
    """Defines the diagonal covariance probabilities per example in a class.

    Args:
      shard_id: id of the current shard.
      shard: current data shard, 1 X num_examples X dimensions.

    Returns a matrix num_examples * num_classes.
    """
    # num_classes X 1
    # TODO(xavigonzalvo): look into alternatives to log for
    # reparametrization of variance parameters.
    det_expanded = math_ops.reduce_sum(
        math_ops.log(self._covs + 1e-3), 1, keepdims=True)
    x2 = math_ops.squared_difference(shard, self._means)
    cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
    # num_classes X num_examples
    x2_cov = math_ops.matmul(x2, cov_expanded)
    x2_cov = array_ops.transpose(array_ops.squeeze(x2_cov, [2]))
    self._probs[shard_id] = -0.5 * (
        math_ops.to_float(self._dimensions) * math_ops.log(2.0 * np.pi) +
        array_ops.transpose(det_expanded) + x2_cov)
def frechet_classifier_distance_from_activations(real_activations,
                                                 generated_activations):

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.to_double(real_activations)
        generated_activations = math_ops.to_double(generated_activations)

    m = math_ops.reduce_mean(real_activations, 0)
    m_w = math_ops.reduce_mean(generated_activations, 0)
    num_examples_real = math_ops.to_double(
        array_ops.shape(real_activations)[0])
    num_examples_generated = math_ops.to_double(
        array_ops.shape(generated_activations)[0])

    real_centered = real_activations - m
    sigma = math_ops.matmul(real_centered, real_centered,
                            transpose_a=True) / (num_examples_real - 1)

    gen_centered = generated_activations - m_w
    sigma_w = math_ops.matmul(gen_centered, gen_centered,
                              transpose_a=True) / (num_examples_generated - 1)

    sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)

    trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component

    mean = math_ops.reduce_sum(math_ops.squared_difference(m, m_w))
    fid = trace + mean
    if activations_dtype != dtypes.float64:
        fid = math_ops.cast(fid, activations_dtype)

    return fid
Ejemplo n.º 47
0
def contrastive_loss(labels,
                     embeddings_anchor,
                     embeddings_positive,
                     margin=1.0):
    """Computes the contrastive loss.

    This loss encourages the embedding to be close to each other for
      the samples of the same label and the embedding to be far apart at least
      by the margin constant for the samples of different labels.
    See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

    Args:
      labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
        binary labels indicating positive vs negative pair.
      embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor
        images. Embeddings should be l2 normalized.
      embeddings_positive: 2-D float `Tensor` of embedding vectors for the
        positive images. Embeddings should be l2 normalized.
      margin: margin term in the loss definition.

    Returns:
      contrastive_loss: tf.float32 scalar.
    """
    # Get per pair distances
    distances = math_ops.sqrt(
        math_ops.reduce_sum(
            math_ops.squared_difference(embeddings_anchor,
                                        embeddings_positive), 1))

    # Add contrastive loss for the siamese network.
    #   label here is {0,1} for neg, pos.
    return math_ops.reduce_mean(
        math_ops.cast(labels, distances.dtype) * math_ops.square(distances) +
        (1. - math_ops.cast(labels, distances.dtype)) *
        math_ops.square(math_ops.maximum(margin - distances, 0.)),
        name='contrastive_loss')
Ejemplo n.º 48
0
def mean_squared_error(y_true, y_pred):
    y_pred = ops.convert_to_tensor(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)
    return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
Ejemplo n.º 49
0
def _reduce_variance(x, axis=None, keepdims=False):
  sample_mean = math_ops.reduce_mean(x, axis, keepdims=True)
  return math_ops.reduce_mean(
      math_ops.squared_difference(x, sample_mean), axis, keepdims)
Ejemplo n.º 50
0
def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
  """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 dimensions for gradient for y.
    x: A `Tensor` of 4 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default) or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
  x_dtype = x.dtype.base_dtype
  if x_dtype == dtypes.float16:
    # float16 math is too imprecise, so we do the batch norm gradient
    # computations in float32.
    x = math_ops.cast(x, dtypes.float32)
    grad_y = math_ops.cast(grad_y, dtypes.float32)
  if is_training:
    if data_format == b"NHWC":
      keepdims = False
      reduce_axis = [0, 1, 2]
    else:
      keepdims = True
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(scale), 1, 1]
      scale = array_ops.reshape(scale, shape)
    mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
    mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
    var_x = math_ops.reduce_mean(
        math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
        reduce_axis,
        keepdims=keepdims)
    grad_y_offset = grad_y - mean_grad_y
    x_offset = x - mean_x
    mean = math_ops.reduce_mean(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
        grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
    grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
        grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
    if data_format == b"NCHW":
      grad_scale = array_ops.squeeze(grad_scale)
    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
  else:
    if data_format == b"NHWC":
      reduce_axis = [0, 1, 2]
    else:
      reduce_axis = [0, 2, 3]
      shape = [1, array_ops.size(pop_mean), 1, 1]
      pop_mean = array_ops.reshape(pop_mean, shape)
      pop_var = array_ops.reshape(pop_var, shape)
      scale = array_ops.reshape(scale, shape)

    grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
    var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
    grad_scale = math_ops.reduce_sum(
        grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
    grad_x = grad_y * scale * var_rsqrt
    return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
Ejemplo n.º 51
0
Archivo: nn.py Proyecto: 01-/tensorflow
def sufficient_statistics(x, axes, shift=False, keep_dims=False, name=None):
  """Calculate the sufficient statistics for the mean and variance of `x`.

  These sufficient statistics are computed using the one pass algorithm on
  an input that's optionally shifted using the value of the 1st element in `x`.
  See:
  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
  Unfortunately, in some cases using a random individual sample as the shift
  value leads experimentally to very poor numerical stability, so it is disabled
  by default. The one-pass approach might have to be revised accordingly.

  Args:
    x: A `Tensor`.
    axes: Array of ints. Axes along which to compute mean and variance.
    shift: If true, shift the data to provide more numerically stable results.
    keep_dims: produce statistics with the same dimensionality as the input.
    name: Name used to scope the operations that compute the sufficient stats.

  Returns:
    Four `Tensor` objects of the same type as `x`:
    * the count (number of elements to average over).
    * the (possibly shifted) sum of the elements in the array.
    * the (possibly shifted) sum of squares of the elements in the array.
    * the shift by which the mean must be corrected or None if `shift` is False.
  """
  with ops.op_scope([x, axes], name, "sufficient_statistics"):
    x = ops.convert_to_tensor(x, name="x")
    x_shape = x.get_shape()
    if x_shape.is_fully_defined():
      counts = 1
      m_shape = []
      for d in xrange(x_shape.ndims):
        dim = x_shape[d].value
        if d in set(axes):
          counts *= dim
          dim = 1
        m_shape.append(dim)
      counts = constant_op.constant(counts, dtype=x.dtype)
    else:  # shape needs to be inferred at runtime.
      x_shape = array_ops.shape(x)
      select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape),
                                               True, False)
      m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape),
                                x_shape)
      counts = math_ops.cast(
          math_ops.reduce_prod(x_shape / m_shape),
          x.dtype,
          name="count")
    if shift:
      shift_value = array_ops.slice(x, array_ops.zeros_like(m_shape), m_shape)
      m_ss = math_ops.sub(x, shift_value)
      v_ss = math_ops.squared_difference(x, shift_value)
      if keep_dims:
        shift_value = array_ops.identity(shift_value, name="shift")
      else:
        shift_value = array_ops.squeeze(shift_value,
                                        squeeze_dims=axes,
                                        name="shift")
    else:  # not shift.
      m_ss = x
      v_ss = math_ops.square(x)
      shift_value = None
    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
  return counts, m_ss, v_ss, shift_value
Ejemplo n.º 52
0
def mean_squared_logarithmic_error(y_true, y_pred):  # pylint: disable=missing-docstring
    y_pred = ops.convert_to_tensor(y_pred)
    y_true = math_ops.cast(y_true, y_pred.dtype)
    first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
Ejemplo n.º 53
0
def mean_squared_logarithmic_error(y_true, y_pred):  # pylint: disable=missing-docstring
  y_pred = ops.convert_to_tensor(y_pred)
  y_true = math_ops.cast(y_true, y_pred.dtype)
  first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
  second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
  return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
def frechet_classifier_distance_from_activations(real_activations,
                                                 generated_activations):
    """Classifier distance for evaluating a generative model.

    This methods computes the Frechet classifier distance from activations of
    real images and generated images. This can be used independently of the
    frechet_classifier_distance() method, especially in the case of using large
    batches during evaluation where we would like precompute all of the
    activations before computing the classifier distance.

    This technique is described in detail in https://arxiv.org/abs/1706.08500.
    Given two Gaussian distribution with means m and m_w and covariance matrices
    C and C_w, this function calculates

                  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

    which captures how different the distributions of real images and generated
    images (or more accurately, their visual features) are. Note that unlike the
    Inception score, this is a true distance and utilizes information about real
    world images.

    Note that when computed using sample means and sample covariance matrices,
    Frechet distance is biased. It is more biased for small sample sizes. (e.g.
    even if the two distributions are the same, for a small sample size, the
    expected Frechet distance is large). It is important to use the same
    sample size to compute frechet classifier distance when comparing two
    generative models.

    Args:
      real_activations: 2D Tensor containing activations of real data. Shape is
        [batch_size, activation_size].
      generated_activations: 2D Tensor containing activations of generated data.
        Shape is [batch_size, activation_size].

    Returns:
     The Frechet Inception distance. A floating-point scalar of the same type
     as the output of the activations.

    """
    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)

    activations_dtype = real_activations.dtype
    if activations_dtype != dtypes.float64:
        real_activations = math_ops.to_double(real_activations)
        generated_activations = math_ops.to_double(generated_activations)

    # Compute mean and covariance matrices of activations.
    m = math_ops.reduce_mean(real_activations, 0)
    m_w = math_ops.reduce_mean(generated_activations, 0)
    num_examples_real = math_ops.to_double(
        array_ops.shape(real_activations)[0])
    num_examples_generated = math_ops.to_double(
        array_ops.shape(generated_activations)[0])

    # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
    real_centered = real_activations - m
    sigma = math_ops.matmul(real_centered, real_centered,
                            transpose_a=True) / (num_examples_real - 1)

    gen_centered = generated_activations - m_w
    sigma_w = math_ops.matmul(gen_centered, gen_centered,
                              transpose_a=True) / (num_examples_generated - 1)

    # Find the Tr(sqrt(sigma sigma_w)) component of FID
    sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component

    # Next the distance between means.
    mean = math_ops.reduce_sum(math_ops.squared_difference(
        m, m_w))  # Equivalent to L2 but more stable.
    fid = trace + mean
    if activations_dtype != dtypes.float64:
        fid = math_ops.cast(fid, activations_dtype)

    return fid
Ejemplo n.º 55
0
def diagonal_only_frechet_classifier_distance_from_activations(
    real_activations, generated_activations):
  """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

          |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images. In this variant, we compute diagonal-only covariance matrices.
  As a result, instead of computing an expensive matrix square root, we can do
  something much simpler, and has O(n) vs O(n^2) space complexity.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: Real images to use to compute Frechet Inception distance.
    generated_activations: Generated images to use to compute Frechet Inception
      distance.

  Returns:
    The diagonal-only Frechet Inception distance. A floating-point scalar of
    the same type as the output of the activations.

  Raises:
    ValueError: If the shape of the variance and mean vectors are not equal.
  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.cast(real_activations, dtypes.float64)
    generated_activations = math_ops.cast(generated_activations, dtypes.float64)

  # Compute mean and covariance matrices of activations.
  m, var = nn_impl.moments(real_activations, axes=[0])
  m_w, var_w = nn_impl.moments(generated_activations, axes=[0])

  actual_shape = var.get_shape()
  expected_shape = m.get_shape()

  if actual_shape != expected_shape:
    raise ValueError('shape: {} must match expected shape: {}'.format(
        actual_shape, expected_shape))

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.reduce_sum(
      (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w)))

  # Next the distance between means.
  mean = math_ops.reduce_sum(
      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
  dofid = trace + mean
  if activations_dtype != dtypes.float64:
    dofid = math_ops.cast(dofid, activations_dtype)

  return dofid
Ejemplo n.º 56
0
def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
  """Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    name: Name used to scope the operation.
    keep_dims: Produce moments with the same dimensionality as the input.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  """
  with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
    x = ops.convert_to_tensor(x, name="x")
    frequency_weights = ops.convert_to_tensor(
        frequency_weights, name="frequency_weights")

    # Unlike moments(), this just uses a simpler two-pass method.

    # See comment in moments() WRT precision; it applies here too.
    needs_cast = x.dtype == dtypes.float16
    if needs_cast:
      x = math_ops.cast(x, dtypes.float32)

    if frequency_weights.dtype != x.dtype:
      frequency_weights = math_ops.cast(frequency_weights, x.dtype)

    # Note that we use keep_dims=True for our reductions regardless of the arg;
    # this is so that the results remain broadcast-compatible with the inputs.
    weighted_input_sum = math_ops.reduce_sum(
        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)

    # The shape of the weights isn't necessarily the same as x's
    # shape, just broadcast-compatible with it -- so this expression
    # performs broadcasting to give a per-item weight, with the same
    # shape as (freqency_weights * x). This avoids having to reason
    # through all the broadcast logic to compute a correct
    # sum_of_weights.
    broadcasted_weights = frequency_weights + array_ops.zeros_like(x)

    sum_of_weights = math_ops.reduce_sum(
        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)

    divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")

    weighted_mean = math_ops.mul(weighted_input_sum, divisor)

    # Have the weighted mean; now on to variance:
    weighted_distsq = math_ops.reduce_sum(
        frequency_weights * math_ops.squared_difference(x, weighted_mean),
        axes,
        name="weighted_distsq",
        keep_dims=True)

    weighted_variance = math_ops.mul(weighted_distsq, divisor)

    if not keep_dims:
      weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
      weighted_variance = array_ops.squeeze(
          weighted_variance, squeeze_dims=axes)

    if needs_cast:
      weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
      weighted_variance = math_ops.cast(weighted_variance, dtypes.float16)

    return weighted_mean, weighted_variance
Ejemplo n.º 57
0
 def _mean_squared_error(self, targets, outputs, mask):
     loss = math_ops.squared_difference(targets, outputs)
     # TODO: Make the below safe to div by zero
     mse = tf.reduce_sum(loss) / tf.reduce_sum(mask)
     return mse
Ejemplo n.º 58
0
def frechet_classifier_distance_from_activations(real_activations,
                                                 generated_activations):
  """Classifier distance for evaluating a generative model.

  This methods computes the Frechet classifier distance from activations of
  real images and generated images. This can be used independently of the
  frechet_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like precompute all of the
  activations before computing the classifier distance.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calculates

                |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].

  Returns:
   The Frechet Inception distance. A floating-point scalar of the same type
   as the output of the activations.

  """
  real_activations.shape.assert_has_rank(2)
  generated_activations.shape.assert_has_rank(2)

  activations_dtype = real_activations.dtype
  if activations_dtype != dtypes.float64:
    real_activations = math_ops.cast(real_activations, dtypes.float64)
    generated_activations = math_ops.cast(generated_activations, dtypes.float64)

  # Compute mean and covariance matrices of activations.
  m = math_ops.reduce_mean(real_activations, 0)
  m_w = math_ops.reduce_mean(generated_activations, 0)
  num_examples_real = math_ops.cast(
      array_ops.shape(real_activations)[0], dtypes.float64)
  num_examples_generated = math_ops.cast(
      array_ops.shape(generated_activations)[0], dtypes.float64)

  # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
  real_centered = real_activations - m
  sigma = math_ops.matmul(
      real_centered, real_centered, transpose_a=True) / (
          num_examples_real - 1)

  gen_centered = generated_activations - m_w
  sigma_w = math_ops.matmul(
      gen_centered, gen_centered, transpose_a=True) / (
          num_examples_generated - 1)

  # Find the Tr(sqrt(sigma sigma_w)) component of FID
  sqrt_trace_component = trace_sqrt_product(sigma, sigma_w)

  # Compute the two components of FID.

  # First the covariance component.
  # Here, note that trace(A + B) = trace(A) + trace(B)
  trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component

  # Next the distance between means.
  mean = math_ops.reduce_sum(
      math_ops.squared_difference(m, m_w))  # Equivalent to L2 but more stable.
  fid = trace + mean
  if activations_dtype != dtypes.float64:
    fid = math_ops.cast(fid, activations_dtype)

  return fid