def _r2(probabilities, targets, weights=None): targets = math_ops.cast(targets, dtypes.float32) y_mean = math_ops.reduce_mean(targets, 0) squares_total = math_ops.reduce_sum( math_ops.squared_difference(targets, y_mean), 0) squares_residuals = math_ops.reduce_sum( math_ops.squared_difference(targets, probabilities), 0) score = 1 - math_ops.reduce_sum(squares_residuals / squares_total) return metrics.mean(score, weights=weights)
def _mean_squared_loss(logits, target): # To prevent broadcasting inside "-". if len(target.get_shape()) == 1: target = array_ops.expand_dims(target, axis=1) logits.get_shape().assert_is_compatible_with(target.get_shape()) return math_ops.squared_difference(logits, math_ops.to_float(target))
def testSampleConsistentStats(self): loc = np.float32([[-1., 1], [1, -1]]) scale = np.float32([1., 0.5]) n_samp = 1e4 with self.test_session() as sess: ind = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=loc, scale_identity_multiplier=scale), reduce_batch_ndims=1) x = ind.sample(int(n_samp), seed=42) sample_mean = math_ops.reduce_mean(x, axis=0) sample_var = math_ops.reduce_mean( math_ops.squared_difference(x, sample_mean), axis=0) sample_std = math_ops.sqrt(sample_var) sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0) [ sample_mean_, sample_var_, sample_std_, sample_entropy_, actual_mean_, actual_var_, actual_std_, actual_entropy_, actual_mode_, ] = sess.run([ sample_mean, sample_var, sample_std, sample_entropy, ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(), ]) self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.) self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.) self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.) self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.) self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
def mean_squared_error(predictions, labels=None, weights=1.0, scope=None): """Adds a Sum-of-Squares loss to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: predictions: The predicted outputs. labels: The ground truth output tensor, same dimensions as 'predictions'. weights: Coefficients for the loss a scalar, a tensor of shape [batch_size] or a tensor whose shape matches `predictions`. scope: The scope for the operations performed in computing the loss. Returns: A scalar `Tensor` representing the loss value. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. """ with ops.name_scope(scope, "mean_squared_error", [predictions, labels, weights]) as scope: predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions = math_ops.cast(predictions, dtypes.float32) labels = math_ops.cast(labels, dtypes.float32) losses = math_ops.squared_difference(predictions, labels) return compute_weighted_loss(losses, weights, scope=scope)
def exact_gaussian_kernel(x, y, stddev): r"""Computes exact Gaussian kernel value(s) for tensors x and y and stddev. The Gaussian kernel for vectors u, v is defined as follows: K(u, v) = exp(-||u-v||^2 / (2* stddev^2)) where the norm is the l2-norm. x, y can be either vectors or matrices. If they are vectors, they must have the same dimension. If they are matrices, they must have the same number of columns. In the latter case, the method returns (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and v is a row from y. Args: x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim]. y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim]. stddev: The width of the Gaussian kernel. Returns: A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for all (u,v) pairs where u, v are rows from x and y respectively. Raises: ValueError: if the shapes of x, y are not compatible. """ x_aligned, y_aligned = _align_matrices(x, y) diff_squared_l2_norm = math_ops.reduce_sum( math_ops.squared_difference(x_aligned, y_aligned), 2) return math_ops.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
def contrastive_loss(labels, embeddings_anchor, embeddings_positive, margin=1.0): """Computes the contrastive loss. This loss encourages the embedding to be close to each other for the samples of the same label and the embedding to be far apart at least by the margin constant for the samples of different labels. See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf Args: labels: 1-D tf.int32 `Tensor` with shape [batch_size] of binary labels indicating positive vs negative pair. embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor images. Embeddings should be l2 normalized. embeddings_positive: 2-D float `Tensor` of embedding vectors for the positive images. Embeddings should be l2 normalized. margin: margin term in the loss definition. Returns: contrastive_loss: tf.float32 scalar. """ # Get per pair distances distances = math_ops.sqrt( math_ops.reduce_sum( math_ops.squared_difference(embeddings_anchor, embeddings_positive), 1)) # Add contrastive loss for the siamese network. # label here is {0,1} for neg, pos. return math_ops.reduce_mean( math_ops.to_float(labels) * math_ops.square(distances) + (1. - math_ops.to_float(labels)) * math_ops.square(math_ops.maximum(margin - distances, 0.)), name='contrastive_loss')
def testSquaredDifference(self): x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) y = np.array([-3, -2, -1], dtype=np.int32) z = (x - y)*(x - y) with self.test_session(): z_tf = math_ops.squared_difference(x, y).eval() self.assertAllClose(z, z_tf)
def testSquaredDifference(self): for dtype in [np.int32, np.float16]: x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) y = np.array([-3, -2, -1], dtype=dtype) z = (x - y)*(x - y) with self.test_session(use_gpu=True): z_tf = math_ops.squared_difference(x, y).eval() self.assertAllClose(z, z_tf)
def testSquaredDifference(self): for dtype in [np.int32, np.float16]: x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) y = np.array([-3, -2, -1], dtype=dtype) z = (x - y) * (x - y) with test_util.device(use_gpu=True): z_tf = self.evaluate(math_ops.squared_difference(x, y)) self.assertAllClose(z, z_tf)
def moments(x, axes, name=None, keep_dims=False): """Calculate the mean and variance of `x`. The mean and variance are calculated by aggregating the contents of `x` across `axes`. If `x` is 1-D and `axes = [0]` this is just the mean and variance of a vector. When using these moments for batch normalization (see `tf.nn.batch_normalization`): * for so-called "global normalization", used with convolutional filters with shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`. * for simple batch normalization pass `axes=[0]` (batch only). Args: x: A `Tensor`. axes: array of ints. Axes along which to compute mean and variance. keep_dims: produce moments with the same dimensionality as the input. name: Name used to scope the operations that compute the moments. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.op_scope([x, axes], name, "moments"): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if all(x_shape[d].value is not None for d in axes): # The shape is known in the relevant axes, so we can statically # compute the divisor. divisor = 1.0 for d in set(axes): divisor *= x.get_shape()[d].value divisor = constant_op.constant(1.0 / divisor, x.dtype, name="divisor") else: divisor = constant_op.constant(1.0, dtype=x.dtype) x_dynamic_shape = array_ops.shape(x) for d in set(axes): divisor *= math_ops.cast(x_dynamic_shape[d], x.dtype) divisor = math_ops.inv(divisor, name="divisor") constant_axes = constant_op.constant(axes, name="axes") # Note: We do not use Mean here because it is very slow on GPU. mean = math_ops.mul( math_ops.reduce_sum(x, constant_axes, keep_dims=True), divisor, name="mean") var = math_ops.mul( math_ops.reduce_sum( math_ops.squared_difference(x, mean), constant_axes, keep_dims=keep_dims), divisor, name="variance") if keep_dims: return mean, var else: return array_ops.squeeze(mean, squeeze_dims=axes), var
def testComplexSquaredDifference(self): for dtype in [np.complex64, np.complex128]: x = np.array([[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]], dtype=dtype) y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype) z = np.conj(x - y) * (x - y) with test_util.device(use_gpu=False): z_tf = self.evaluate(math_ops.squared_difference(x, y)) self.assertAllClose(z, z_tf)
def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None): """Calculate the sufficient statistics for the mean and variance of `x`. These sufficient statistics are computed using the one pass algorithm on an input that's optionally shifted. See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. shift: A `Tensor` containing the value by which to shift the data for numerical stability, or `None` if no shift is to be performed. A shift close to the true mean provides the most numerically stable results. keep_dims: produce statistics with the same dimensionality as the input. name: Name used to scope the operations that compute the sufficient stats. Returns: Four `Tensor` objects of the same type as `x`: * the count (number of elements to average over). * the (possibly shifted) sum of the elements in the array. * the (possibly shifted) sum of squares of the elements in the array. * the shift by which the mean must be corrected or None if `shift` is None. """ with ops.op_scope([x, axes, shift], name, "sufficient_statistics"): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if x_shape.is_fully_defined(): counts = 1 m_shape = [] for d in xrange(x_shape.ndims): dim = x_shape[d].value if d in set(axes): counts *= dim dim = 1 m_shape.append(dim) counts = constant_op.constant(counts, dtype=x.dtype) else: # shape needs to be inferred at runtime. x_shape = array_ops.shape(x) select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape), True, False) m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape), x_shape) counts = math_ops.cast( math_ops.reduce_prod(x_shape / m_shape), x.dtype, name="count") if shift is not None: shift = ops.convert_to_tensor(shift, name="shift") m_ss = math_ops.sub(x, shift) v_ss = math_ops.squared_difference(x, shift) else: # no shift. m_ss = x v_ss = math_ops.square(x) m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss") v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss") return counts, m_ss, v_ss, shift
def _reduce_variance(x, axis=None, biased=True, keepdims=False): with ops.name_scope("reduce_variance"): x = ops.convert_to_tensor(x, name="x") mean = math_ops.reduce_mean(x, axis=axis, keepdims=True) biased_var = math_ops.reduce_mean( math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims) if biased: return biased_var n = _axis_size(x, axis) return (n / (n - 1.)) * biased_var
def mean_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model from activations. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. In this variant, we only compute the difference between the means of the fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet still retains much of the same information as FID. Args: real_activations: 2D array of activations of real images of size [num_images, num_dims] to use to compute Frechet Inception distance. generated_activations: 2D array of activations of generated images of size [num_images, num_dims] to use to compute Frechet Inception distance. Returns: The mean-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute means of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) # Next the distance between means. mean = math_ops.reduce_sum( math_ops.squared_difference(m, m_w)) # Equivalent to L2 but more stable. mofid = mean if activations_dtype != dtypes.float64: mofid = math_ops.cast(mofid, activations_dtype) return mofid
def mean_squared_error( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a Sum-of-Squares loss to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: labels: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "mean_squared_error", (predictions, labels, weights)) as scope: predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) losses = math_ops.squared_difference(predictions, labels) return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def psnr(y_true, y_pred): if (y_pred.shape[3] == 5): return 0 img1 = tf.matmul(y_true, [[65.481], [128.553], [24.966]]) / 255.0 + 16.0 img2 = tf.matmul(y_pred, [[65.481], [128.553], [24.966]]) / 255.0 + 16.0 mse = math_ops.reduce_mean(math_ops.squared_difference(img1, img2), [-3, -2, -1]) def log10(x): numerator = tf.compat.v1.log(x) denominator = tf.compat.v1.log(tf.constant(10, dtype=numerator.dtype)) return numerator / denominator return 10 * log10(255.0 * 255.0 / mse)
def mean_squared_error(labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a Sum-of-Squares loss to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: labels: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "mean_squared_error", (predictions, labels, weights)) as scope: predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) losses = math_ops.squared_difference(predictions, labels) return compute_weighted_loss(losses, weights, scope, loss_collection, reduction=reduction)
def _get_mse(y_true, y_pred, mask): """ Calculates mean squared error using mask. Mask is of shape (batch_size, seq_len) i.e if a time step in a batch is masked or not. Hence mask is multiplied by number of outputs. :param y_true: :param y_pred: :param mask: :return: """ diff = tf.reduce_sum(math_ops.squared_difference(y_pred, y_true)) mask_sum = tf.reduce_sum(tf.cast(mask, tf.float32)) * y_true.shape[-1] mask_sum = tf.cast(mask_sum, dtype=diff.dtype) return diff / mask_sum
def _variance(self): with ops.control_dependencies(self._runtime_assertions): # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X]) probs = self._pad_mix_dims( self.mixture_distribution.probs) # [B, k, [1]*e] mean_cond_var = math_ops.reduce_sum( probs * self.components_distribution.variance(), axis=-1 - self._event_ndims) # [B, E] var_cond_mean = math_ops.reduce_sum( probs * math_ops.squared_difference( self.components_distribution.mean(), self._pad_sample_dims(self._mean())), axis=-1 - self._event_ndims) # [B, E] return mean_cond_var + var_cond_mean # [B, E]
def loss_msr_sequence_relative(y_true, y_predicted): y_predicted = ops.convert_to_tensor_v2_with_dispatch(y_predicted) y_true = math_ops.cast(y_true, y_predicted.dtype) losses = K.mean(math_ops.squared_difference(y_predicted, y_true)/(math_ops.square(y_true)+0.01), axis=-1) # losses has shape [batch_size, time steps] -> this is the loss for every time step losses = losses[:, wash_out_len:] # This discards losses for timesteps ≤ wash_out_len # Get discounted some of losses for a time series # Axis (2,1) results in the natural operation of losses * discount_vector # loss = keras.layers.Dot(axes=(1, 0))([losses, discount_vector]) loss = tf.linalg.matvec(losses, discount_vector) return loss
def MaskedMSE(output, target, lengths): with tf.name_scope('MaskedMSE'): # check loss of masked sequence mask = tf.cast(tf.expand_dims( tf.transpose(tf.sequence_mask(lengths), [1, 0]), 2), dtype=tf.float32) remp1 = tf.multiply(output, mask) remp2 = tf.multiply(target, mask) loss = tf.reduce_sum(math_ops.squared_difference(remp1, remp2)) # loss = tf.losses.mean_squared_error(remp1, remp2) # check not divideded loss = loss / tf.reduce_sum(mask) tf.summary.scalar("loss", loss) return loss
def mean_squared_error_velocity(y_true, y_pred, epsilon=1e-08): y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) # mask error in prediction if note is not played y_pred = y_pred[:, :, :, 0:1] * y_true[:, :, :, 0:1] count_non_zero = tf.math.maximum( tf.math.count_nonzero(y_pred, dtype=tf.dtypes.float32), 1) # avoid division by zero #tf.print([y_pred[0,0:88,10,:], y_true[0,0:88,10,2:3], K.sum(math_ops.squared_difference(y_pred, y_true[:,:,:,2:3]))/ count_non_zero, count_non_zero], summarize=-1) return K.sum(math_ops.squared_difference( y_pred, y_true[:, :, :, 2:3])) / count_non_zero + epsilon
def loss_op(self, targets, prediction_ops): """Create loss_op.""" prediction = prediction_ops["mean"] if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS: covariance = prediction_ops["covariance"] sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5)) loss_op = -math_ops.reduce_sum( math_utils.normal_log_prob(targets, sigma, prediction)) else: assert self.loss == ARModel.SQUARED_LOSS, self.loss loss_op = math_ops.reduce_sum( math_ops.squared_difference(prediction, targets)) loss_op /= math_ops.cast( math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype) return loss_op
def mean_squared_error(y_true, y_pred): """Computes the mean squared error between labels and predictions. `loss = square(y_true - y_pred)` Args: y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: Mean squared error values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
def testSampleConsistentStats(self): loc = np.float32([[-1., 1], [1, -1]]) scale = np.float32([1., 0.5]) n_samp = 1e4 with self.test_session() as sess: ind = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=loc, scale_identity_multiplier=scale), reduce_batch_ndims=1) x = ind.sample(int(n_samp), seed=42) sample_mean = math_ops.reduce_mean(x, axis=0) sample_var = math_ops.reduce_mean(math_ops.squared_difference( x, sample_mean), axis=0) sample_std = math_ops.sqrt(sample_var) sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0) [ sample_mean_, sample_var_, sample_std_, sample_entropy_, actual_mean_, actual_var_, actual_std_, actual_entropy_, actual_mode_, ] = sess.run([ sample_mean, sample_var, sample_std, sample_entropy, ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(), ]) self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.) self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.) self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.) self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.) self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
def per_example_squared_loss(labels, weights, predictions): """Squared loss given labels, example weights and predictions. Args: labels: Rank 2 (N, D) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. predictions: Rank 2 (N, D) tensor of per-example predictions. Returns: loss: A Rank 2 (N, 1) tensor of per-example squared loss. update_op: An update operation to update the loss's internal state. """ unweighted_loss = math_ops.reduce_sum( math_ops.squared_difference(predictions, labels), 1, keepdims=True) return unweighted_loss * weights, control_flow_ops.no_op()
def least_squares_generator_loss( discriminator_gen_outputs, real_label=1, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, add_summaries=False): """Least squares generator loss. This loss comes from `Least Squares Generative Adversarial Networks` (https://arxiv.org/abs/1611.04076). L = 1/2 * (D(G(z)) - `real_label`) ** 2 where D(y) are discriminator logits. Args: discriminator_gen_outputs: Discriminator output on generated data. Expected to be in the range of (-inf, inf). real_label: The value that the generator is trying to get the discriminator to output on generated data. weights: Optional `Tensor` whose rank is either 0, or the same rank as `discriminator_gen_outputs`, and must be broadcastable to `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or the same as the corresponding dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which this loss will be added. reduction: A `tf.losses.Reduction` to apply to loss. add_summaries: Whether or not to add summaries for the loss. Returns: A loss Tensor. The shape depends on `reduction`. """ with ops.name_scope(scope, 'lsq_generator_loss', (discriminator_gen_outputs, real_label)) as scope: discriminator_gen_outputs = math_ops.to_float( discriminator_gen_outputs) loss = math_ops.squared_difference(discriminator_gen_outputs, real_label) / 2.0 loss = losses.compute_weighted_loss(loss, weights, scope, loss_collection, reduction) if add_summaries: summary.scalar('generator_lsq_loss', loss) return loss
def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None): """Calculate the sufficient statistics for the mean and variance of `x`. These sufficient statistics are computed using the one pass algorithm on an input that's optionally shifted. See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. shift: A `Tensor` containing the value by which to shift the data for numerical stability, or `None` if no shift is to be performed. A shift close to the true mean provides the most numerically stable results. keep_dims: produce statistics with the same dimensionality as the input. name: Name used to scope the operations that compute the sufficient stats. Returns: Four `Tensor` objects of the same type as `x`: * the count (number of elements to average over). * the (possibly shifted) sum of the elements in the array. * the (possibly shifted) sum of squares of the elements in the array. * the shift by which the mean must be corrected or None if `shift` is None. """ axes = list(set(axes)) with ops.name_scope(name, "sufficient_statistics", [x, shift]): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if all(x_shape[d].value is not None for d in axes): counts = 1 for d in axes: counts *= x_shape[d].value counts = constant_op.constant(counts, dtype=x.dtype) else: # shape needs to be inferred at runtime. x_dims = array_ops.gather( math_ops.cast(array_ops.shape(x), x.dtype), axes) counts = math_ops.reduce_prod(x_dims, name="count") if shift is not None: shift = ops.convert_to_tensor(shift, name="shift") m_ss = math_ops.subtract(x, shift) v_ss = math_ops.squared_difference(x, shift) else: # no shift. m_ss = x v_ss = math_ops.square(x) m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss") v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss") return counts, m_ss, v_ss, shift
def _variance(self): var = ( math_ops.square(self.rate) / math_ops.squared_difference( self.concentration, 1.) / (self.concentration - 2.)) if self.allow_nan_stats: nan = array_ops.fill( self.batch_shape_tensor(), np.array(np.nan, dtype=self.dtype.as_numpy_dtype()), name="nan") return array_ops.where(self.concentration > 2., var, nan) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( constant_op.constant(2., dtype=self.dtype), self.concentration, message="variance undefined when any concentration <= 2"), ], var)
def _variance(self): var = (math_ops.square(self.rate) / math_ops.squared_difference(self.concentration, 1.) / (self.concentration - 2.)) if self.allow_nan_stats: nan = array_ops.fill(self.batch_shape_tensor(), np.array(np.nan, dtype=self.dtype.as_numpy_dtype()), name="nan") return array_ops.where(self.concentration > 2., var, nan) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( constant_op.constant(2., dtype=self.dtype), self.concentration, message="variance undefined when any concentration <= 2"), ], var)
def _calculate_mean_and_var(self, x, axes, keep_dims): with backend.name_scope('moments'): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x replica_ctx = ds.get_replica_context() if replica_ctx: local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True) local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True) batch_size = math_ops.cast(array_ops.shape_v2(y)[0], dtypes.float32) y_sum, y_squared_sum, global_batch_size = ( replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, [ local_sum, local_squared_sum, batch_size])) axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))] multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32) multiplier = multiplier * global_batch_size mean = y_sum / multiplier y_squared_mean = y_squared_sum / multiplier # var = E(x^2) - E(x)^2 variance = y_squared_mean - math_ops.square(mean) else: # Compute true mean while keeping the dims for proper broadcasting. mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean') # sample variance, not unbiased variance # Note: stop_gradient does not change the gradient that gets # backpropagated to the mean from the variance calculation, # because that gradient is zero variance = math_ops.reduce_mean( math_ops.squared_difference(y, array_ops.stop_gradient(mean)), axes, keepdims=True, name='variance') if not keep_dims: mean = array_ops.squeeze(mean, axes) variance = array_ops.squeeze(variance, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(variance, dtypes.float16)) else: return (mean, variance)
def least_squares_generator_loss( discriminator_gen_outputs, real_label=1, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS, add_summaries=False): """Least squares generator loss. This loss comes from `Least Squares Generative Adversarial Networks` (https://arxiv.org/abs/1611.04076). L = 1/2 * (D(G(z)) - `real_label`) ** 2 where D(y) are discriminator logits. Args: discriminator_gen_outputs: Discriminator output on generated data. Expected to be in the range of (-inf, inf). real_label: The value that the generator is trying to get the discriminator to output on generated data. weights: Optional `Tensor` whose rank is either 0, or the same rank as `discriminator_gen_outputs`, and must be broadcastable to `discriminator_gen_outputs` (i.e., all dimensions must be either `1`, or the same as the corresponding dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which this loss will be added. reduction: A `tf.losses.Reduction` to apply to loss. add_summaries: Whether or not to add summaries for the loss. Returns: A loss Tensor. The shape depends on `reduction`. """ with ops.name_scope(scope, 'lsq_generator_loss', (discriminator_gen_outputs, real_label)) as scope: discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs) loss = math_ops.squared_difference( discriminator_gen_outputs, real_label) / 2.0 loss = losses.compute_weighted_loss( loss, weights, scope, loss_collection, reduction) if add_summaries: summary.scalar('generator_lsq_loss', loss) return loss
def mean_squared_logarithmic_error(y_true, y_pred): """Computes the mean squared logarithmic error between `y_true` and `y_pred`. `loss = square(log(y_true) - log(y_pred))` Args: y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) first_log = math_ops.log(K.maximum(y_pred, K.epsilon()) + 1.) second_log = math_ops.log(K.maximum(y_true, K.epsilon()) + 1.) return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
def mean_squared_error(y_true, y_pred): """Computes the mean squared error between labels and predictions. After computing the squared distance between the inputs, the mean value over the last dimension is returned. `loss = mean(square(y_true - y_pred), axis=-1)` Args: y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: Mean squared error values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
def per_example_squared_loss(labels, weights, predictions): """Squared loss given labels, example weights and predictions. Args: labels: Rank 2 (N, D) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. predictions: Rank 2 (N, D) tensor of per-example predictions. Returns: loss: A Rank 2 (N, 1) tensor of per-example squared loss. update_op: An update operation to update the loss's internal state. """ unweighted_loss = math_ops.reduce_sum(math_ops.squared_difference( predictions, labels), 1, keepdims=True) return unweighted_loss * weights, control_flow_ops.no_op()
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ if data_format == b"NHWC": keep_dims = False reduce_axis = [0, 1, 2] else: keep_dims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims) mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims) var_x = math_ops.reduce_mean(math_ops.squared_difference( x, array_ops.stop_gradient(mean_x)), reduce_axis, keep_dims=keep_dims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean(grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return grad_x, grad_scale, grad_offset
def mse_align_to_y_true(y_true, y_pred): y_true_to_calc, y_pred_to_calc = _align_y_pre_process(y_true, y_pred) non_zero_count = tf.math.count_nonzero(tf.where( tf.math.is_nan(y_true_to_calc), 0, 1), axis=[-1, -2], dtype=tf.dtypes.float32) y_true_nan_to_zero = tf.where(tf.math.is_nan(y_true_to_calc), 0., y_true_to_calc) y_pred_nan_to_zero = tf.where(tf.math.is_nan(y_pred_to_calc), 0., y_pred_to_calc) squared_difference = math_ops.squared_difference(y_pred_nan_to_zero, y_true_nan_to_zero) mse = tf.math.divide(tf.math.reduce_sum(squared_difference, axis=[-1, -2]), non_zero_count) # old version # rlt = K.mean(math_ops.squared_difference(y_pred_to_calc, y_true_to_calc), axis=[-1, -2]) return mse
def _testGrad(self, left_shape, right_shape): if len(left_shape) > len(right_shape): output_shape = left_shape else: output_shape = right_shape l = np.random.randn(*left_shape) r = np.random.randn(*right_shape) with self.cached_session(use_gpu=True): left_tensor = constant_op.constant(l, shape=left_shape) right_tensor = constant_op.constant(r, shape=right_shape) output = math_ops.squared_difference(left_tensor, right_tensor) left_err = gradient_checker.compute_gradient_error( left_tensor, left_shape, output, output_shape, x_init_value=l) right_err = gradient_checker.compute_gradient_error( right_tensor, right_shape, output, output_shape, x_init_value=r) self.assertLess(left_err, 1e-10) self.assertLess(right_err, 1e-10)
def _BatchNormGrad(grad_y, x, scale, epsilon, data_format): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ if data_format == b"NHWC": keep_dims = False reduce_axis = [0, 1, 2] else: keep_dims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keep_dims=keep_dims) mean_x = math_ops.reduce_mean(x, reduce_axis, keep_dims=keep_dims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keep_dims=keep_dims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keep_dims=keep_dims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return grad_x, grad_scale, grad_offset
def _kl_normal_normal(n_a, n_b, name=None): """Calculate the batched KL divergence KL(n_a || n_b) with n_a and n_b Normal. Args: n_a: instance of a Normal distribution object. n_b: instance of a Normal distribution object. name: (optional) Name to use for created operations. default is "kl_normal_normal". Returns: Batchwise KL(n_a || n_b) """ with ops.name_scope(name, "kl_normal_normal", [n_a.loc, n_b.loc]): one = constant_op.constant(1, dtype=n_a.dtype) two = constant_op.constant(2, dtype=n_a.dtype) half = constant_op.constant(0.5, dtype=n_a.dtype) s_a_squared = math_ops.square(n_a.scale) s_b_squared = math_ops.square(n_b.scale) ratio = s_a_squared / s_b_squared return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared) + half * (ratio - one - math_ops.log(ratio)))
def dsnt_mse(y_true, y_pred): """Computes the mean squared error between labels and predictions. After computing the squared distance between the inputs, the mean value over the last dimension is returned. `loss = mean(square(y_true - y_pred), axis=-1)` Standalone usage: >>> y_true = np.random.randint(0, 2, size=(2, 3)) >>> y_pred = np.random.random(size=(2, 3)) >>> loss = tf.keras.losses.mean_squared_error(y_true, y_pred) >>> assert loss.shape == (2,) >>> assert np.array_equal( ... loss.numpy(), np.mean(np.square(y_true - y_pred), axis=-1)) Args: y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: Mean squared error values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor_v2(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
def update_state(self, y_true, y_pred, sample_weight=None): # tf.print("before: \n", y_true) weights = y_true[:, 4:] y_true = y_true[:, :4] # tf.print("after: \n", y_true) # tf.print("weights: \n", weights) # tf.print("pred: \n", y_pred) y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) #loss = K.sum(tf.math.multiply(math_ops.squared_difference(y_pred, y_true), weights))/K.sum(weights) loss = K.mean(K.sum(tf.math.multiply( math_ops.squared_difference(y_pred, y_true), weights), axis=-1) / K.sum(weights, axis=-1), axis=-1) # tf.print("loss: \n", loss) #loss = K.mean(tf.math.multiply(math_ops.squared_difference(y_pred, y_true), weights)) #,axis=-1) return self.metric.assign_add(loss)
def _define_diag_covariance_probs(self, shard_id, shard): """Defines the diagonal covariance probabilities per example in a class. Args: shard_id: id of the current shard. shard: current data shard, 1 X num_examples X dimensions. Returns a matrix num_examples * num_classes. """ # num_classes X 1 # TODO(xavigonzalvo): look into alternatives to log for # reparametrization of variance parameters. det_expanded = math_ops.reduce_sum( math_ops.log(self._covs + 1e-3), 1, keepdims=True) x2 = math_ops.squared_difference(shard, self._means) cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2) # num_classes X num_examples x2_cov = math_ops.matmul(x2, cov_expanded) x2_cov = array_ops.transpose(array_ops.squeeze(x2_cov, [2])) self._probs[shard_id] = -0.5 * ( math_ops.to_float(self._dimensions) * math_ops.log(2.0 * np.pi) + array_ops.transpose(det_expanded) + x2_cov)
def frechet_classifier_distance_from_activations(real_activations, generated_activations): real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) num_examples_real = math_ops.to_double( array_ops.shape(real_activations)[0]) num_examples_generated = math_ops.to_double( array_ops.shape(generated_activations)[0]) real_centered = real_activations - m sigma = math_ops.matmul(real_centered, real_centered, transpose_a=True) / (num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul(gen_centered, gen_centered, transpose_a=True) / (num_examples_generated - 1) sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component mean = math_ops.reduce_sum(math_ops.squared_difference(m, m_w)) fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def contrastive_loss(labels, embeddings_anchor, embeddings_positive, margin=1.0): """Computes the contrastive loss. This loss encourages the embedding to be close to each other for the samples of the same label and the embedding to be far apart at least by the margin constant for the samples of different labels. See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf Args: labels: 1-D tf.int32 `Tensor` with shape [batch_size] of binary labels indicating positive vs negative pair. embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor images. Embeddings should be l2 normalized. embeddings_positive: 2-D float `Tensor` of embedding vectors for the positive images. Embeddings should be l2 normalized. margin: margin term in the loss definition. Returns: contrastive_loss: tf.float32 scalar. """ # Get per pair distances distances = math_ops.sqrt( math_ops.reduce_sum( math_ops.squared_difference(embeddings_anchor, embeddings_positive), 1)) # Add contrastive loss for the siamese network. # label here is {0,1} for neg, pos. return math_ops.reduce_mean( math_ops.cast(labels, distances.dtype) * math_ops.square(distances) + (1. - math_ops.cast(labels, distances.dtype)) * math_ops.square(math_ops.maximum(margin - distances, 0.)), name='contrastive_loss')
def mean_squared_error(y_true, y_pred): y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
def _reduce_variance(x, axis=None, keepdims=False): sample_mean = math_ops.reduce_mean(x, axis, keepdims=True) return math_ops.reduce_mean( math_ops.squared_difference(x, sample_mean), axis, keepdims)
def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False. pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". is_training: A bool value to indicate the operation is for training (default) or inference. Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ x_dtype = x.dtype.base_dtype if x_dtype == dtypes.float16: # float16 math is too imprecise, so we do the batch norm gradient # computations in float32. x = math_ops.cast(x, dtypes.float32) grad_y = math_ops.cast(grad_y, dtypes.float32) if is_training: if data_format == b"NHWC": keepdims = False reduce_axis = [0, 1, 2] else: keepdims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims) mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keepdims=keepdims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset else: if data_format == b"NHWC": reduce_axis = [0, 1, 2] else: reduce_axis = [0, 2, 3] shape = [1, array_ops.size(pop_mean), 1, 1] pop_mean = array_ops.reshape(pop_mean, shape) pop_var = array_ops.reshape(pop_var, shape) scale = array_ops.reshape(scale, shape) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) var_rsqrt = math_ops.rsqrt(pop_var + epsilon) grad_scale = math_ops.reduce_sum( grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) grad_x = grad_y * scale * var_rsqrt return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
def sufficient_statistics(x, axes, shift=False, keep_dims=False, name=None): """Calculate the sufficient statistics for the mean and variance of `x`. These sufficient statistics are computed using the one pass algorithm on an input that's optionally shifted using the value of the 1st element in `x`. See: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data Unfortunately, in some cases using a random individual sample as the shift value leads experimentally to very poor numerical stability, so it is disabled by default. The one-pass approach might have to be revised accordingly. Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. shift: If true, shift the data to provide more numerically stable results. keep_dims: produce statistics with the same dimensionality as the input. name: Name used to scope the operations that compute the sufficient stats. Returns: Four `Tensor` objects of the same type as `x`: * the count (number of elements to average over). * the (possibly shifted) sum of the elements in the array. * the (possibly shifted) sum of squares of the elements in the array. * the shift by which the mean must be corrected or None if `shift` is False. """ with ops.op_scope([x, axes], name, "sufficient_statistics"): x = ops.convert_to_tensor(x, name="x") x_shape = x.get_shape() if x_shape.is_fully_defined(): counts = 1 m_shape = [] for d in xrange(x_shape.ndims): dim = x_shape[d].value if d in set(axes): counts *= dim dim = 1 m_shape.append(dim) counts = constant_op.constant(counts, dtype=x.dtype) else: # shape needs to be inferred at runtime. x_shape = array_ops.shape(x) select_axes = sparse_ops.sparse_to_dense(axes, array_ops.shape(x_shape), True, False) m_shape = math_ops.select(select_axes, array_ops.ones_like(x_shape), x_shape) counts = math_ops.cast( math_ops.reduce_prod(x_shape / m_shape), x.dtype, name="count") if shift: shift_value = array_ops.slice(x, array_ops.zeros_like(m_shape), m_shape) m_ss = math_ops.sub(x, shift_value) v_ss = math_ops.squared_difference(x, shift_value) if keep_dims: shift_value = array_ops.identity(shift_value, name="shift") else: shift_value = array_ops.squeeze(shift_value, squeeze_dims=axes, name="shift") else: # not shift. m_ss = x v_ss = math_ops.square(x) shift_value = None m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss") v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss") return counts, m_ss, v_ss, shift_value
def mean_squared_logarithmic_error(y_true, y_pred): # pylint: disable=missing-docstring y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.) second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.) return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
def frechet_classifier_distance_from_activations(real_activations, generated_activations): """Classifier distance for evaluating a generative model. This methods computes the Frechet classifier distance from activations of real images and generated images. This can be used independently of the frechet_classifier_distance() method, especially in the case of using large batches during evaluation where we would like precompute all of the activations before computing the classifier distance. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calculates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.to_double(real_activations) generated_activations = math_ops.to_double(generated_activations) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) num_examples_real = math_ops.to_double( array_ops.shape(real_activations)[0]) num_examples_generated = math_ops.to_double( array_ops.shape(generated_activations)[0]) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul(real_centered, real_centered, transpose_a=True) / (num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul(gen_centered, gen_centered, transpose_a=True) / (num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.reduce_sum(math_ops.squared_difference( m, m_w)) # Equivalent to L2 but more stable. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def diagonal_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model. This is based on the Frechet Inception distance, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 + (sigma + sigma_w - 2(sigma x sigma_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. In this variant, we compute diagonal-only covariance matrices. As a result, instead of computing an expensive matrix square root, we can do something much simpler, and has O(n) vs O(n^2) space complexity. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: Real images to use to compute Frechet Inception distance. generated_activations: Generated images to use to compute Frechet Inception distance. Returns: The diagonal-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. Raises: ValueError: If the shape of the variance and mean vectors are not equal. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute mean and covariance matrices of activations. m, var = nn_impl.moments(real_activations, axes=[0]) m_w, var_w = nn_impl.moments(generated_activations, axes=[0]) actual_shape = var.get_shape() expected_shape = m.get_shape() if actual_shape != expected_shape: raise ValueError('shape: {} must match expected shape: {}'.format( actual_shape, expected_shape)) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.reduce_sum( (var + var_w) - 2.0 * math_ops.sqrt(math_ops.multiply(var, var_w))) # Next the distance between means. mean = math_ops.reduce_sum( math_ops.squared_difference(m, m_w)) # Equivalent to L2 but more stable. dofid = trace + mean if activations_dtype != dtypes.float64: dofid = math_ops.cast(dofid, activations_dtype) return dofid
def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False): """Returns the frequency-weighted mean and variance of `x`. Args: x: A tensor. axes: 1-d tensor of int32 values; these are the axes along which to compute mean and variance. frequency_weights: A tensor of positive weights which can be broadcast with x. name: Name used to scope the operation. keep_dims: Produce moments with the same dimensionality as the input. Returns: Two tensors: `weighted_mean` and `weighted_variance`. """ with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]): x = ops.convert_to_tensor(x, name="x") frequency_weights = ops.convert_to_tensor( frequency_weights, name="frequency_weights") # Unlike moments(), this just uses a simpler two-pass method. # See comment in moments() WRT precision; it applies here too. needs_cast = x.dtype == dtypes.float16 if needs_cast: x = math_ops.cast(x, dtypes.float32) if frequency_weights.dtype != x.dtype: frequency_weights = math_ops.cast(frequency_weights, x.dtype) # Note that we use keep_dims=True for our reductions regardless of the arg; # this is so that the results remain broadcast-compatible with the inputs. weighted_input_sum = math_ops.reduce_sum( frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True) # The shape of the weights isn't necessarily the same as x's # shape, just broadcast-compatible with it -- so this expression # performs broadcasting to give a per-item weight, with the same # shape as (freqency_weights * x). This avoids having to reason # through all the broadcast logic to compute a correct # sum_of_weights. broadcasted_weights = frequency_weights + array_ops.zeros_like(x) sum_of_weights = math_ops.reduce_sum( broadcasted_weights, axes, name="sum_of_weights", keep_dims=True) divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum") weighted_mean = math_ops.mul(weighted_input_sum, divisor) # Have the weighted mean; now on to variance: weighted_distsq = math_ops.reduce_sum( frequency_weights * math_ops.squared_difference(x, weighted_mean), axes, name="weighted_distsq", keep_dims=True) weighted_variance = math_ops.mul(weighted_distsq, divisor) if not keep_dims: weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes) weighted_variance = array_ops.squeeze( weighted_variance, squeeze_dims=axes) if needs_cast: weighted_mean = math_ops.cast(weighted_mean, dtypes.float16) weighted_variance = math_ops.cast(weighted_variance, dtypes.float16) return weighted_mean, weighted_variance
def _mean_squared_error(self, targets, outputs, mask): loss = math_ops.squared_difference(targets, outputs) # TODO: Make the below safe to div by zero mse = tf.reduce_sum(loss) / tf.reduce_sum(mask) return mse
def frechet_classifier_distance_from_activations(real_activations, generated_activations): """Classifier distance for evaluating a generative model. This methods computes the Frechet classifier distance from activations of real images and generated images. This can be used independently of the frechet_classifier_distance() method, especially in the case of using large batches during evaluation where we would like precompute all of the activations before computing the classifier distance. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calculates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) num_examples_real = math_ops.cast( array_ops.shape(real_activations)[0], dtypes.float64) num_examples_generated = math_ops.cast( array_ops.shape(generated_activations)[0], dtypes.float64) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul( real_centered, real_centered, transpose_a=True) / ( num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul( gen_centered, gen_centered, transpose_a=True) / ( num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.reduce_sum( math_ops.squared_difference(m, m_w)) # Equivalent to L2 but more stable. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid