def variance(self, name="variance"):
        """Variance of each batch member.

    Variance for inverse gamma is defined only for `alpha > 2`. If
    `self.strict_statistics` is `True`, an exception will be raised rather
    than returning `NaN`.

    Args:
      name: A name to give this op.

    Returns:
      The variance for every batch member, a `Tensor` with same `dtype` as self.
    """
        alpha = self._alpha
        beta = self._beta
        with ops.name_scope(self.name):
            with ops.op_scope([alpha, beta], name):
                var_if_defined = math_ops.square(self._beta) / (
                    math_ops.square(self._alpha - 1.0) * (self._alpha - 2.0)
                )
                if self.strict_statistics:
                    two = ops.convert_to_tensor(2.0, dtype=self.dtype)
                    return control_flow_ops.with_dependencies([check_ops.assert_less(two, alpha)], var_if_defined)
                else:
                    alpha_gt_2 = alpha > 2.0
                    nan = np.nan * self._ones()
                    return math_ops.select(alpha_gt_2, var_if_defined, nan)
def npairs_loss(labels, embeddings_anchor, embeddings_positive,
                reg_lambda=0.002, print_losses=False):
  """Computes the npairs loss.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels.

  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
    embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

  Returns:
    npairs_loss: tf.float32 scalar.
  """
  # pylint: enable=line-too-long
  # Add the regularizer on the embedding.
  reg_anchor = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
  reg_positive = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
  l2loss = math_ops.multiply(
      0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss')

  # Get per pair similarities.
  similarity_matrix = math_ops.matmul(
      embeddings_anchor, embeddings_positive, transpose_a=False,
      transpose_b=True)

  # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
  lshape = array_ops.shape(labels)
  assert lshape.shape == 1
  labels = array_ops.reshape(labels, [lshape[0], 1])

  labels_remapped = math_ops.to_float(
      math_ops.equal(labels, array_ops.transpose(labels)))
  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

  # Add the softmax loss.
  xent_loss = nn.softmax_cross_entropy_with_logits(
      logits=similarity_matrix, labels=labels_remapped)
  xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

  if print_losses:
    xent_loss = logging_ops.Print(
        xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

  return l2loss + xent_loss
    def body(it, cost):
      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      cost = control_flow_ops.cond(
          math_ops.equal(it, 3), lambda: math_ops.square(cost),
          (lambda: cost + math_ops.reduce_sum(embedding)))
      return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          cond, body, [constant_op.constant(0),
                       constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,
                                           dynamic_grads.indices)

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,
                                          static_grads.indices)

      with self.cached_session():
        self.evaluate(variables.global_variables_initializer())
        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
  def test_optimize(self):
    scalar = variables.Variable(random_ops.random_normal([]), 'scalar')
    vector = variables.Variable(random_ops.random_normal([2]), 'vector')
    matrix = variables.Variable(random_ops.random_normal([2, 3]), 'matrix')

    minimum_location = constant_op.constant(np.arange(9), dtype=dtypes.float32)

    loss = math_ops.reduce_sum(
        math_ops.square(vector - minimum_location[:2])) / 2.
    loss += math_ops.reduce_sum(
        math_ops.square(scalar - minimum_location[2])) / 2.
    loss += math_ops.reduce_sum(
        math_ops.square(
            matrix - array_ops.reshape(minimum_location[3:], [2, 3]))) / 2.

    optimizer = MockOptimizerInterface(loss)

    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())

      optimizer.minimize(sess)

      self.assertAllClose(np.arange(2), sess.run(vector))
      self.assertAllClose(np.arange(1) + 2, sess.run(scalar))
      self.assertAllClose(np.arange(6).reshape(2, 3) + 3, sess.run(matrix))
def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
                     margin=1.0):
  """Computes the contrastive loss.

  This loss encourages the embedding to be close to each other for
    the samples of the same label and the embedding to be far apart at least
    by the margin constant for the samples of different labels.
  See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      binary labels indicating positive vs negative pair.
    embeddings_anchor: 2-D float `Tensor` of embedding vectors for the anchor
      images. Embeddings should be l2 normalized.
    embeddings_positive: 2-D float `Tensor` of embedding vectors for the
      positive images. Embeddings should be l2 normalized.
    margin: margin term in the loss definition.

  Returns:
    contrastive_loss: tf.float32 scalar.
  """
  # Get per pair distances
  distances = math_ops.sqrt(
      math_ops.reduce_sum(
          math_ops.square(embeddings_anchor - embeddings_positive), 1))

  # Add contrastive loss for the siamese network.
  #   label here is {0,1} for neg, pos.
  return math_ops.reduce_mean(
      math_ops.to_float(labels) * math_ops.square(distances) +
      (1. - math_ops.to_float(labels)) *
      math_ops.square(math_ops.maximum(margin - distances, 0.)),
      name='contrastive_loss')
Exemple #6
0
def per_example_quantile_regression_loss(labels, weights, predictions,
                                         quantile):
  """Smoothed loss for quantile regression.

  The standard quantile regression loss is quantile*(y-y') when y>y' and
  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
  below is this loss but squared in the region where the loss value < 1.

  Args:
    labels: Rank 2 (N, D) tensor of per-example labels.
    weights: Rank 2 (N, 1) tensor of per-example weights.
    predictions: Rank 2 (N, D) tensor of per-example predictions.
    quantile: The quantile to use.

  Returns:
    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
    update_op: An update operation to update the loss's internal state.
  """
  labels = math_ops.to_float(labels)
  error = labels - predictions
  square_loss_right = array_ops.where(error * quantile < 1.0,
                                      math_ops.square(quantile * error),
                                      quantile * error)
  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
                                     math_ops.square((quantile - 1) * error),
                                     (quantile - 1) * error)

  unweighted_loss = array_ops.where(error > 0, square_loss_right,
                                    square_loss_left)
  if weights is None:
    return unweighted_loss, control_flow_ops.no_op()
  else:
    return unweighted_loss * weights, control_flow_ops.no_op()
  def variance(self, name="variance"):
    """Variance of each batch member.

    Variance for inverse gamma is defined only for `alpha > 2`. If
    `self.allow_nan_stats` is `False`, an exception will be raised rather
    than returning `NaN`.

    Args:
      name: A name to give this op.

    Returns:
      The variance for every batch member, a `Tensor` with same `dtype` as self.
    """
    alpha = self._alpha
    beta = self._beta
    with ops.name_scope(self.name):
      with ops.op_scope([alpha, beta], name):
        var_if_defined = (math_ops.square(self._beta) /
                          (math_ops.square(self._alpha - 1.0) *
                           (self._alpha - 2.0)))
        if self.allow_nan_stats:
          alpha_gt_2 = alpha > 2.0
          nan = np.nan * self._ones()
          return math_ops.select(alpha_gt_2, var_if_defined, nan)
        else:
          two = constant_op.constant(2.0, dtype=self.dtype)
          return control_flow_ops.with_dependencies(
              [check_ops.assert_less(
                  two, alpha,
                  message="variance not defined for components of alpha <= 2")],
              var_if_defined)
def _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name):
  """Find max_norm given norm and previous average."""
  with vs.variable_scope(name, "AdaptiveMaxNorm", [norm]):
    log_norm = math_ops.log(norm + epsilon)

    def moving_average(name, value, decay):
      moving_average_variable = vs.get_variable(
          name,
          shape=value.get_shape(),
          dtype=value.dtype,
          initializer=init_ops.zeros_initializer(),
          trainable=False)
      return moving_averages.assign_moving_average(
          moving_average_variable, value, decay, zero_debias=False)

    # quicker adaptation at the beginning
    if global_step is not None:
      n = math_ops.to_float(global_step)
      decay = math_ops.minimum(decay, n / (n + 1.))

    # update averages
    mean = moving_average("mean", log_norm, decay)
    sq_mean = moving_average("sq_mean", math_ops.square(log_norm), decay)

    variance = sq_mean - math_ops.square(mean)
    std = math_ops.sqrt(math_ops.maximum(epsilon, variance))
    max_norms = math_ops.exp(mean + std_factor * std)
    return max_norms, mean
Exemple #9
0
  def get_updates(self, loss, params):
    grads = self.get_gradients(loss, params)
    shapes = [K.int_shape(p) for p in params]
    accumulators = [K.zeros(shape) for shape in shapes]
    delta_accumulators = [K.zeros(shape) for shape in shapes]
    self.weights = accumulators + delta_accumulators
    self.updates = [state_ops.assign_add(self.iterations, 1)]

    lr = self.lr
    if self.initial_decay > 0:
      lr = lr * (  # pylint: disable=g-no-augmented-assignment
          1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                K.dtype(self.decay))))

    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
      # update accumulator
      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
      self.updates.append(state_ops.assign(a, new_a))

      # use the new accumulator and the *old* delta_accumulator
      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
      new_p = p - lr * update

      # Apply constraints.
      if getattr(p, 'constraint', None) is not None:
        new_p = p.constraint(new_p)

      self.updates.append(state_ops.assign(p, new_p))

      # update delta_accumulator
      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
      self.updates.append(state_ops.assign(d_a, new_d_a))
    return self.updates
def _Atan2Grad(op, grad):
  """Returns grad * x / (x^2 + y^2), grad * -y / (x^2 + y^2)."""
  y = op.inputs[0]
  x = op.inputs[1]
  with ops.control_dependencies([grad]):
    grad_inv = grad / (math_ops.square(x) + math_ops.square(y))
    return x * grad_inv, -y * grad_inv
def _cross_squared_distance_matrix(x, y):
  """Pairwise squared distance between two (batch) matrices' rows (2nd dim).

  Computes the pairwise distances between rows of x and rows of y
  Args:
    x: [batch_size, n, d] float `Tensor`
    y: [batch_size, m, d] float `Tensor`

  Returns:
    squared_dists: [batch_size, n, m] float `Tensor`, where
    squared_dists[b,i,j] = ||x[b,i,:] - y[b,j,:]||^2
  """
  x_norm_squared = math_ops.reduce_sum(math_ops.square(x), 2)
  y_norm_squared = math_ops.reduce_sum(math_ops.square(y), 2)

  # Expand so that we can broadcast.
  x_norm_squared_tile = array_ops.expand_dims(x_norm_squared, 2)
  y_norm_squared_tile = array_ops.expand_dims(y_norm_squared, 1)

  x_y_transpose = math_ops.matmul(x, y, adjoint_b=True)

  # squared_dists[b,i,j] = ||x_bi - y_bj||^2 = x_bi'x_bi- 2x_bi'x_bj + x_bj'x_bj
  squared_dists = x_norm_squared_tile - 2 * x_y_transpose + y_norm_squared_tile

  return squared_dists
  def _compute_euclidean_distance(cls, inputs, clusters):
    """Computes Euclidean distance between each input and each cluster center.

    Args:
      inputs: list of input Tensors.
      clusters: cluster Tensor.

    Returns:
      list of Tensors, where each element corresponds to each element in inputs.
      The value is the distance of each row to all the cluster centers.
    """
    output = []
    for inp in inputs:
      with ops.colocate_with(inp, ignore_existing=True):
        # Computes Euclidean distance. Note the first and third terms are
        # broadcast additions.
        squared_distance = (
            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
            array_ops.transpose(
                math_ops.reduce_sum(
                    math_ops.square(clusters), 1, keep_dims=True)))
        output.append(squared_distance)

    return output
def _r2(probabilities, targets):
    if targets.get_shape().ndims == 1:
        targets = array_ops.expand_dims(targets, -1)
    y_mean = math_ops.reduce_mean(targets, 0)
    squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
    squares_residuals = math_ops.reduce_sum(math_ops.square(targets - probabilities), 0)
    score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
    return metric_ops.streaming_mean(score)
Exemple #14
0
def _r2(probabilities, targets, weights=None):
  targets = math_ops.to_float(targets)
  y_mean = math_ops.reduce_mean(targets, 0)
  squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
  squares_residuals = math_ops.reduce_sum(
      math_ops.square(targets - probabilities), 0)
  score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
  return metric_ops.streaming_mean(score, weights=weights)
def normal_conjugates_known_sigma_posterior(prior, sigma, s, n):
  """Posterior Normal distribution with conjugate prior on the mean.

  This model assumes that `n` observations (with sum `s`) come from a
  Normal with unknown mean `mu` (described by the Normal `prior`)
  and known variance `sigma^2`.  The "known sigma posterior" is
  the distribution of the unknown `mu`.

  Accepts a prior Normal distribution object, having parameters
  `mu0` and `sigma0`, as well as known `sigma` values of the predictive
  distribution(s) (also assumed Normal),
  and statistical estimates `s` (the sum(s) of the observations) and
  `n` (the number(s) of observations).

  Returns a posterior (also Normal) distribution object, with parameters
  `(mu', sigma'^2)`, where:

  ```
  mu ~ N(mu', sigma'^2)
  sigma'^2 = 1/(1/sigma0^2 + n/sigma^2),
  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma'^2.
  ```

  Distribution parameters from `prior`, as well as `sigma`, `s`, and `n`.
  will broadcast in the case of multidimensional sets of parameters.

  Args:
    prior: `Normal` object of type `dtype`:
      the prior distribution having parameters `(mu0, sigma0)`.
    sigma: tensor of type `dtype`, taking values `sigma > 0`.
      The known stddev parameter(s).
    s: Tensor of type `dtype`.  The sum(s) of observations.
    n: Tensor of type `int`.  The number(s) of observations.

  Returns:
    A new Normal posterior distribution object for the unknown observation
    mean `mu`.

  Raises:
    TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
      Normal object.
  """
  if not isinstance(prior, Normal):
    raise TypeError("Expected prior to be an instance of type Normal")

  if s.dtype != prior.dtype:
    raise TypeError(
        "Observation sum s.dtype does not match prior dtype: %s vs. %s"
        % (s.dtype, prior.dtype))

  n = math_ops.cast(n, prior.dtype)
  sigma0_2 = math_ops.square(prior.sigma)
  sigma_2 = math_ops.square(sigma)
  sigmap_2 = 1.0/(1/sigma0_2 + n/sigma_2)
  return Normal(
      mu=(prior.mu/sigma0_2 + s/sigma_2) * sigmap_2,
      sigma=math_ops.sqrt(sigmap_2))
  def _get_coordinatewise_learning_rate(self, grad, var):
    # Compute the learning rate using a moving average for the diagonal of BB^T
    avg_first = self.get_slot(var, 'first_moment')
    avg_second = self.get_slot(var, 'second_moment')
    decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
    batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)

    # Create an estimator for the moving average of gradient mean and variance
    # via Welford's algorithm
    if isinstance(grad, ops.Tensor):
      delta = grad - avg_first
      first_moment_update = avg_first.assign_add(
          array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        second_moment_update = avg_second.assign_add(
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                avg_second - decay_tensor  * math_ops.square(delta)))
      diag_preconditioner = control_flow_ops.with_dependencies(
          [second_moment_update],
          clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
    elif isinstance(grad, ops.IndexedSlices):
      delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
      first_moment_update = state_ops.scatter_add(
          avg_first,
          grad.indices,
          array_ops.where(self._counter < 1,
                          math_ops.cast(1., var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        avg_second = state_ops.scatter_add(
            avg_second,
            grad.indices,
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
                math_ops.square(delta)))
        avg_second = array_ops.gather_nd(avg_second, grad.indices)
        # TODO(b/70783772)
        diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
    else:
      raise errors.InvalidArgumentError(
          None, None, 'grad must of type Tensor or IndexedSlice')

    diag_preconditioner *= batch_size

    if self._use_single_learning_rate:
      diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)

    # From Theorem 2 Corollary 1 of Mandt et al. 2017
    return 2. * batch_size / (
        math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
        diag_preconditioner)
 def testNoIntegerGradient6(self):
   k = constant_op.constant(3)
   x = math_ops.to_float(k)
   grad_1, = gradients_impl.gradients(k * k, k)
   grad_2, = gradients_impl.gradients(x * x, k)
   grad_3, = gradients_impl.gradients(math_ops.square(k), k)
   grad_4, = gradients_impl.gradients(math_ops.square(x), k)
   self.assertIsNone(grad_1)
   self.assertIsNone(grad_2)
   self.assertIsNone(grad_3)
   self.assertIsNone(grad_4)
def calculate_loss(input_mat, row_factors, col_factors, regularization=None,
                   w0=1., row_weights=None, col_weights=None):
  """Calculates the loss of a given factorization.

  Using a non distributed method, different than the one implemented in the
  WALS model. The weight of an observed entry (i, j) (i.e. such that
  input_mat[i, j] is non zero) is (w0 + row_weights[i]col_weights[j]).

  Args:
    input_mat: The input matrix, a SparseTensor of rank 2.
    row_factors: The row factors, a dense Tensor of rank 2.
    col_factors: The col factors, a dense Tensor of rank 2.
    regularization: the regularization coefficient, a scalar.
    w0: the weight of unobserved entries. A scalar.
    row_weights: A dense tensor of rank 1.
    col_weights: A dense tensor of rank 1.

  Returns:
    The total loss.
  """
  wr = (array_ops.expand_dims(row_weights, 1) if row_weights is not None
        else constant_op.constant(1.))
  wc = (array_ops.expand_dims(col_weights, 0) if col_weights is not None
        else constant_op.constant(1.))
  reg = (regularization if regularization is not None
         else constant_op.constant(0.))

  row_indices, col_indices = array_ops.split(input_mat.indices,
                                             axis=1,
                                             num_or_size_splits=2)
  gathered_row_factors = array_ops.gather(row_factors, row_indices)
  gathered_col_factors = array_ops.gather(col_factors, col_indices)
  sp_approx_vals = array_ops.squeeze(math_ops.matmul(
      gathered_row_factors, gathered_col_factors, adjoint_b=True))
  sp_approx = sparse_tensor.SparseTensor(
      indices=input_mat.indices,
      values=sp_approx_vals,
      dense_shape=input_mat.dense_shape)

  sp_approx_sq = math_ops.square(sp_approx)
  row_norm = math_ops.reduce_sum(math_ops.square(row_factors))
  col_norm = math_ops.reduce_sum(math_ops.square(col_factors))
  row_col_norm = math_ops.reduce_sum(math_ops.square(math_ops.matmul(
      row_factors, col_factors, transpose_b=True)))

  resid = sparse_ops.sparse_add(input_mat, sp_approx * (-1))
  resid_sq = math_ops.square(resid)
  loss = w0 * (
      sparse_ops.sparse_reduce_sum(resid_sq) -
      sparse_ops.sparse_reduce_sum(sp_approx_sq)
      )
  loss += (sparse_ops.sparse_reduce_sum(wr * (resid_sq * wc)) +
           w0 * row_col_norm + reg * (row_norm + col_norm))
  return loss.eval()
def known_sigma_predictive(prior, sigma, s, n):
  """Return the posterior predictive distribution with known sigma.

  Accepts a prior Gaussian distribution, having parameters `mu0` and `sigma0`,
  a known `sigma` of the predictive distribution (also assumed Gaussian),
  and statistical estimates `s` (the sum of the observations) and
  `n` (the number of observations).

  Calculates the Gaussian distribution p(x | sigma):
  ```
    p(x | sigma) = int N(x | mu, sigma^2) N(mu | prior.mu, prior.sigma^2) dmu
                 = N(x | prior.mu, 1/(sigma^2 + prior.sigma^2))
  ```

  Returns the predictive posterior distribution object, with parameters
  `(mu', sigma'^2)`, where:
  ```
  sigma_n^2 = 1/(1/sigma0^2 + n/sigma^2),
  mu' = (mu0/sigma0^2 + s/sigma^2) * sigma_n^2.
  sigma'^2 = sigma_n^2 + sigma^2,
  ```

  Args:
    prior: `Normal` object of type `dtype`, the prior distribution having
      parameters `(mu0, sigma0)`.
    sigma: Scalar of type `dtype`, `sigma > 0`.  The known stddev parameter.
    s: Scalar, of type `dtype`, the sum of observations.
    n: Scalar int, the number of observations.

  Returns:
    A new Gaussian posterior distribution.

  Raises:
    TypeError: if dtype of `s` does not match `dtype`, or `prior` is not a
      Gaussian object.
  """
  if not isinstance(prior, Gaussian):
    raise TypeError("Expected prior to be an instance of type Gaussian")

  if s.dtype != prior.dtype:
    raise TypeError(
        "Observation sum s.dtype does not match prior dtype: %s vs. %s"
        % (s.dtype, prior.dtype))

  n = math_ops.cast(n, prior.dtype)
  sigma0_2 = math_ops.square(prior.sigma)
  sigma_2 = math_ops.square(sigma)
  sigmap_2 = 1.0/(1/sigma0_2 + n/sigma_2)
  return Gaussian(
      mu=(prior.mu/sigma0_2 + s/sigma_2) * sigmap_2,
      sigma=math_ops.sqrt(sigmap_2 + sigma_2))
  def testSample(self):
    with self.test_session():
      scale = make_pd(1., 2)
      df = 4

      chol_w = distributions.WishartCholesky(
          df, chol(scale), cholesky_input_output_matrices=False)

      x = chol_w.sample(1, seed=42).eval()
      chol_x = [chol(x[0])]

      full_w = distributions.WishartFull(
          df, scale, cholesky_input_output_matrices=False)
      self.assertAllClose(x, full_w.sample(1, seed=42).eval())

      chol_w_chol = distributions.WishartCholesky(
          df, chol(scale), cholesky_input_output_matrices=True)
      self.assertAllClose(chol_x, chol_w_chol.sample(1, seed=42).eval())
      eigen_values = array_ops.matrix_diag_part(
          chol_w_chol.sample(
              1000, seed=42))
      np.testing.assert_array_less(0., eigen_values.eval())

      full_w_chol = distributions.WishartFull(
          df, scale, cholesky_input_output_matrices=True)
      self.assertAllClose(chol_x, full_w_chol.sample(1, seed=42).eval())
      eigen_values = array_ops.matrix_diag_part(
          full_w_chol.sample(
              1000, seed=42))
      np.testing.assert_array_less(0., eigen_values.eval())

      # Check first and second moments.
      df = 4.
      chol_w = distributions.WishartCholesky(
          df=df,
          scale=chol(make_pd(1., 3)),
          cholesky_input_output_matrices=False)
      x = chol_w.sample(10000, seed=42)
      self.assertAllEqual((10000, 3, 3), x.get_shape())

      moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval()
      self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05)

      # The Variance estimate uses the squares rather than outer-products
      # because Wishart.Variance is the diagonal of the Wishart covariance
      # matrix.
      variance_estimate = (math_ops.reduce_mean(
          math_ops.square(x), reduction_indices=[0]) -
                           math_ops.square(moment1_estimate)).eval()
      self.assertAllClose(
          chol_w.variance().eval(), variance_estimate, rtol=0.05)
Exemple #21
0
 def _l2_loss(self):
   """"Computes the l1 loss of the model."""
   with name_scope('l2_loss'):
     sparse_weights = self._convert_n_to_tensor(self._variables[
         'sparse_features_weights'])
     dense_weights = self._convert_n_to_tensor(self._variables[
         'dense_features_weights'])
     l2 = self._options['symmetric_l2_regularization']
     loss = 0
     for w in sparse_weights:
       loss += l2 * math_ops.reduce_sum(math_ops.square(w))
     for w in dense_weights:
       loss += l2 * math_ops.reduce_sum(math_ops.square(w))
     return loss
Exemple #22
0
 def testDependentYs(self):
   with self.test_session():
     x = constant_op.constant(3.0)
     y = math_ops.square(x)
     y1 = math_ops.square(y)
     y2 = math_ops.square(y1)
     g = gradients.gradients([y, y2], x)
     self.assertAllClose(17502.0, g[0].eval())
     g = gradients.gradients(y + y2, x)
     self.assertAllClose(17502.0, g[0].eval())
     z = array_ops.identity(y)
     z2 = array_ops.identity(y2)
     g = gradients.gradients([z, z2], x)
     self.assertAllClose(17502.0, g[0].eval())
 def _variance(self):
   var = (math_ops.square(self.beta) /
          (math_ops.square(self.alpha - 1.) * (self.alpha - 2.)))
   if self.allow_nan_stats:
     nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
     return array_ops.where(
         self.alpha > 2., var,
         array_ops.fill(self.batch_shape(), nan, name="nan"))
   else:
     return control_flow_ops.with_dependencies([
         check_ops.assert_less(
             constant_op.constant(2., dtype=self.dtype), self.alpha,
             message="variance not defined for components of alpha <= 2"),
     ], var)
def _ErfGrad(op, grad):
  """Returns grad * 2/sqrt(pi) * exp(-x**2)."""
  x = op.inputs[0]
  two_over_root_pi = constant_op.constant(2 / np.sqrt(np.pi), dtype=grad.dtype)
  with ops.control_dependencies([grad]):
    x = math_ops.conj(x)
    return grad * two_over_root_pi * math_ops.exp(-math_ops.square(x))
Exemple #25
0
def l2_normalize(x, dim, epsilon=1e-12, name=None):
  """Normalizes along dimension `dim` using an L2 norm.

  For a 1-D tensor with `dim = 0`, computes

      output = x / sqrt(max(sum(x**2), epsilon))

  For `x` with more dimensions, independently normalizes each 1-D slice along
  dimension `dim`.

  Args:
    x: A `Tensor`.
    dim: Dimension along which to normalize.
    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
      divisor if `norm < sqrt(epsilon)`.
    name: A name for this operation (optional).

  Returns:
    A `Tensor` with the same shape as `x`.
  """
  with ops.op_scope([x], name, "l2_normalize") as name:
    x = ops.convert_to_tensor(x, name="x")
    square_sum = math_ops.reduce_sum(math_ops.square(x), [dim], keep_dims=True)
    x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
    return math_ops.mul(x, x_inv_norm, name=name)
 def squared_frobenius_norm(x):
   """Helper to make KL calculation slightly more readable."""
   # http://mathworld.wolfram.com/FrobeniusNorm.html
   # The gradient of KL[p,q] is not defined when p==q. The culprit is
   # linalg_ops.norm, i.e., we cannot use the commented out code.
   # return math_ops.square(linalg_ops.norm(x, ord="fro", axis=[-2, -1]))
   return math_ops.reduce_sum(math_ops.square(x), axis=[-2, -1])
Exemple #27
0
def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
  """Calculate the mean and variance of based on the sufficient statistics.

  Args:
    counts: A `Tensor` containing a the total count of the data (one value).
    mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
      shifted) sum of the elements to average over.
    variance_ss: A `Tensor` containing the variance sufficient statistics: the
      (possibly shifted) squared sum of the data to compute the variance over.
    shift: A `Tensor` containing the value by which the data is shifted for
      numerical stability, or `None` if no shift was performed.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  """
  with ops.op_scope([counts, mean_ss, variance_ss, shift], name, "normalize"):
    divisor = math_ops.inv(counts, name="divisor")
    if shift is not None:
      shifted_mean = math_ops.mul(mean_ss, divisor, name="shifted_mean")
      mean = math_ops.add(shifted_mean, shift, name="mean")
    else:  # no shift.
      shifted_mean = math_ops.mul(mean_ss, divisor, name="mean")
      mean = shifted_mean
    variance = math_ops.sub(
        math_ops.mul(variance_ss, divisor),
        math_ops.square(shifted_mean),
        name="variance")
  return (mean, variance)
Exemple #28
0
  def variance(self, name="variance"):
    """Variance.

    Variance is defined as,

    ```none
    Var = E[(X - E[X])**2]
    ```

    where `X` is the random variable associated with this distribution, `E`
    denotes expectation, and `Var.shape = batch_shape + event_shape`.

    Args:
      name: Python `str` prepended to names of ops created by this function.

    Returns:
      variance: Floating-point `Tensor` with shape identical to
        `batch_shape + event_shape`, i.e., the same shape as `self.mean()`.
    """
    with self._name_scope(name):
      try:
        return self._variance()
      except NotImplementedError as original_exception:
        try:
          return math_ops.square(self._stddev())
        except NotImplementedError:
          raise original_exception
def pearson(logu, name=None):
  """The Pearson Csiszar-function in log-space.

  A Csiszar-function is a member of,

  ```none
  F = { f:R_+ to R : f convex }.
  ```

  The Pearson Csiszar-function is:

  ```none
  f(u) = (u - 1)**2
  ```

  Warning: this function makes non-log-space calculations and may therefore be
  numerically unstable for `|logu| >> 0`.

  Args:
    logu: Floating-type `Tensor` representing `log(u)` from above.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    pearson_of_u: Floating-type `Tensor` of the Csiszar-function evaluated at
      `u = exp(logu)`.
  """

  with ops.name_scope(name, "pearson", [logu]):
    logu = ops.convert_to_tensor(logu, name="logu")
    return math_ops.square(math_ops.expm1(logu))
  def testIdentifyGradientWorksOnMultipleLosses(self):
    grad_debugger_1 = debug_gradients.GradientsDebugger()
    grad_debugger_2 = debug_gradients.GradientsDebugger()

    y = math_ops.add(self.w, -1.0, name="y")
    debug_y = grad_debugger_1.identify_gradient(y)
    z1 = math_ops.square(debug_y, name="z1")

    debug_y = grad_debugger_2.identify_gradient(y)
    z2 = math_ops.sqrt(debug_y, name="z2")

    with grad_debugger_1:
      gradient_descent.GradientDescentOptimizer(0.1).minimize(z1)
    with grad_debugger_2:
      gradient_descent.GradientDescentOptimizer(0.1).minimize(z2)

    dz1_dy = grad_debugger_1.gradient_tensor(y)
    dz2_dy = grad_debugger_2.gradient_tensor(y)
    self.assertIsInstance(dz1_dy, ops.Tensor)
    self.assertIsInstance(dz2_dy, ops.Tensor)
    self.assertIsNot(dz1_dy, dz2_dy)

    self.sess.run(variables.global_variables_initializer())
    self.assertAllClose(5.0 ** 2, self.sess.run(z1))
    self.assertAllClose(5.0 ** 0.5, self.sess.run(z2))
    self.assertAllClose(2.0 * 5.0, self.sess.run(dz1_dy))
    self.assertAllClose(0.5 * (5.0 ** -0.5), self.sess.run(dz2_dy))
Exemple #31
0
 def _log_unnormalized_prob(self, x):
     return -math_ops.log1p(math_ops.square(self._z(x)))
def frechet_classifier_distance(real_images,
                                generated_images,
                                classifier_fn,
                                num_batches=1):
    """Classifier distance for evaluating a generative model.

  This is based on the Frechet Inception distance, but for an arbitrary
  classifier.

  This technique is described in detail in https://arxiv.org/abs/1706.08500.
  Given two Gaussian distribution with means m and m_w and covariance matrices
  C and C_w, this function calcuates

  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))

  which captures how different the distributions of real images and generated
  images (or more accurately, their visual features) are. Note that unlike the
  Inception score, this is a true distance and utilizes information about real
  world images.

  Note that when computed using sample means and sample covariance matrices,
  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
  even if the two distributions are the same, for a small sample size, the
  expected Frechet distance is large). It is important to use the same
  sample size to compute frechet classifier distance when comparing two
  generative models.

  Args:
    real_images: Real images to use to compute Frechet Inception distance.
    generated_images: Generated images to use to compute Frechet Inception
      distance.
    classifier_fn: A function that takes images and produces activations
      based on a classifier.
    num_batches: Number of batches to split images in to in order to
      efficiently run them through the classifier network.

  Returns:
    The Frechet Inception distance. A floating-point scalar of the same type
    as the output of `classifier_fn`
  """

    real_images_list = array_ops.split(real_images,
                                       num_or_size_splits=num_batches)
    generated_images_list = array_ops.split(generated_images,
                                            num_or_size_splits=num_batches)

    imgs = array_ops.stack(real_images_list + generated_images_list)

    # Compute the activations using the memory-efficient `map_fn`.
    activations = functional_ops.map_fn(fn=classifier_fn,
                                        elems=imgs,
                                        parallel_iterations=1,
                                        back_prop=False,
                                        swap_memory=True,
                                        name='RunClassifier')

    activations_dtype = activations.dtype
    # Split the activations by the real and generated images.
    real_a, gen_a = array_ops.split(activations, [num_batches, num_batches], 0)

    # Ensure the activations have the right shapes.
    real_a = array_ops.concat(array_ops.unstack(real_a), 0)
    gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
    if activations_dtype != dtypes.float64:
        real_a = math_ops.to_double(real_a)
        gen_a = math_ops.to_double(gen_a)

    real_a.shape.assert_has_rank(2)
    gen_a.shape.assert_has_rank(2)

    # Compute mean and covariance matrices of activations.
    m = math_ops.reduce_mean(real_a, 0)
    m_v = math_ops.reduce_mean(gen_a, 0)
    num_examples = math_ops.to_double(array_ops.shape(real_a)[0])

    # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
    sigma = math_ops.matmul(real_a - m, real_a - m,
                            transpose_a=True) / (num_examples - 1)

    sigma_v = math_ops.matmul(gen_a - m_v, gen_a - m_v,
                              transpose_a=True) / (num_examples - 1)

    # Find the Tr(sqrt(sigma sigma_v)) component of FID
    sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)

    # Compute the two components of FID.

    # First the covariance component.
    # Here, note that trace(A + B) = trace(A) + trace(B)
    trace = math_ops.trace(sigma + sigma_v) - 2.0 * sqrt_trace_component

    # Next the distance between means.
    mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
    fid = trace + mean
    if activations_dtype != dtypes.float64:
        fid = math_ops.cast(fid, activations_dtype)

    return fid
Exemple #33
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = tf_utils.smart_cond(training, lambda: r,
                                lambda: array_ops.ones_like(r))
        d = tf_utils.smart_cond(training, lambda: d,
                                lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                """Updates the var and weight, returns their updated ratio."""
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = self._assign_moving_average(var, value,
                                                      self.renorm_momentum)
                new_weight = self._assign_moving_average(
                    weight, weight_value, self.renorm_momentum)
                # TODO(yuefengz): the updates to var and weighted can not be batched
                # together if we fetch their updated values here. Consider calculating
                # new values and delaying the updates.
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return tf_utils.smart_cond(training, _do_update, _fake_update)

        # TODO(yuefengz): colocate the operations
        new_mean = _update_renorm_variable(self.renorm_mean,
                                           self.renorm_mean_weight, mean)
        new_stddev = _update_renorm_variable(self.renorm_stddev,
                                             self.renorm_stddev_weight, stddev)
        # Make sqrt(moving_variance + epsilon) = new_stddev.
        new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Exemple #34
0
 def loss(x):
     return math_ops.square(asinp1(x))
Exemple #35
0
 def _variance(self):
     return self.concentration / math_ops.square(self.rate)
Exemple #36
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0, and decay=1 meaning no updates.
        r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
        d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
        decay = _smart_select(training, lambda: self.renorm_momentum,
                              lambda: 1.)

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            # Update the variables without zero debiasing. The debiasing will be
            # accomplished by dividing the exponential moving average by the weight.
            # For example, after a single update, the moving average would be
            # (1-decay) * value. and the weight will be 1-decay, with their ratio
            # giving value.
            # Make sure the weight is not updated until before r and d computation.
            value = array_ops.identity(value)
            with ops.control_dependencies([value]):
                weight_value = array_ops.constant(1., dtype=weight.dtype)
            new_var = moving_averages.assign_moving_average(var,
                                                            value,
                                                            decay,
                                                            zero_debias=False)
            new_weight = moving_averages.assign_moving_average(
                weight, weight_value, decay, zero_debias=False)
            return new_var / new_weight

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
 def _map_fn(x, y, z):
     return math_ops.square(x), math_ops.square(y), math_ops.square(z)
Exemple #38
0
 def _variance(self):
     return math_ops.square(self.std())
Exemple #39
0
def per_example_squared_hinge_loss(labels, weights, predictions):
    loss = losses.hinge_loss(labels=labels,
                             logits=predictions,
                             weights=weights)
    return math_ops.square(loss), control_flow_ops.no_op()
Exemple #40
0
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
           x_log_prob=None, x_grad=None, skip_metropolis_step=False, name=None):
  """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes
  that all dimensions of `x` not specified in `event_dims` are
  independent, and should therefore be updated independently. The
  output of `target_log_prob_fn()` should sum log-probabilities across
  all event dimensions. Slices along dimensions not in `event_dims`
  may have different target distributions; for example, if
  `event_dims == (1,)`, then `x[0, :]` could have a different target
  distribution from x[1, :]. This is up to `target_log_prob_fn()`.

  Args:
    step_size: Scalar step size or array of step sizes for the
      leapfrog integrator. Broadcasts to the shape of
      `x`. Larger step sizes lead to faster progress, but
      too-large step sizes make rejection exponentially more likely.
      When possible, it's often helpful to match per-variable step
      sizes to the standard deviations of the target distribution in
      each variable.
    n_leapfrog_steps: Integer number of steps to run the leapfrog
      integrator for. Total progress per HMC step is roughly
      proportional to step_size * n_leapfrog_steps.
    x: Tensor containing the value(s) of the random variable(s) to update.
    target_log_prob_fn: Python callable which takes an argument like `initial_x`
      and returns its (possibly unnormalized) log-density under the target
      distribution.
    event_dims: List of dimensions that should not be treated as
      independent. This allows for multiple chains to be run independently
      in parallel. Default is (), i.e., all dimensions are independent.
    x_log_prob (optional): Tensor containing the cached output of a previous
      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    x_grad (optional): Tensor containing the cached gradient of
      `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    skip_metropolis_step (optional): boolean specifying whether to skip the
      Metropolis-Hastings step and directly return the newly proposed values
      by the integrator. The acceptance probabilities returned remain unchanged.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
    acceptance_probs: Tensor with the acceptance probabilities for the final
      iteration. This is useful for diagnosing step size problems etc. Has
      shape matching `target_log_prob_fn(initial_x)`.
    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
      `updated_x`.

  #### Examples:

  ```python
  # Tuning acceptance rates:
  target_accept_rate = 0.631
  def target_log_prob(x):
    # Standard normal
    return tf.reduce_sum(-0.5 * tf.square(x))
  initial_x = tf.zeros([10])
  initial_log_prob = target_log_prob(initial_x)
  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
  # Algorithm state
  x = tf.Variable(initial_x, name='x')
  step_size = tf.Variable(1., name='step_size')
  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
  last_grad = tf.Variable(initial_grad, name='last_grad')
  # Compute updates
  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
                                                      target_log_prob,
                                                      event_dims=[0],
                                                      x_log_prob=last_log_prob)
  x_update = tf.assign(x, new_x)
  log_prob_update = tf.assign(last_log_prob, log_prob)
  grad_update = tf.assign(last_grad, grad)
  step_size_update = tf.assign(step_size,
                               tf.where(acceptance_prob > target_accept_rate,
                                        step_size * 1.01, step_size / 1.01))
  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
  sampling_updates = [x_update, log_prob_update, grad_update]

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  # Warm up the sampler and adapt the step size
  for i in xrange(500):
    sess.run(adaptive_updates)
  # Collect samples without adapting step size
  samples = np.zeros([500, 10])
  for i in xrange(500):
    x_val, _ = sess.run([new_x, sampling_updates])
    samples[i] = x_val
  ```

  ```python
  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:

  # Problem setup
  N = 150
  D = 10
  x = np.random.randn(N, D).astype(np.float32)
  true_sigma = 0.5
  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)

  def log_prior(beta, log_sigma):
    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
                         log_sigma)
  def regression_log_joint(beta, log_sigma, x, y):
    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
    means = tf.squeeze(means)
    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
    return log_prior(beta, log_sigma) + log_likelihood
  def log_joint_partial(beta):
    return regression_log_joint(beta, log_sigma, x, y)
  # Our estimate of log(sigma)
  log_sigma = tf.Variable(0., name='log_sigma')
  # The state of the Markov chain
  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
                                 event_dims=[0])
  beta_update = tf.assign(beta, new_beta)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  with tf.control_dependencies([beta_update]):
    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
                                          var_list=[log_sigma])

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  log_sigma_history = np.zeros(1000)
  for i in xrange(1000):
    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
    log_sigma_history[i] = log_sigma_val
  # Should converge to something close to true_sigma
  plt.plot(np.exp(log_sigma_history))
  ```
  """
  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
    x = ops.convert_to_tensor(x, name='x')

    x_shape = array_ops.shape(x)
    m = random_ops.random_normal(x_shape, dtype=x.dtype)

    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)

    if (x_log_prob is not None) and (x_grad is not None):
      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
    else:
      if x_log_prob is not None:
        logging.warn('x_log_prob was provided, but x_grad was not,'
                     ' so x_log_prob was not used.')
      if x_grad is not None:
        logging.warn('x_grad was provided, but x_log_prob was not,'
                     ' so x_grad was not used.')
      log_potential_0, grad_0 = potential_and_grad(x)

    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)

    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)

    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
    # Treat NaN as infinite energy (and therefore guaranteed rejection).
    energy_change = array_ops.where(
        math_ops.is_nan(energy_change),
        array_ops.fill(array_ops.shape(energy_change),
                       energy_change.dtype.as_numpy_dtype(np.inf)),
        energy_change)
    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
    
    # If we are skipping the MH step directly return
    if skip_metropolis_step:
      return new_x, acceptance_probs, -log_potential_1, -grad_1
    
    accepted = (
        random_ops.random_uniform(
            array_ops.shape(acceptance_probs), dtype=x.dtype)
        < acceptance_probs)
    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)

    # TODO(b/65738010): This should work, but it doesn't for now.
    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                        keep_dims=True))
    accepted = array_ops.reshape(accepted, reduced_shape)
    accepted = math_ops.logical_or(
        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
    new_x = array_ops.where(accepted, new_x, x)
    new_grad = -array_ops.where(accepted, grad_1, grad_0)

  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
  # should be fixed.
  return new_x, acceptance_probs, new_log_prob, new_grad
Exemple #41
0
 def variance(self, name="variance"):
     with ops.name_scope(self.name):
         with ops.op_scope([self.range()], name):
             return math_ops.square(self.range()) / 12.
Exemple #42
0
def npairs_loss(labels,
                embeddings_anchor,
                embeddings_positive,
                reg_lambda=0.002,
                print_losses=False):
    """Computes the npairs loss.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels.

  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

  Args:
    labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
    embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

  Returns:
    npairs_loss: tf.float32 scalar.
  """
    # pylint: enable=line-too-long
    # Add the regularizer on the embedding.
    reg_anchor = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
    reg_positive = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
    l2loss = math_ops.multiply(0.25 * reg_lambda,
                               reg_anchor + reg_positive,
                               name='l2loss')

    # Get per pair similarities.
    similarity_matrix = math_ops.matmul(embeddings_anchor,
                                        embeddings_positive,
                                        transpose_a=False,
                                        transpose_b=True)

    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    labels_remapped = math_ops.to_float(
        math_ops.equal(labels, array_ops.transpose(labels)))
    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

    # Add the softmax loss.
    xent_loss = nn.softmax_cross_entropy_with_logits(logits=similarity_matrix,
                                                     labels=labels_remapped)
    xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

    if print_losses:
        xent_loss = logging_ops.Print(
            xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

    return l2loss + xent_loss
def kernel_classifier_distance_and_std_from_activations(
        real_activations,
        generated_activations,
        max_block_size=1024,
        dtype=None):
    """Kernel "classifier" distance for evaluating a generative model.

  This methods computes the kernel classifier distance from activations of
  real images and generated images. This can be used independently of the
  kernel_classifier_distance() method, especially in the case of using large
  batches during evaluation where we would like to precompute all of the
  activations before computing the classifier distance, or if we want to
  compute multiple metrics based on the same images. It also returns a rough
  estimate of the standard error of the estimator.

  This technique is described in detail in https://arxiv.org/abs/1801.01401.
  Given two distributions P and Q of activations, this function calculates

      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]

  where k is the polynomial kernel

      k(x, y) = ( x^T y / dimension + 1 )^3.

  This captures how different the distributions of real and generated images'
  visual features are. Like the Frechet distance (and unlike the Inception
  score), this is a true distance and incorporates information about the
  target images. Unlike the Frechet score, this function computes an
  *unbiased* and asymptotically normal estimator, which makes comparing
  estimates across models much more intuitive.

  The estimator used takes time quadratic in max_block_size. Larger values of
  max_block_size will decrease the variance of the estimator but increase the
  computational cost. This differs slightly from the estimator used by the
  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
  The estimate of the standard error will also be more reliable when there are
  more blocks, i.e. when max_block_size is smaller.

  NOTE: the blocking code assumes that real_activations and
  generated_activations are both in random order. If either is sorted in a
  meaningful order, the estimator will behave poorly.

  Args:
    real_activations: 2D Tensor containing activations of real data. Shape is
      [batch_size, activation_size].
    generated_activations: 2D Tensor containing activations of generated data.
      Shape is [batch_size, activation_size].
    max_block_size: integer, default 1024. The distance estimator splits samples
      into blocks for computational efficiency. Larger values are more
      computationally expensive but decrease the variance of the distance
      estimate. Having a smaller block size also gives a better estimate of the
      standard error.
    dtype: If not None, coerce activations to this dtype before computations.

  Returns:
   The Kernel Inception Distance. A floating-point scalar of the same type
     as the output of the activations.
   An estimate of the standard error of the distance estimator (a scalar of
     the same type).
  """

    real_activations.shape.assert_has_rank(2)
    generated_activations.shape.assert_has_rank(2)
    real_activations.shape[1].assert_is_compatible_with(
        generated_activations.shape[1])

    if dtype is None:
        dtype = real_activations.dtype
        assert generated_activations.dtype == dtype
    else:
        real_activations = math_ops.cast(real_activations, dtype)
        generated_activations = math_ops.cast(generated_activations, dtype)

    # Figure out how to split the activations into blocks of approximately
    # equal size, with none larger than max_block_size.
    n_r = array_ops.shape(real_activations)[0]
    n_g = array_ops.shape(generated_activations)[0]

    n_bigger = math_ops.maximum(n_r, n_g)
    n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size),
                             dtypes.int32)

    v_r = n_r // n_blocks
    v_g = n_g // n_blocks

    n_plusone_r = n_r - v_r * n_blocks
    n_plusone_g = n_g - v_g * n_blocks

    sizes_r = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_r], v_r),
        array_ops.fill([n_plusone_r], v_r + 1),
    ], 0)
    sizes_g = array_ops.concat([
        array_ops.fill([n_blocks - n_plusone_g], v_g),
        array_ops.fill([n_plusone_g], v_g + 1),
    ], 0)

    zero = array_ops.zeros([1], dtype=dtypes.int32)
    inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
    inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)

    dim = math_ops.cast(real_activations.shape[1], dtype)

    def compute_kid_block(i):
        """Computes the ith block of the KID estimate."""
        r_s = inds_r[i]
        r_e = inds_r[i + 1]
        r = real_activations[r_s:r_e]
        m = math_ops.cast(r_e - r_s, dtype)

        g_s = inds_g[i]
        g_e = inds_g[i + 1]
        g = generated_activations[g_s:g_e]
        n = math_ops.cast(g_e - g_s, dtype)

        k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
        k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
        k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
        return (-2 * math_ops.reduce_mean(k_rg) +
                (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) /
                (m * (m - 1)) +
                (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n *
                                                                      (n - 1)))

    ests = map_fn.map_fn(compute_kid_block,
                         math_ops.range(n_blocks),
                         dtype=dtype,
                         back_prop=False)

    mn = math_ops.reduce_mean(ests)

    # nn_impl.moments doesn't use the Bessel correction, which we want here
    n_blocks_ = math_ops.cast(n_blocks, dtype)
    var = control_flow_ops.cond(
        math_ops.less_equal(n_blocks, 1),
        lambda: array_ops.constant(float('nan'), dtype=dtype),
        lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) /
        (n_blocks_ - 1))

    return mn, math_ops.sqrt(var / n_blocks_)
    def unregularized_loss(self, examples):
        """Add operations to compute the loss (without the regularization loss).

    Args:
      examples: Examples to compute unregularized loss on.

    Returns:
      An Operation that computes mean (unregularized) loss for given set of
      examples.

    Raises:
      ValueError: if examples are not well defined.
    """
        self._assertSpecified([
            'example_labels', 'example_weights', 'sparse_features',
            'dense_features'
        ], examples)
        self._assertList(['sparse_features', 'dense_features'], examples)
        with name_scope('sdca/unregularized_loss'):
            predictions = math_ops.cast(self._linear_predictions(examples),
                                        dtypes.float64)
            labels = math_ops.cast(
                internal_convert_to_tensor(examples['example_labels']),
                dtypes.float64)
            weights = math_ops.cast(
                internal_convert_to_tensor(examples['example_weights']),
                dtypes.float64)

            if self._options['loss_type'] == 'logistic_loss':
                return math_ops.reduce_sum(
                    math_ops.multiply(
                        sigmoid_cross_entropy_with_logits(labels=labels,
                                                          logits=predictions),
                        weights)) / math_ops.reduce_sum(weights)

            if self._options['loss_type'] == 'poisson_loss':
                return math_ops.reduce_sum(
                    math_ops.multiply(
                        log_poisson_loss(targets=labels,
                                         log_input=predictions),
                        weights)) / math_ops.reduce_sum(weights)

            if self._options['loss_type'] in [
                    'hinge_loss', 'smooth_hinge_loss'
            ]:
                # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to
                # first convert 0/1 labels into -1/1 labels.
                all_ones = array_ops.ones_like(predictions)
                adjusted_labels = math_ops.subtract(2 * labels, all_ones)
                # Tensor that contains (unweighted) error (hinge loss) per
                # example.
                error = nn_ops.relu(
                    math_ops.subtract(
                        all_ones,
                        math_ops.multiply(adjusted_labels, predictions)))
                weighted_error = math_ops.multiply(error, weights)
                return math_ops.reduce_sum(
                    weighted_error) / math_ops.reduce_sum(weights)

            # squared loss
            err = math_ops.subtract(labels, predictions)

            weighted_squared_err = math_ops.multiply(math_ops.square(err),
                                                     weights)
            # SDCA squared loss function is sum(err^2) / (2*sum(weights))
            return (math_ops.reduce_sum(weighted_squared_err) /
                    (2.0 * math_ops.reduce_sum(weights)))
Exemple #45
0
def _SvdGrad(op, grad_s, grad_u, grad_v):
    """Gradient for Svd based on Giles' algorithm. Reference at top of file."""
    def _Adjoint(x):
        return math_ops.conj(array_ops.matrix_transpose(x))

    if op.get_attr("compute_uv") and not op.get_attr("full_matrices"):
        raise NotImplementedError(
            "SVD gradient is not implemented for compute_uv=True and "
            "full_matrices=False.")

    a = op.inputs[0]
    a_shape = a.get_shape().with_rank_at_least(2)

    if op.get_attr("compute_uv"):
        # TODO(rmlarsen): Make this work with complex types.
        if a.dtype.is_complex:
            raise NotImplementedError(
                "SVD gradient is not implemented for complex types and "
                "compute_uv=True.")
        grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
        grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
        m = a_shape[-2].merge_with(grad_u_shape[-2])
        n = a_shape[-1].merge_with(grad_v_shape[-2])
        batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
            grad_v_shape[:-2])
        a_shape = batch_shape.concatenate([m, n])

    m = a_shape[-2].value
    n = a_shape[-1].value
    # TODO(rmlarsen): Make this work with placeholders.
    if m is None or n is None:
        raise NotImplementedError(
            "SVD gradient has not been implemented for input with unknown "
            "inner matrix shape.")

    if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"):
        s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
    else:
        s = op.outputs[0]
        u = op.outputs[1]
        v = op.outputs[2]

    use_adjoint = False
    if m > n:
        # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the
        # Hermitian transpose of the gradient at the end.
        use_adjoint = True
        m, n = n, m
        u, v = v, u
        grad_u, grad_v = grad_v, grad_u

    with ops.control_dependencies([grad_s, grad_u, grad_v]):
        grad_s_mat = array_ops.matrix_diag(grad_s)
        if not op.get_attr("compute_uv"):
            if use_adjoint:
                grad_a = math_ops.matmul(v[..., :, :m],
                                         math_ops.matmul(u, grad_s_mat),
                                         adjoint_b=True)
            else:
                grad_a = math_ops.matmul(
                    u,
                    math_ops.matmul(grad_s_mat, v[..., :, :m], adjoint_b=True))
            grad_a.set_shape(a_shape)
            return grad_a

        # TODO(rmlarsen): Define a gradient that is numerically stable for
        # abs(m-n) > 1. Currently this does not work because there are effectively
        # multiple singular values with value zero. I am not sure if this is a true
        # instability or if it simply throws off the finite difference gradient
        # checker.
        if abs(m - n) > 1:
            raise NotImplementedError(
                "svd gradient is not implemented for abs(m - n) > 1")
        s_mat = array_ops.matrix_diag(s)
        s2 = math_ops.square(s)

        # NOTICE: Because of the term involving f, the gradient becomes
        # infinite (or NaN in practice) when singular values are not unique.
        # Mathematically this should not be surprising, since for (k-fold)
        # degenerate singular values, the corresponding singular vectors are
        # only defined up a (k-dimensional) subspace. In practice, this can
        # lead to numerical instability when singular values are close but not
        # exactly equal.
        f = array_ops.matrix_set_diag(
            math_ops.reciprocal(
                array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
            array_ops.zeros_like(s))
        s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))
        u_gu = math_ops.matmul(u, grad_u, adjoint_a=True)
        v_gv = math_ops.matmul(v, grad_v, adjoint_a=True)

        if m == n:
            f_u = f * u_gu
            f_v = f * v_gv
        else:
            dv2 = array_ops.matrix_transpose(
                v_gv[..., m:n, :m]) - v_gv[..., :m, m:n]
            f_u = f * u_gu
            f_v = f * v_gv[..., :m, :m]

        grad_a_nouv = (grad_s_mat +
                       math_ops.matmul(f_u + _Adjoint(f_u), s_mat) +
                       math_ops.matmul(s_mat, f_v + _Adjoint(f_v)))

        if m != n:
            grad_a_nouv = array_ops.concat(
                [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1)

        if use_adjoint:
            # Use (U X V^H)^H = V (U X)^H.
            grad_a = math_ops.matmul(v,
                                     math_ops.matmul(u, grad_a_nouv),
                                     adjoint_b=True)
        else:
            grad_a = math_ops.matmul(
                u, math_ops.matmul(grad_a_nouv, v, adjoint_b=True))

        grad_a.set_shape(a_shape)
        return grad_a
Exemple #46
0
 def _log_unnormalized_prob(self, x):
     #         return self._z(x) / self.scale
     return -0.5 * math_ops.square(self._z(x))
Exemple #47
0
def _TanhGrad(op, grad):
    """Returns grad * (1 - tanh(x) * tanh(x))."""
    y = op.outputs[0]  # y = tanh(x)
    with ops.control_dependencies([grad.op]):
        return grad * (1 - math_ops.square(y))
def my_metric_tool_cond(y_true, y_pred):
    return K.mean(math_ops.square(y_pred - y_true))
Exemple #49
0
 def call(self, inputs):
   return math_ops.square(inputs)
Exemple #50
0
def _InvGrad(op, grad):
    """Returns -grad * (1 / x^2)."""
    y = op.outputs[0]  # y = 1 / x
    # Added control dependencies to prevent -x^2 from being computed too early.
    with ops.control_dependencies([grad.op]):
        return grad * (-math_ops.square(y))
    def _process_input_helper(self,
                              update_row_factors,
                              sp_input=None,
                              transpose_input=False,
                              row_weights=None):
        """Creates the graph for processing a sparse slice of input.

    Args:
      update_row_factors: if True, update or project the row_factors, else
        update or project the column factors.
      sp_input: Please refer to comments for update_row_factors,
        update_col_factors, project_row_factors, and project_col_factors for
        restrictions.
      transpose_input: If True, the input is logically transposed and then the
        corresponding rows/columns of the transposed input are updated.
      row_weights: If not None, this is the row/column weights to be used for
        the update or projection. If None, use the corresponding weights from
        the model. Note that the feature (column/row) weights will be
        determined by the model. When not None, it can either be a scalar or
        a rank-1 tensor with the same number of elements as the number of rows
        of columns to be updated/projected.

    Returns:
      A tuple consisting of the following elements:
      new_values: New values for the row/column factors.
      update_op: An op that assigns the newly computed values to the row/column
        factors.
      unregularized_loss: A tensor (scalar) that contains the normalized
        minibatch loss corresponding to sp_input, without the regularization
        term. Add the regularization term below to yield the loss.
      regularization: A tensor (scalar) that contains the normalized
        regularization term for the minibatch loss corresponding to sp_input.
      sum_weights: The sum of the weights corresponding to sp_input. This
        can be used with unregularized loss to calculate the root weighted
        squared error.
    """
        assert isinstance(sp_input, sparse_tensor.SparseTensor)

        if update_row_factors:
            left = self._row_factors
            right_factors = self._col_factors_cache
            row_wt = self._row_wt_cache
            col_wt = self._col_wt_cache
            total_rows = self._input_rows
            total_cols = self._input_cols
            sharding_func = WALSModel._get_sharding_func(
                self._input_rows, self._num_row_shards)
            gramian = self._col_gramian_cache
        else:
            left = self._col_factors
            right_factors = self._row_factors_cache
            row_wt = self._col_wt_cache
            col_wt = self._row_wt_cache
            total_rows = self._input_cols
            total_cols = self._input_rows
            sharding_func = WALSModel._get_sharding_func(
                self._input_cols, self._num_col_shards)
            gramian = self._row_gramian_cache
            transpose_input = not transpose_input

        # Note that the row indices of sp_input are based on the original full input
        # Here we reindex the rows and give them contiguous ids starting at 0.
        # We use tf.unique to achieve this reindexing. Note that this is done so
        # that the downstream kernel can assume that the input is "dense" along the
        # row dimension.
        row_ids, col_ids = array_ops.split(value=sp_input.indices,
                                           num_or_size_splits=2,
                                           axis=1)
        update_row_indices, all_row_ids = array_ops.unique(row_ids[:, 0])
        update_col_indices, all_col_ids = array_ops.unique(col_ids[:, 0])
        col_ids = array_ops.expand_dims(
            math_ops.cast(all_col_ids, dtypes.int64), 1)
        row_ids = array_ops.expand_dims(
            math_ops.cast(all_row_ids, dtypes.int64), 1)

        if transpose_input:
            update_indices = update_col_indices
            row_shape = [
                math_ops.cast(
                    array_ops.shape(update_row_indices)[0], dtypes.int64)
            ]
            gather_indices = update_row_indices
        else:
            update_indices = update_row_indices
            row_shape = [
                math_ops.cast(
                    array_ops.shape(update_col_indices)[0], dtypes.int64)
            ]
            gather_indices = update_col_indices

        num_rows = math_ops.cast(
            array_ops.shape(update_indices)[0], dtypes.int64)
        col_shape = [num_rows]
        right = embedding_ops.embedding_lookup(right_factors,
                                               gather_indices,
                                               partition_strategy="div")
        new_sp_indices = array_ops.concat([row_ids, col_ids], 1)
        new_sp_shape = (array_ops.concat([row_shape, col_shape], 0)
                        if transpose_input else array_ops.concat(
                            [col_shape, row_shape], 0))
        new_sp_input = sparse_tensor.SparseTensor(indices=new_sp_indices,
                                                  values=sp_input.values,
                                                  dense_shape=new_sp_shape)

        # Compute lhs and rhs of the normal equations
        total_lhs = (self._unobserved_weight * gramian)
        if self._regularization_matrix is not None:
            total_lhs += self._regularization_matrix
        if self._row_weights is None:
            # Special case of ALS. Use a much simpler update rule.
            total_rhs = (self._unobserved_weight *
                         sparse_ops.sparse_tensor_dense_matmul(
                             new_sp_input, right, adjoint_a=transpose_input))
            # TODO(rmlarsen): handle transposing in tf.matrix_solve instead of
            # transposing explicitly.
            # TODO(rmlarsen): multi-thread tf.matrix_solve.
            new_left_values = array_ops.transpose(
                linalg_ops.matrix_solve(total_lhs,
                                        array_ops.transpose(total_rhs)))
        else:
            if row_weights is None:
                # TODO(yifanchen): Add special handling for single shard without using
                # embedding_lookup and perform benchmarks for those cases. Same for
                # col_weights lookup below.
                row_weights_slice = embedding_ops.embedding_lookup(
                    row_wt, update_indices, partition_strategy="div")
            else:
                num_indices = array_ops.shape(update_indices)[0]
                with ops.control_dependencies([
                        check_ops.assert_less_equal(
                            array_ops.rank(row_weights), 1)
                ]):
                    row_weights_slice = control_flow_ops.cond(
                        math_ops.equal(array_ops.rank(row_weights), 0), lambda:
                        (array_ops.ones([num_indices]) * row_weights),
                        lambda: math_ops.cast(row_weights, dtypes.float32))

            col_weights = embedding_ops.embedding_lookup(
                col_wt, gather_indices, partition_strategy="div")
            partial_lhs, total_rhs = (
                gen_factorization_ops.wals_compute_partial_lhs_and_rhs(
                    right,
                    col_weights,
                    self._unobserved_weight,
                    row_weights_slice,
                    new_sp_input.indices,
                    new_sp_input.values,
                    num_rows,
                    transpose_input,
                    name="wals_compute_partial_lhs_rhs"))
            total_lhs = array_ops.expand_dims(total_lhs, 0) + partial_lhs
            total_rhs = array_ops.expand_dims(total_rhs, -1)
            new_left_values = array_ops.squeeze(
                linalg_ops.matrix_solve(total_lhs, total_rhs), [2])

        update_op_name = "row_update" if update_row_factors else "col_update"
        update_op = self.scatter_update(left,
                                        update_indices,
                                        new_left_values,
                                        sharding_func,
                                        name=update_op_name)

        # Create the loss subgraph
        loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
                         if transpose_input else new_sp_input)
        # sp_approx is the low rank estimate of the input matrix, formed by
        # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
        sp_approx_vals = gen_factorization_ops.masked_matmul(
            new_left_values,
            right,
            loss_sp_input.indices,
            transpose_a=False,
            transpose_b=True)
        sp_approx = sparse_tensor.SparseTensor(loss_sp_input.indices,
                                               sp_approx_vals,
                                               loss_sp_input.dense_shape)
        sp_approx_sq = math_ops.square(sp_approx)
        sp_residual = sparse_ops.sparse_add(loss_sp_input, sp_approx * (-1))
        sp_residual_sq = math_ops.square(sp_residual)
        row_wt_mat = (constant_op.constant(0.) if self._row_weights is None
                      else array_ops.expand_dims(row_weights_slice, 1))
        col_wt_mat = (constant_op.constant(0.) if self._col_weights is None
                      else array_ops.expand_dims(col_weights, 0))

        # We return the normalized loss
        partial_row_gramian = math_ops.matmul(new_left_values,
                                              new_left_values,
                                              transpose_a=True)
        normalization_factor = total_rows / math_ops.cast(
            num_rows, dtypes.float32)

        unregularized_loss = (
            self._unobserved_weight * (  # pyformat line break
                sparse_ops.sparse_reduce_sum(sp_residual_sq) -  # pyformat break
                sparse_ops.sparse_reduce_sum(sp_approx_sq) +  # pyformat break
                math_ops.trace(math_ops.matmul(partial_row_gramian, gramian)))
            + sparse_ops.sparse_reduce_sum(
                row_wt_mat *
                (sp_residual_sq * col_wt_mat))) * normalization_factor

        if self._regularization is not None:
            regularization = self._regularization * (
                math_ops.trace(partial_row_gramian) * normalization_factor +
                math_ops.trace(gramian))
        else:
            regularization = constant_op.constant(0.)

        sum_weights = self._unobserved_weight * math_ops.cast(
            total_rows * total_cols, dtypes.float32)
        if self._row_weights is not None and self._col_weights is not None:
            ones = sparse_tensor.SparseTensor(
                indices=loss_sp_input.indices,
                values=array_ops.ones(array_ops.shape(loss_sp_input.values)),
                dense_shape=loss_sp_input.dense_shape)
            sum_weights += sparse_ops.sparse_reduce_sum(
                row_wt_mat * (ones * col_wt_mat)) * normalization_factor

        return (new_left_values, update_op, unregularized_loss, regularization,
                sum_weights)
 def train_step(data):
     return math_ops.square(data)
Exemple #53
0
    def _log_prob(self, x):
        if self.cholesky_input_output_matrices:
            x_sqrt = x
        else:
            # Complexity: O(nbk^3)
            x_sqrt = linalg_ops.cholesky(x)

        batch_shape = self.batch_shape_tensor()
        event_shape = self.event_shape_tensor()
        ndims = array_ops.rank(x_sqrt)
        # sample_ndims = ndims - batch_ndims - event_ndims
        sample_ndims = ndims - array_ops.shape(batch_shape)[0] - 2
        sample_shape = array_ops.strided_slice(array_ops.shape(x_sqrt), [0],
                                               [sample_ndims])

        # We need to be able to pre-multiply each matrix by its corresponding
        # batch scale matrix. Since a Distribution Tensor supports multiple
        # samples per batch, this means we need to reshape the input matrix `x`
        # so that the first b dimensions are batch dimensions and the last two
        # are of shape [dimension, dimensions*number_of_samples]. Doing these
        # gymnastics allows us to do a batch_solve.
        #
        # After we're done with sqrt_solve (the batch operation) we need to undo
        # this reshaping so what we're left with is a Tensor partitionable by
        # sample, batch, event dimensions.

        # Complexity: O(nbk**2) since transpose must access every element.
        scale_sqrt_inv_x_sqrt = x_sqrt
        perm = array_ops.concat([
            math_ops.range(sample_ndims, ndims),
            math_ops.range(0, sample_ndims)
        ], 0)
        scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt,
                                                    perm)
        shape = array_ops.concat(
            (batch_shape,
             (math_ops.cast(self.dimension, dtype=dtypes.int32), -1)), 0)
        scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)

        # Complexity: O(nbM*k) where M is the complexity of the operator solving
        # a vector system. E.g., for LinearOperatorDiag, each solve is O(k), so
        # this complexity is O(nbk**2). For LinearOperatorLowerTriangular,
        # each solve is O(k**2) so this step has complexity O(nbk^3).
        scale_sqrt_inv_x_sqrt = self.scale_operator.solve(
            scale_sqrt_inv_x_sqrt)

        # Undo make batch-op ready.
        # Complexity: O(nbk**2)
        shape = array_ops.concat([batch_shape, event_shape, sample_shape], 0)
        scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
        perm = array_ops.concat([
            math_ops.range(ndims - sample_ndims, ndims),
            math_ops.range(0, ndims - sample_ndims)
        ], 0)
        scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt,
                                                    perm)

        # Write V = SS', X = LL'. Then:
        # tr[inv(V) X] = tr[inv(S)' inv(S) L L']
        #              = tr[inv(S) L L' inv(S)']
        #              = tr[(inv(S) L) (inv(S) L)']
        #              = sum_{ik} (inv(S) L)_{ik}**2
        # The second equality follows from the cyclic permutation property.
        # Complexity: O(nbk**2)
        trace_scale_inv_x = math_ops.reduce_sum(
            math_ops.square(scale_sqrt_inv_x_sqrt), axis=[-2, -1])

        # Complexity: O(nbk)
        half_log_det_x = math_ops.reduce_sum(math_ops.log(
            array_ops.matrix_diag_part(x_sqrt)),
                                             axis=[-1])

        # Complexity: O(nbk**2)
        log_prob = ((self.df - self.dimension - 1.) * half_log_det_x -
                    0.5 * trace_scale_inv_x - self.log_normalization())

        # Set shape hints.
        # Try to merge what we know from the input then what we know from the
        # parameters of this distribution.
        if x.get_shape().ndims is not None:
            log_prob.set_shape(x.get_shape()[:-2])
        if (log_prob.get_shape().ndims is not None
                and self.batch_shape.ndims is not None
                and self.batch_shape.ndims > 0):
            log_prob.get_shape()[-self.batch_shape.ndims:].merge_with(
                self.batch_shape)

        return log_prob
def _SvdGrad(op, grad_s, grad_u, grad_v):
  """Gradient for the singular value decomposition."""

  # The derivation for the compute_uv=False case, and most of
  # the derivation for the full_matrices=True case, are in
  # Giles' paper (see reference at top of file).  A derivation for
  # the full_matrices=False case is available at
  # https://j-towns.github.io/papers/svd-derivative.pdf
  # The derivation for complex valued SVD can be found in
  # https://re-ra.xyz/misc/complexsvd.pdf or
  # https://giggleliu.github.io/2019/04/02/einsumbp.html
  a = op.inputs[0]
  a_shape = a.get_shape().with_rank_at_least(2)
  grad_s = math_ops.cast(grad_s, a.dtype)
  grad_s_mat = array_ops.matrix_diag(grad_s)

  if not op.get_attr("compute_uv"):
    s, u, v = linalg_ops.svd(a, compute_uv=True)
    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
    grad_a.set_shape(a_shape)
    return grad_a

  full_matrices = op.get_attr("full_matrices")

  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
  m = a_shape.dims[-2].merge_with(grad_u_shape[-2])
  n = a_shape.dims[-1].merge_with(grad_v_shape[-2])
  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
      grad_v_shape[:-2])
  a_shape = batch_shape.concatenate([m, n])

  m = a_shape.dims[-2].value
  n = a_shape.dims[-1].value
  # TODO(rmlarsen): Make this work with placeholders.
  if m is None or n is None:
    raise NotImplementedError(
        "SVD gradient has not been implemented for input with unknown "
        "inner matrix shape.")

  s = op.outputs[0]
  u = op.outputs[1]
  v = op.outputs[2]
  s = math_ops.cast(s, a.dtype)

  use_adjoint = False
  if m > n:
    # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the
    # Hermitian transpose of the gradient at the end.
    use_adjoint = True
    m, n = n, m
    u, v = v, u
    grad_u, grad_v = grad_v, grad_u

  with ops.control_dependencies([grad_s, grad_u, grad_v]):
    if full_matrices and abs(m - n) > 1:
      raise NotImplementedError(
          "svd gradient is not implemented for abs(m - n) > 1 "
          f"when full_matrices is True. Received: m={m} and n={n} from "
          f"op input={a} with shape={a_shape}.")
    s_mat = array_ops.matrix_diag(s)
    s2 = math_ops.square(s)

    # NOTICE: Because of the term involving f, the gradient becomes
    # infinite (or NaN in practice) when singular values are not unique.
    # Mathematically this should not be surprising, since for (k-fold)
    # degenerate singular values, the corresponding singular vectors are
    # only defined up a (k-dimensional) subspace. In practice, this can
    # lead to numerical instability when singular values are close but not
    # exactly equal.

    s_shape = array_ops.shape(s)
    f = array_ops.matrix_set_diag(
        _SafeReciprocal(
            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
        array_ops.zeros_like(s))
    s_inv_mat = array_ops.matrix_diag(_SafeReciprocal(s))

    v1 = v[..., :, :m]
    grad_v1 = grad_v[..., :, :m]

    u_gu = math_ops.matmul(u, grad_u, adjoint_a=True)
    v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True)

    f_u = f * u_gu
    f_v = f * v_gv

    term1_nouv = (
        grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) +
        math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v)))

    term1 = math_ops.matmul(u, math_ops.matmul(term1_nouv, v1, adjoint_b=True))

    if m == n:
      grad_a_before_transpose = term1
    else:
      gv1t = array_ops.matrix_transpose(grad_v1, conjugate=True)
      gv1t_v1 = math_ops.matmul(gv1t, v1)
      term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)

      if full_matrices:
        v2 = v[..., :, m:n]
        grad_v2 = grad_v[..., :, m:n]

        v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True)
        term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True)

      u_s_inv = math_ops.matmul(u, s_inv_mat)
      term2 = math_ops.matmul(u_s_inv, term2_nous)

      grad_a_before_transpose = term1 + term2

    if a.dtype.is_complex:
      eye = _linalg.eye(s_shape[-1], batch_shape=s_shape[:-1], dtype=a.dtype)
      l = eye * v_gv
      term3_nouv = math_ops.matmul(s_inv_mat, _linalg.adjoint(l) - l)
      term3 = 1 / 2. * math_ops.matmul(
          u, math_ops.matmul(term3_nouv, v1, adjoint_b=True))

      grad_a_before_transpose += term3

    if use_adjoint:
      grad_a = array_ops.matrix_transpose(
          grad_a_before_transpose, conjugate=True)
    else:
      grad_a = grad_a_before_transpose

    grad_a.set_shape(a_shape)
    return grad_a
  def _mean_of_covariance_given_quadrature_component(self, diag_only):
    p = self.mixture_distribution.probs

    # To compute E[Cov(Z|V)], we'll add matrices within three categories:
    # scaled-identity, diagonal, and full. Then we'll combine these at the end.
    scale_identity_multiplier = None
    diag = None
    full = None

    for k, aff in enumerate(self.interpolated_affine):
      s = aff.scale  # Just in case aff.scale has side-effects, we'll call once.
      if (s is None
          or isinstance(s, linop_identity_lib.LinearOperatorIdentity)):
        scale_identity_multiplier = add(scale_identity_multiplier,
                                        p[..., k, array_ops.newaxis])
      elif isinstance(s, linop_identity_lib.LinearOperatorScaledIdentity):
        scale_identity_multiplier = add(
            scale_identity_multiplier,
            (p[..., k, array_ops.newaxis] * math_ops.square(s.multiplier)))
      elif isinstance(s, linop_diag_lib.LinearOperatorDiag):
        diag = add(diag, (p[..., k, array_ops.newaxis] *
                          math_ops.square(s.diag_part())))
      else:
        x = (p[..., k, array_ops.newaxis, array_ops.newaxis] *
             s.matmul(s.to_dense(), adjoint_arg=True))
        if diag_only:
          x = array_ops.matrix_diag_part(x)
        full = add(full, x)

    # We must now account for the fact that the base distribution might have a
    # non-unity variance. Recall that, since X ~ iid Law(X_0),
    #   `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`.
    # We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid
    # samples from a scalar-event distribution.
    v = self.distribution.variance()
    if scale_identity_multiplier is not None:
      scale_identity_multiplier *= v
    if diag is not None:
      diag *= v[..., array_ops.newaxis]
    if full is not None:
      full *= v[..., array_ops.newaxis]

    if diag_only:
      # Apparently we don't need the full matrix, just the diagonal.
      r = add(diag, full)
      if r is None and scale_identity_multiplier is not None:
        ones = array_ops.ones(self.event_shape_tensor(), dtype=self.dtype)
        return scale_identity_multiplier[..., array_ops.newaxis] * ones
      return add(r, scale_identity_multiplier)

    # `None` indicates we don't know if the result is positive-definite.
    is_positive_definite = (True if all(aff.scale.is_positive_definite
                                        for aff in self.endpoint_affine)
                            else None)

    to_add = []
    if diag is not None:
      to_add.append(linop_diag_lib.LinearOperatorDiag(
          diag=diag,
          is_positive_definite=is_positive_definite))
    if full is not None:
      to_add.append(linop_full_lib.LinearOperatorFullMatrix(
          matrix=full,
          is_positive_definite=is_positive_definite))
    if scale_identity_multiplier is not None:
      to_add.append(linop_identity_lib.LinearOperatorScaledIdentity(
          num_rows=self.event_shape_tensor()[0],
          multiplier=scale_identity_multiplier,
          is_positive_definite=is_positive_definite))

    return (linop_add_lib.add_operators(to_add)[0].to_dense()
            if to_add else None)
def my_metric_allowances(y_true, y_pred):
    return K.mean(math_ops.square(y_pred - y_true))
 def f(x):
     return math_ops.square(x)
Exemple #58
0
 def __call__(self, w):
     return w / (K.epsilon() + K.sqrt(
         math_ops.reduce_sum(
             math_ops.square(w), axis=self.axis, keepdims=True)))
Exemple #59
0
 def map_fn(x):
     return math_ops.log(math_ops.square(x) + 1)
Exemple #60
0
 def _log_prob(self, x):
     return (-0.5 * math.log(2. * math.pi) - math_ops.log(self.sigma) -
             0.5 * math_ops.square(self._z(x)))