Exemple #1
  def update_state(self, values, sample_weight=None):
    """Accumulates statistics for computing the mean.

    For example, if `values` is [1, 3, 5, 7] then the mean is 4. If
    the `sample_weight` is specified as [1, 1, 0, 0] then the mean would be 2.

      values: Per-example value.
      sample_weight: Optional weighting of each example. Defaults to 1.
    values = math_ops.cast(values, self._dtype)
    if sample_weight is None:
      num_values = math_ops.cast(array_ops.size(values), self._dtype)
      sample_weight = math_ops.cast(sample_weight, self._dtype)

      # Update dimensions of weights to match with values.
      values, _, sample_weight = _squeeze_or_expand_dimensions(
          values, None, sample_weight)
      sample_weight = weights_broadcast_ops.broadcast_weights(
          sample_weight, values)
      num_values = math_ops.reduce_sum(sample_weight)
      values = math_ops.multiply(values, sample_weight)
    values = math_ops.reduce_sum(values)

    # Update state variables
    state_ops.assign_add(self.total, values)
    state_ops.assign_add(self.count, num_values)
  def _compute_euclidean_distance(cls, inputs, clusters):
    """Computes Euclidean distance between each input and each cluster center.

      inputs: list of input Tensors.
      clusters: cluster Tensor.

      list of Tensors, where each element corresponds to each element in inputs.
      The value is the distance of each row to all the cluster centers.
    output = []
    for inp in inputs:
      with ops.colocate_with(inp, ignore_existing=True):
        # Computes Euclidean distance. Note the first and third terms are
        # broadcast additions.
        squared_distance = (
            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
                    math_ops.square(clusters), 1, keep_dims=True)))

    return output
def _num_present(losses, weights, per_batch=False):
  """Computes the number of elements in the loss function induced by `weights`.

  A given weights tensor induces different numbers of usable elements in the
  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
  possible dimensions. For example, if `losses` is a tensor of dimension
  `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is,
  in effect, tiled to match the shape of `losses`. Following this effective
  tile, the total number of present elements is the number of non-zero weights.

    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: `Tensor` of shape `[]`, `[batch_size]` or
      `[batch_size, d1, ... dK]`, where K < N.
    per_batch: Whether to return the number of elements per batch or as a sum

    The number of present (non-zero) elements in the losses tensor. If
      `per_batch` is `True`, the value is returned as a tensor of size
      `[batch_size]`. Otherwise, a single scalar tensor is returned.
  with ops.name_scope(None, "num_present", (losses, weights)) as scope:
    weights = math_ops.to_float(weights)
    present = array_ops.where(
        math_ops.equal(weights, 0.0),
    present = weights_broadcast_ops.broadcast_weights(present, losses)
    if per_batch:
      return math_ops.reduce_sum(
          present, axis=math_ops.range(1, array_ops.rank(present)),
          keep_dims=True, name=scope)
    return math_ops.reduce_sum(present, name=scope)
Exemple #4
  def _integrator_conserves_energy(self, x, event_dims, sess,
    def potential_and_grad(x):
      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
      return -log_prob, -grad

    step_size = array_ops.placeholder(np.float32, [], name='step_size')
    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')

    if feed_dict is None:
      feed_dict = {}
    feed_dict[hmc_lf_steps] = 1000

    m = random_ops.random_normal(array_ops.shape(x))
    potential_0, grad_0 = potential_and_grad(x)
    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,

    _, new_m, potential_1, _ = (
        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
                                m, potential_and_grad, grad_0))

    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,

    x_shape = sess.run(x, feed_dict).shape
    n_event_dims = self._n_event_dims(x_shape, event_dims)
    feed_dict[step_size] = 0.1 / n_event_dims
    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
    logging.vlog(1, 'average energy change: {}'.format(
        abs(old_energy_val - new_energy_val).mean()))

    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
                        abs(old_energy_val - new_energy_val) < 1.)
  def _inverse_log_det_jacobian(self, y, use_saved_statistics=False):
    if not y.shape.is_fully_defined():
      raise ValueError("Input must have shape known at graph construction.")
    input_shape = np.int32(y.shape.as_list())

    if not self.batchnorm.built:
      # Create variables.

    event_dims = self.batchnorm.axis
    reduction_axes = [i for i in range(len(input_shape)) if i not in event_dims]

    if use_saved_statistics or not self._training:
      log_variance = math_ops.log(
          self.batchnorm.moving_variance + self.batchnorm.epsilon)
      # At training-time, ildj is computed from the mean and log-variance across
      # the current minibatch.
      _, v = nn.moments(y, axes=reduction_axes, keepdims=True)
      log_variance = math_ops.log(v + self.batchnorm.epsilon)

    # `gamma` and `log Var(y)` reductions over event_dims.
    # Log(total change in area from gamma term).
    log_total_gamma = math_ops.reduce_sum(math_ops.log(self.batchnorm.gamma))

    # Log(total change in area from log-variance term).
    log_total_variance = math_ops.reduce_sum(log_variance)
    # The ildj is scalar, as it does not depend on the values of x and are
    # constant across minibatch elements.
    return log_total_gamma - 0.5 * log_total_variance
 def feature_importances(self):
   tree_counts = [
       for i in range(self.params.num_trees)
   total_counts = math_ops.reduce_sum(array_ops.stack(tree_counts, 0), 0)
   return total_counts / math_ops.reduce_sum(total_counts)
  def testPartialShapes(self):

    # Input shape is unknown.
    reduction_axes = [1, 2]
    c_unknown = array_ops.placeholder(dtypes.float32)
    s_unknown = math_ops.reduce_sum(c_unknown, reduction_axes)
    self.assertEqual(tensor_shape.unknown_shape(), s_unknown.get_shape())

    np_input = np.random.randn(3, 3, 3)
    self._compareAll(np_input, reduction_axes, {c_unknown: np_input})

    # Input shape only has known rank.
    c_known_rank = array_ops.placeholder(dtypes.float32)
    s_known_rank = math_ops.reduce_sum(
        c_known_rank, reduction_axes, keep_dims=True)
    self.assertEqual(3, s_known_rank.get_shape().ndims)

    np_input = np.random.randn(3, 3, 3)
    self._compareAll(np_input, reduction_axes, {c_known_rank: np_input})

    # Reduction indices are unknown.
    unknown_indices = array_ops.placeholder(dtypes.int32)
    c_unknown_indices = constant_op.constant([[10.0], [20.0]])
    s_unknown_indices = math_ops.reduce_sum(
        c_unknown_indices, unknown_indices, keep_dims=False)
    s_unknown_indices_keep = math_ops.reduce_sum(
        c_unknown_indices, unknown_indices, keep_dims=True)
    self.assertEqual(2, s_unknown_indices_keep.get_shape().ndims)
  def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
    with ops.Graph().as_default():
      embedding_matrix = variable_scope.get_variable(
          "embedding_matrix", [5, 5],

      def Cond(it, _):
        return it < 5

      def Body(it, cost):
        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
        cost = control_flow_ops.cond(
            math_ops.equal(it, 3), lambda: math_ops.square(cost),
            lambda: cost + math_ops.reduce_sum(embedding))
        return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          Cond, Body, [constant_op.constant(0), constant_op.constant(0.0)])

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,

      with self.test_session() as sess:
        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
def _estimate_data_distribution(labels, num_classes, smoothing_constant=10):
  """Estimate data distribution as labels are seen."""
  # Variable to track running count of classes. Smooth by a nonzero value to
  # avoid division-by-zero. Higher values provide more stability at the cost of
  # slower convergence.
  if smoothing_constant <= 0:
    raise ValueError('smoothing_constant must be nonzero.')
  num_examples_per_class_seen = variables.Variable(
      initial_value=[smoothing_constant] * num_classes, trainable=False,
      name='class_count', dtype=dtypes.int64)

  # Update the class-count based on what labels are seen in batch.
  num_examples_per_class_seen = num_examples_per_class_seen.assign_add(
      math_ops.reduce_sum(array_ops.one_hot(labels, num_classes,
                                            dtype=dtypes.int64), 0))

  # Normalize count into a probability.
  # NOTE: Without the `+= 0` line below, the test
  # `testMultiThreadedEstimateDataDistribution` fails. The reason is that
  # before this line, `num_examples_per_class_seen` is a Tensor that shares a
  # buffer with an underlying `ref` object. When the `ref` is changed by another
  # thread, `num_examples_per_class_seen` changes as well. Since this can happen
  # in the middle of the normalization computation, we get probabilities that
  # are very far from summing to one. Adding `+= 0` copies the contents of the
  # tensor to a new buffer, which will be consistent from the start to the end
  # of the normalization computation.
  num_examples_per_class_seen += 0
  init_prob_estimate = math_ops.truediv(

  # Must return float32 (not float64) to agree with downstream `_verify_input`
  # checks.
  return math_ops.cast(init_prob_estimate, dtypes.float32)
Exemple #10
  def _reduce_jacobian_det_over_event(
      self, y, ildj, min_event_ndims, event_ndims):
    """Reduce jacobian over event_ndims - min_event_ndims."""
    if not self.is_constant_jacobian:
      return math_ops.reduce_sum(
          self._get_event_reduce_dims(min_event_ndims, event_ndims))

    # In this case, we need to tile the jacobian over the event and reduce.
    y_rank = array_ops.rank(y)
    y_shape = array_ops.shape(y)[
        y_rank - event_ndims : y_rank - min_event_ndims]

    ones = array_ops.ones(y_shape, ildj.dtype)
    reduced_ildj = math_ops.reduce_sum(
        ones * ildj,
        axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
    # The multiplication by ones can change the inferred static shape so we try
    # to recover as much as possible.
    if (isinstance(event_ndims, int) and
        y.get_shape().ndims and ildj.get_shape().ndims):
      y_shape = y.get_shape()
      y_shape = y_shape[y_shape.ndims - event_ndims :
                        y_shape.ndims - min_event_ndims]
      ildj_shape = ildj.get_shape()
      broadcast_shape = array_ops.broadcast_static_shape(
          ildj_shape, y_shape)
          broadcast_shape[: broadcast_shape.ndims - (
              event_ndims - min_event_ndims)])

    return reduced_ildj
  def testConstraints(self):
    # Conv1D
    k_constraint = lambda x: x / math_ops.reduce_sum(x)
    b_constraint = lambda x: x / math_ops.reduce_max(x)
    conv1d = conv_layers.Conv1D(2, 3,
    inputs = random_ops.random_uniform((5, 3, 5), seed=1)
    self.assertEqual(conv1d.kernel_constraint, k_constraint)
    self.assertEqual(conv1d.bias_constraint, b_constraint)

    # Conv2D
    k_constraint = lambda x: x / math_ops.reduce_sum(x)
    b_constraint = lambda x: x / math_ops.reduce_max(x)
    conv2d = conv_layers.Conv2D(2, 3,
    inputs = random_ops.random_uniform((5, 3, 3, 5), seed=1)
    self.assertEqual(conv2d.kernel_constraint, k_constraint)
    self.assertEqual(conv2d.bias_constraint, b_constraint)

    # Conv3D
    k_constraint = lambda x: x / math_ops.reduce_sum(x)
    b_constraint = lambda x: x / math_ops.reduce_max(x)
    conv3d = conv_layers.Conv3D(2, 3,
    inputs = random_ops.random_uniform((5, 3, 3, 3, 5), seed=1)
    self.assertEqual(conv3d.kernel_constraint, k_constraint)
    self.assertEqual(conv3d.bias_constraint, b_constraint)
Exemple #12
  def call(self, values, weights=None):
    """Accumulate statistics for computing the mean.

    For example, if values is [1, 3, 5, 7] then the mean is 4.
    If the weights were specified as [1, 1, 0, 0] then the mean would be 2.

      values: Tensor with the per-example value.
      weights: Optional weighting of each example. Defaults to 1.

      The arguments, for easy chaining.
    if weights is None:
          math_ops.cast(array_ops.identity(array_ops.size(values)), self.dtype))
      values = math_ops.reduce_sum(values)
      self.numer.assign_add(math_ops.cast(values, self.dtype))
      weights = math_ops.cast(weights, self.dtype)
      values = math_ops.cast(values, self.dtype) * weights
    if weights is None:
      return values
    return values, weights
  def testVariablesAcrossGraphs(self):
    optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
    with ops.Graph().as_default():
      var0 = resource_variable_ops.ResourceVariable(
          [1.0, 2.0], dtype=dtypes.float32, name="var0")
      var1 = resource_variable_ops.ResourceVariable(
          [3.0, 4.0], dtype=dtypes.float32, name="var1")
      if context.executing_eagerly():
        loss = lambda: math_ops.reduce_sum(var0 + var1)
        loss = math_ops.reduce_sum(var0 + var1)
      optimizer_variables = optimizer.variables()
      self.assertStartsWith(optimizer_variables[0].name, "var0")
      self.assertStartsWith(optimizer_variables[1].name, "var1")
      self.assertEquals(2, len(optimizer_variables))

    with ops.Graph().as_default():
      var2 = resource_variable_ops.ResourceVariable(
          [1.0, 2.0], dtype=dtypes.float32, name="var2")
      var3 = resource_variable_ops.ResourceVariable(
          [3.0, 4.0], dtype=dtypes.float32, name="var3")
      if context.executing_eagerly():
        loss = lambda: math_ops.reduce_sum(var2 + var3)
        loss = math_ops.reduce_sum(var2 + var3)
      optimizer_variables = optimizer.variables()
      self.assertStartsWith(optimizer_variables[0].name, "var2")
      self.assertStartsWith(optimizer_variables[1].name, "var3")
      self.assertEquals(2, len(optimizer_variables))
def accuracy(predictions, labels, weights=None):
  """Computes the percentage of times that predictions matches labels.

    predictions: the predicted values, a `Tensor` whose dtype and shape
                 matches 'labels'.
    labels: the ground truth values, a `Tensor` of any shape and
            bool, integer, or string dtype.
    weights: None or `Tensor` of float values to reweight the accuracy.

    Accuracy `Tensor`.

    ValueError: if dtypes don't match or
                if dtype is not bool, integer, or string.
  if not (labels.dtype.is_integer or
          labels.dtype in (dtypes.bool, dtypes.string)):
    raise ValueError(
        'Labels should have bool, integer, or string dtype, not %r' %
  if not labels.dtype.is_compatible_with(predictions.dtype):
    raise ValueError('Dtypes of predictions and labels should match. '
                     'Given: predictions (%r) and labels (%r)' %
                     (predictions.dtype, labels.dtype))
  with ops.name_scope('accuracy', values=[predictions, labels]):
    is_correct = math_ops.cast(
        math_ops.equal(predictions, labels), dtypes.float32)
    if weights is not None:
      is_correct = math_ops.mul(is_correct, weights)
      num_values = math_ops.mul(weights, array_ops.ones_like(is_correct))
      return math_ops.div(math_ops.reduce_sum(is_correct),
    return math_ops.reduce_mean(is_correct)
Exemple #15
def _SubGrad(op, grad):
    x = op.inputs[0]
    y = op.inputs[1]
    sx = array_ops.shape(x)
    sy = array_ops.shape(y)
    rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
    return (array_ops.reshape(math_ops.reduce_sum(grad, rx), sx), array_ops.reshape(-math_ops.reduce_sum(grad, ry), sy))
def _get_batch(per_class_queues, probs, batch_size):
  """Generates batches according to per-class-probabilities."""
  num_classes = probs.size
  # Number of examples per class is governed by a multinomial distribution.
  # Note: multinomial takes unnormalized log probabilities for its first
  # argument, of dimension [batch_size, num_classes].
  examples = random_ops.multinomial(
      np.expand_dims(np.log(probs), 0), batch_size)

  # Prepare the data and label batches.
  val_list = []
  label_list = []
  for i in range(num_classes):
    num_examples = math_ops.reduce_sum(
        math_ops.cast(math_ops.equal(examples, i), dtypes.int32))
    label_list.append(array_ops.ones([num_examples], dtype=dtypes.int32) * i)

  # Create a tensor of labels.
  batch_labels = array_ops.concat(0, label_list)

  # Debug instrumentation.
  sample_tags = ['stratified_sample/samples_class%i' % i for i in
  logging_ops.scalar_summary(sample_tags, math_ops.reduce_sum(
      array_ops.one_hot(batch_labels, num_classes), 0))

  return array_ops.concat(0, val_list), batch_labels
 def nonempty_lbeta():
     log_prod_gamma_x = math_ops.reduce_sum(
         math_ops.lgamma(x), reduction_indices=[-1])
     sum_x = math_ops.reduce_sum(x, reduction_indices=[-1])
     log_gamma_sum_x = math_ops.lgamma(sum_x)
     result = log_prod_gamma_x - log_gamma_sum_x
     return result
  def loss(self, logits, target, features):
    """Returns loss tensor for this head.

    The loss returned is the weighted average.

      L = sum_{i} w_{i} * l_{i} / sum_{i} w_{i}

      logits: logits, a float tensor.
      target: either a tensor for labels or in multihead case, a dict of string
        to target tensor.
      features: features dict.

      Loss tensor.
    target = target[self.name] if isinstance(target, dict) else target
    loss_unweighted = self._loss_fn(logits, target)

    weight_tensor = self.get_weight_tensor(features)
    if weight_tensor is None:
      return math_ops.reduce_mean(loss_unweighted, name="loss")
    loss_weighted = self._weighted_loss(loss_unweighted, weight_tensor)
    return math_ops.div(
        math_ops.cast(math_ops.reduce_sum(weight_tensor), dtypes.float32),
 def _LossFunc():
   first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
       array_ops.zeros([28, 100, 28]))
   second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
       array_ops.zeros([28, 100, 100]))
   return (math_ops.reduce_sum(first_output) +
Exemple #20
  def loss(self, logits, target, features):
    """Returns loss tensor for this head.

      logits: logits, a float tensor.
      target: either a tensor for labels or in multihead case, a dict of string
        to target tensor.
      features: features dict.

      Loss tensor.
    target = target[self.name] if isinstance(target, dict) else target
    loss_unweighted = self._loss_fn(logits, target)

    weight_tensor = self.get_weight_tensor(features)
    if weight_tensor is None:
      return math_ops.reduce_mean(loss_unweighted, name="loss")
      loss_unweighted = array_ops.reshape(loss_unweighted, shape=(-1,))
      loss_weighted = math_ops.mul(
          loss_unweighted, array_ops.reshape(weight_tensor, shape=(-1,)))
      return math_ops.div(
Exemple #21
  def _squared_fisher_norm(self, grads_and_vars, precon_grads_and_vars):
    """Computes the squared (approximate) Fisher norm of the updates.

    This is defined as v^T F v, where F is the approximate Fisher matrix
    as computed by the estimator, and v = F^{-1} g, where g is the gradient.
    This is computed efficiently as v^T g.

      grads_and_vars: List of (gradient, variable) pairs.
      precon_grads_and_vars: List of (preconditioned gradient, variable) pairs.
        Must be the result of calling `self._fisher_est.multiply_inverse`
        on `grads_and_vars`.

      Scalar representing the squared norm.

      ValueError: if the two list arguments do not contain the same variables,
        in the same order.
    for (_, gvar), (_, pgvar) in zip(grads_and_vars, precon_grads_and_vars):
      if gvar is not pgvar:
        raise ValueError("The variables referenced by the two arguments "
                         "must match.")
    terms = [
        math_ops.reduce_sum(grad * pgrad)
        for (grad, _), (pgrad, _) in zip(grads_and_vars, precon_grads_and_vars)
    return math_ops.reduce_sum(terms)
  def test_optimize(self):
    scalar = variables.Variable(random_ops.random_normal([]), 'scalar')
    vector = variables.Variable(random_ops.random_normal([2]), 'vector')
    matrix = variables.Variable(random_ops.random_normal([2, 3]), 'matrix')

    minimum_location = constant_op.constant(np.arange(9), dtype=dtypes.float32)

    loss = math_ops.reduce_sum(
        math_ops.square(vector - minimum_location[:2])) / 2.
    loss += math_ops.reduce_sum(
        math_ops.square(scalar - minimum_location[2])) / 2.
    loss += math_ops.reduce_sum(
            matrix - array_ops.reshape(minimum_location[3:], [2, 3]))) / 2.

    optimizer = MockOptimizerInterface(loss)

    with self.test_session() as sess:


      self.assertAllClose(np.arange(2), sess.run(vector))
      self.assertAllClose(np.arange(1) + 2, sess.run(scalar))
      self.assertAllClose(np.arange(6).reshape(2, 3) + 3, sess.run(matrix))
Exemple #23
 def __call__(self, x):
   regularization = 0.
   if self.l1:
     regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
   if self.l2:
     regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
   return regularization
def npairs_loss(labels, embeddings_anchor, embeddings_positive,
                reg_lambda=0.002, print_losses=False):
  """Computes the npairs loss.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels.

  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf

    labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
    embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
      embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

    npairs_loss: tf.float32 scalar.
  # pylint: enable=line-too-long
  # Add the regularizer on the embedding.
  reg_anchor = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
  reg_positive = math_ops.reduce_mean(
      math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
  l2loss = math_ops.multiply(
      0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss')

  # Get per pair similarities.
  similarity_matrix = math_ops.matmul(
      embeddings_anchor, embeddings_positive, transpose_a=False,

  # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
  lshape = array_ops.shape(labels)
  assert lshape.shape == 1
  labels = array_ops.reshape(labels, [lshape[0], 1])

  labels_remapped = math_ops.to_float(
      math_ops.equal(labels, array_ops.transpose(labels)))
  labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

  # Add the softmax loss.
  xent_loss = nn.softmax_cross_entropy_with_logits(
      logits=similarity_matrix, labels=labels_remapped)
  xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

  if print_losses:
    xent_loss = logging_ops.Print(
        xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

  return l2loss + xent_loss
Exemple #25
  def call(self, values, weights=None):
    """Accumulate statistics for computing the mean.

    For example, if values is [1, 3, 5, 7] then the mean is 4.
    If the weights were specified as [1, 1, 0, 0] then the mean would be 2.

      values: Tensor with the per-example value.
      weights: Optional weighting of each example. Defaults to 1.
    if not self.built:  # False only in the first call().
      self.numer = self.add_variable(name="numer", shape=(),
      self.denom = self.add_variable(name="denom", shape=(),
    if weights is None:
          math_ops.cast(array_ops.size(values), dtypes.float64))
      values = math_ops.reduce_sum(values)
      self.numer.assign_add(math_ops.cast(values, dtypes.float64))
      weights = math_ops.cast(weights, dtypes.float64)
      values = math_ops.cast(values, dtypes.float64) * weights
Exemple #26
 def approximate_hessian(self, grads_and_vars, name=None):
   I haven't tested this yet so I have no idea if it works, but even if it
   does it's probably super slow, and either way nothing else has been modified
   to deal with it.
   gv = 0
   var_refs = []
   for g_t, x_tm1 in grads_and_vars:
     if g_t is None:
     with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
       if isinstance(g_t, ops.Tensor):
         gv += math_ops.reduce_sum(g_t * random_ops.random_normal(g_t.get_shape()))
         idxs, idxs_ = array_ops.unique(g_t.indices)
         g_t_ = math_ops.unsorted_segment_sum(g_t.values, idxs_, array_ops.size(idxs))
         gv += math_ops.reduce_sum(g_t_ * random_ops.random_normal(g_t_.get_shape()))
   hesses = gradients.gradients(gv, var_refs,
                                gate_gradients=(gate_gradients == Optimizer.GATE_OP),
   return zip([g_t for g_t, _ in grads_and_vars], [x_tm1 for _, x_tm1 in grads_and_vars], hesses)
  def test_tensor_array_grad(self):
    inp = constant_op.constant(np.random.rand(3, 4, 2), dtype=dtypes.float32)
    ta = tensor_array_ops.TensorArray(dtypes.float32, size=3)
    ta = ta.unstack(inp)

    def loop_fn(i):

      def body(j, x):
        value = ta.gather([j])
        value = array_ops.gather(array_ops.reshape(value, [4, 2]), i)
        return j + 1, x + value

      _, out = control_flow_ops.while_loop(lambda j, _: j < 3, body,
                                           (0, array_ops.zeros([2])))
      out = math_ops.reduce_prod(out)
      return out, gradient_ops.gradients(out, inp)[0]

    pfor_out, pfor_out_grad = pfor_control_flow_ops.pfor(loop_fn, 4)
    # Note that tf.while_loop does not work in the setup above. So we manually
    # construct the equivalent computation of the above loops here.
    real_out = math_ops.reduce_sum(inp, axis=[0])
    real_out = math_ops.reduce_prod(real_out, axis=[1])
    # Note that gradients of real_out will accumulate the gradients across the
    # output value. Hence we do the same aggregation on pfor_out_grad.
    real_out_grad = gradient_ops.gradients(real_out, inp)[0]
    sum_pfor_out_grad = math_ops.reduce_sum(pfor_out_grad, axis=[0])

    with session.Session() as sess:
      v1, v2, v1_grad, v2_grad = sess.run(
          [pfor_out, real_out, sum_pfor_out_grad, real_out_grad])
      self.assertAllClose(v1, v2)
      self.assertAllClose(v1_grad, v2_grad)
Exemple #28
def _scale_losses(losses, weights):
  """Computes the scaled loss.

    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: `Tensor` of shape `[]`, `[batch_size]` or
      `[batch_size, d1, ... dN]`. The `losses` are reduced (`tf.reduce_sum`)
      until its dimension matches that of `weights` at which point the reduced
      `losses` are element-wise multiplied by `weights` and a final `reduce_sum`
      is computed on the result. Conceptually, this operation is similar to
      broadcasting (tiling) `weights` to be the same shape as `losses`,
      performing an element-wise multiplication, and summing the result. Note,
      however, that the dimension matching is right-to-left, not left-to-right;
      i.e., the opposite of standard NumPy/Tensorflow broadcasting.

    A scalar tf.float32 `Tensor` whose value represents the sum of the scaled
  # First, compute the sum of the losses over all elements:
  start_index = max(0, weights.get_shape().ndims)
  reduction_indices = list(range(start_index, losses.get_shape().ndims))
  reduced_losses = math_ops.reduce_sum(losses,
  reduced_losses = math_ops.multiply(reduced_losses, weights)
  return math_ops.reduce_sum(reduced_losses)
    def body(it, cost):
      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      cost = control_flow_ops.cond(
          math_ops.equal(it, 3), lambda: math_ops.square(cost),
          (lambda: cost + math_ops.reduce_sum(embedding)))
      return it + 1, cost

      _, cost = control_flow_ops.while_loop(
          cond, body, [constant_op.constant(0),

      dynamic_grads = gradients_impl.gradients(cost, [embedding_matrix])[0]
      dynamic_grads = math_ops.segment_sum(dynamic_grads.values,

      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
      static = math_ops.square(
          math_ops.reduce_sum(embedding) + math_ops.reduce_sum(embedding) +
          math_ops.reduce_sum(embedding)) + math_ops.reduce_sum(embedding)
      static_grads = gradients_impl.gradients(static, [embedding_matrix])[0]
      static_grads = math_ops.segment_sum(static_grads.values,

      with self.cached_session():
        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
 def _log_prob(self, x):
   x = self._assert_valid_sample(x)
   # broadcast logits or x if need be.
   logits = self.logits
   if (not x.get_shape().is_fully_defined() or
       not logits.get_shape().is_fully_defined() or
       x.get_shape() != logits.get_shape()):
     logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
     x = array_ops.ones_like(logits, dtype=x.dtype) * x
   logits_shape = array_ops.shape(math_ops.reduce_sum(logits, axis=[-1]))
   logits_2d = array_ops.reshape(logits, [-1, self.event_size])
   x_2d = array_ops.reshape(x, [-1, self.event_size])
   # compute the normalization constant
   k = math_ops.cast(self.event_size, x.dtype)
   log_norm_const = (math_ops.lgamma(k)
                     + (k - 1.)
                     * math_ops.log(self.temperature))
   # compute the unnormalized density
   log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
   log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
   # combine unnormalized density with normalization constant
   log_prob = log_norm_const + log_unnorm_prob
   # Reshapes log_prob to be consistent with shape of user-supplied logits
   ret = array_ops.reshape(log_prob, logits_shape)
   return ret
Exemple #31
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

      grads_and_vars: List of (gradient, variable) pairs as returned by
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        inputs = []
        var_list = []
        for x in grads_and_vars:

        with ops.device(global_step.device):
            self._local_steps = variables.Variable(array_ops.zeros(
                [self._total_num_replicas], dtype=global_step.dtype),

        # Check staleness. Note that this has to be ref(), otherwise identity will
        # be accessed and it will be old values.
        local_step = array_ops.slice(self._local_steps.ref(),
                                                       (1, )), [1],
        local_step = array_ops.reshape(local_step, ())
        is_stale = math_ops.less(local_step, global_step)

        with ops.op_scope(inputs, None, self._name):
            for grad, var in grads_and_vars:
                with ops.device(var.device):
                    if isinstance(grad, ops.Tensor):
                        gradient_queue = (data_flow_ops.FIFOQueue(
                            self._tokens_per_step * 2,
                            (gradient_queue, var.device))

                        # Aggregate all gradients
                        gradients = gradient_queue.dequeue_many(
                            math_ops.reduce_sum(gradients, [0]))
                    elif grad is None:
                        aggregated_grad.append(None)  # pass-through.
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                            self._aggregate_sparse_grad(grad, var, train_ops))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                update_op = self._opt.apply_gradients(
                    aggregated_grads_and_vars, global_step)

            # Create token queue.
            with ops.device(global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
            # Clear all the gradients queues in case there are stale gradients.
            clear_queue_ops = []
            with ops.control_dependencies([update_op]):
                for queue, dev in self._one_element_queue_list:
                    with ops.device(dev):
                        stale_grads = queue.dequeue_many(queue.size())

                for queue, dev in self._sparse_grad_queues_and_devs:
                    with ops.device(dev):
                        _, stale_indices = queue.dequeue_many(queue.size())

            with ops.device(global_step.device):
                self._clean_up_op = control_flow_ops.abort(
                    error_msg="From sync_replicas")

            # According to the staleness, select between the enqueue op (real_grad)
            # or no-op (no_op_grad). Effectively dropping all the stale gradients.
            no_op_grad = lambda: [
            real_grad = lambda: [control_flow_ops.group(*train_ops)]
            final_train_ops = control_flow_ops.cond(is_stale, no_op_grad,

            with ops.device(global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies([final_train_ops]):
                    token = sync_token_queue.dequeue()
                    train_op = state_ops.scatter_update(
                        self._local_steps, self._replica_id, token)

                with ops.control_dependencies(clear_queue_ops):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    # Note that ref() is used to avoid reading from the identity with old
                    # the step.
                    tokens = array_ops.fill([self._tokens_per_step],
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                if self._variable_averages is not None:
                    with ops.control_dependencies([sync_op
                                                   ]), ops.name_scope(""):
                        sync_op = self._variable_averages.apply(

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
                self._gradients_applied = True
                return train_op
 def error_function():
   x = random_ops.random_uniform((2, 10))
   y = random_ops.random_uniform((10, 2))
       math_ops.reduce_sum(math_ops.matmul(x, y)))
   return x
Exemple #33
 def _log_abs_determinant(self):
     return math_ops.reduce_sum(math_ops.log(math_ops.abs(
    def vimco_advantage_fn(_, loss, name=None):
        """Internal VIMCO function.

      _: ignored `StochasticTensor`.
      loss: The loss `Tensor`.
      name: Python string, the name scope to use.

      The advantage `Tensor`.
        with ops.name_scope(name, "VIMCOAdvantage", values=[loss]):
            loss = ops.convert_to_tensor(loss)
            loss_shape = loss.get_shape()
            loss_num_elements = loss_shape[0].value
            n = math_ops.cast(loss_num_elements or array_ops.shape(loss)[0],

            if have_log_loss:
                log_loss = loss
                log_loss = math_ops.log(loss)

            # Calculate L_hat, Eq. (4) -- stably
            log_mean = math_ops.reduce_logsumexp(log_loss,
                                                 [0]) - math_ops.log(n)

            # expand_dims: Expand shape [a, b, c] to [a, 1, b, c]
            log_loss_expanded = array_ops.expand_dims(log_loss, [1])

            # divide: log_loss_sub with shape [a, a, b, c], where
            #  log_loss_sub[i] = log_loss - log_loss[i]
            #       = [ log_loss[j] - log_loss[i] for rows j = 0 ... i - 1     ]
            #         [ zeros                                                  ]
            #         [ log_loss[j] - log_loss[i] for rows j = i + 1 ... a - 1 ]
            log_loss_sub = log_loss - log_loss_expanded

            # reduce_sum: Sums each row across all the sub[i]'s; result is:
            #   reduce_sum[j] = (n - 1) * log_loss[j] - (sum_{i != j} loss[i])
            # divide by (n - 1) to get:
            #   geometric_reduction[j] =
            #     log_loss[j] - (sum_{i != j} log_loss[i]) / (n - 1)
            geometric_reduction = math_ops.reduce_sum(log_loss_sub,
                                                      [0]) / (n - 1)

            # subtract this from the original log_loss to get the baseline:
            #   geometric_mean[j] = exp((sum_{i != j} log_loss[i]) / (n - 1))
            log_geometric_mean = log_loss - geometric_reduction

            ## Equation (9)

            # Calculate sum_{i != j} loss[i] -- via exp(reduce_logsumexp(.))
            # reduce_logsumexp: log-sum-exp each row across all the
            # -sub[i]'s, result is:
            #  exp(reduce_logsumexp[j]) =
            #    1 + sum_{i != j} exp(log_loss[i] - log_loss[j])
            log_local_learning_reduction = math_ops.reduce_logsumexp(
                -log_loss_sub, [0])

            # convert local_learning_reduction to the sum-exp of the log-sum-exp
            #  (local_learning_reduction[j] - 1) * exp(log_loss[j])
            #    = sum_{i != j} exp(log_loss[i])
            local_learning_log_sum = (_logexpm1(log_local_learning_reduction) +

            # Add (logaddexp) the local learning signals (Eq. 9)
            local_learning_signal = (math_ops.reduce_logsumexp(
                    (local_learning_log_sum, log_geometric_mean)), [0]) -

            advantage = log_mean - local_learning_signal

            return advantage
Exemple #35
 def _v2_loss():
     if len(root.variables) == 1:
         v2 = variables.Variable(2.)
     return math_ops.reduce_sum(root.variables[1]**2)
 def f(x):
     pointwise = math_ops.sin(x) * math_ops.tan(x)
     return math_ops.reduce_prod(pointwise +
 def _f(x):
     return math_ops.reduce_sum(x[:, None] * mat * x[None, :])
 def _batch_log_det(self):
   return 2 * math_ops.reduce_sum(
       math_ops.log(self._diag), reduction_indices=[-1])
Exemple #39
 def first(x):
   l = constant_op.constant([[0.0]])
   x = nn_ops.softmax_cross_entropy_with_logits(labels=l, logits=x)
   x = math_ops.reduce_sum(x, constant_op.constant([0]))
   return x
Exemple #40
 def testReduceAllDims(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with self.test_session():
         y_tf = math_ops.reduce_sum(x).eval()
         self.assertEqual(y_tf, 21)
 def fn(info):
     prices = [toy.price for toy in info.toys]
     return math_ops.reduce_sum(array_ops.stack(prices))
Exemple #42
 def second(x):
   grad = backprop.gradients_function(first, [0])(x)[0]
   return math_ops.reduce_sum(grad, constant_op.constant([0]))
def _FillGrad(_, grad):
    return None, math_ops.reduce_sum(grad)
        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)

                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                        [2, 3])  # shape (batch_size,attn_length)
                    #e = tf.multiply(e, rel_scores)
                    # Take softmax of e to get the attention distribution
                    attn_dist = nn_ops.softmax(
                        e)  # shape (batch_size, attn_length)

                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features),
                        [2, 3])  # calculate e
                    #e = tf.multiply(e, rel_scores)
                    # Take softmax of e to get the attention distribution
                    attn_dist = nn_ops.softmax(
                        e)  # shape (batch_size, attn_length)

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage
Exemple #45
  def create_estimator_spec(
      self, features, mode, logits, labels=None, train_op_fn=None,
    """Returns an `EstimatorSpec`.

      features: Input `dict` of `Tensor` or `SparseTensor` objects.
      mode: Estimator's `ModeKeys`.
      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
        For many applications, the shape is `[batch_size, n_classes]`.
      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
        `mode` equals `TRAIN` or `EVAL`.
      train_op_fn: Function that takes a scalar loss `Tensor` and returns
        `train_op`. Required in TRAIN mode.
      regularization_losses: A list of additional scalar losses to be added to
        the training loss, such as regularization losses. These losses are
        usually expressed as a batch average, so for best results users need to
        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
        avoid scaling errors.
      ValueError: If `train_op_fn` is `None` in TRAIN mode.
    with ops.name_scope(self._name, 'head'):
      logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access

      # Predict.
      pred_keys = prediction_keys.PredictionKeys
      with ops.name_scope(None, 'predictions', (logits,)):
        probabilities = math_ops.sigmoid(logits, name=pred_keys.PROBABILITIES)
        predictions = {
            pred_keys.LOGITS: logits,
            pred_keys.PROBABILITIES: probabilities,
      if mode == model_fn.ModeKeys.PREDICT:
        classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
            scores=probabilities, n_classes=self._n_classes,
        return model_fn.EstimatorSpec(
                _DEFAULT_SERVING_KEY: classifier_output,
                head_lib._CLASSIFY_SERVING_KEY: classifier_output,  # pylint:disable=protected-access
                head_lib._PREDICT_SERVING_KEY: (  # pylint:disable=protected-access

      (training_loss, unreduced_loss, weights,
       processed_labels) = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
      if regularization_losses:
        regularization_loss = math_ops.add_n(regularization_losses)
        regularized_training_loss = math_ops.add_n(
            [training_loss, regularization_loss])
        regularization_loss = None
        regularized_training_loss = training_loss

      # Eval.
      if mode == model_fn.ModeKeys.EVAL:
        return model_fn.EstimatorSpec(

      # Train.
      if train_op_fn is None:
        raise ValueError('train_op_fn can not be None.')
      # Only summarize mean_loss for SUM reduction to preserve backwards
      # compatibility. Otherwise skip it to avoid unnecessary computation.
      if self._loss_reduction == losses.Reduction.SUM:
        example_weight_sum = math_ops.reduce_sum(
            weights * array_ops.ones_like(unreduced_loss))
        mean_loss = training_loss / example_weight_sum
        mean_loss = None
    with ops.name_scope(''):
      keys = metric_keys.MetricKeys
          head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
      if mean_loss is not None:
            head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
      if regularization_loss is not None:
            head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
    return model_fn.EstimatorSpec(
Exemple #46
 def testReduceExplicitDims(self):
   x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
   axis = np.array([[0], [1]])
   with self.assertRaisesRegexp(ValueError, "must have rank at most 1"):
     math_ops.reduce_sum(x, axis)
Exemple #47
def mean_pairwise_squared_error(
    labels, predictions, weights=1.0, scope=None,
  """Adds a pairwise-errors-squared loss to the training procedure.

  Unlike `mean_squared_error`, which is a measure of the differences between
  corresponding elements of `predictions` and `labels`,
  `mean_pairwise_squared_error` is a measure of the differences between pairs of
  corresponding elements of `predictions` and `labels`.

  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
  three pairs of differences are summed to compute the loss:
    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3

  Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the
  corresponding pairs are computed within each batch sample but not across
  samples within a batch. For example, if `predictions` represents a batch of
  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
  is drawn from each image, but not across images.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector.

    labels: The ground truth output tensor, whose shape must match the shape of
    predictions: The predicted outputs, a tensor of size
      `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in
    weights: Coefficients for the loss a scalar, a tensor of shape
      `[batch_size]` or a tensor whose shape matches `predictions`.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.

    A scalar `Tensor` that returns the weighted loss.

    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "mean_pairwise_squared_error",
                      (predictions, labels, weights)) as scope:
    weights = math_ops.to_float(weights)
    labels = math_ops.to_float(labels)
    with ops.control_dependencies((
        weights_broadcast_ops.assert_broadcastable(weights, labels),)):
      predictions = math_ops.to_float(predictions)

      diffs = math_ops.subtract(predictions, labels)

      reduction_indices = math_ops.range(1, array_ops.rank(diffs))

      sum_squares_diff_per_batch = math_ops.reduce_sum(
      num_present_per_batch = _num_present(diffs, weights, per_batch=True)

      term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
                              num_present_per_batch - 1)

      sum_diff = math_ops.reduce_sum(
          diffs, reduction_indices=reduction_indices, keepdims=True)
      term2 = 2.0 * _safe_div(
          math_ops.multiply(num_present_per_batch, num_present_per_batch - 1))

      weighted_losses = math_ops.multiply(term1 - term2, weights)
      loss = math_ops.reduce_sum(weighted_losses)

      mean_loss = array_ops.where(
          math_ops.reduce_sum(num_present_per_batch) > 0,
      util.add_loss(mean_loss, loss_collection)
      return mean_loss
Exemple #48
def triplet_semihard_loss(labels, embeddings, metric, margin=1.0):
    """Computes the triplet loss with semi-hard negative mining.
    The loss encourages the positive distances (between a pair of embeddings with
    the same labels) to be smaller than the minimum negative distance among
    which are at least greater than the positive distance plus the margin constant
    (called semi-hard negative) in the mini-batch. If no such negative exists,
    uses the largest negative distance instead.
    See: https://arxiv.org/abs/1503.03832.
        labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
        multiclass integer labels.
        embeddings: 2-D float `Tensor` of embedding vectors.
        margin: Float, margin term in the loss definition.
        triplet_loss: tf.float32 scalar.
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = metric(embeddings)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                keepdims=True), 0.0), [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    triplet_loss = math_ops.truediv(math_ops.reduce_sum(
        math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)),

    return triplet_loss, 0
 def train_step(data):
     if math_ops.reduce_sum(data) < 0:
         return -data
     return data
Exemple #50
 def Loss(x, y):
     return math_ops.reduce_sum(
         math_ops.matmul(x, y, adjoint_a, adjoint_b))
 def _log_unnormalized_prob(self, x):
   x = self._maybe_assert_valid_sample(x)
   return math_ops.reduce_sum((self.concentration - 1.) * math_ops.log(x), -1)
Exemple #52
def compute_weighted_loss(
    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
  """Computes the weighted loss.

    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `losses`, and must be broadcastable to `losses` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: the scope for the operations performed in computing the loss.
    loss_collection: the loss will be added to these collections.
    reduction: Type of reduction to apply to loss.

    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.

    ValueError: If `weights` is `None` or the shape is not compatible with
      `losses`, or if the number of dimensions (rank) of either `losses` or
      `weights` is missing.

    When calculating the gradient of a weighted loss contributions from
    both `losses` and `weights` are considered. If your `weights` depend
    on some model parameters but you do not want this to affect the loss
    gradient, you need to apply @{tf.stop_gradient} to `weights` before
    passing them to `compute_weighted_loss`.
  with ops.name_scope(scope, "weighted_loss", (losses, weights)):
    # Save the `reduction` argument for loss normalization when distributing
    # to multiple towers.
    # TODO(josh11b): Associate it with the returned op for more precision.
    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access

    with ops.control_dependencies((
        weights_broadcast_ops.assert_broadcastable(weights, losses),)):
      losses = ops.convert_to_tensor(losses)
      input_dtype = losses.dtype
      losses = math_ops.to_float(losses)
      weights = math_ops.to_float(weights)
      weighted_losses = math_ops.multiply(losses, weights)
      if reduction == Reduction.NONE:
        loss = weighted_losses
        loss = math_ops.reduce_sum(weighted_losses)
        if reduction == Reduction.MEAN:
          loss = _safe_mean(
              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
        elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
              reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
          loss = _safe_mean(loss, _num_present(losses, weights))
        elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
          loss = _safe_mean(loss, _num_elements(losses))

      # Convert the result back to the input type.
      loss = math_ops.cast(loss, input_dtype)
      util.add_loss(loss, loss_collection)
      return loss
 def _entropy(self):
     return -math_ops.reduce_sum(
         nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
Exemple #54
    def __init__(self,
        """Initialize a batch of DirichletMultinomial distributions.

      n:  Non-negative floating point tensor, whose dtype is the same as
        `alpha`. The shape is broadcastable to `[N1,..., Nm]` with `m >= 0`.
        Defines this as a batch of `N1 x ... x Nm` different Dirichlet
        multinomial distributions. Its components should be equal to integer
      alpha: Positive floating point tensor, whose dtype is the same as
        `n` with shape broadcastable to `[N1,..., Nm, k]` `m >= 0`.  Defines
        this as a batch of `N1 x ... x Nm` different `k` class Dirichlet
        multinomial distributions.
      validate_args: `Boolean`, default `False`.  Whether to assert valid
        values for parameters `alpha` and `n`, and `x` in `prob` and
        `log_prob`.  If `False`, correct behavior is not guaranteed.
      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
        batch member.  If `True`, batch members with valid parameters leading to
        undefined statistics will return NaN for this statistic.
      name: The name to prefix Ops created by this distribution class.


    # Define 1-batch of 2-class Dirichlet multinomial distribution,
    # also known as a beta-binomial.
    dist = DirichletMultinomial(2.0, [1.1, 2.0])

    # Define a 2-batch of 3-class distributions.
    dist = DirichletMultinomial([3., 4], [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

        parameters = locals()
        with ops.name_scope(name, values=[n, alpha]) as ns:
            # Broadcasting works because:
            # * The broadcasting convention is to prepend dimensions of size [1], and
            #   we use the last dimension for the distribution, wherease
            #   the batch dimensions are the leading dimensions, which forces the
            #   distribution dimension to be defined explicitly (i.e. it cannot be
            #   created automatically by prepending).  This forces enough
            #   explicitivity.
            #   * All calls involving `counts` eventually require a broadcast between
            #   `counts` and alpha.
            self._alpha = self._assert_valid_alpha(alpha, validate_args)
            self._n = self._assert_valid_n(n, validate_args)
            self._alpha_sum = math_ops.reduce_sum(self._alpha,
        super(DirichletMultinomial, self).__init__(
            graph_parents=[self._alpha, self._n, self._alpha_sum],
Exemple #55
 def test_none(self):
   sum_lt = ops.reduce_sum(self.original_lt)
   golden_lt = core.LabeledTensor(
       math_ops.reduce_sum(self.original_lt.tensor), [])
   self.assertLabeledTensorsEqual(sum_lt, golden_lt)
 def worker_fn(iterator):
     return math_ops.reduce_sum(next(iterator))
def einsum(equation, *inputs):
  A generalized contraction between tensors of arbitrary dimension.

  This function returns a tensor whose elements are defined by `equation`,
  which is written in a shorthand form inspired by the Einstein summation
  convention.  As an example, consider multiplying two matrices
  A and B to form a matrix C.  The elements of C are given by:

    C[i,k] = sum_j A[i,j] * B[j,k]

  The corresponding `equation` is:


  In general, the `equation` is obtained from the more familiar element-wise
  equation by
    1. removing variable names, brackets, and commas,
    2. replacing "*" with ",",
    3. dropping summation signs, and
    4. moving the output to the right, and replacing "=" with "->".

  Many common operations can be expressed in this way.  For example:

  # Matrix multiplication
  >>> einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]

  # Dot product
  >>> einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]

  # Outer product
  >>> einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]

  # Transpose
  >>> einsum('ij->ji', m)  # output[j,i] = m[i,j]

  # Batch matrix multiplication
  >>> einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]

  This function behaves like `numpy.einsum`, but does not support:
  * Ellipses (subscripts like `ij...,jk...->ik...`)
  * Subscripts where an axis appears more than once for a single input
    (e.g. `ijj,k->ik`).
  * Subscripts that are summed across multiple inputs (e.g., `ij,ij,jk->ik`).

    equation: a `str` describing the contraction, in the same format as
    inputs: the inputs to contract (each one a `Tensor`), whose shapes should
      be consistent with `equation`.

    The contracted `Tensor`, with shape determined by `equation`.

    ValueError: If
      - the format of `equation` is incorrect,
      - the number of inputs implied by `equation` does not match `len(inputs)`,
      - an axis appears in the output subscripts but not in any of the inputs,
      - the number of dimensions of an input differs from the number of
        indices in its subscript, or
      - the input shapes are inconsistent along a particular axis.
  if '...' in equation:
    raise ValueError("Subscripts with ellipses are not yet supported.")

  match = re.match('([a-z,]+)(->[a-z]*)?', equation)
  if not match:
    raise ValueError(
        'Indices have incorrect format: %s' % equation

  inputs = list(inputs)
  input_axis_labels = match.group(1).split(',')

  if len(inputs) != len(input_axis_labels):
    raise ValueError('Got %d arguments for equation "%s", expecting %d' % (
        len(inputs), equation, len(input_axis_labels)))

  axis_labels = set(''.join(input_axis_labels))
  if match.group(2):
    output_axis_labels = match.group(2)[2:]
    # infer the output subscripts if not given, assume alphabetical order
    indices = ''.join(sorted(axis_labels))
    counts = {ax: 0 for ax in indices}
    for axes_ in input_axis_labels:
      for ax in axes_:
        counts[ax] += 1

    output_axis_labels = ''.join(sorted(
      ax for ax in indices
      if counts[ax] == 1

  for a in axis_labels:
    input_count = sum(1 for s in input_axis_labels if a in s)
    if input_count > 2 and a not in output_axis_labels:
          'Falling back to exponential-space implementation of einsum() because'
          ' index "%s" is summed over more than two inputs.', a)
      return _exponential_space_einsum(equation, *inputs)

  temp = inputs[0]
  temp_axis_labels = input_axis_labels[0]
  for i in xrange(len(inputs)-1):
    axes_to_sum = (set(temp_axis_labels) & set(input_axis_labels[i+1])
                   - set(output_axis_labels))
    temp, temp_axis_labels = _einsum_reduction(temp,

  missing_indices = set(temp_axis_labels) - set(output_axis_labels)
  if missing_indices:
    reduction_indices = [i for i, a in enumerate(temp_axis_labels)
                         if a not in output_axis_labels]
    temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
    temp_axis_labels = ''.join(a for a in temp_axis_labels
                               if a in output_axis_labels)

  if sorted(temp_axis_labels) != sorted(output_axis_labels):
    raise ValueError('Invalid equation: %s' % equation)

  perm = [temp_axis_labels.index(a) for a in output_axis_labels]
  return _transpose_if_necessary(temp, perm)
 def body(it, cost):
     embedding = embedding_ops.embedding_lookup(
         embedding_matrix, [0])
     cost += math_ops.reduce_sum(embedding)
     return it + 1, cost
def _exponential_space_einsum(equation, *inputs):
  """Fallback implementation that supports summing an index over > 2 inputs."""
  if '...' in equation:
    raise ValueError("Subscripts with ellipses are not yet supported.")

  match = re.match('([a-z,]+)(->[a-z]*)?', equation)
  if not match:
    raise ValueError(
        'Indices have incorrect format: %s' % equation

  inputs = list(inputs)
  idx_in = match.group(1).split(',')
  idx_all = set(''.join(idx_in))
  indices = ''.join(sorted(idx_all))

  if match.group(2):
    idx_out = match.group(2)[2:]

    # infer the output subscripts if not given, assume alphabetical order
    counts = {ax: 0 for ax in indices}
    for axes_ in idx_in:
      for ax in axes_:
        counts[ax] += 1

    idx_out = ''.join(sorted(
        ax for ax in indices
        if counts[ax] == 1

  if len(idx_in) != len(inputs):
    raise ValueError(
        'Expected %d inputs but got %d' % (len(idx_in), len(inputs))

  missing_idx = set(idx_out).difference(idx_all)
  if missing_idx:
    raise ValueError(
        'Unknown ouput axes: %s' % missing_idx

  axis_order = {}
  for ax in indices:
    if ax not in idx_out:
      axis_order[ax] = len(axis_order)
  for ax in idx_out:
    axis_order[ax] = len(axis_order)

  # transpose inputs so axes are in order
  for i, (input_, axes_) in enumerate(zip(inputs, idx_in)):
    if input_.get_shape().ndims != len(axes_):
      raise ValueError(
        'Input %d with axes %s has incorrect' \
        ' number of dimensions (expected %d, got %d)' % (
          i, axes_, len(axes_), input_.get_shape().ndims

    sorted_idx = sorted(axes_, key=axis_order.get)

    if len(set(axes_)) != len(axes_):
      raise ValueError(
          'Subscript not supported: an axis appears more than once: %s' % axes_

    if list(axes_) != sorted_idx:
      permuted = [axes_.find(ax) for ax in sorted_idx]
      inputs[i] = array_ops.transpose(input_, permuted)
      idx_in[i] = sorted_idx

  reduction_idx = []
  shapes = [[dim if dim else -1
             for dim in tensor.get_shape().as_list()]
            for tensor in inputs]

  # validate shapes for broadcasting
  for j, ax in enumerate(sorted(idx_all, key=axis_order.get)):
    dims = []
    for i, idx in enumerate(idx_in):
      if ax not in idx:
        shapes[i].insert(j, 1)
        dim = shapes[i][j]
        if isinstance(dim, int) and dim > 1:

    if len(set(dims)) > 1:
      raise ValueError(
          'Dimension mismatch on axis: %s' % ax

    if ax not in idx_out:

  # reshape, multiply
  expanded_inputs = [array_ops.reshape(input_, shape)
                     for input_, shape in zip(inputs, shapes)]
  expanded_output = 1
  for input_ in expanded_inputs:
    expanded_output *= input_

  # contract
  return math_ops.reduce_sum(expanded_output, reduction_idx)
Exemple #60
 def test_drop_scalar_axis(self):
   sum_lt = ops.reduce_sum(self.original_lt, 'channel')
   golden_lt = core.LabeledTensor(
       math_ops.reduce_sum(self.original_lt.tensor, 1),
       [self.a0, self.a2, self.a3])
   self.assertLabeledTensorsEqual(sum_lt, golden_lt)