def _Update_global_variables():
   local_vars = [v for g, v in grads_and_vars if g is not None]
   global_center_vars = [self._global_map[var] for var in local_vars]
   local_center_vars = [self._local_map[var] for var in local_vars]
   local_center_vars_update = []
   for lvar, var in zip(local_center_vars, global_center_vars):
     local_center_vars_update.append(lvar.assign(var))
   update_ops = []
   differences = []
   with ops.control_dependencies(local_center_vars_update):
     for v, lv in zip(local_vars, local_center_vars):
       with ops.device(v.device):
         differences.append(math_ops.subtract(v, lv))
     for lvar, diff in zip(local_vars, differences):
       with ops.device(lvar.device):
         update_ops.append(
             state_ops.assign_sub(lvar,
                                  math_ops.multiply(self._moving_rate,
                                                    diff)))
     for var, diff in zip(global_center_vars, differences):
       with ops.device(var.device):
         update_ops.append(
             state_ops.assign_add(var,
                                  math_ops.multiply(self._moving_rate,
                                                    diff)))
     if global_step:
       with ops.colocate_with(global_step):
         update_ops.append(state_ops.assign_add(global_step, 1))
   variable_update = control_flow_ops.group(*(update_ops))
   return variable_update
Example #2
0
def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
  """Calculate the mean and variance of based on the sufficient statistics.

  Args:
    counts: A `Tensor` containing a the total count of the data (one value).
    mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
      shifted) sum of the elements to average over.
    variance_ss: A `Tensor` containing the variance sufficient statistics: the
      (possibly shifted) squared sum of the data to compute the variance over.
    shift: A `Tensor` containing the value by which the data is shifted for
      numerical stability, or `None` if no shift was performed.
    name: Name used to scope the operations that compute the moments.

  Returns:
    Two `Tensor` objects: `mean` and `variance`.
  """
  with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]):
    divisor = math_ops.reciprocal(counts, name="divisor")
    if shift is not None:
      shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean")
      mean = math_ops.add(shifted_mean, shift, name="mean")
    else:  # no shift.
      shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean")
      mean = shifted_mean
    variance = math_ops.subtract(math_ops.multiply(variance_ss, divisor),
                                 math_ops.square(shifted_mean),
                                 name="variance")
  return (mean, variance)
 def Foo():
   x = constant_op.constant(10.0, name="x")
   y = math_ops.multiply(x, c, name="y")
   # Regression test for b/122564611.
   z = math_ops.multiply(c, y, name="z")
   g = gradients_impl.gradients(z, x)
   return g[0]
Example #4
0
  def setUp(self):
    """Test setup.

    Structure of the forward graph:
              f
             | |
        -----   -----
        |           |
        d           e
       | |         | |
    ---   ---------  ---
    |         |        |
    a         b        c

    Construct a backward graph using the GradientDescentOptimizer.
    """

    self.a = variables.Variable(1.0, name="a")
    self.b = variables.Variable(2.0, name="b")
    self.c = variables.Variable(4.0, name="c")
    self.d = math_ops.multiply(self.a, self.b, name="d")
    self.e = math_ops.multiply(self.b, self.c, name="e")
    self.f = math_ops.multiply(self.d, self.e, name="f")

    # Gradient descent optimizer that minimizes g.
    gradient_descent.GradientDescentOptimizer(0.01).minimize(
        self.f, name="optim")

    rewriter_config = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True)
    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
    config = config_pb2.ConfigProto(graph_options=graph_options)
    self.sess = session.Session(config=config)
    self.sess.run(variables.global_variables_initializer())
Example #5
0
def logloss(y_true, y_pred):
  y_pred = ops.convert_to_tensor(y_pred)
  y_true = math_ops.cast(y_true, y_pred.dtype)
  losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon()))
  losses += math_ops.multiply((1 - y_true),
                              math_ops.log(1 - y_pred + K.epsilon()))
  return K.mean(-losses, axis=-1)
  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
                 power, cycle, name):
    """Helper to recompute learning rate; most helpful in eager-mode."""
    with ops.name_scope(
        name, "PolynomialDecay",
        [learning_rate, global_step, decay_steps, end_learning_rate, power]
    ) as name:
      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
      dtype = learning_rate.dtype
      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
      power = math_ops.cast(power, dtype)

      global_step_recomp = math_ops.cast(global_step, dtype)
      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
      if cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
  def __call__(self, step):
    with ops.name_scope(
        self.name, "PolynomialDecay",
        [self.initial_learning_rate, step, self.decay_steps,
         self.end_learning_rate, self.power]
    ) as name:
      initial_learning_rate = ops.convert_to_tensor(
          self.initial_learning_rate, name="initial_learning_rate")
      dtype = initial_learning_rate.dtype
      end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
      power = math_ops.cast(self.power, dtype)

      global_step_recomp = math_ops.cast(step, dtype)
      decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
      if self.cycle:
        # Find the first multiple of decay_steps that is bigger than
        # global_step. If global_step is zero set the multiplier to 1
        multiplier = control_flow_ops.cond(
            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
            lambda: math_ops.ceil(global_step_recomp / self.decay_steps))
        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
      else:
        # Make sure that the global_step used is not bigger than decay_steps.
        global_step_recomp = math_ops.minimum(global_step_recomp,
                                              self.decay_steps)

      p = math_ops.div(global_step_recomp, decay_steps_recomp)
      return math_ops.add(
          math_ops.multiply(initial_learning_rate - end_learning_rate,
                            math_ops.pow(1 - p, power)),
          end_learning_rate,
          name=name)
Example #8
0
  def test_graph_replace_gradients(self):
    ops.reset_default_graph()
    w = variables.VariableV1(0.0, name="w")
    y = math_ops.multiply(math_ops.multiply(w, w, name="mul1"), w, name="mul2")
    g = gradients_impl.gradients(y, w, name="grad")[0]

    # Extract the operations.
    replacement_ts = {w.value(): g}
    original_mul1_grad = (ops.get_default_graph().
                          get_operation_by_name("grad/mul1_grad/Mul_1"))

    # Should not raise exception.
    res = ge.graph_replace(g, replacement_ts, dst_scope="res")

    # Extract the operations after graph_replace.
    result_mul1_grad = (ops.get_default_graph().
                        get_operation_by_name("res/grad/mul1_grad/Mul_1"))

    # Make sure _original_ops are as expected.
    self.assertEqual(original_mul1_grad._original_op.name, u"mul1")
    self.assertEqual(result_mul1_grad._original_op.name, u"res/mul1")
    self.assertNotEqual(res.name, g.name)
    with session.Session() as sess:
      sess.run(variables.global_variables_initializer())
      g_val, res_val = sess.run([g, res])
    self.assertNear(g_val, 0.0, ERROR_TOLERANCE)
    self.assertNear(res_val, 0.0, ERROR_TOLERANCE)
Example #9
0
def accuracy(predictions, labels, weights=None):
  """Computes the percentage of times that predictions matches labels.

  Args:
    predictions: the predicted values, a `Tensor` whose dtype and shape
                 matches 'labels'.
    labels: the ground truth values, a `Tensor` of any shape and
            bool, integer, or string dtype.
    weights: None or `Tensor` of float values to reweight the accuracy.

  Returns:
    Accuracy `Tensor`.

  Raises:
    ValueError: if dtypes don't match or
                if dtype is not bool, integer, or string.
  """
  if not (labels.dtype.is_integer or
          labels.dtype in (dtypes.bool, dtypes.string)):
    raise ValueError(
        'Labels should have bool, integer, or string dtype, not %r' %
        labels.dtype)
  if not labels.dtype.is_compatible_with(predictions.dtype):
    raise ValueError('Dtypes of predictions and labels should match. '
                     'Given: predictions (%r) and labels (%r)' %
                     (predictions.dtype, labels.dtype))
  with ops.name_scope('accuracy', values=[predictions, labels]):
    is_correct = math_ops.cast(
        math_ops.equal(predictions, labels), dtypes.float32)
    if weights is not None:
      is_correct = math_ops.multiply(is_correct, weights)
      num_values = math_ops.multiply(weights, array_ops.ones_like(is_correct))
      return math_ops.div(math_ops.reduce_sum(is_correct),
                          math_ops.reduce_sum(num_values))
    return math_ops.reduce_mean(is_correct)
Example #10
0
  def setUp(self):
    self.a = variables.VariableV1(2.0, name="a")
    self.b = variables.VariableV1(3.0, name="b")

    self.c = math_ops.multiply(self.a, self.b, name="c")  # Should be 6.0.
    self.d = math_ops.multiply(self.a, self.a, name="d")  # Should be 4.0.

    self.e = math_ops.multiply(self.d, self.c, name="e")  # Should be 24.0.

    self.f_y = constant_op.constant(0.30, name="f_y")
    self.f = math_ops.div(self.b, self.f_y, name="f")  # Should be 10.0.

    # The there nodes x, y and z form a graph with "cross-links" in. I.e., x
    # and y are both direct inputs to z, but x is also a direct input to y.
    self.x = variables.VariableV1(2.0, name="x")  # Should be 2.0
    self.y = math_ops.negative(self.x, name="y")  # Should be -2.0.

    self.z = math_ops.multiply(self.x, self.y, name="z")  # Should be -4.0.

    rewriter_config = rewriter_config_pb2.RewriterConfig(
        disable_model_pruning=True,
        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
        constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
    config = config_pb2.ConfigProto(graph_options=graph_options)
    self.sess = session.Session(config=config)
    self.sess.run(variables.global_variables_initializer())
Example #11
0
def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None):
  """Adds a Log Loss term to the training procedure.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  [batch_size], then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    predictions: The predicted outputs.
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    weights: Coefficients for the loss a scalar, a tensor of shape
      [batch_size] or a tensor whose shape matches `predictions`.
    epsilon: A small increment to add to avoid taking a log of zero.
    scope: The scope for the operations performed in computing the loss.

  Returns:
    A scalar `Tensor` representing the loss value.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.
  """
  with ops.name_scope(scope, "log_loss",
                      [predictions, labels, weights]) as scope:
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    predictions = math_ops.to_float(predictions)
    labels = math_ops.to_float(labels)
    losses = -math_ops.multiply(
        labels, math_ops.log(predictions + epsilon)) - math_ops.multiply(
            (1 - labels), math_ops.log(1 - predictions + epsilon))
    return compute_weighted_loss(losses, weights, scope=scope)
Example #12
0
def _SquareGrad(op, grad):
  x = op.inputs[0]
  # Added control dependencies to prevent 2*x from being computed too early.
  with ops.control_dependencies([grad]):
    x = math_ops.conj(x)
    y = constant_op.constant(2.0, dtype=x.dtype)
    return math_ops.multiply(grad, math_ops.multiply(x, y))
Example #13
0
  def setUp(self):
    """Test setup.

    Structure of the forward graph:
              f
             | |
        -----   -----
        |           |
        d           e
       | |         | |
    ---   ---------  ---
    |         |        |
    a         b        c

    Construct a backward graph using the GradientDescentOptimizer.
    """

    self.a = variables.Variable(1.0, name="a")
    self.b = variables.Variable(2.0, name="b")
    self.c = variables.Variable(4.0, name="c")
    self.d = math_ops.multiply(self.a, self.b, name="d")
    self.e = math_ops.multiply(self.b, self.c, name="e")
    self.f = math_ops.multiply(self.d, self.e, name="f")

    # Gradient descent optimizer that minimizes g.
    gradient_descent.GradientDescentOptimizer(0.01).minimize(
        self.f, name="optim")

    self.sess = session.Session()
    self.sess.run(variables.global_variables_initializer())
Example #14
0
def huber_loss(y_true, y_pred, delta=1.0):
  """Computes Huber loss value.

  For each value x in `error=y_true-y_pred`, the following is calculated:

  ```
  0.5 * x^2                  if |x| <= d
  0.5 * d^2 + d * (|x| - d)  if |x| > d
  ```
  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss

  Args:
    y_true: tensor of true targets.
    y_pred: tensor of predicted targets.
    delta: A float, the point where the Huber loss function changes from a
      quadratic to linear.

  Returns:
    Tensor with one scalar loss entry per sample.
  """
  y_pred = math_ops.cast(y_pred, dtype=K.floatx())
  y_true = math_ops.cast(y_true, dtype=K.floatx())
  error = math_ops.subtract(y_pred, y_true)
  abs_error = math_ops.abs(error)
  quadratic = math_ops.minimum(abs_error, delta)
  linear = math_ops.subtract(abs_error, quadratic)
  return math_ops.add(
      math_ops.multiply(
          ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
          math_ops.multiply(quadratic, quadratic)),
      math_ops.multiply(delta, linear))
Example #15
0
  def unregularized_loss(self, examples):
    """Add operations to compute the loss (without the regularization loss).

    Args:
      examples: Examples to compute unregularized loss on.

    Returns:
      An Operation that computes mean (unregularized) loss for given set of
      examples.

    Raises:
      ValueError: if examples are not well defined.
    """
    self._assertSpecified([
        'example_labels', 'example_weights', 'sparse_features', 'dense_features'
    ], examples)
    self._assertList(['sparse_features', 'dense_features'], examples)
    with name_scope('sdca/unregularized_loss'):
      predictions = math_ops.cast(
          self._linear_predictions(examples), dtypes.float64)
      labels = math_ops.cast(
          internal_convert_to_tensor(examples['example_labels']),
          dtypes.float64)
      weights = math_ops.cast(
          internal_convert_to_tensor(examples['example_weights']),
          dtypes.float64)

      if self._options['loss_type'] == 'logistic_loss':
        return math_ops.reduce_sum(math_ops.multiply(
            sigmoid_cross_entropy_with_logits(labels=labels,
                                              logits=predictions),
            weights)) / math_ops.reduce_sum(weights)

      if self._options['loss_type'] == 'poisson_loss':
        return math_ops.reduce_sum(math_ops.multiply(
            log_poisson_loss(targets=labels, log_input=predictions),
            weights)) / math_ops.reduce_sum(weights)

      if self._options['loss_type'] in ['hinge_loss', 'smooth_hinge_loss']:
        # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to
        # first convert 0/1 labels into -1/1 labels.
        all_ones = array_ops.ones_like(predictions)
        adjusted_labels = math_ops.subtract(2 * labels, all_ones)
        # Tensor that contains (unweighted) error (hinge loss) per
        # example.
        error = nn_ops.relu(
            math_ops.subtract(all_ones,
                              math_ops.multiply(adjusted_labels, predictions)))
        weighted_error = math_ops.multiply(error, weights)
        return math_ops.reduce_sum(weighted_error) / math_ops.reduce_sum(
            weights)

      # squared loss
      err = math_ops.subtract(labels, predictions)

      weighted_squared_err = math_ops.multiply(math_ops.square(err), weights)
      # SDCA squared loss function is sum(err^2) / (2*sum(weights))
      return (math_ops.reduce_sum(weighted_squared_err) /
              (2.0 * math_ops.reduce_sum(weights)))
 def testSmartCondTrue(self):
   with ops.Graph().as_default():
     with session.Session():
       x = constant_op.constant(2)
       y = constant_op.constant(5)
       z = smart_cond.smart_cond(True, lambda: math_ops.multiply(x, 16),
                                 lambda: math_ops.multiply(y, 5))
       self.assertEqual(z.eval(), 32)
 def testSmartCondFalse(self):
   with ops.Graph().as_default():
     with session.Session():
       x = constant_op.constant(4)
       y = constant_op.constant(3)
       z = smart_cond.smart_cond(False, lambda: math_ops.multiply(x, 16),
                                 lambda: math_ops.multiply(y, 3))
       self.assertEqual(z.eval(), 9)
 def fn():
   two = constant_op.constant(2.0, name='two')
   ten = constant_op.constant(10.0, name='ten')
   twenty = math_ops.multiply(two, ten, name='twenty')
   three = constant_op.constant(3.0, name='three')
   with framework_ops.colocate_with(twenty):
     thirty = math_ops.multiply(three, ten, name='thirty')
   return ten, twenty, thirty
 def decayed_lr():
   """Helper to recompute learning rate; most helpful in eager-mode."""
   global_step_recomp = math_ops.cast(global_step, dtype)
   p = global_step_recomp / decay_steps
   if staircase:
     p = math_ops.floor(p)
   exponent = math_ops.exp(
       math_ops.multiply(math_ops.negative(decay_rate), p))
   return math_ops.multiply(learning_rate, exponent, name=name)
Example #20
0
  def testFeedOneHandleDirectly(self):
    with self.test_session() as sess:
      a = constant_op.constant(10.0)
      b = constant_op.constant(5.0)
      c = math_ops.multiply(a, b)
      d = math_ops.multiply(c, c)

      h_c = sess.run(session_ops.get_session_handle(c))

      self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
  def testScan_Simple(self):
    elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
    v = constant_op.constant(2.0, name="v")

    # pylint: disable=unnecessary-lambda
    r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems)
    self.assertAllEqual([1., 2., 6., 24., 120., 720.], self.evaluate(r))

    r = functional_ops.scan(
        lambda a, x: math_ops.multiply(a, x), elems, initializer=v)
    self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
Example #22
0
  def testHandleAndValue(self):
    with self.test_session() as sess:
      # Return a handle and a value.
      a = constant_op.constant(10)
      b = constant_op.constant(5)
      c = math_ops.multiply(a, b)
      h = session_ops.get_session_handle(c)
      v = math_ops.multiply(a, c)
      h, v = sess.run([h, v])

      self.assertEqual(50, h.eval())
      self.assertEqual(500, v)
Example #23
0
def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
             loss_collection=ops.GraphKeys.LOSSES,
             reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Adds a Log Loss term to the training procedure.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    epsilon: A small increment to add to avoid taking a log of zero.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "log_loss",
                      (predictions, labels, weights)) as scope:
    predictions = math_ops.to_float(predictions)
    labels = math_ops.to_float(labels)
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    losses = -math_ops.multiply(
        labels,
        math_ops.log(predictions + epsilon)) - math_ops.multiply(
            (1 - labels), math_ops.log(1 - predictions + epsilon))
    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
  def testFold_Grad(self):
    with self.cached_session():
      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
      v = constant_op.constant(2.0, name="v")
      r = functional_ops.foldl(
          lambda a, x: math_ops.multiply(a, x), elems, initializer=v)
      r = gradients_impl.gradients(r, v)[0]
      self.assertAllEqual(720.0, self.evaluate(r))

      r = functional_ops.foldr(
          lambda a, x: math_ops.multiply(a, x), elems, initializer=v)
      r = gradients_impl.gradients(r, v)[0]
      self.assertAllEqual(720.0, self.evaluate(r))
  def testFoldl_Simple(self):
    elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")

    r = functional_ops.foldl(
        lambda a, x: math_ops.multiply(math_ops.add(a, x), 2),
        elems)
    self.assertAllEqual(208, self.evaluate(r))

    r = functional_ops.foldl(
        lambda a, x: math_ops.multiply(math_ops.add(a, x), 2),
        elems,
        initializer=10)
    self.assertAllEqual(880, self.evaluate(r))
Example #26
0
  def testHandleBasic(self):
    with self.test_session() as sess:
      # Return a handle.
      a = constant_op.constant(10)
      b = constant_op.constant(5)
      c = math_ops.multiply(a, b)
      h = session_ops.get_session_handle(c)
      h = sess.run(h)

      # Feed a tensor handle.
      f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
      y = math_ops.multiply(x, 10)
      self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
Example #27
0
def _num_present(losses, weights, per_batch=False):
  """Computes the number of elements in the loss function induced by `weights`.

  A given weights tensor induces different numbers of usable elements in the
  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
  possible dimensions. For example, if `losses` is a tensor of dimension
  `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is,
  in effect, tiled to match the shape of `losses`. Following this effective
  tile, the total number of present elements is the number of non-zero weights.

  Args:
    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: `Tensor` of shape `[]`, `[batch_size]` or
      `[batch_size, d1, ... dK]`, where K < N.
    per_batch: Whether to return the number of elements per batch or as a sum
      total.

  Returns:
    The number of present (non-zero) elements in the losses tensor. If
      `per_batch` is `True`, the value is returned as a tensor of size
      `[batch_size]`. Otherwise, a single scalar tensor is returned.
  """
  # If weights is a scalar, its easy to compute:
  if weights.get_shape().ndims == 0:
    if losses.get_shape().ndims == 0:
      batch_size = 1
    else:
      batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses),
                                                     [0], [1]), [])
    num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)),
                                 math_ops.to_float(batch_size))
    num_per_batch = array_ops.where(math_ops.equal(weights, 0),
                                    0.0, num_per_batch)
    num_per_batch = math_ops.multiply(array_ops.ones(
        array_ops.reshape(batch_size, [1])), num_per_batch)
    return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)

  # First, count the number of nonzero weights.
  if weights.get_shape().ndims >= 1:
    reduction_indices = list(range(1, weights.get_shape().ndims))
    num_nonzero_per_batch = math_ops.reduce_sum(
        math_ops.to_float(math_ops.not_equal(weights, 0)),
        reduction_indices=reduction_indices)

  # Next, determine the number of elements that weight would broadcast to:
  broadcast_dims = array_ops.slice(array_ops.shape(losses),
                                   [weights.get_shape().ndims], [-1])
  num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims))

  num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast)
  return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
Example #28
0
  def testTrainingLoop(self):
    with imperative_mode.ImperativeMode(self._target) as mode:
      w = variables.Variable(np.random.rand(3))
      x = constant_op.constant(np.random.rand(3))
      y = math_ops.multiply(x, w)
      dw = gradients_impl.gradients(y, w)
      self.assertAllClose(dw[0].value, x.value)

      for _ in range(3):
        with mode.new_step():
          x = constant_op.constant(np.random.rand(3))
          y = math_ops.multiply(x, w)
          dw = gradients_impl.gradients(y, w)
          self.assertAllClose(dw[0].value, x.value)
  def testFoldr_Simple(self):
    with self.test_session():
      elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")

      r = functional_ops.foldr(
          lambda a, x: math_ops.multiply(math_ops.add(a, x), 2),
          elems)
      self.assertAllEqual(450, r.eval())

      r = functional_ops.foldr(
          lambda a, x: math_ops.multiply(math_ops.add(a, x), 2),
          elems,
          initializer=10)
      self.assertAllEqual(1282, r.eval())
Example #30
0
  def _update_mask(self, weights, threshold):
    """Updates the mask for a given weight tensor.

    This functions first computes the cdf of the weight tensor, and estimates
    the threshold value such that 'desired_sparsity' fraction of weights
    have magnitude less than the threshold.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if sparsity is not defined
    """
    if self._sparsity is None:
      raise ValueError('Sparsity variable undefined')

    sparsity = self._get_sparsity(weights.op.name)

    with ops.name_scope(weights.op.name + '_pruning_ops'):
      abs_weights = math_ops.abs(weights)
      max_value = math_ops.reduce_max(abs_weights)
      cdf_fn = pruning_utils.compute_cdf_from_histogram
      if self._spec.use_tpu:
        cdf_fn = pruning_utils.compute_cdf

      norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins)
      current_threshold = math_ops.multiply(
          math_ops.div(
              math_ops.reduce_sum(
                  math_ops.cast(
                      math_ops.less(norm_cdf, sparsity), dtypes.float32)),
              float(self._spec.nbins)), max_value)

      smoothed_threshold = math_ops.add_n([
          math_ops.multiply(current_threshold, 1 - self._spec.threshold_decay),
          math_ops.multiply(threshold, self._spec.threshold_decay)
      ])
      new_mask = math_ops.cast(
          math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32)
    return smoothed_threshold, new_mask
Example #31
0
def compute_weighted_loss(
    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Computes the weighted loss.

  Args:
    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `losses`, and must be broadcastable to `losses` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: the scope for the operations performed in computing the loss.
    loss_collection: the loss will be added to these collections.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.

  Raises:
    ValueError: If `weights` is `None` or the shape is not compatible with
      `losses`, or if the number of dimensions (rank) of either `losses` or
      `weights` is missing.

  Note:
    When calculating the gradient of a weighted loss contributions from
    both `losses` and `weights` are considered. If your `weights` depend
    on some model parameters but you do not want this to affect the loss
    gradient, you need to apply `tf.stop_gradient` to `weights` before
    passing them to `compute_weighted_loss`.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  Reduction.validate(reduction)
  with ops.name_scope(scope, "weighted_loss", (losses, weights)):
    # Save the `reduction` argument for loss normalization when distributing
    # to multiple replicas. Used only for estimator + v1 optimizer flow.
    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access

    with ops.control_dependencies((
        weights_broadcast_ops.assert_broadcastable(weights, losses),)):
      losses = ops.convert_to_tensor(losses)
      input_dtype = losses.dtype
      losses = math_ops.cast(losses, dtype=dtypes.float32)
      weights = math_ops.cast(weights, dtype=dtypes.float32)
      weighted_losses = math_ops.multiply(losses, weights)
      if reduction == Reduction.NONE:
        loss = weighted_losses
      else:
        loss = math_ops.reduce_sum(weighted_losses)
        if reduction == Reduction.MEAN:
          loss = _safe_mean(
              loss, math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
        elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
              reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
          loss = _safe_mean(loss, _num_present(losses, weights))
        elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
          loss = _safe_mean(loss, _num_elements(losses))

      # Convert the result back to the input type.
      loss = math_ops.cast(loss, input_dtype)
      util.add_loss(loss, loss_collection)
      return loss
Example #32
0
def triplet_semihard_loss(labels, embeddings, margin=1.0):
    """Computes the triplet loss with semi-hard negative mining.
      The loss encourages the positive distances (between a pair of embeddings with
      the same labels) to be smaller than the minimum negative distance among
      which are at least greater than the positive distance plus the margin constant
      (called semi-hard negative) in the mini-batch. If no such negative exists,
      uses the largest negative distance instead.
      See: https://arxiv.org/abs/1503.03832.
      Args:
        labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
          multiclass integer labels.
        embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
          be l2 normalized.
        margin: Float, margin term in the loss definition.
      Returns:
        triplet_loss: tf.float32 scalar.
    """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile,
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                1,
                                keep_dims=True), 0.0),
        [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,
                                          negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    _triplet_loss = math_ops.truediv(math_ops.reduce_sum(
        math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)),
                                     num_positives,
                                     name='triplet_semihard_loss')
    return _triplet_loss
Example #33
0
def logloss(y_true, y_pred, epsilon=1e-7):
    losses = math_ops.multiply(y_true, math_ops.log(y_pred + epsilon))
    losses += math_ops.multiply((1 - y_true),
                                math_ops.log(1 - y_pred + epsilon))
    return K.mean(-losses, axis=-1)
Example #34
0
 def _double_values(x):
   custom = op_hint.OpHint("add_test")
   x, = custom.add_inputs(x)
   output = math_ops.multiply(x, x)
   output, = custom.add_outputs(output)
   return output
def npairs_loss_multilabel(sparse_labels, embeddings_anchor,
                           embeddings_positive, reg_lambda=0.002,
                           print_losses=False):
  r"""Computes the npairs loss with multilabel data.

  Npairs loss expects paired data where a pair is composed of samples from the
  same labels and each pairs in the minibatch have different labels. The loss
  has two components. The first component is the L2 regularizer on the
  embedding vectors. The second component is the sum of cross entropy loss
  which takes each row of the pair-wise similarity matrix as logits and
  the remapped one-hot labels as labels. Here, the similarity is defined by the
  dot product between two embedding vectors. S_{i,j} = f(x_i)^T f(x_j)

  To deal with multilabel inputs, we use the count of label intersection
  i.e. L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) |
  Then we normalize each rows of the count based label matrix so that each row
  sums to one.

  Args:
    sparse_labels: List of 1-D Boolean `SparseTensor` of dense_shape
                   [batch_size/2, num_classes] labels for the anchor-pos pairs.
    embeddings_anchor: 2-D `Tensor` of shape [batch_size/2, embedding_dim] for
      the embedding vectors for the anchor images. Embeddings should not be
      l2 normalized.
    embeddings_positive: 2-D `Tensor` of shape [batch_size/2, embedding_dim] for
      the embedding vectors for the positive images. Embeddings should not be
      l2 normalized.
    reg_lambda: Float. L2 regularization term on the embedding vectors.
    print_losses: Boolean. Option to print the xent and l2loss.

  Returns:
    npairs_loss: tf.float32 scalar.
  Raises:
    TypeError: When the specified sparse_labels is not a `SparseTensor`.
  """
  if False in [isinstance(
      l, sparse_tensor.SparseTensor) for l in sparse_labels]:
    raise TypeError(
        'sparse_labels must be a list of SparseTensors, but got %s' % str(
            sparse_labels))

  with ops.name_scope('NpairsLossMultiLabel'):
    # Add the regularizer on the embedding.
    reg_anchor = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
    reg_positive = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
    l2loss = math_ops.multiply(0.25 * reg_lambda,
                               reg_anchor + reg_positive, name='l2loss')

    # Get per pair similarities.
    similarity_matrix = math_ops.matmul(
        embeddings_anchor, embeddings_positive, transpose_a=False,
        transpose_b=True)

    # TODO(coreylynch): need to check the sparse values
    # TODO(coreylynch): are composed only of 0's and 1's.

    multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels)
    labels_remapped = math_ops.to_float(multilabel_adjacency_matrix)
    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True)

    # Add the softmax loss.
    xent_loss = nn.softmax_cross_entropy_with_logits(
        logits=similarity_matrix, labels=labels_remapped)
    xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

    if print_losses:
      xent_loss = logging_ops.Print(
          xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

    return l2loss + xent_loss
Example #36
0
    def testStripUnusedMultipleInputs(self):
        input_graph_name = "input_graph.pb"
        output_graph_name = "output_graph.pb"

        # We'll create an input graph that multiplies two input nodes.
        with ops.Graph().as_default():
            constant_node1 = constant_op.constant(1.0, name="constant_node1")
            constant_node2 = constant_op.constant(2.0, name="constant_node2")
            input_node1 = math_ops.sub(constant_node1, 3.0, name="input_node1")
            input_node2 = math_ops.sub(constant_node2, 5.0, name="input_node2")
            output_node = math_ops.multiply(input_node1,
                                            input_node2,
                                            name="output_node")
            math_ops.add(output_node, 2.0, name="later_node")
            sess = session.Session()
            output = sess.run(output_node)
            self.assertNear(6.0, output, 0.00001)
            graph_io.write_graph(sess.graph, self.get_temp_dir(),
                                 input_graph_name)

        # We save out the graph to disk, and then call the const conversion
        # routine.
        input_graph_path = os.path.join(self.get_temp_dir(), input_graph_name)
        input_binary = False
        input_node_names = "input_node1,input_node2"
        input_node_types = [
            dtypes.float32.as_datatype_enum, dtypes.float32.as_datatype_enum
        ]
        output_binary = True
        output_node_names = "output_node"
        output_graph_path = os.path.join(self.get_temp_dir(),
                                         output_graph_name)

        strip_unused_lib.strip_unused_from_files(
            input_graph_path, input_binary, output_graph_path, output_binary,
            input_node_names, output_node_names, input_node_types)

        # Now we make sure the variable is now a constant, and that the graph still
        # produces the expected result.
        with ops.Graph().as_default():
            output_graph_def = graph_pb2.GraphDef()
            with open(output_graph_path, "rb") as f:
                output_graph_def.ParseFromString(f.read())
                _ = importer.import_graph_def(output_graph_def, name="")

            self.assertEqual(3, len(output_graph_def.node))
            for node in output_graph_def.node:
                self.assertNotEqual("Add", node.op)
                self.assertNotEqual("Sub", node.op)
                if node.name == input_node_names:
                    self.assertTrue("shape" in node.attr)

            with session.Session() as sess:
                input_node1 = sess.graph.get_tensor_by_name("input_node1:0")
                input_node2 = sess.graph.get_tensor_by_name("input_node2:0")
                output_node = sess.graph.get_tensor_by_name("output_node:0")
                output = sess.run(output_node,
                                  feed_dict={
                                      input_node1: [10.0],
                                      input_node2: [-5.0]
                                  })
                self.assertNear(-50.0, output, 0.00001)
def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
    """Helper for einsum() that computes the result of a two-argument einsum().

  Args:
    t0: a `Tensor`
    t0_axis_labels: a string of axis labels.  This string's length must equal
      the rank of t0.
    t1: a `Tensor`
    t1_axis_labels: a string to axis labels.  This string's length must equal
      the rank of t1.
    axes_to_sum: set of labels of axes to be summed over

  Returns:
    A `Tensor` whose elements are obtained by summing, over all axes in
    `axes_to_sum`, the corresponding elements of `t0` and `t1`.

    For example, if t0_axis_labels == 'abijk', t1_axis_labels == 'acjkl', and
    axes_to_sum == {j,k}, this will return a tensor x where

      out[a,b,c,i,l] = sum_j sum_k t0[a,b,i,j,k] * t1[a,c,j,k,l]

  Raises:
    ValueError: if the rank of `t0` does not match the length of
      `t0_axis_labels`, or that of `t1` does not match the length of
      `t1_axis_labels`.
  """
    if len(t0_axis_labels) != len(t0.get_shape()):
        raise ValueError()
    if len(t1_axis_labels) != len(t1.get_shape()):
        raise ValueError()

    # This function computes the result of a two-argument einsum() using batch
    # matrix multiplication.  This involves
    # 1. transposing t0 and t1 so that axes are in the correct order for
    #    batch matrix multiplication, and
    # 2. reshaping t0 and t1 so that they are both of rank 3.

    # First, we divide axes into three groups:
    #  * "preserved" axes are present in both inputs and the output
    #  * "summed" axes are present in both inputs but not the output
    #  * "broadcast" axes are present in exactly one input and the output
    #
    # As an example, if the einsum is abijk,acjkl->abcil, then "a" is a
    # preserved axis, "b" and "c" are broadcast axes, and "j" and "k" are
    # summed axes.
    assert all(a in t0_axis_labels and a in t1_axis_labels
               for a in axes_to_sum)
    preserved_axes = (set(t0_axis_labels) & set(t1_axis_labels)) - axes_to_sum
    broadcast_axes = {}
    for i, sym_list in enumerate([t0_axis_labels, t1_axis_labels]):
        broadcast_axes[i] = set(sym_list) - preserved_axes - axes_to_sum

    # Reorder the axes so that:
    # 1. preserved axes come first in both inputs
    # 2. in input 0, broadcast axes come next, followed by summed axes
    # 3. in input 1, summed axes come next, followed by broadcast axes
    def sort_key(input_index, a):
        if a in preserved_axes:
            return (-1, a)
        elif ((input_index == 0 and a in broadcast_axes[0])
              or (input_index == 1 and a in axes_to_sum)):
            return (0, a)
        else:
            return (1, a)

    axis_labels = [t0_axis_labels, t1_axis_labels]
    sorted_axes = [
        sorted(sym_list, key=lambda a: sort_key(i, a))
        for i, sym_list in enumerate(axis_labels)
    ]
    inputs = [t0, t1]
    for i, axes_str in enumerate(axis_labels):
        perm = [axes_str.find(a) for a in sorted_axes[i]]
        inputs[i] = _transpose_if_necessary(inputs[i], perm)
    t0, t1 = inputs

    if not axes_to_sum:
        # In the special case where there are no axes to sum over, reduce to mul()
        # rather than to batch matrix multiplication.
        for _ in broadcast_axes[1]:
            t0 = array_ops.expand_dims(t0, -1)
        for _ in broadcast_axes[0]:
            t1 = array_ops.expand_dims(t1, len(preserved_axes))
        product = math_ops.multiply(t0, t1)
        product_axes = sorted_axes[0] + sorted_axes[1][len(preserved_axes):]
        return product, ''.join(product_axes)
    else:
        # Reduce to matmul().

        # Reshape both inputs so as to combine multiple broadcast axes
        # into a single axis, and combine multiple summed axes into a
        # single axis.

        t0_shape = tuple(x.value for x in t0.get_shape())
        num_broadcast_elements_t0 = _total_size(
            t0_shape[len(preserved_axes):-len(axes_to_sum)])
        num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
        new_shape = t0_shape[:len(preserved_axes)] + (
            num_broadcast_elements_t0, num_summed_elements)
        t0 = _reshape_if_necessary(t0, new_shape)

        t1_shape = tuple(x.value for x in t1.get_shape())
        num_broadcast_elements_t1 = _total_size(t1_shape[len(preserved_axes) +
                                                         len(axes_to_sum):])
        new_shape = t1_shape[:len(preserved_axes)] + (
            num_summed_elements, num_broadcast_elements_t1)
        t1 = _reshape_if_necessary(t1, new_shape)

        product = math_ops.matmul(t0, t1)

        # Undo compaction of broadcast axes
        uncompacted_shape = (
            t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] +
            t1_shape[len(t1_shape) - len(broadcast_axes[1]):])
        product = _reshape_if_necessary(product, uncompacted_shape)

        product_axes = (
            sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] +
            sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):])

        return product, ''.join(product_axes)
Example #38
0
def _FindFusedBatchNorms(graph):
  """Finds all ops and tensors related to found FusedBatchNorms.

  Args:
    graph: Graph to inspect.

  Yields:
    _FusedBatchNormMatches.
  """
  input_pattern = graph_matcher.OpTypePattern('*')
  weight_pattern = graph_matcher.OpTypePattern('*')
  gamma_pattern = graph_matcher.OpTypePattern('*')
  beta_pattern = graph_matcher.OpTypePattern('*')
  mean_pattern = graph_matcher.OpTypePattern('*')
  variance_pattern = graph_matcher.OpTypePattern('*')

  moving_average_pattern = graph_matcher.OpTypePattern('*')
  bn_decay_pattern = graph_matcher.OpTypePattern('*')
  layer_pattern = graph_matcher.OpTypePattern(
      'Conv2D|DepthwiseConv2dNative|MatMul',
      inputs=[input_pattern, weight_pattern])
  # MatMul has a Reshape between it and FusedBatchNorm.
  matmul_reshape_pattern = graph_matcher.OpTypePattern(
      'Reshape', inputs=[layer_pattern,
                         graph_matcher.OpTypePattern('*')])

  batch_norm_pattern = graph_matcher.OpTypePattern(
      'FusedBatchNorm',
      inputs=[
          graph_matcher.OneofPattern([matmul_reshape_pattern, layer_pattern]),
          gamma_pattern, beta_pattern, mean_pattern, variance_pattern
      ])
  matmul_bn_output_reshape_pattern = graph_matcher.OpTypePattern(
      'Reshape', inputs=[batch_norm_pattern,
                         graph_matcher.OpTypePattern('*')])

  bn_matcher = graph_matcher.GraphMatcher(
      graph_matcher.OneofPattern(
          [matmul_bn_output_reshape_pattern, batch_norm_pattern]))

  moving_average_sub_pattern = graph_matcher.OpTypePattern(
      'Sub', inputs=[moving_average_pattern, batch_norm_pattern])
  moving_average_mul_pattern = graph_matcher.OpTypePattern(
      'Mul', inputs=[moving_average_sub_pattern, bn_decay_pattern])

  moving_avg_mul_matcher = graph_matcher.GraphMatcher(
      moving_average_mul_pattern)

  for match_result in bn_matcher.match_graph(graph):
    moving_mean_tensor = None
    moving_variance_tensor = None
    bn_decay_mean_tensor = None
    bn_decay_var_tensor = None
    layer_op = match_result.get_op(layer_pattern)
    layer_tensor = match_result.get_tensor(layer_pattern)
    bn_op = match_result.get_op(batch_norm_pattern)
    batch_epsilon_tensor = bn_op.get_attr('epsilon')

    # In the MatMul case, the output of batch norm is reshaped back into a
    # 2D tensor, so the output_tensor is the output of the Reshape op.
    output_tensor = bn_op.outputs[0]
    if layer_op.type == 'MatMul':
      output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern)
      # If the matcher didn't match matmul_bn_output_reshape, there will be
      # another match for this 'MatMul' later, so we can skip this one.
      if output_reshape_op is None:
        continue
      output_tensor = output_reshape_op.outputs[0]

    input_tensor = match_result.get_tensor(input_pattern)
    weight_tensor = match_result.get_tensor(weight_pattern)
    gamma_tensor = match_result.get_tensor(gamma_pattern)
    beta_tensor = match_result.get_tensor(beta_pattern)
    # FusedBatchNorm in training is different from that in inference. It takes
    # empty 'mean' and empty 'variance', and produces the mean and the variance
    # of the batch. Therefore, when is_training is true, mean_tensor and
    # variance_tensor point to 1st and 2nd (0-based) output of bn_op,
    # respectively; when is_training is false, they point to bn_op's inputs.
    is_training = bn_op.get_attr('is_training')
    if is_training:
      # FusedBatchNormGrad doesn't compute gradients of the batch_mean and
      # batch_variance outputs, so we need to substitute our own custom
      # gradient.
      # TODO(suharshs, raghuramank): Find a way to avoid needing this hack.
      # pylint: disable=protected-access
      bn_op._set_attr(
          '_gradient_op_type',
          attr_value_pb2.AttrValue(s=compat.as_bytes('FoldFusedBatchNormGrad')))
      # pylint: enable=protected-access
      mean_tensor = bn_op.outputs[1]
      # The batch variance used during forward and backward prop is biased,
      # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average
      # calculation, the variance is corrected by the term N/N-1 (Bessel's
      # correction). The variance tensor read from FuseBatchNorm has bessel's
      # correction applied, so we undo it here.
      scope, sep, _ = bn_op.name.rpartition('/')
      g = ops.get_default_graph()
      with g.as_default(), g.name_scope(scope + sep):
        n = math_ops.cast(
            array_ops.size(layer_tensor) / array_ops.size(mean_tensor),
            dtypes.float32)
        variance_tensor = math_ops.multiply(
            bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction')
      # TODO(suharshs): Find a way to get rid of this inner match.
      for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
        sub_op = mul_match_result.get_op(moving_average_sub_pattern)
        if sub_op.inputs[1].name == bn_op.outputs[1].name:
          # During training: Batch Mean is bn_op.outputs[1]
          moving_mean_tensor = sub_op.inputs[0]
          bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern)
        if sub_op.inputs[1].name == bn_op.outputs[2].name:
          # During training: Batch Var is bn_op.outputs[2]
          moving_variance_tensor = sub_op.inputs[0]
          bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern)
    else:
      mean_tensor = match_result.get_tensor(mean_pattern)
      variance_tensor = match_result.get_tensor(variance_pattern)

    yield _BatchNormMatch(
        layer_op=layer_op,
        bn_op=bn_op,
        output_tensor=output_tensor,
        input_tensor=input_tensor,
        weight_tensor=weight_tensor,
        gamma_tensor=gamma_tensor,
        beta_tensor=beta_tensor,
        mean_tensor=mean_tensor,
        variance_tensor=variance_tensor,
        moving_mean_tensor=moving_mean_tensor,
        moving_variance_tensor=moving_variance_tensor,
        bn_decay_mean_tensor=bn_decay_mean_tensor,
        bn_decay_var_tensor=bn_decay_var_tensor,
        batch_epsilon_tensor=batch_epsilon_tensor)
    def testArithmeticOptimizationActive(self):
        """Tests that tfdbg can dump the tensor from nodes created by Grappler."""
        with session.Session(
                config=_grappler_enabled_session_config()) as sess:
            u = variables.VariableV1([[1, 2], [3, 4]],
                                     name="u",
                                     dtype=dtypes.float32)
            # The next two ops should be optimized by Grappler into a single op:
            # either an AddN op or a Mul op.
            x = math_ops.add(u, u)
            x = math_ops.add(x, u)
            y = math_ops.multiply(x, u)

            sess.run(variables.global_variables_initializer())

            run_options = config_pb2.RunOptions(output_partition_graphs=True)
            debug_utils.watch_graph(run_options,
                                    sess.graph,
                                    debug_ops=["DebugIdentity"],
                                    debug_urls=[self._debug_url])

            run_metadata = config_pb2.RunMetadata()
            run_result = sess.run(y,
                                  options=run_options,
                                  run_metadata=run_metadata)
            self.assertAllClose(run_result, [[3, 12], [27, 48]])

            dump_data = debug_data.DebugDumpDir(
                self._dump_root,
                partition_graphs=run_metadata.partition_graphs,
                validate=True)

            original_node_names = set(
                [op.name for op in sess.graph.get_operations()])
            dumped_node_names = set(dump_data.nodes())
            grappler_created_node_names = dumped_node_names - original_node_names
            grappler_removed_node_names = original_node_names - dumped_node_names

            # Assert that Grappler should have replaced some of the nodes from the
            # original graph with new nodes.
            self.assertTrue(grappler_created_node_names)
            self.assertTrue(grappler_removed_node_names)

            # Iterate through the nodes created by Grappler. One of them should be
            # be the result of replacing the original add ops with an AddN op or a
            # Mul op.
            found_optimized_node = False
            for grappler_node_name in grappler_created_node_names:
                node_op_type = dump_data.node_op_type(grappler_node_name)
                # Look for the node created by Grappler's arithmetic optimization.
                if ((test_util.IsMklEnabled()
                     and node_op_type in ("_MklAddN", "Mul"))
                        or (node_op_type in ("AddN", "Mul"))):
                    datum = dump_data.get_tensors(grappler_node_name, 0,
                                                  "DebugIdentity")
                    self.assertEqual(1, len(datum))
                    self.assertAllClose(datum[0], [[3, 6], [9, 12]])
                    found_optimized_node = True
                    break
            self.assertTrue(
                found_optimized_node,
                "Failed to find optimized node created by Grappler's arithmetic "
                "optimization.")
Example #40
0
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay,
                                 fused_batch_norm):
  """Computes batch norm correction params.

     Before batch normalization is frozen:
     We use batch statistics for batch norm.
       correction_scale = sigma_b/sigma_mv
       correction_recip = 1/correction_scale
       correction_offset = 0

     After batch normalization is frozen:
      correction_scale = sigma_b/sigma_mv
      correction_recip = 1
      correction_offset =  gamma*(mu_b/sigma_b-mu_mv/sigma_mv).

     Batch norm is frozen if global_step > bn_freeze_delay.
     The corrections ensure that:
     a) The weights are quantized after scaling by gamma/sigma_mv. This enables
     smoother training as the scaling on the weights changes slowly, rather than
     jump across mini-batches
     b) Changing the values of the corrections allows for one to switch between
     using batch statistics to using moving mean and average, without requiring
     changes to batch_norm


  Args:
    context: The scope under which we look for batch norm params
    match: Object containg required batch norm tensors for correction
      computation.
    freeze_batch_norm_delay: Delay in steps at which computation switches
      from regular batch norm to frozen mean and variance.
    fused_batch_norm: Bool, true if fused batch norm is used.

  Returns:
    A tuple of correction_scale, correction_recip, correction_offset
  """

  g = ops.get_default_graph()
  with g.name_scope(context + '/batch_norm_correction'):
    recip_sigma_mv = math_ops.rsqrt(
        match.moving_variance_tensor + match.batch_epsilon_tensor)
    recip_sigma = math_ops.rsqrt(
        match.variance_tensor + match.batch_epsilon_tensor)
    correction_scale = math_ops.divide(
        recip_sigma_mv, recip_sigma, name='scale_compute')
    correction_scale = array_ops.identity(
        correction_scale, name='correction_scale')
    correction_recip = math_ops.reciprocal(
        correction_scale, name='reciprocal_compute')
    correction_offset = math_ops.multiply(
        match.gamma_tensor,
        match.mean_tensor * recip_sigma -
        match.moving_mean_tensor * recip_sigma_mv,
        name='offset_compute')

    if freeze_batch_norm_delay is not None:
      use_mv_avg = math_ops.greater_equal(
          common.CreateOrGetQuantizationStep(),
          freeze_batch_norm_delay,
          name='use_moving_average')
    else:
      use_mv_avg = False

    bn_decay_zero = 0.0
    bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers())
    bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers())

    bn_decay_mean_out = utils.smart_cond(
        use_mv_avg,
        lambda: bn_decay_zero,
        lambda: match.bn_decay_mean_tensor,
        name='freeze_moving_mean')
    graph_editor.reroute_ts(
        [bn_decay_mean_out], [match.bn_decay_mean_tensor],
        can_modify=bn_decay_mean_consumers)

    if fused_batch_norm is False:
      bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers())
      bn_decay_var_out = utils.smart_cond(
          use_mv_avg,
          lambda: bn_decay_zero,
          lambda: match.bn_decay_var_tensor,
          name='freeze_moving_var')
      graph_editor.reroute_ts(
          [bn_decay_var_out], [match.bn_decay_var_tensor],
          can_modify=bn_decay_var_consumers)

    correction_recip = utils.smart_cond(
        use_mv_avg,
        lambda: array_ops.ones(correction_scale.shape),
        lambda: correction_recip,
        name='correction_recip')

    correction_offset = utils.smart_cond(
        use_mv_avg,
        lambda: correction_offset,
        lambda: array_ops.zeros(correction_offset.shape),
        name='correction_offset')
  return correction_scale, correction_recip, correction_offset
Example #41
0
def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
                    is_training):
  """Folds in batch norm layer into preceding convolution or FC layer.

  Creates 3 new nodes, connects their inputs and adds them to the graph:
  mul is cloned into mul_fold, Conv2D or MatMul, or DepthwiseConv2d is cloned
  into respective *_Fold, add is cloned into add_fold.

  Args:
    graph: Graph to modify.
    context: String, batch norm context, i.e. node into which BatchNorm is
      nested.
    has_scaling: Whether the batch norm has scaling enabled.
    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
      and variance and using them for batch normalization.
    is_training: Bool, true if training.

  Raises:
    ValueError: When operation type is not supported, or input and output tensor
      shapes mismatch for created operations: mul_fold, add_fold.

  Returns:
    A pair of Operations, the first is the original consumer node of the batch
      norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
      the folded graph (add_fold).
  """
  mul_scale_name = 'mul_1' if has_scaling else 'mul'
  mul_scale = graph.get_operation_by_name(context +
                                          '/BatchNorm/batchnorm/' +
                                          mul_scale_name)
  op_below = mul_scale.inputs[0].op
  weights = op_below.inputs[1]
  match = _GetBatchNormParams(
      graph=graph, context=context, has_scaling=has_scaling)
  correction_scale, correction_recip, correction_offset = None, None, None
  if is_training:
    correction_scale, correction_recip, correction_offset = (
        _ComputeBatchNormCorrections(
            context=context,
            match=match,
            freeze_batch_norm_delay=freeze_batch_norm_delay,
            fused_batch_norm=False))
  # Special handling for weights of depthwise convolution.
  if op_below.type == 'DepthwiseConv2dNative':
    new_shape = [
        weights.get_shape().as_list()[2],
        weights.get_shape().as_list()[3]
    ]
    scale_name = 'mul' if has_scaling else 'Rsqrt'
    scale = graph.get_operation_by_name(
        context + '/BatchNorm/batchnorm/' + scale_name)
    scale = array_ops.reshape(scale.outputs[0], new_shape,
                              context + '/scale_reshape')

    if correction_scale is not None:
      correction_scale = array_ops.reshape(correction_scale, new_shape,
                                           context + '/correction_reshape')
      with ops.device(mul_scale.device):
        weights = math_ops.multiply(correction_scale, weights,
                                    context + '/correction_mult')

    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights),
                                                           (1, scale)])
  elif op_below.type in ['Conv2D', 'MatMul']:

    if correction_scale is not None:
      with ops.device(mul_scale.device):
        weights = math_ops.multiply(correction_scale, weights,
                                    context + '/correction_mult')
    mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)])
  else:
    raise ValueError('Cannot handle operation of type: %s' % op_below.op)
  _AssertShapesMatch('mul_fold', mul_fold.inputs[0], mul_fold.outputs[0])

  conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold',
                               [(1, mul_fold.outputs[0])])

  add_shift = graph.get_operation_by_name(
      context + '/BatchNorm/batchnorm/add_1')

  corrected_output = conv_or_fc_folded.outputs[0]
  if correction_offset is not None:
    with ops.device(conv_or_fc_folded.device):
      corrected_output = math_ops.multiply(correction_recip, corrected_output,
                                           context + '/post_conv_mul')
      corrected_output = math_ops.add(corrected_output, (correction_offset),
                                      context + '/correction_add')
  add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)])
  _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0])
  return add_shift, add_fold
 def Inner():
     z = math_ops.multiply(y, 3.0, name="z")
     g = gradients_impl.gradients(z, y)
     return g[0]
Example #43
0
 def __rmul__(self, other):
   return math_ops.multiply(other, self)
Example #44
0
 def _benchmark_tf_multiply_op(self, m, num_iters):
   func = lambda: math_ops.multiply(m, m)
   self._run(func, num_iters)
Example #45
0
 def __mul__(self, other):
   return math_ops.multiply(self, other)
Example #46
0
def update_confusion_matrix_variables(variables_to_update,
                                      y_true,
                                      y_pred,
                                      thresholds,
                                      top_k=None,
                                      class_id=None,
                                      sample_weight=None,
                                      multi_label=False,
                                      label_weights=None):
    """Returns op to update the given confusion matrix variables.

  For every pair of values in y_true and y_pred:

  true_positive: y_true == True and y_pred > thresholds
  false_negatives: y_true == True and y_pred <= thresholds
  true_negatives: y_true == False and y_pred <= thresholds
  false_positive: y_true == False and y_pred > thresholds

  The results will be weighted and added together. When multiple thresholds are
  provided, we will repeat the same for every threshold.

  For estimation of these metrics over a stream of data, the function creates an
  `update_op` operation that updates the given variables.

  If `sample_weight` is `None`, weights default to 1.
  Use weights of 0 to mask values.

  Args:
    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
      and corresponding variables to update as values.
    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
      the range `[0, 1]`.
    thresholds: A float value, float tensor, python list, or tuple of float
      thresholds in `[0, 1]`, or NEG_INF (used when top_k is set).
    top_k: Optional int, indicates that the positive labels should be limited to
      the top k predictions.
    class_id: Optional int, limits the prediction and labels to the class
      specified by this argument.
    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `y_true` dimension).
    multi_label: Optional boolean indicating whether multidimensional
      prediction/labels should be treated as multilabel responses, or flattened
      into a single label. When True, the valus of `variables_to_update` must
      have a second dimension equal to the number of labels in y_true and
      y_pred, and those tensors must not be RaggedTensors.
    label_weights: (optional) tensor of non-negative weights for multilabel
      data. The weights are applied when calculating TP, FP, FN, and TN without
      explicit multilabel handling (i.e. when the data is to be flattened).

  Returns:
    Update op.

  Raises:
    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
      `variables_to_update` contains invalid keys.
  """
    if multi_label and label_weights is not None:
        raise ValueError(
            '`label_weights` for multilabel data should be handled '
            'outside of `update_confusion_matrix_variables` when '
            '`multi_label` is True.')
    if variables_to_update is None:
        return
    if not any(key
               for key in variables_to_update if key in list(ConfusionMatrix)):
        raise ValueError(
            'Please provide at least one valid confusion matrix '
            'variable to update. Valid variable key options are: "{}". '
            'Received: "{}"'.format(list(ConfusionMatrix),
                                    variables_to_update.keys()))

    variable_dtype = list(variables_to_update.values())[0].dtype

    y_true = math_ops.cast(y_true, dtype=variable_dtype)
    y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
    thresholds = ops.convert_to_tensor_v2_with_dispatch(thresholds,
                                                        dtype=variable_dtype)
    num_thresholds = thresholds.shape[0]
    if multi_label:
        one_thresh = math_ops.equal(math_ops.cast(1, dtype=dtypes.int32),
                                    array_ops.rank(thresholds),
                                    name='one_set_of_thresholds_cond')
    else:
        [y_pred, y_true
         ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true],
                                                             sample_weight)
        one_thresh = math_ops.cast(True, dtype=dtypes.bool)

    invalid_keys = [
        key for key in variables_to_update if key not in list(ConfusionMatrix)
    ]
    if invalid_keys:
        raise ValueError(
            'Invalid keys: {}. Valid variable key options are: "{}"'.format(
                invalid_keys, list(ConfusionMatrix)))

    with ops.control_dependencies([
            check_ops.assert_greater_equal(y_pred,
                                           math_ops.cast(0.0,
                                                         dtype=y_pred.dtype),
                                           message='predictions must be >= 0'),
            check_ops.assert_less_equal(y_pred,
                                        math_ops.cast(1.0, dtype=y_pred.dtype),
                                        message='predictions must be <= 1')
    ]):
        if sample_weight is None:
            y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
                y_pred, y_true)
        else:
            sample_weight = math_ops.cast(sample_weight, dtype=variable_dtype)
            y_pred, y_true, sample_weight = (
                losses_utils.squeeze_or_expand_dimensions(
                    y_pred, y_true, sample_weight=sample_weight))
    y_pred.shape.assert_is_compatible_with(y_true.shape)

    if top_k is not None:
        y_pred = _filter_top_k(y_pred, top_k)
    if class_id is not None:
        y_true = y_true[..., class_id]
        y_pred = y_pred[..., class_id]

    pred_shape = array_ops.shape(y_pred)
    num_predictions = pred_shape[0]
    if y_pred.shape.ndims == 1:
        num_labels = 1
    else:
        num_labels = gen_math_ops.Prod(input=pred_shape[1:], axis=0)
    thresh_label_tile = control_flow_ops.cond(
        one_thresh, lambda: num_labels,
        lambda: math_ops.cast(1, dtype=dtypes.int32))

    # Reshape predictions and labels, adding a dim for thresholding.
    if multi_label:
        predictions_extra_dim = array_ops.expand_dims(y_pred, 0)
        labels_extra_dim = array_ops.expand_dims(
            math_ops.cast(y_true, dtype=dtypes.bool), 0)
    else:
        # Flatten predictions and labels when not multilabel.
        predictions_extra_dim = array_ops.reshape(y_pred, [1, -1])
        labels_extra_dim = array_ops.reshape(
            math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])

    # Tile the thresholds for every prediction.
    if multi_label:
        thresh_pretile_shape = [num_thresholds, 1, -1]
        thresh_tiles = [1, num_predictions, thresh_label_tile]
        data_tiles = [num_thresholds, 1, 1]
    else:
        thresh_pretile_shape = [num_thresholds, -1]
        thresh_tiles = [1, num_predictions * num_labels]
        data_tiles = [num_thresholds, 1]

    thresh_tiled = array_ops.tile(
        array_ops.reshape(thresholds, thresh_pretile_shape),
        array_ops.stack(thresh_tiles))

    # Tile the predictions for every threshold.
    preds_tiled = array_ops.tile(predictions_extra_dim, data_tiles)

    # Compare predictions and threshold.
    pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)

    # Tile labels by number of thresholds
    label_is_pos = array_ops.tile(labels_extra_dim, data_tiles)

    if sample_weight is not None:
        sample_weight = weights_broadcast_ops.broadcast_weights(
            math_ops.cast(sample_weight, dtype=variable_dtype), y_pred)
        weights_tiled = array_ops.tile(
            array_ops.reshape(sample_weight, thresh_tiles), data_tiles)
    else:
        weights_tiled = None

    if label_weights is not None and not multi_label:
        label_weights = array_ops.expand_dims(label_weights, 0)
        label_weights = weights_broadcast_ops.broadcast_weights(
            label_weights, y_pred)
        label_weights_tiled = array_ops.tile(
            array_ops.reshape(label_weights, thresh_tiles), data_tiles)
        if weights_tiled is None:
            weights_tiled = label_weights_tiled
        else:
            weights_tiled = math_ops.multiply(weights_tiled,
                                              label_weights_tiled)

    update_ops = []

    def weighted_assign_add(label, pred, weights, var):
        label_and_pred = math_ops.cast(math_ops.logical_and(label, pred),
                                       dtype=var.dtype)
        if weights is not None:
            label_and_pred *= math_ops.cast(weights, dtype=var.dtype)
        return var.assign_add(math_ops.reduce_sum(label_and_pred, 1))

    loop_vars = {
        ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
    }
    update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
    update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
    update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update

    if update_fn or update_tn:
        pred_is_neg = math_ops.logical_not(pred_is_pos)
        loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos,
                                                      pred_is_neg)

    if update_fp or update_tn:
        label_is_neg = math_ops.logical_not(label_is_pos)
        loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg,
                                                      pred_is_pos)
        if update_tn:
            loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg,
                                                         pred_is_neg)

    for matrix_cond, (label, pred) in loop_vars.items():

        if matrix_cond in variables_to_update:
            update_ops.append(
                weighted_assign_add(label, pred, weights_tiled,
                                    variables_to_update[matrix_cond]))

    return control_flow_ops.group(update_ops)
def lifted_struct_loss(labels, embeddings, margin=1.0):
  """Computes the lifted structured loss.

  The loss encourages the positive distances (between a pair of embeddings
  with the same labels) to be smaller than any negative distances (between a
  pair of embeddings with different labels) in the mini-batch in a way
  that is differentiable with respect to the embedding vectors.
  See: https://arxiv.org/abs/1511.06452.

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      multiclass integer labels.
    embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not
      be l2 normalized.
    margin: Float, margin term in the loss definition.

  Returns:
    lifted_loss: tf.float32 scalar.
  """
  # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
  lshape = array_ops.shape(labels)
  assert lshape.shape == 1
  labels = array_ops.reshape(labels, [lshape[0], 1])

  # Build pairwise squared distance matrix.
  pairwise_distances = pairwise_distance(embeddings)

  # Build pairwise binary adjacency matrix.
  adjacency = math_ops.equal(labels, array_ops.transpose(labels))
  # Invert so we can select negatives only.
  adjacency_not = math_ops.logical_not(adjacency)

  batch_size = array_ops.size(labels)

  diff = margin - pairwise_distances
  mask = math_ops.cast(adjacency_not, dtype=dtypes.float32)
  # Safe maximum: Temporarily shift negative distances
  #   above zero before taking max.
  #     this is to take the max only among negatives.
  row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True)
  row_negative_maximums = math_ops.reduce_max(
      math_ops.multiply(
          diff - row_minimums, mask), 1, keep_dims=True) + row_minimums

  # Compute the loss.
  # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
  #   where m_i is the max of alpha - negative D_i's.
  # This matches the Caffe loss layer implementation at:
  #   https://github.com/rksltnl/Caffe-Deep-Metric-Learning-CVPR16/blob/0efd7544a9846f58df923c8b992198ba5c355454/src/caffe/layers/lifted_struct_similarity_softmax_layer.cpp  # pylint: disable=line-too-long

  max_elements = math_ops.maximum(
      row_negative_maximums, array_ops.transpose(row_negative_maximums))
  diff_tiled = array_ops.tile(diff, [batch_size, 1])
  mask_tiled = array_ops.tile(mask, [batch_size, 1])
  max_elements_vect = array_ops.reshape(
      array_ops.transpose(max_elements), [-1, 1])

  loss_exp_left = array_ops.reshape(
      math_ops.reduce_sum(math_ops.multiply(
          math_ops.exp(
              diff_tiled - max_elements_vect),
          mask_tiled), 1, keep_dims=True), [batch_size, batch_size])

  loss_mat = max_elements + math_ops.log(
      loss_exp_left + array_ops.transpose(loss_exp_left))
  # Add the positive distance.
  loss_mat += pairwise_distances

  mask_positives = math_ops.cast(
      adjacency, dtype=dtypes.float32) - array_ops.diag(
          array_ops.ones([batch_size]))

  # *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2.
  num_positives = math_ops.reduce_sum(mask_positives) / 2.0

  lifted_loss = math_ops.truediv(
      0.25 * math_ops.reduce_sum(
          math_ops.square(
              math_ops.maximum(
                  math_ops.multiply(loss_mat, mask_positives), 0.0))),
      num_positives,
      name='liftedstruct_loss')
  return lifted_loss
Example #48
0
def polynomial_decay(learning_rate,
                     global_step,
                     decay_steps,
                     end_learning_rate=0.0001,
                     power=1.0,
                     cycle=False,
                     name=None):
    """Applies a polynomial decay to the learning rate.

  It is commonly observed that a monotonically decreasing learning rate, whose
  degree of change is carefully chosen, results in a better performing model.
  This function applies a polynomial decay function to a provided initial
  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.

  It requires a `global_step` value to compute the decayed learning rate.  You
  can just pass a TensorFlow variable that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  global_step = min(global_step, decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  If `cycle` is True then a multiple of `decay_steps` is used, the first one
  that is bigger than `global_steps`.

  ```python
  decay_steps = decay_steps * ceil(global_step / decay_steps)
  decayed_learning_rate = (learning_rate - end_learning_rate) *
                          (1 - global_step / decay_steps) ^ (power) +
                          end_learning_rate

  ```

  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  end_learning_rate = 0.01
  decay_steps = 10000
  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
                                            decay_steps, end_learning_rate,
                                            power=0.5)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Must be positive.  See the decay computation above.
    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The minimal end learning rate.
    power: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The power of the polynomial. Defaults to sqrt, i.e. 0.5.
    cycle: A boolean, whether or not it should cycle beyond decay_steps.
    name: String.  Optional name of the operation. Defaults to
      'PolynomialDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.
  """
    if global_step is None:
        raise ValueError("global_step is required for polynomial_decay.")
    with ops.name_scope(
            name, "PolynomialDecay",
        [learning_rate, global_step, decay_steps, end_learning_rate, power
         ]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        end_learning_rate = math_ops.cast(end_learning_rate, dtype)
        power = math_ops.cast(power, dtype)
        if cycle:
            # Find the first multiple of decay_steps that is bigger than global_step.
            decay_steps = math_ops.multiply(
                decay_steps, math_ops.ceil(global_step / decay_steps))
        else:
            # Make sure that the global_step used is not bigger than decay_steps.
            global_step = math_ops.minimum(global_step, decay_steps)

        p = math_ops.div(global_step, decay_steps)
        return math_ops.add(math_ops.multiply(
            learning_rate - end_learning_rate, math_ops.pow(1 - p, power)),
                            end_learning_rate,
                            name=name)
Example #49
0
 def loss_fn(ev):
   emb = embedding_ops.embedding_lookup(
       ev, math_ops.cast([0, 1, 2, 5, 6, 7], dtypes.int64))
   fun = math_ops.multiply(emb, 2.0, name='multiply')
   loss = math_ops.reduce_sum(fun, name='reduce_sum')
   return loss
Example #50
0
def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
  """Returns the frequency-weighted mean and variance of `x`.

  Args:
    x: A tensor.
    axes: 1-d tensor of int32 values; these are the axes along which
      to compute mean and variance.
    frequency_weights: A tensor of positive weights which can be
      broadcast with x.
    name: Name used to scope the operation.
    keep_dims: Produce moments with the same dimensionality as the input.

  Returns:
    Two tensors: `weighted_mean` and `weighted_variance`.
  """
  with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
    x = ops.convert_to_tensor(x, name="x")
    frequency_weights = ops.convert_to_tensor(
        frequency_weights, name="frequency_weights")

    # Unlike moments(), this just uses a simpler two-pass method.

    # See comment in moments() WRT precision; it applies here too.
    needs_cast = x.dtype == dtypes.float16
    if needs_cast:
      x = math_ops.cast(x, dtypes.float32)

    if frequency_weights.dtype != x.dtype:
      frequency_weights = math_ops.cast(frequency_weights, x.dtype)

    # Note that we use keep_dims=True for our reductions regardless of the arg;
    # this is so that the results remain broadcast-compatible with the inputs.
    weighted_input_sum = math_ops.reduce_sum(
        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)

    # The shape of the weights isn't necessarily the same as x's
    # shape, just broadcast-compatible with it -- so this expression
    # performs broadcasting to give a per-item weight, with the same
    # shape as (freqency_weights * x). This avoids having to reason
    # through all the broadcast logic to compute a correct
    # sum_of_weights.
    broadcasted_weights = frequency_weights + array_ops.zeros_like(x)

    sum_of_weights = math_ops.reduce_sum(
        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)

    divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")

    weighted_mean = math_ops.multiply(weighted_input_sum, divisor)

    # Have the weighted mean; now on to variance:
    weighted_distsq = math_ops.reduce_sum(
        frequency_weights * math_ops.squared_difference(x, weighted_mean),
        axes,
        name="weighted_distsq",
        keep_dims=True)

    weighted_variance = math_ops.multiply(weighted_distsq, divisor)

    if not keep_dims:
      weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes)
      weighted_variance = array_ops.squeeze(
          weighted_variance, squeeze_dims=axes)

    if needs_cast:
      weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
      weighted_variance = math_ops.cast(weighted_variance, dtypes.float16)

    return weighted_mean, weighted_variance
Example #51
0
def inverse_time_decay(learning_rate,
                       global_step,
                       decay_steps,
                       decay_rate,
                       staircase=False,
                       name=None):
    """Applies inverse time decay to the initial learning rate.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an inverse decay function
  to a provided initial learning rate.  It requires an `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
  ```

  Example: decay 1/t with a rate of 0.5:

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  learning_rate = 0.1
  k = 0.5
  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)

  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: How often to apply decay.
    decay_rate: A Python number.  The decay rate.
    staircase: Whether to apply decay in a discrete staircase, as opposed to
      continuous, fashion.
    name: String.  Optional name of the operation.  Defaults to
      'InverseTimeDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.
  """
    if global_step is None:
        raise ValueError("global_step is required for inverse_time_decay.")
    with ops.name_scope(name, "InverseTimeDecay",
                        [learning_rate, global_step, decay_rate]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        decay_rate = math_ops.cast(decay_rate, dtype)
        p = global_step / decay_steps
        if staircase:
            p = math_ops.floor(p)
        const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
        denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
        return math_ops.div(learning_rate, denom, name=name)
Example #52
0
 def _CloneMul(self, op, inputs, new_name):
   del op  # Unused.
   return math_ops.multiply(inputs[0], inputs[1], name=new_name).op
Example #53
0
def exponential_decay(learning_rate,
                      global_step,
                      decay_steps,
                      decay_rate,
                      staircase=False,
                      name=None):
    """Applies exponential decay to the learning rate.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies an exponential decay function
  to a provided initial learning rate.  It requires a `global_step` value to
  compute the decayed learning rate.  You can just pass a TensorFlow variable
  that you increment at each training step.

  The function returns the decayed learning rate.  It is computed as:

  ```python
  decayed_learning_rate = learning_rate *
                          decay_rate ^ (global_step / decay_steps)
  ```

  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
  integer division and the decayed learning rate follows a staircase function.

  Example: decay every 100000 steps with a base of 0.96:

  ```python
  ...
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.1
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                             100000, 0.96, staircase=True)
  # Passing global_step to minimize() will increment it at each step.
  learning_step = (
      tf.train.GradientDescentOptimizer(learning_rate)
      .minimize(...my loss..., global_step=global_step)
  )
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
      Global step to use for the decay computation.  Must not be negative.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
      Must be positive.  See the decay computation above.
    decay_rate: A scalar `float32` or `float64` `Tensor` or a
      Python number.  The decay rate.
    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
    name: String.  Optional name of the operation.  Defaults to
      'ExponentialDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.

  Raises:
    ValueError: if `global_step` is not supplied.
  """
    if global_step is None:
        raise ValueError("global_step is required for exponential_decay.")
    with ops.name_scope(
            name, "ExponentialDecay",
        [learning_rate, global_step, decay_steps, decay_rate]) as name:
        learning_rate = ops.convert_to_tensor(learning_rate,
                                              name="learning_rate")
        dtype = learning_rate.dtype
        global_step = math_ops.cast(global_step, dtype)
        decay_steps = math_ops.cast(decay_steps, dtype)
        decay_rate = math_ops.cast(decay_rate, dtype)
        p = global_step / decay_steps
        if staircase:
            p = math_ops.floor(p)
        return math_ops.multiply(learning_rate,
                                 math_ops.pow(decay_rate, p),
                                 name=name)
Example #54
0
def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
               loss_collection=ops.GraphKeys.LOSSES,
               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Adds a Huber Loss term to the training procedure.

  For each value x in `error=labels-predictions`, the following is calculated:

  ```
    0.5 * x^2                  if |x| <= d
    0.5 * d^2 + d * (|x| - d)  if |x| > d
  ```

  where d is `delta`.

  See: https://en.wikipedia.org/wiki/Huber_loss

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector. If the shape of
  `weights` matches the shape of `predictions`, then the loss of each
  measurable element of `predictions` is scaled by the corresponding value of
  `weights`.

  Args:
    labels: The ground truth output tensor, same dimensions as 'predictions'.
    predictions: The predicted outputs.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    delta: `float`, the point where the huber loss function
      changes from a quadratic to linear.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or
     `predictions` is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "huber_loss",
                      (predictions, labels, weights)) as scope:
    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
    labels = math_ops.cast(labels, dtype=dtypes.float32)
    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
    error = math_ops.subtract(predictions, labels)
    abs_error = math_ops.abs(error)
    quadratic = math_ops.minimum(abs_error, delta)
    # The following expression is the same in value as
    # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
    # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
    # This is necessary to avoid doubling the gradient, since there is already a
    # nonzero contribution to the gradient from the quadratic term.
    linear = math_ops.subtract(abs_error, quadratic)
    losses = math_ops.add(
        math_ops.multiply(
            ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
            math_ops.multiply(quadratic, quadratic)),
        math_ops.multiply(delta, linear))
    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
Example #55
0
 def f(x):
     return (math_ops.multiply(obj.v,
                               x), math_ops.multiply(obj.v,
                                                     (x + 1)), None)
Example #56
0
def _compute_sampled_logits(weights,
                            biases,
                            labels,
                            inputs,
                            num_sampled,
                            num_classes,
                            num_true=1,
                            sampled_values=None,
                            subtract_log_q=True,
                            remove_accidental_hits=False,
                            partition_strategy="mod",
                            name=None):
  """Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
  Returns:
    out_logits, out_labels: `Tensor` objects each with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
  """

  if isinstance(weights, variables.PartitionedVariable):
    weights = list(weights)
  if not isinstance(weights, list):
    weights = [weights]

  with ops.name_scope(name, "compute_sampled_logits",
                      weights + [biases, inputs, labels]):
    if labels.dtype != dtypes.int64:
      labels = math_ops.cast(labels, dtypes.int64)
    labels_flat = array_ops.reshape(labels, [-1])

    # Sample the negative labels.
    #   sampled shape: [num_sampled] tensor
    #   true_expected_count shape = [batch_size, 1] tensor
    #   sampled_expected_count shape = [num_sampled] tensor
    if sampled_values is None:
      sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
          true_classes=labels,
          num_true=num_true,
          num_sampled=num_sampled,
          unique=True,
          range_max=num_classes)
    # NOTE: pylint cannot tell that 'sampled_values' is a sequence
    # pylint: disable=unpacking-non-sequence
    sampled, true_expected_count, sampled_expected_count = sampled_values
    # pylint: enable=unpacking-non-sequence

    # labels_flat is a [batch_size * num_true] tensor
    # sampled is a [num_sampled] int tensor
    all_ids = array_ops.concat([labels_flat, sampled], 0)

    # weights shape is [num_classes, dim]
    all_w = embedding_ops.embedding_lookup(
        weights, all_ids, partition_strategy=partition_strategy)
    all_b = embedding_ops.embedding_lookup(biases, all_ids)
    # true_w shape is [batch_size * num_true, dim]
    # true_b is a [batch_size * num_true] tensor
    true_w = array_ops.slice(
        all_w, [0, 0], array_ops.stack([array_ops.shape(labels_flat)[0], -1]))
    true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))

    # inputs shape is [batch_size, dim]
    # true_w shape is [batch_size * num_true, dim]
    # row_wise_dots is [batch_size, num_true, dim]
    dim = array_ops.shape(true_w)[1:2]
    new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
    row_wise_dots = math_ops.multiply(
        array_ops.expand_dims(inputs, 1),
        array_ops.reshape(true_w, new_true_w_shape))
    # We want the row-wise dot plus biases which yields a
    # [batch_size, num_true] tensor of true_logits.
    dots_as_matrix = array_ops.reshape(row_wise_dots,
                                       array_ops.concat([[-1], dim], 0))
    true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])
    true_b = array_ops.reshape(true_b, [-1, num_true])
    true_logits += true_b

    # Lookup weights and biases for sampled labels.
    #   sampled_w shape is [num_sampled, dim]
    #   sampled_b is a [num_sampled] float tensor
    sampled_w = array_ops.slice(
        all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
    sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

    # inputs has shape [batch_size, dim]
    # sampled_w has shape [num_sampled, dim]
    # sampled_b has shape [num_sampled]
    # Apply X*W'+B, which yields [batch_size, num_sampled]
    sampled_logits = math_ops.matmul(
        inputs, sampled_w, transpose_b=True) + sampled_b

    if remove_accidental_hits:
      acc_hits = candidate_sampling_ops.compute_accidental_hits(
          labels, sampled, num_true=num_true)
      acc_indices, acc_ids, acc_weights = acc_hits

      # This is how SparseToDense expects the indices.
      acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
      acc_ids_2d_int32 = array_ops.reshape(
          math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
      sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,
                                        "sparse_indices")
      # Create sampled_logits_shape = [batch_size, num_sampled]
      sampled_logits_shape = array_ops.concat(
          [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)],
          0)
      if sampled_logits.dtype != acc_weights.dtype:
        acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
      sampled_logits += sparse_ops.sparse_to_dense(
          sparse_indices,
          sampled_logits_shape,
          acc_weights,
          default_value=0.0,
          validate_indices=False)

    if subtract_log_q:
      # Subtract log of Q(l), prior probability that l appears in sampled.
      true_logits -= math_ops.log(true_expected_count)
      sampled_logits -= math_ops.log(sampled_expected_count)

    # Construct output logits and labels. The true labels/logits start at col 0.
    out_logits = array_ops.concat([true_logits, sampled_logits], 1)
    # true_logits is a float tensor, ones_like(true_logits) is a float tensor
    # of ones. We then divide by num_true to ensure the per-example labels sum
    # to 1.0, i.e. form a proper probability distribution.
    out_labels = array_ops.concat([
        array_ops.ones_like(true_logits) / num_true,
        array_ops.zeros_like(sampled_logits)
    ], 1)

  return out_logits, out_labels
 def Foo():
     x = constant_op.constant(10.0, name="x")
     y = math_ops.multiply(x, c, name="y")
     z = math_ops.multiply(y, 3.0, name="z")
     g = gradients_impl.gradients(z, x)
     return g[0]
Example #58
0
def mean_pairwise_squared_error(
    labels, predictions, weights=1.0, scope=None,
    loss_collection=ops.GraphKeys.LOSSES):
  """Adds a pairwise-errors-squared loss to the training procedure.

  Unlike `mean_squared_error`, which is a measure of the differences between
  corresponding elements of `predictions` and `labels`,
  `mean_pairwise_squared_error` is a measure of the differences between pairs of
  corresponding elements of `predictions` and `labels`.

  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
  three pairs of differences are summed to compute the loss:
    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3

  Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the
  corresponding pairs are computed within each batch sample but not across
  samples within a batch. For example, if `predictions` represents a batch of
  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
  is drawn from each image, but not across images.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  `[batch_size]`, then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector.

  Args:
    labels: The ground truth output tensor, whose shape must match the shape of
      `predictions`.
    predictions: The predicted outputs, a tensor of size
      `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in
      `predictions`.
    weights: Coefficients for the loss a scalar, a tensor of shape
      `[batch_size]` or a tensor whose shape matches `predictions`.
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.

  Returns:
    A scalar `Tensor` that returns the weighted loss.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
      is None.

  @compatibility(eager)
  The `loss_collection` argument is ignored when executing eagerly. Consider
  holding on to the return value or collecting losses via a `tf.keras.Model`.
  @end_compatibility
  """
  if labels is None:
    raise ValueError("labels must not be None.")
  if predictions is None:
    raise ValueError("predictions must not be None.")
  with ops.name_scope(scope, "mean_pairwise_squared_error",
                      (predictions, labels, weights)) as scope:
    weights = math_ops.cast(weights, dtype=dtypes.float32)
    labels = math_ops.cast(labels, dtype=dtypes.float32)
    with ops.control_dependencies((
        weights_broadcast_ops.assert_broadcastable(weights, labels),)):
      predictions = math_ops.cast(predictions, dtype=dtypes.float32)
      predictions.get_shape().assert_is_compatible_with(labels.get_shape())

      diffs = math_ops.subtract(predictions, labels)

      axis = math_ops.range(1, array_ops.rank(diffs))

      sum_squares_diff_per_batch = math_ops.reduce_sum(
          math_ops.square(diffs), axis=axis, keepdims=True)
      num_present_per_batch = _num_present(diffs, weights, per_batch=True)

      term1 = 2.0 * math_ops.div_no_nan(
          sum_squares_diff_per_batch,
          math_ops.maximum(num_present_per_batch - 1, 0),
          name="value")

      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
      term2 = 2.0 * math_ops.div_no_nan(
          math_ops.square(sum_diff),
          math_ops.maximum(
              math_ops.multiply(num_present_per_batch,
                                num_present_per_batch - 1), 0),
          name="value")

      weighted_losses = math_ops.multiply(term1 - term2, weights)
      loss = math_ops.reduce_sum(weighted_losses)

      mean_loss = array_ops.where(
          math_ops.reduce_sum(num_present_per_batch) > 0,
          loss,
          array_ops.zeros_like(loss),
          name="value")
      util.add_loss(mean_loss, loss_collection)
      return mean_loss
 def Foo():
     y = math_ops.multiply(var, 2.0, name="y")
     g = gradients_impl.gradients(y, var)
     return g[0]
Example #60
0
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
  """Finds fused batch norm layers and folds them into preceding layers.

  Folding only affects the following layers: Conv2D, fully connected, depthwise
  convolution.

  Args:
    graph: Graph to walk and modify.
    is_training: Bool, true if training.
    freeze_batch_norm_delay: How many steps to wait before freezing moving mean
      and variance and using them for batch normalization.

  Raises:
    ValueError: When batch norm folding fails.
  """
  for match in _FindFusedBatchNorms(graph):
    scope, sep, _ = match.layer_op.name.rpartition('/')
    # Make sure new ops are added to `graph` and put on the same device as
    # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope
    # named `scope`. Otherwise, TF creates a unique scope whose name starts with
    # `scope`.
    with graph.as_default(), graph.name_scope(scope + sep):
      with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep):
        # new weights = old weights * gamma / sqrt(variance + epsilon)
        # new biases = -mean * gamma / sqrt(variance + epsilon) + beta
        multiplier_tensor = match.gamma_tensor * math_ops.rsqrt(
            match.variance_tensor + match.bn_op.get_attr('epsilon'))
        bias_tensor = math_ops.subtract(
            match.beta_tensor,
            match.mean_tensor * multiplier_tensor,
            name='bias')

        correction_scale, correction_recip, correction_offset = None, None, None
        if is_training:
          correction_scale, correction_recip, correction_offset = (
              _ComputeBatchNormCorrections(
                  context='',
                  match=match,
                  freeze_batch_norm_delay=freeze_batch_norm_delay,
                  fused_batch_norm=True))
        # The shape of depthwise weights is different, so we need to reshape the
        # multiplier_tensor to ensure that the scaled_weight_tensor has the
        # expected shape.
        weights = match.weight_tensor
        if match.layer_op.type == 'DepthwiseConv2dNative':
          new_shape = [
              match.weight_tensor.get_shape().as_list()[2],
              match.weight_tensor.get_shape().as_list()[3]
          ]
          multiplier_tensor = array_ops.reshape(
              multiplier_tensor, new_shape, name='scale_reshape')

          if correction_scale is not None:
            correction_scale = array_ops.reshape(
                correction_scale, new_shape, name='correction_reshape')

      if correction_scale is not None:
        weights = math_ops.multiply(
            correction_scale, weights, name='correction_mult')

      scaled_weight_tensor = math_ops.multiply(
          weights, multiplier_tensor, name='mul_fold')
      new_layer_tensor = _CloneWithNewOperands(
          match.layer_op, match.input_tensor, scaled_weight_tensor)

      if correction_recip is not None:
        new_layer_tensor = math_ops.multiply(
            correction_recip, new_layer_tensor, name='post_conv_mul')
        new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset),
                                        'correction_add')

      bias_add_tensor = math_ops.add(
          new_layer_tensor, bias_tensor, name='add_fold')

      nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                     match.output_tensor)
      if nodes_modified_count != 1:
        raise ValueError(
            'Unexpected inputs to op: %s' % match.output_tensor.name)