Beispiel #1
0
  def _log_prob(self, x):
    x = self._assert_valid_sample(x)
    # broadcast logits or x if need be.
    logits = self.logits
    if (not x.get_shape().is_fully_defined() or
        not logits.get_shape().is_fully_defined() or
        x.get_shape() != logits.get_shape()):
      logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
      x = array_ops.ones_like(logits, dtype=x.dtype) * x

    logits_shape = array_ops.shape(logits)
    if logits.get_shape().ndims == 2:
      logits_2d = logits
      x_2d = x
    else:
      logits_2d = array_ops.reshape(logits, [-1, self.event_size])
      x_2d = array_ops.reshape(x, [-1, self.event_size])
    # compute the normalization constant
    log_norm_const = (math_ops.lgamma(self.event_size)
                      + (self.event_size - 1)
                      * math_ops.log(self.temperature))
    # compute the unnormalized density
    log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self.temperature)
    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
    # combine unnormalized density with normalization constant
    log_prob = log_norm_const + log_unnorm_prob
    ret = array_ops.reshape(log_prob, logits_shape)
    return ret
Beispiel #2
0
  def log_prob(self, event, name="log_prob"):
    """Log of the probability mass function.

    Args:
      event: `int32` or `int64` binary Tensor.
      name: A name for this operation (optional).

    Returns:
      The log-probabilities of the events.
    """
    # TODO(jaana): The current sigmoid_cross_entropy_with_logits has
    # inconsistent  behavior for logits = inf/-inf.
    with ops.name_scope(self.name):
      with ops.name_scope(name, values=[self.logits, event]):
        event = ops.convert_to_tensor(event, name="event")
        event = math_ops.cast(event, self.logits.dtype)
        logits = self.logits
        # sigmoid_cross_entropy_with_logits doesn't broadcast shape,
        # so we do this here.
        # TODO(b/30637701): Check dynamic shape, and don't broadcast if the
        # dynamic shapes are the same.
        if (not event.get_shape().is_fully_defined() or
            not logits.get_shape().is_fully_defined() or
            event.get_shape() != logits.get_shape()):
          logits = array_ops.ones_like(event) * logits
          event = array_ops.ones_like(logits) * event
        return -nn.sigmoid_cross_entropy_with_logits(logits, event)
 def _log_prob(self, x):
   x = self._assert_valid_sample(x)
   # broadcast logits or x if need be.
   logits = self.logits
   if (not x.get_shape().is_fully_defined() or
       not logits.get_shape().is_fully_defined() or
       x.get_shape() != logits.get_shape()):
     logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
     x = array_ops.ones_like(logits, dtype=x.dtype) * x
   logits_shape = array_ops.shape(math_ops.reduce_sum(logits, axis=[-1]))
   logits_2d = array_ops.reshape(logits, [-1, self.event_size])
   x_2d = array_ops.reshape(x, [-1, self.event_size])
   # compute the normalization constant
   k = math_ops.cast(self.event_size, x.dtype)
   log_norm_const = (math_ops.lgamma(k)
                     + (k - 1.)
                     * math_ops.log(self.temperature))
   # compute the unnormalized density
   log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
   log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
   # combine unnormalized density with normalization constant
   log_prob = log_norm_const + log_unnorm_prob
   # Reshapes log_prob to be consistent with shape of user-supplied logits
   ret = array_ops.reshape(log_prob, logits_shape)
   return ret
Beispiel #4
0
  def full_exp_with_logits(name, labels=None, logits=None):
    """Computes exponential loss given `logits`.

    Args:
      name: A name for the operation (optional).
      labels: A `Tensor` of the same type and shape as `logits`.
      logits: A `Tensor` of type `float32` or `float64`.

    Returns:
      A `Tensor` of the same shape as `logits` with the componentwise
      exponential losses.

    Raises:
      ValueError: If `logits` and `labels` do not have the same shape.
    """
    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
      logits = ops.convert_to_tensor(logits, name="logits")
      labels = ops.convert_to_tensor(labels, name="labels")
      try:
        labels.get_shape().merge_with(logits.get_shape())
      except ValueError:
        raise ValueError("logits and labels must have the same shape (%s vs %s)"
                         % (logits.get_shape(), labels.get_shape()))

    # Default threshold of 0 to switch between classes
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    ones = array_ops.ones_like(logits, dtype=logits.dtype)
    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)

    # Convert labels to 1 and -1
    cond_labels = (labels > zeros)
    labels_converted = array_ops.where(cond_labels, ones, neg_ones)

    return math_ops.exp(-1.0 * logits * labels_converted)
Beispiel #5
0
 def grad(dmm, dr):
   return [
       math_ops.matmul(dmm, b, transpose_b=True) +
       math_ops.matmul(array_ops.ones_like(b * dr), b, transpose_b=True),
       math_ops.matmul(a, dmm, transpose_b=True) +
       math_ops.matmul(a, array_ops.ones_like(a) * dr, transpose_b=True)
   ]
Beispiel #6
0
  def _sample_n(self, n, seed=None):
    n_draws = math_ops.cast(self.total_count, dtype=dtypes.int32)
    k = self.event_shape_tensor()[0]

    # broadcast the total_count and logits to same shape
    n_draws = array_ops.ones_like(
        self.logits[..., 0], dtype=n_draws.dtype) * n_draws
    logits = array_ops.ones_like(
        n_draws[..., array_ops.newaxis], dtype=self.logits.dtype) * self.logits

    # flatten the total_count and logits
    flat_logits = array_ops.reshape(logits, [-1, k])  # [B1B2...Bm, k]
    flat_ndraws = n * array_ops.reshape(n_draws, [-1])  # [B1B2...Bm]

    # computes each total_count and logits situation by map_fn
    def _sample_single(args):
      logits, n_draw = args[0], args[1]  # [K], []
      x = random_ops.multinomial(logits[array_ops.newaxis, ...], n_draw,
                                 seed)  # [1, n*n_draw]
      x = array_ops.reshape(x, shape=[n, -1])  # [n, n_draw]
      x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)  # [n, k]
      return x

    x = functional_ops.map_fn(
        _sample_single, [flat_logits, flat_ndraws],
        dtype=self.dtype)  # [B1B2...Bm, n, k]

    # reshape the results to proper shape
    x = array_ops.transpose(x, perm=[1, 0, 2])
    final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
    x = array_ops.reshape(x, final_shape)  # [n, B1, B2,..., Bm, k]
    return x
Beispiel #7
0
def _BiasAddGradGrad(op, received_grad):
  """Gradient for the BiasAddGrad op.

  Args:
    op: BiasAddGrad op for which we are calculating gradients.
    received_grad: The gradients passed to the BiasAddGrad op.

  Returns:
    A single gradient Tensor for the input to BiasAddGrad (which
    is the gradient of the bias term in BiasAdd)
  """

  try:
    data_format = op.get_attr("data_format")
  except ValueError:
    data_format = None

  shape = array_ops.shape(op.inputs[0])
  rank = array_ops.rank(op.inputs[0])
  bias_shape = array_ops.shape(received_grad)

  if data_format == b"NCHW":
    expanded_shape = array_ops.concat([
        array_ops.ones_like(shape[:-3]), bias_shape,
        array_ops.ones_like(shape[-2:])
    ], 0)
    tile_mults = array_ops.concat([shape[:-3], [1], shape[-2:]], 0)
  else:
    expanded_shape = array_ops.concat(
        [array_ops.ones_like(shape[:-1]), bias_shape], 0)
    tile_mults = array_ops.concat([shape[:-1], [1]], 0)

  expanded_grad = array_ops.reshape(received_grad, expanded_shape)
  return array_ops.tile(expanded_grad, tile_mults)
Beispiel #8
0
  def sample_n(self, n, seed=None, name="sample_n"):
    """Sample `n` observations from the Beta Distributions.

    Args:
      n: `Scalar` `Tensor` of type `int32` or `int64`, the number of
        observations to sample.
      seed: Python integer, the random seed.
      name: The name to give this op.

    Returns:
      samples: `[n, ...]`, a `Tensor` of `n` samples for each
        of the distributions determined by broadcasting the hyperparameters.
    """
    with ops.name_scope(self.name):
      with ops.name_scope(name, values=[self.a, self.b, n]):
        a = array_ops.ones_like(self._a_b_sum, dtype=self.dtype) * self.a
        b = array_ops.ones_like(self._a_b_sum, dtype=self.dtype) * self.b
        n = ops.convert_to_tensor(n, name="n")

        gamma1_sample = random_ops.random_gamma(
            [n,], a, dtype=self.dtype, seed=seed)
        gamma2_sample = random_ops.random_gamma(
            [n,], b, dtype=self.dtype, seed=seed)

        beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample)

        n_val = tensor_util.constant_value(n)
        final_shape = tensor_shape.vector(n_val).concatenate(
            self._a_b_sum.get_shape())

        beta_sample.set_shape(final_shape)
        return beta_sample
Beispiel #9
0
 def _sample_n(self, n, seed=None):
     a = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.a
     b = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.b
     gamma1_sample = random_ops.random_gamma([n], a, dtype=self.dtype, seed=seed)
     gamma2_sample = random_ops.random_gamma([n], b, dtype=self.dtype, seed=seed)
     beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample)
     return beta_sample
Beispiel #10
0
 def _sample_n(self, n, seed=None):
   a = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.a
   b = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.b
   gamma1_sample = random_ops.random_gamma(
       [n,], a, dtype=self.dtype, seed=seed)
   gamma2_sample = random_ops.random_gamma(
       [n,], b, dtype=self.dtype,
       seed=distribution_util.gen_new_seed(seed, "beta"))
   beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample)
   return beta_sample
 def grad_fn(inputs, trainable_variables, unused_outputs,
             unused_grad_outputs):
   grad_inputs = [
       array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
   ]
   grad_vars = [
       array_ops.ones_like(t) * (i + len(inputs) + 1.)
       for i, t in enumerate(trainable_variables)
   ]
   return grad_inputs, grad_vars
Beispiel #12
0
  def exp_with_logits(name, eps, labels=None, logits=None):
    """Computes exponential loss given `logits`.

    The loss returns is exp(-targets*modified_predictions), where
    modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive
    class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in
    the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps).

    Args:
      name: A name for the operation (optional).
      eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b.
      labels: A `Tensor` of the same type and shape as `logits`.
      logits: A `Tensor` of type `float32` or `float64`.

    Returns:
      A `Tensor` of the same shape as `logits` with the componentwise
      exponential losses.

    Raises:
      ValueError: If `logits` and `labels` do not have the same shape.
    """
    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
      logits = ops.convert_to_tensor(logits, name="logits")
      labels = ops.convert_to_tensor(labels, name="labels")
      try:
        labels.get_shape().merge_with(logits.get_shape())
      except ValueError:
        raise ValueError("logits and labels must have the same shape (%s vs %s)"
                         % (logits.get_shape(), labels.get_shape()))

    # Default threshold to switch between classes
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    ones = array_ops.ones_like(logits, dtype=logits.dtype)
    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)

    # Convert labels to 1 and -1
    cond_labels = (labels > zeros)
    labels_converted = array_ops.where(cond_labels, ones, neg_ones)

    # Convert predictions to 1 and -1
    # The loss we build is min(1, max(-1,ax+b))
    # where a=1/eps, b=-1/2eps.

    a = 1.0 / eps
    b = -1.0 / 2 / eps
    probs = math_ops.sigmoid(logits)
    y = a * probs + b
    # Build max(-1, ax+b)
    cond = (y < -1)
    max_res = array_ops.where(cond, neg_ones, y)
    # Build min part
    cond = (max_res > 1)
    min_res = array_ops.where(cond, ones, max_res)
    preds_converted = min_res
    return math_ops.exp(-preds_converted * labels_converted)
Beispiel #13
0
 def _moment(self, n):
   """Compute the n'th (uncentered) moment."""
   expanded_concentration1 = array_ops.ones_like(
       self.total_concentration, dtype=self.dtype) * self.concentration1
   expanded_concentration0 = array_ops.ones_like(
       self.total_concentration, dtype=self.dtype) * self.concentration0
   beta_arg0 = 1 + n / expanded_concentration1
   beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1)
   log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta(
       beta_arg)
   return math_ops.exp(log_moment)
Beispiel #14
0
  def _sample_n(self, n, seed=None):
    expanded_concentration1 = array_ops.ones_like(
        self.total_concentration, dtype=self.dtype) * self.concentration1
    expanded_concentration0 = array_ops.ones_like(
        self.total_concentration, dtype=self.dtype) * self.concentration0
    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
    uniform_sample = random_ops.random_uniform(
        shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed)

    kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**(
        1. / expanded_concentration1)
    return kumaraswamy_sample
Beispiel #15
0
  def _log_prob(self, counts):
    if self.validate_args:
      counts = distribution_util.embed_check_nonnegative_discrete(
          counts, check_integer=True)
    counts *= array_ops.ones_like(self.probs)
    probs = self.probs * array_ops.ones_like(counts)

    safe_domain = array_ops.where(
        math_ops.equal(counts, 0.),
        array_ops.zeros_like(probs),
        probs)
    return counts * math_ops.log1p(-safe_domain) + math_ops.log(probs)
Beispiel #16
0
 def _log_prob(self, x):
   if self.validate_args:
     x = distribution_util.embed_check_nonnegative_integer_form(x)
   else:
     # For consistency with cdf, we take the floor.
     x = math_ops.floor(x)
   x *= array_ops.ones_like(self.probs)
   probs = self.probs * array_ops.ones_like(x)
   safe_domain = array_ops.where(
       math_ops.equal(x, 0.),
       array_ops.zeros_like(probs),
       probs)
   return x * math_ops.log1p(-safe_domain) + math_ops.log(probs)
def broadcast_weights(weights, values):
  """Broadcast `weights` to the same shape as `values`.

  This returns a version of `weights` following the same broadcast rules as
  `mul(weights, values)`, but limited to the weights shapes allowed by
  `assert_broadcastable`. When computing a weighted average, use this function
  to broadcast `weights` before summing them; e.g.,
  `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.

  Args:
    weights: `Tensor` whose shape is broadcastable to `values` according to the
      rules of `assert_broadcastable`.
    values: `Tensor` of any shape.

  Returns:
    `weights` broadcast to `values` shape according to the rules of
      `assert_broadcastable`.
  """
  with ops.name_scope(None, "broadcast_weights", (weights, values)) as scope:
    values = ops.convert_to_tensor(values, name="values")
    weights = ops.convert_to_tensor(
        weights, dtype=values.dtype.base_dtype, name="weights")

    # Try static check for exact match.
    weights_shape = weights.get_shape()
    values_shape = values.get_shape()
    if (weights_shape.is_fully_defined() and
        values_shape.is_fully_defined() and
        weights_shape.is_compatible_with(values_shape)):
      return weights

    with ops.control_dependencies((assert_broadcastable(weights, values),)):
      return math_ops.multiply(
          weights, array_ops.ones_like(values), name=scope)
def reduce_mean(rt_input, axis=None, name=None):
  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
  with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]):
    total = reduce_sum(rt_input, axis)
    if ragged_tensor.is_ragged(rt_input):
      ones = ragged_factory_ops.from_nested_row_splits(
          array_ops.ones_like(rt_input.inner_values),
          rt_input.nested_row_splits)
    else:
      ones = array_ops.ones_like(rt_input)
    count = reduce_sum(ones, axis)
    if ragged_tensor.is_ragged(total):
      return ragged_factory_ops.from_nested_row_splits(
          total.inner_values / count.inner_values, total.nested_row_splits)
    else:
      return total / count
Beispiel #19
0
  def testContribSignalSTFT(self):
    ws = 512
    hs = 128
    dims = (ws * 20,)
    shape = BATCH_DIMS + dims
    data = np.arange(np.prod(shape)) / np.prod(dims)
    np.random.seed(123)
    np.random.shuffle(data)
    data = np.reshape(data.astype(np.float32), shape)
    window = sps.get_window("hann", ws)
    expected = sps.stft(
        data, nperseg=ws, noverlap=ws - hs, boundary=None, window=window)[2]
    expected = np.swapaxes(expected, -1, -2)
    expected *= window.sum()  # scipy divides by window sum
    with self.test_session() as sess:
      with self.test_scope():
        ph = array_ops.placeholder(
            dtypes.as_dtype(data.dtype), shape=data.shape)
        out = signal.stft(ph, ws, hs)
        grad = gradients_impl.gradients(out, ph,
                                        grad_ys=array_ops.ones_like(out))

      # For gradients, we simply verify that they compile & execute.
      value, _ = sess.run([out, grad], {ph: data})
      self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL)
Beispiel #20
0
 def _variance(self):
   p = self.p * array_ops.expand_dims(array_ops.ones_like(self.n), -1)
   outer_prod = math_ops.batch_matmul(
       array_ops.expand_dims(self._mean_val, -1),
       array_ops.expand_dims(p, -2))
   return array_ops.batch_matrix_set_diag(
       -outer_prod, self._mean_val - self._mean_val * p)
Beispiel #21
0
  def mode(self, name="mode"):
    """Mode of the distribution.

    Note that the mode for the Beta distribution is only defined
    when `alpha > 1`. This returns the mode when `alpha > 1`,
    and NaN otherwise. If `self.allow_nan_stats` is `False`, an exception
    will be raised rather than returning `NaN`.

    Args:
      name: The name for this op.

    Returns:
      Mode of the Dirichlet distribution.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self._alpha, self._alpha_0], name):
        one = constant_op.constant(1, self.dtype)
        mode = (self._alpha - 1)/ (
            array_ops.expand_dims(self._alpha_0, -1) - math_ops.cast(
                self.event_shape()[0], self.dtype))

        if self.allow_nan_stats:
          return math_ops.select(
              math_ops.greater(self._alpha, 1),
              mode,
              (constant_op.constant(float("NaN"), dtype=self.dtype) *
               array_ops.ones_like(self._alpha, dtype=self.dtype)))
        else:
          return control_flow_ops.with_dependencies([
              check_ops.assert_less(
                  one, self._alpha,
                  message="mode not defined for components of alpha <= 1")
          ], mode)
Beispiel #22
0
  def testMaskingSingleInput(self):

    class MaskedLayer(base_layers.Layer):

      def call(self, inputs, mask=None):
        if mask is not None:
          return inputs * mask
        return inputs

      def compute_mask(self, inputs, mask=None):
        return array_ops.ones_like(inputs)

    if context.in_graph_mode():
      x = base_layers.Input(shape=(32,))
      y = MaskedLayer()(x)  # pylint: disable=not-callable
      network = base_layers.Network(x, y)

      # test callability on Input
      x_2 = base_layers.Input(shape=(32,))
      y_2 = network(x_2)
      self.assertEqual(y_2.get_shape().as_list(), [None, 32])

      # test callability on regular tensor
      x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
      y_2 = network(x_2)
      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
    else:
      a = constant_op.constant([2] * 32)
      mask = constant_op.constant([0, 1] * 16)
      a._keras_mask = mask
      b = MaskedLayer().apply(a)
      self.assertTrue(hasattr(b, '_keras_mask'))
      self.assertAllEqual(self.evaluate(array_ops.ones_like(mask)),
                          self.evaluate(getattr(b, '_keras_mask')))
      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
  def testCustomGrad(self):

    def fn(a, b, c):
      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)

    def grad_fn(inputs, trainable_variables, unused_outputs,
                unused_grad_outputs):
      grad_inputs = [
          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
      ]
      grad_vars = [
          array_ops.ones_like(t) * (i + len(inputs) + 1.)
          for i, t in enumerate(trainable_variables)
      ]
      return grad_inputs, grad_vars

    a = random_ops.random_uniform([11, 6])
    b = random_ops.random_uniform([11, 7])
    c = random_ops.random_uniform([7, 10])
    w = random_ops.random_uniform([6, 10])
    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
    loss = math_ops.reduce_mean(out)
    grads = gradients_impl.gradients(
        loss, [a, b, c, variables.trainable_variables()[0]])
    expected_grads = [
        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
    ]
    with self.test_session() as sess:
      sess.run(variables.global_variables_initializer())
      g_val, eg_val = sess.run([grads, expected_grads])
      for g1, g2 in zip(g_val, eg_val):
        self.assertAllClose(g1, g2)
  def _survival_function(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= upper_cutoff,
    #                       = 1, if y < lower_cutoff,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when lower_cutoff < X < upper_cutoff.
    result_so_far = self.distribution.survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      result_so_far = math_ops.select(j < lower_cutoff,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)
    if upper_cutoff is not None:
      result_so_far = math_ops.select(j >= upper_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
def hinge_loss(labels, logits, weights=1.0, scope=None,
               loss_collection=ops.GraphKeys.LOSSES,
               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
  """Adds a hinge loss to the training procedure.

  Args:
    labels: The ground truth output tensor. Its shape should match the shape of
      logits. The values of the tensor are expected to be 0.0 or 1.0.
    logits: The logits, a float tensor.
    weights: Optional `Tensor` whose rank is either 0, or the same rank as
      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
      be either `1`, or the same as the corresponding `losses` dimension).
    scope: The scope for the operations performed in computing the loss.
    loss_collection: collection to which the loss will be added.
    reduction: Type of reduction to apply to loss.

  Returns:
    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
    shape as `labels`; otherwise, it is scalar.

  Raises:
    ValueError: If the shapes of `logits` and `labels` don't match.
  """
  with ops.name_scope(scope, "hinge_loss", (logits, labels, weights)) as scope:
    logits = math_ops.to_float(logits)
    labels = math_ops.to_float(labels)
    logits.get_shape().assert_is_compatible_with(labels.get_shape())
    # We first need to convert binary labels to -1/1 labels (as floats).
    all_ones = array_ops.ones_like(labels)
    labels = math_ops.subtract(2 * labels, all_ones)
    losses = nn_ops.relu(
        math_ops.subtract(all_ones, math_ops.multiply(labels, logits)))
    return compute_weighted_loss(
        losses, weights, scope, loss_collection, reduction=reduction)
Beispiel #26
0
  def gradient_clipping(grads_and_vars):
    """Internal function for adaptive clipping."""
    grads, variables = zip(*grads_and_vars)

    norm = clip_ops.global_norm(grads)

    max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay,
                                            global_step, epsilon, name)

    # reports the max gradient norm for debugging
    if report_summary:
      summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm)

    # factor will be 1. if norm is smaller than max_norm
    factor = array_ops.where(norm < max_norm,
                             array_ops.ones_like(norm),
                             math_ops.exp(log_mean) / norm)

    if static_max_norm is not None:
      factor = math_ops.minimum(static_max_norm / norm, factor)

    # apply factor
    clipped_grads = []
    for grad in grads:
      if grad is None:
        clipped_grads.append(None)
      elif isinstance(grad, ops.IndexedSlices):
        clipped_grads.append(
            ops.IndexedSlices(grad.values * factor, grad.indices,
                              grad.dense_shape))
      else:
        clipped_grads.append(grad * factor)

    return list(zip(clipped_grads, variables))
 def call(self, inputs, mask=None):
   self._validate_call_args(inputs=inputs, mask=mask)
   q = inputs[0]
   v = inputs[1]
   k = inputs[2] if len(inputs) > 2 else v
   q_mask = mask[0] if mask else None
   v_mask = mask[1] if mask else None
   scores = self._calculate_scores(query=q, key=k)
   if v_mask is not None:
     # Mask of shape [batch_size, 1, Tv].
     v_mask = array_ops.expand_dims(v_mask, axis=-2)
   if self.causal:
     # Creates a lower triangular mask, so position i cannot attend to
     # positions j>i. This prevents the flow of information from the future
     # into the past.
     scores_shape = array_ops.shape(scores)
     # causal_mask_shape = [1, Tq, Tv].
     causal_mask_shape = array_ops.concat(
         [array_ops.ones_like(scores_shape[:-2]), scores_shape[-2:]],
         axis=0)
     causal_mask = _lower_triangular_mask(causal_mask_shape)
   else:
     causal_mask = None
   scores_mask = _merge_masks(v_mask, causal_mask)
   result = self._apply_scores(scores=scores, value=v, scores_mask=scores_mask)
   if q_mask is not None:
     # Mask of shape [batch_size, Tq, 1].
     q_mask = array_ops.expand_dims(q_mask, axis=-1)
     result *= math_ops.cast(q_mask, dtype=result.dtype)
   return result
Beispiel #28
0
def _num_present(losses, weights, per_batch=False):
  """Computes the number of elements in the loss function induced by `weights`.

  A given weights tensor induces different numbers of usable elements in the
  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
  possible dimensions. For example, if `losses` is a tensor of dimension
  `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is,
  in effect, tiled to match the shape of `losses`. Following this effective
  tile, the total number of present elements is the number of non-zero weights.

  Args:
    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: `Tensor` of shape `[]`, `[batch_size]` or
      `[batch_size, d1, ... dK]`, where K < N.
    per_batch: Whether to return the number of elements per batch or as a sum
      total.

  Returns:
    The number of present (non-zero) elements in the losses tensor. If
      `per_batch` is `True`, the value is returned as a tensor of size
      `[batch_size]`. Otherwise, a single scalar tensor is returned.
  """
  with ops.name_scope(None, "num_present", (losses, weights)) as scope:
    weights = math_ops.to_float(weights)
    present = array_ops.where(
        math_ops.equal(weights, 0.0),
        array_ops.zeros_like(weights),
        array_ops.ones_like(weights))
    present = weights_broadcast_ops.broadcast_weights(present, losses)
    if per_batch:
      return math_ops.reduce_sum(
          present, axis=math_ops.range(1, array_ops.rank(present)),
          keep_dims=True, name=scope)
    return math_ops.reduce_sum(present, name=scope)
Beispiel #29
0
  def pdf(self, x, name="pdf"):
    """The PDF of observations in `x` under these Uniform distribution(s).

    Args:
      x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`.
      name: The name to give this op.

    Returns:
      pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will
          return `nan`.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self.a, self.b, x], name):
        x = ops.convert_to_tensor(x, name="x")
        if x.dtype != self.dtype:
          raise TypeError("Input x dtype does not match dtype: %s vs. %s" %
                          (x.dtype, self.dtype))

        broadcasted_x = x * self._ones()
        return math_ops.select(
            math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select(
                math_ops.logical_or(broadcasted_x < self.a,
                                    broadcasted_x > self.b),
                array_ops.zeros_like(broadcasted_x),
                (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
  def _log_cdf(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # cdf(y) := P[Y <= y]
    #         = 1, if y >= upper_cutoff,
    #         = 0, if y < lower_cutoff,
    #         = P[X <= y], otherwise.

    # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
    # between.
    j = math_ops.floor(y)

    result_so_far = self.distribution.log_cdf(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      neg_inf = -np.inf * array_ops.ones_like(result_so_far)
      result_so_far = math_ops.select(j < lower_cutoff, neg_inf, result_so_far)
    if upper_cutoff is not None:
      result_so_far = math_ops.select(j >= upper_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
Beispiel #31
0
 def _stddev(self):
     return self.scale * array_ops.ones_like(
         self.loc) * math.pi / math.sqrt(3)
Beispiel #32
0
def _ndtri(p):
    """Implements ndtri core logic."""

    # Constants used in piece-wise rational approximations. Taken from the cephes
    # library:
    # https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtri.c
    p0 = list(
        reversed([
            -5.99633501014107895267E1, 9.80010754185999661536E1,
            -5.66762857469070293439E1, 1.39312609387279679503E1,
            -1.23916583867381258016E0
        ]))
    q0 = list(
        reversed([
            1.0, 1.95448858338141759834E0, 4.67627912898881538453E0,
            8.63602421390890590575E1, -2.25462687854119370527E2,
            2.00260212380060660359E2, -8.20372256168333339912E1,
            1.59056225126211695515E1, -1.18331621121330003142E0
        ]))
    p1 = list(
        reversed([
            4.05544892305962419923E0, 3.15251094599893866154E1,
            5.71628192246421288162E1, 4.40805073893200834700E1,
            1.46849561928858024014E1, 2.18663306850790267539E0,
            -1.40256079171354495875E-1, -3.50424626827848203418E-2,
            -8.57456785154685413611E-4
        ]))
    q1 = list(
        reversed([
            1.0, 1.57799883256466749731E1, 4.53907635128879210584E1,
            4.13172038254672030440E1, 1.50425385692907503408E1,
            2.50464946208309415979E0, -1.42182922854787788574E-1,
            -3.80806407691578277194E-2, -9.33259480895457427372E-4
        ]))
    p2 = list(
        reversed([
            3.23774891776946035970E0, 6.91522889068984211695E0,
            3.93881025292474443415E0, 1.33303460815807542389E0,
            2.01485389549179081538E-1, 1.23716634817820021358E-2,
            3.01581553508235416007E-4, 2.65806974686737550832E-6,
            6.23974539184983293730E-9
        ]))
    q2 = list(
        reversed([
            1.0, 6.02427039364742014255E0, 3.67983563856160859403E0,
            1.37702099489081330271E0, 2.16236993594496635890E-1,
            1.34204006088543189037E-2, 3.28014464682127739104E-4,
            2.89247864745380683936E-6, 6.79019408009981274425E-9
        ]))

    def _create_polynomial(var, coeffs):
        """Compute n_th order polynomial via Horner's method."""
        if not coeffs:
            return 0.
        return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var

    maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p)
    # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
    # later on. The result from the computation when p == 0 is not used so any
    # number that doesn't result in NaNs is fine.
    sanitized_mcp = array_ops.where(maybe_complement_p <= 0.,
                                    0.5 * array_ops.ones_like(p),
                                    maybe_complement_p)

    # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
    w = sanitized_mcp - 0.5
    ww = w**2
    x_for_big_p = w + w * ww * (_create_polynomial(ww, p0) /
                                _create_polynomial(ww, q0))
    x_for_big_p *= -np.sqrt(2. * np.pi)

    # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z),
    # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different
    # arrays based on wether p < exp(-32).
    z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
    first_term = z - math_ops.log(z) / z
    second_term_small_p = (_create_polynomial(1. / z, p2) /
                           _create_polynomial(1. / z, q2)) / z
    second_term_otherwise = (_create_polynomial(1. / z, p1) /
                             _create_polynomial(1. / z, q1)) / z
    x_for_small_p = first_term - second_term_small_p
    x_otherwise = first_term - second_term_otherwise

    x = array_ops.where(sanitized_mcp > np.exp(-2.), x_for_big_p,
                        array_ops.where(z >= 8.0, x_for_small_p, x_otherwise))

    x = array_ops.where(p > 1. - np.exp(-2.), x, -x)
    infinity = constant_op.constant(np.inf,
                                    dtype=x.dtype) * array_ops.ones_like(x)
    x_nan_replaced = array_ops.where(p <= 0.0, -infinity,
                                     array_ops.where(p >= 1.0, infinity, x))
    return x_nan_replaced
 def _compute_ones_matrix_shape(self):
     if self.batch_shape.is_fully_defined():
         batch_shape = self.batch_shape.concatenate([1, 1])
         return np.ones_like(batch_shape, dtype=np.int32)
     return array_ops.concat(
         [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
Beispiel #34
0
 def CheckUnitary(self, x, tol):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     self.assertAllClose(identity, xx, atol=tol)
 def big():
     return array_ops.ones_like(gini, dtype=dtypes.float32) * 10000000.
    def training_graph(self,
                       input_data,
                       input_labels,
                       random_seed,
                       data_spec,
                       sparse_features=None,
                       input_weights=None):
        """Constructs a TF graph for training a random tree.

    Args:
      input_data: A tensor or placeholder for input data.
      input_labels: A tensor or placeholder for labels associated with
        input_data.
      random_seed: The random number generator seed to use for this tree.  0
        means use the current time as the seed.
      data_spec: A data_ops.TensorForestDataSpec object specifying the
        original feature/columns of the data.
      sparse_features: A tf.SparseTensor for sparse input data.
      input_weights: A float tensor or placeholder holding per-input weights,
        or None if all inputs are to be weighted equally.

    Returns:
      The last op in the random tree training graph.
    """
        epoch = math_ops.to_int32(get_epoch_variable())

        serialized_input_spec = data_spec.SerializeToString()

        if input_weights is None:
            input_weights = []

        if input_data is None:
            input_data = []

        sparse_indices = []
        sparse_values = []
        sparse_shape = []
        if sparse_features is not None:
            sparse_indices = sparse_features.indices
            sparse_values = sparse_features.values
            sparse_shape = sparse_features.dense_shape

        # Count extremely random stats.
        (node_sums, node_squares, splits_indices, splits_sums, splits_squares,
         totals_indices, totals_sums, totals_squares,
         input_leaves) = (tensor_forest_ops.count_extremely_random_stats(
             input_data,
             sparse_indices,
             sparse_values,
             sparse_shape,
             input_labels,
             input_weights,
             self.variables.tree,
             self.variables.tree_thresholds,
             self.variables.node_to_accumulator_map,
             self.variables.candidate_split_features,
             self.variables.candidate_split_thresholds,
             self.variables.start_epoch,
             epoch,
             input_spec=serialized_input_spec,
             num_classes=self.params.num_output_columns,
             regression=self.params.regression))
        node_update_ops = []
        node_update_ops.append(
            state_ops.assign_add(self.variables.node_sums, node_sums))

        splits_update_ops = []
        splits_update_ops.append(
            tensor_forest_ops.scatter_add_ndim(
                self.variables.candidate_split_sums, splits_indices,
                splits_sums))
        splits_update_ops.append(
            tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_sums,
                                               totals_indices, totals_sums))

        if self.params.regression:
            node_update_ops.append(
                state_ops.assign_add(self.variables.node_squares,
                                     node_squares))
            splits_update_ops.append(
                tensor_forest_ops.scatter_add_ndim(
                    self.variables.candidate_split_squares, splits_indices,
                    splits_squares))
            splits_update_ops.append(
                tensor_forest_ops.scatter_add_ndim(
                    self.variables.accumulator_squares, totals_indices,
                    totals_squares))

        # Sample inputs.
        update_indices, feature_updates, threshold_updates = (
            tensor_forest_ops.sample_inputs(
                input_data,
                sparse_indices,
                sparse_values,
                sparse_shape,
                input_weights,
                self.variables.node_to_accumulator_map,
                input_leaves,
                self.variables.candidate_split_features,
                self.variables.candidate_split_thresholds,
                input_spec=serialized_input_spec,
                split_initializations_per_input=(
                    self.params.split_initializations_per_input),
                split_sampling_random_seed=random_seed))
        update_features_op = state_ops.scatter_update(
            self.variables.candidate_split_features, update_indices,
            feature_updates)
        update_thresholds_op = state_ops.scatter_update(
            self.variables.candidate_split_thresholds, update_indices,
            threshold_updates)

        # Calculate finished nodes.
        with ops.control_dependencies(splits_update_ops):
            # Passing input_leaves to finished nodes here means that nodes that
            # have become stale won't be deallocated until an input reaches them,
            # because we're trying to avoid considering every fertile node for
            # performance reasons.
            finished, stale = tensor_forest_ops.finished_nodes(
                input_leaves,
                self.variables.node_to_accumulator_map,
                self.variables.candidate_split_sums,
                self.variables.candidate_split_squares,
                self.variables.accumulator_sums,
                self.variables.accumulator_squares,
                self.variables.start_epoch,
                epoch,
                num_split_after_samples=self.params.split_after_samples,
                min_split_samples=self.params.min_split_samples,
                dominate_method=self.params.dominate_method,
                dominate_fraction=self.params.dominate_fraction)

        # Update leaf scores.
        # TODO(thomaswc): Store the leaf scores in a TopN and only update the
        # scores of the leaves that were touched by this batch of input.
        children = array_ops.squeeze(array_ops.slice(self.variables.tree,
                                                     [0, 0], [-1, 1]),
                                     squeeze_dims=[1])
        is_leaf = math_ops.equal(constants.LEAF_NODE, children)
        leaves = math_ops.to_int32(
            array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1]))
        non_fertile_leaves = array_ops.boolean_mask(
            leaves,
            math_ops.less(
                array_ops.gather(self.variables.node_to_accumulator_map,
                                 leaves), 0))

        # TODO(gilberth): It should be possible to limit the number of non
        # fertile leaves we calculate scores for, especially since we can only take
        # at most array_ops.shape(finished)[0] of them.
        with ops.control_dependencies(node_update_ops):
            sums = array_ops.gather(self.variables.node_sums,
                                    non_fertile_leaves)
            if self.params.regression:
                squares = array_ops.gather(self.variables.node_squares,
                                           non_fertile_leaves)
                non_fertile_leaf_scores = self._variance(sums, squares)
            else:
                non_fertile_leaf_scores = self._weighted_gini(sums)

        # Calculate best splits.
        with ops.control_dependencies(splits_update_ops):
            split_indices = tensor_forest_ops.best_splits(
                finished,
                self.variables.node_to_accumulator_map,
                self.variables.candidate_split_sums,
                self.variables.candidate_split_squares,
                self.variables.accumulator_sums,
                self.variables.accumulator_squares,
                regression=self.params.regression)

        # Grow tree.
        with ops.control_dependencies(
            [update_features_op, update_thresholds_op]):
            (tree_update_indices, tree_children_updates,
             tree_threshold_updates, new_eot) = (tensor_forest_ops.grow_tree(
                 self.variables.end_of_tree,
                 self.variables.node_to_accumulator_map, finished,
                 split_indices, self.variables.candidate_split_features,
                 self.variables.candidate_split_thresholds))
            tree_update_op = state_ops.scatter_update(self.variables.tree,
                                                      tree_update_indices,
                                                      tree_children_updates)
            thresholds_update_op = state_ops.scatter_update(
                self.variables.tree_thresholds, tree_update_indices,
                tree_threshold_updates)
            # TODO(thomaswc): Only update the epoch on the new leaves.
            new_epoch_updates = epoch * array_ops.ones_like(
                tree_threshold_updates, dtype=dtypes.int32)
            epoch_update_op = state_ops.scatter_update(
                self.variables.start_epoch, tree_update_indices,
                new_epoch_updates)

        # Update fertile slots.
        with ops.control_dependencies([tree_update_op]):
            (n2a_map_updates, a2n_map_updates, accumulators_cleared,
             accumulators_allocated) = (tensor_forest_ops.update_fertile_slots(
                 finished,
                 non_fertile_leaves,
                 non_fertile_leaf_scores,
                 self.variables.end_of_tree,
                 self.variables.accumulator_sums,
                 self.variables.node_to_accumulator_map,
                 stale,
                 self.variables.node_sums,
                 regression=self.params.regression))

        # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has
        # used it to calculate new leaves.
        gated_new_eot, = control_flow_ops.tuple(
            [new_eot], control_inputs=[n2a_map_updates])
        eot_update_op = state_ops.assign(self.variables.end_of_tree,
                                         gated_new_eot)

        updates = []
        updates.append(eot_update_op)
        updates.append(tree_update_op)
        updates.append(thresholds_update_op)
        updates.append(epoch_update_op)

        updates.append(
            state_ops.scatter_update(self.variables.node_to_accumulator_map,
                                     n2a_map_updates[0], n2a_map_updates[1]))

        updates.append(
            state_ops.scatter_update(self.variables.accumulator_to_node_map,
                                     a2n_map_updates[0], a2n_map_updates[1]))

        cleared_and_allocated_accumulators = array_ops.concat_v2(
            [accumulators_cleared, accumulators_allocated], 0)

        # Calculate values to put into scatter update for candidate counts.
        # Candidate split counts are always reset back to 0 for both cleared
        # and allocated accumulators. This means some accumulators might be doubly
        # reset to 0 if the were released and not allocated, then later allocated.
        split_values = array_ops.tile(
            array_ops.expand_dims(
                array_ops.expand_dims(
                    array_ops.zeros_like(cleared_and_allocated_accumulators,
                                         dtype=dtypes.float32), 1), 2),
            [
                1, self.params.num_splits_to_consider,
                self.params.num_output_columns
            ])
        updates.append(
            state_ops.scatter_update(self.variables.candidate_split_sums,
                                     cleared_and_allocated_accumulators,
                                     split_values))
        if self.params.regression:
            updates.append(
                state_ops.scatter_update(
                    self.variables.candidate_split_squares,
                    cleared_and_allocated_accumulators, split_values))

        # Calculate values to put into scatter update for total counts.
        total_cleared = array_ops.tile(
            array_ops.expand_dims(
                math_ops.negative(
                    array_ops.ones_like(accumulators_cleared,
                                        dtype=dtypes.float32)), 1),
            [1, self.params.num_output_columns])
        total_reset = array_ops.tile(
            array_ops.expand_dims(
                array_ops.zeros_like(accumulators_allocated,
                                     dtype=dtypes.float32), 1),
            [1, self.params.num_output_columns])
        accumulator_updates = array_ops.concat_v2([total_cleared, total_reset],
                                                  0)
        updates.append(
            state_ops.scatter_update(self.variables.accumulator_sums,
                                     cleared_and_allocated_accumulators,
                                     accumulator_updates))
        if self.params.regression:
            updates.append(
                state_ops.scatter_update(self.variables.accumulator_squares,
                                         cleared_and_allocated_accumulators,
                                         accumulator_updates))

        # Calculate values to put into scatter update for candidate splits.
        split_features_updates = array_ops.tile(
            array_ops.expand_dims(
                math_ops.negative(
                    array_ops.ones_like(cleared_and_allocated_accumulators)),
                1), [1, self.params.num_splits_to_consider])
        updates.append(
            state_ops.scatter_update(self.variables.candidate_split_features,
                                     cleared_and_allocated_accumulators,
                                     split_features_updates))

        updates += self.finish_iteration()

        return control_flow_ops.group(*updates)
Beispiel #37
0
 def _mean(self):
     return self.loc * array_ops.ones_like(self.scale)
Beispiel #38
0
 def _stddev(self):
     return self.scale * array_ops.ones_like(self.loc)
Beispiel #39
0
 def _ones(self):
     return array_ops.ones_like(self._df + self._mu + self._sigma)
Beispiel #40
0
 def tpu_fn():
   results = mid_level_api.dequeue()
   mid_level_api.apply_gradients((None, None,
                                  array_ops.ones_like(results[2])))
   return results
def random_normal_correlated_columns(shape,
                                     mean=0.0,
                                     stddev=1.0,
                                     dtype=dtypes.float32,
                                     eps=1e-4,
                                     seed=None):
    """Batch matrix with (possibly complex) Gaussian entries and correlated cols.

  Returns random batch matrix `A` with specified element-wise `mean`, `stddev`,
  living close to an embedded hyperplane.

  Suppose `shape[-2:] = (M, N)`.

  If `M < N`, `A` is a random `M x N` [batch] matrix with iid Gaussian entries.

  If `M >= N`, then the colums of `A` will be made almost dependent as follows:

  ```
  L = random normal N x N-1 matrix, mean = 0, stddev = 1 / sqrt(N - 1)
  B = random normal M x N-1 matrix, mean = 0, stddev = stddev.

  G = (L B^H)^H, a random normal M x N matrix, living on N-1 dim hyperplane
  E = a random normal M x N matrix, mean = 0, stddev = eps
  mu = a constant M x N matrix, equal to the argument "mean"

  A = G + E + mu
  ```

  Args:
    shape:  Python list of integers.
      Shape of the returned tensor.  Must be at least length two.
    mean:  `Tensor` giving mean of normal to sample from.
    stddev:  `Tensor` giving stdev of normal to sample from.
    dtype:  `TensorFlow` `dtype` or numpy dtype
    eps:  Distance each column is perturbed from the low-dimensional subspace.
    seed:  Python integer seed for the RNG.

  Returns:
    `Tensor` with desired shape and dtype.

  Raises:
    ValueError:  If `shape` is not at least length 2.
  """
    dtype = dtypes.as_dtype(dtype)

    if len(shape) < 2:
        raise ValueError(
            "Argument shape must be at least length 2.  Found: %s" % shape)

    # Shape is the final shape, e.g. [..., M, N]
    shape = list(shape)
    batch_shape = shape[:-2]
    m, n = shape[-2:]

    # If there is only one column, "they" are by definition correlated.
    if n < 2 or n < m:
        return random_normal(shape,
                             mean=mean,
                             stddev=stddev,
                             dtype=dtype,
                             seed=seed)

    # Shape of the matrix with only n - 1 columns that we will embed in higher
    # dimensional space.
    smaller_shape = batch_shape + [m, n - 1]

    # Shape of the embedding matrix, mapping batch matrices
    # from [..., N-1, M] to [..., N, M]
    embedding_mat_shape = batch_shape + [n, n - 1]

    # This stddev for the embedding_mat ensures final result has correct stddev.
    stddev_mat = 1 / np.sqrt(n - 1)

    with ops.name_scope("random_normal_correlated_columns"):
        smaller_mat = random_normal(smaller_shape,
                                    mean=0.0,
                                    stddev=stddev_mat,
                                    dtype=dtype,
                                    seed=seed)

        if seed is not None:
            seed += 1287

        embedding_mat = random_normal(embedding_mat_shape,
                                      dtype=dtype,
                                      seed=seed)

        embedded_t = math_ops.matmul(embedding_mat,
                                     smaller_mat,
                                     transpose_b=True)
        embedded = array_ops.matrix_transpose(embedded_t)

        mean_mat = array_ops.ones_like(embedded) * mean

        return embedded + random_normal(shape, stddev=eps,
                                        dtype=dtype) + mean_mat
Beispiel #42
0
 def call(self, inputs, training=None):
     if training is None:
         training = keras.backend.learning_phase()
     return tf_utils.smart_cond(
         training, lambda: array_ops.ones_like(inputs),
         lambda: array_ops.zeros_like(inputs))
Beispiel #43
0
 def _entropy(self):
     # Use broadcasting rules to calculate the full broadcast scale.
     scale = self.scale * array_ops.ones_like(self.loc)
     return 0.5 * math.log(2. * math.pi * math.e) + math_ops.log(scale)
    def training_graph(self, input_data, input_labels, **tree_kwargs):
        """Constructs a TF graph for training a random forest.

    Args:
      input_data: A tensor or dict of string->Tensor for input data.
      input_labels: A tensor or placeholder for labels associated with
        input_data.
      **tree_kwargs: Keyword arguments passed to each tree's training_graph.

    Returns:
      The last op in the random forest training graph.

    Raises:
      NotImplementedError: If trying to use bagging with sparse features.
    """
        processed_dense_features, processed_sparse_features, data_spec = (
            data_ops.ParseDataTensorOrDict(input_data))

        if input_labels is not None:
            labels = data_ops.ParseLabelTensorOrDict(input_labels)

        data_spec = data_spec or self.get_default_data_spec(input_data)

        tree_graphs = []
        for i in range(self.params.num_trees):
            with ops.device(self.device_assigner.get_device(i)):
                seed = self.params.base_random_seed
                if seed != 0:
                    seed += i
                # If using bagging, randomly select some of the input.
                tree_data = processed_dense_features
                tree_labels = labels
                if self.params.bagging_fraction < 1.0:
                    # TODO(gilberth): Support bagging for sparse features.
                    if processed_sparse_features is not None:
                        raise NotImplementedError(
                            'Bagging not supported with sparse features.')
                    # TODO(thomaswc): This does sampling without replacment.  Consider
                    # also allowing sampling with replacement as an option.
                    batch_size = array_ops.strided_slice(
                        array_ops.shape(processed_dense_features), [0], [1])
                    r = random_ops.random_uniform(batch_size, seed=seed)
                    mask = math_ops.less(
                        r,
                        array_ops.ones_like(r) * self.params.bagging_fraction)
                    gather_indices = array_ops.squeeze(array_ops.where(mask),
                                                       squeeze_dims=[1])
                    # TODO(thomaswc): Calculate out-of-bag data and labels, and store
                    # them for use in calculating statistics later.
                    tree_data = array_ops.gather(processed_dense_features,
                                                 gather_indices)
                    tree_labels = array_ops.gather(labels, gather_indices)
                if self.params.bagged_features:
                    if processed_sparse_features is not None:
                        raise NotImplementedError(
                            'Feature bagging not supported with sparse features.'
                        )
                    tree_data = self._bag_features(i, tree_data)

                initialization = self.trees[i].tree_initialization()

                with ops.control_dependencies([initialization]):
                    tree_graphs.append(self.trees[i].training_graph(
                        tree_data,
                        tree_labels,
                        seed,
                        data_spec=data_spec,
                        sparse_features=processed_sparse_features,
                        **tree_kwargs))

        return control_flow_ops.group(*tree_graphs, name='train')
Beispiel #45
0
 def _entropy(self):
     # Use broadcasting rules to calculate the full broadcast sigma.
     scale = self.scale * array_ops.ones_like(self.loc)
     return 2 + math_ops.log(scale)
Beispiel #46
0
def confusion_matrix(labels,
                     predictions,
                     num_classes=None,
                     dtype=dtypes.int32,
                     name=None,
                     weights=None):
    """Computes the confusion matrix from predictions and labels.

  Calculate the Confusion Matrix for a pair of prediction and
  label 1-D int arrays.

  The matrix columns represent the prediction labels and the rows represent the
  real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
  where `n` is the number of valid labels for a given classification task. Both
  prediction and labels must be 1-D arrays of the same shape in order for this
  function to work.

  If `num_classes` is None, then `num_classes` will be set to the one plus
  the maximum value in either predictions or labels.
  Class labels are expected to start at 0. E.g., if `num_classes` was
  three, then the possible labels would be `[0, 1, 2]`.

  If `weights` is not `None`, then each prediction contributes its
  corresponding weight to the total value of the confusion matrix cell.

  For example:

  ```python
    tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
        [[0 0 0 0 0]
         [0 0 1 0 0]
         [0 0 1 0 0]
         [0 0 0 0 0]
         [0 0 0 0 1]]
  ```

  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
  resulting in a 5x5 confusion matrix.

  Args:
    labels: 1-D `Tensor` of real labels for the classification task.
    predictions: 1-D `Tensor` of predictions for a given classification.
    num_classes: The possible number of labels the classification task can
                 have. If this value is not provided, it will be calculated
                 using both predictions and labels array.
    dtype: Data type of the confusion matrix.
    name: Scope name.
    weights: An optional `Tensor` whose shape matches `predictions`.

  Returns:
    A k X k matrix representing the confusion matrix, where k is the number of
    possible labels in the classification task.

  Raises:
    ValueError: If both predictions and labels are not 1-D vectors and have
      mismatched shapes, or if `weights` is not `None` and its shape doesn't
      match `predictions`.
  """
    with ops.name_scope(name, 'confusion_matrix',
                        (predictions, labels, num_classes, weights)) as name:
        labels, predictions = remove_squeezable_dimensions(
            ops.convert_to_tensor(labels, name='labels'),
            ops.convert_to_tensor(predictions, name='predictions'))
        predictions = math_ops.cast(predictions, dtypes.int64)
        labels = math_ops.cast(labels, dtypes.int64)

        if num_classes is None:
            num_classes = math_ops.maximum(math_ops.reduce_max(predictions),
                                           math_ops.reduce_max(labels)) + 1

        if weights is not None:
            predictions.get_shape().assert_is_compatible_with(
                weights.get_shape())
            weights = math_ops.cast(weights, dtype)

        shape = array_ops.stack([num_classes, num_classes])
        indices = array_ops.transpose(array_ops.stack([labels, predictions]))
        values = (array_ops.ones_like(predictions, dtype)
                  if weights is None else weights)
        cm_sparse = sparse_tensor.SparseTensor(
            indices=indices,
            values=values,
            dense_shape=math_ops.to_int64(shape))
        zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)

        return sparse_ops.sparse_add(zero_matrix, cm_sparse)
def _compute_sampled_logits(weights,
                            biases,
                            inputs,
                            labels,
                            num_sampled,
                            num_classes,
                            num_true=1,
                            sampled_values=None,
                            subtract_log_q=True,
                            remove_accidental_hits=False,
                            partition_strategy="mod",
                            name=None):
    """Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
  Returns:
    out_logits, out_labels: `Tensor` objects each with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
  """

    if not isinstance(weights, list):
        weights = [weights]

    with ops.op_scope(weights + [biases, inputs, labels], name,
                      "compute_sampled_logits"):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])

        # Sample the negative labels.
        #   sampled shape: [num_sampled] tensor
        #   true_expected_count shape = [batch_size, 1] tensor
        #   sampled_expected_count shape = [num_sampled] tensor
        if sampled_values is None:
            sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
                true_classes=labels,
                num_true=num_true,
                num_sampled=num_sampled,
                unique=True,
                range_max=num_classes)
        # NOTE: pylint cannot tell that 'sampled_values' is a sequence
        # pylint: disable=unpacking-non-sequence
        sampled, true_expected_count, sampled_expected_count = sampled_values
        # pylint: enable=unpacking-non-sequence

        # labels_flat is a [batch_size * num_true] tensor
        # sampled is a [num_sampled] int tensor
        all_ids = array_ops.concat(0, [labels_flat, sampled])

        # weights shape is [num_classes, dim]
        all_w = embedding_ops.embedding_lookup(
            weights, all_ids, partition_strategy=partition_strategy)
        all_b = embedding_ops.embedding_lookup(biases, all_ids)
        # true_w shape is [batch_size * num_true, dim]
        # true_b is a [batch_size * num_true] tensor
        true_w = array_ops.slice(
            all_w, [0, 0],
            array_ops.pack([array_ops.shape(labels_flat)[0], -1]))
        true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))

        # inputs shape is [batch_size, dim]
        # true_w shape is [batch_size * num_true, dim]
        # row_wise_dots is [batch_size, num_true, dim]
        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim])
        row_wise_dots = math_ops.mul(
            array_ops.expand_dims(inputs, 1),
            array_ops.reshape(true_w, new_true_w_shape))
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat(0, [[-1], dim]))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix),
                                        [-1, num_true])
        true_b = array_ops.reshape(true_b, [-1, num_true])
        true_logits += true_b

        # Lookup weights and biases for sampled labels.
        #   sampled_w shape is [num_sampled, dim]
        #   sampled_b is a [num_sampled] float tensor
        sampled_w = array_ops.slice(
            all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]),
            [-1, -1])
        sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

        # inputs has shape [batch_size, dim]
        # sampled_w has shape [num_sampled, dim]
        # sampled_b has shape [num_sampled]
        # Apply X*W'+B, which yields [batch_size, num_sampled]
        sampled_logits = math_ops.matmul(inputs, sampled_w,
                                         transpose_b=True) + sampled_b

        if remove_accidental_hits:
            acc_hits = candidate_sampling_ops.compute_accidental_hits(
                labels, sampled, num_true=num_true)
            acc_indices, acc_ids, acc_weights = acc_hits

            # This is how SparseToDense expects the indices.
            acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
            acc_ids_2d_int32 = array_ops.reshape(
                math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
            sparse_indices = array_ops.concat(
                1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices")
            # Create sampled_logits_shape = [batch_size, num_sampled]
            sampled_logits_shape = array_ops.concat(0, [
                array_ops.shape(labels)[:1],
                array_ops.expand_dims(num_sampled, 0)
            ])
            if sampled_logits.dtype != acc_weights.dtype:
                acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
            sampled_logits += sparse_ops.sparse_to_dense(
                sparse_indices,
                sampled_logits_shape,
                acc_weights,
                default_value=0.0,
                validate_indices=False)

        if subtract_log_q:
            # Subtract log of Q(l), prior probability that l appears in sampled.
            true_logits -= math_ops.log(true_expected_count)
            sampled_logits -= math_ops.log(sampled_expected_count)

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat(1, [true_logits, sampled_logits])
        # true_logits is a float tensor, ones_like(true_logits) is a float tensor
        # of ones. We then divide by num_true to ensure the per-example labels sum
        # to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat(1, [
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(sampled_logits)
        ])

    return out_logits, out_labels
Beispiel #48
0
 def _unbounded_exponential_log_prob(x):
     """An exponential distribution with log-likelihood NaN for x < 0."""
     per_element_potentials = array_ops.where(
         x < 0, np.nan * array_ops.ones_like(x), -x)
     return math_ops.reduce_sum(per_element_potentials)
    def _create_tpu_estimator_spec(self,
                                   features,
                                   mode,
                                   logits,
                                   labels=None,
                                   optimizer=None,
                                   train_op_fn=None,
                                   regularization_losses=None):
        """Returns an `model_fn._TPUEstimatorSpec`.

    Args:
      features: Input `dict` of `Tensor` or `SparseTensor` objects.
      mode: Estimator's `ModeKeys`.
      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
        For many applications, the shape is `[batch_size, n_classes]`.
      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
        `mode` equals `TRAIN` or `EVAL`.
      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
        updates variables and increments `global_step`.
      train_op_fn: Function that takes a scalar loss `Tensor` and returns
        `train_op`. Used if `optimizer` is `None`.
      regularization_losses: A list of additional scalar losses to be added to
        the training loss, such as regularization losses. These losses are
        usually expressed as a batch average, so for best results users need to
        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
        avoid scaling errors.
    Returns:
      `model_fn._TPUEstimatorSpec`.
    Raises:
      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
        mode, or if both are set.
    """
        with ops.name_scope(self._name, 'head'):
            logits = head_lib._check_logits_final_dim(logits,
                                                      self.logits_dimension)  # pylint:disable=protected-access

            # Predict.
            pred_keys = prediction_keys.PredictionKeys
            with ops.name_scope(None, 'predictions', (logits, )):
                probabilities = math_ops.sigmoid(logits,
                                                 name=pred_keys.PROBABILITIES)
                predictions = {
                    pred_keys.LOGITS: logits,
                    pred_keys.PROBABILITIES: probabilities,
                }
            if mode == model_fn.ModeKeys.PREDICT:
                classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
                    scores=probabilities,
                    n_classes=self._n_classes,
                    label_vocabulary=self._label_vocabulary)
                return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
                    mode=model_fn.ModeKeys.PREDICT,
                    predictions=predictions,
                    export_outputs={
                        _DEFAULT_SERVING_KEY: classifier_output,
                        head_lib._CLASSIFY_SERVING_KEY: classifier_output,  # pylint:disable=protected-access
                        head_lib._PREDICT_SERVING_KEY: (  # pylint:disable=protected-access
                            export_output.PredictOutput(predictions))
                    })

            (training_loss, unreduced_loss, weights,
             processed_labels) = self.create_loss(features=features,
                                                  mode=mode,
                                                  logits=logits,
                                                  labels=labels)
            if regularization_losses:
                regularization_loss = math_ops.add_n(regularization_losses)
                regularized_training_loss = math_ops.add_n(
                    [training_loss, regularization_loss])
            else:
                regularization_loss = None
                regularized_training_loss = training_loss

            # Eval.
            if mode == model_fn.ModeKeys.EVAL:
                return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
                    mode=model_fn.ModeKeys.EVAL,
                    predictions=predictions,
                    loss=regularized_training_loss,
                    eval_metrics=head_lib._create_eval_metrics_tuple(  # pylint:disable=protected-access
                        self._eval_metric_ops, {
                            'labels': processed_labels,
                            'probabilities': probabilities,
                            'weights': weights,
                            'unreduced_loss': unreduced_loss,
                            'regularization_loss': regularization_loss,
                        }))

            # Train.
            if optimizer is not None:
                if train_op_fn is not None:
                    raise ValueError(
                        'train_op_fn and optimizer cannot both be set.')
                train_op = optimizer.minimize(
                    regularized_training_loss,
                    global_step=training_util.get_global_step())
            elif train_op_fn is not None:
                train_op = train_op_fn(regularized_training_loss)
            else:
                raise ValueError(
                    'train_op_fn and optimizer cannot both be None.')
            train_op = head_lib._append_update_ops(train_op)  # pylint:disable=protected-access
            # Only summarize mean_loss for SUM reduction to preserve backwards
            # compatibility. Otherwise skip it to avoid unnecessary computation.
            if self._loss_reduction == losses.Reduction.SUM:
                example_weight_sum = math_ops.reduce_sum(
                    weights * array_ops.ones_like(unreduced_loss))
                mean_loss = training_loss / example_weight_sum
            else:
                mean_loss = None
        with ops.name_scope(''):
            keys = metric_keys.MetricKeys
            summary.scalar(
                head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
                regularized_training_loss)
            if mean_loss is not None:
                summary.scalar(
                    head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
                    mean_loss)
            if regularization_loss is not None:
                summary.scalar(
                    head_lib._summary_key(self._name,
                                          keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
                    regularization_loss)
        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
            mode=model_fn.ModeKeys.TRAIN,
            predictions=predictions,
            loss=regularized_training_loss,
            train_op=train_op)
Beispiel #50
0
def clip_by_norm(t, clip_norm, axes=None, name=None):
  """Clips tensor values to a maximum L2-norm.

  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
  along the dimensions given in `axes`. Specifically, in the default case
  where all dimensions are used for calculation, if the L2-norm of `t` is
  already less than or equal to `clip_norm`, then `t` is not modified. If
  the L2-norm is greater than `clip_norm`, then this operation returns a
  tensor of the same type and shape as `t` with its values set to:

  `t * clip_norm / l2norm(t)`

  In this case, the L2-norm of the output tensor is `clip_norm`.

  As another example, if `t` is a matrix and `axes == [1]`, then each row
  of the output will have L2-norm less than or equal to `clip_norm`. If
  `axes == [0]` instead, each column of the output will be clipped.

  Code example:

  >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32)
  >>> tf.clip_by_norm(some_nums, 2.0).numpy()
  array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]],
        dtype=float32)

  This operation is typically used to clip gradients before applying them with
  an optimizer.  Most gradient data is a collection of different shaped tensors
  for different parts of the model.  Thus, this is a common usage:

  ```
  # Get your gradients after training
  loss_value, grads = grad(model, features, labels)

  # Apply some clipping
  grads = [tf.clip_by_norm(g, norm)
               for g in grads]

  # Continue on with training
  optimizer.apply_gradients(grads)
  ```

  Args:
    t: A `Tensor` or `IndexedSlices`.  This must be a floating point type.
    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also
      floating point
    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
      to use for computing the L2-norm. If `None` (the default), uses all
      dimensions.
    name: A name for the operation (optional).

  Returns:
    A clipped `Tensor` or `IndexedSlices`.

  Raises:
    ValueError: If the clip_norm tensor is not a 0-D scalar tensor.
    TypeError: If dtype of the input is not a floating point or
      complex type.
  """
  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
    values = ops.convert_to_tensor(
        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")

    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
    pred = l2sum > 0
    # Two-tap tf.where trick to bypass NaN gradients
    l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
    l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
    intermediate = values * clip_norm
    # Assert that the shape is compatible with the initial shape,
    # to prevent unintentional broadcasting.
    values.shape.assert_is_compatible_with(intermediate.shape)
    values_clip = array_ops.identity(
        intermediate / math_ops.maximum(l2norm, clip_norm), name=name)

    if isinstance(t, ops.IndexedSlices):
      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)

    return values_clip
Beispiel #51
0
    def _mini_batch_training_op(self, inputs, cluster_idx_list,
                                cluster_centers, total_counts):
        """Creates an op for training for mini batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.
      total_counts: Tensor Ref of cluster counts.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
        update_ops = []
        for inp, cluster_idx in zip(inputs, cluster_idx_list):
            with ops.colocate_with(inp, ignore_existing=True):
                assert total_counts is not None
                cluster_idx = array_ops.reshape(cluster_idx, [-1])
                # Dedupe the unique ids of cluster_centers being updated so that updates
                # can be locally aggregated.
                unique_ids, unique_idx = array_ops.unique(cluster_idx)
                num_unique_cluster_idx = array_ops.size(unique_ids)
                # Fetch the old values of counts and cluster_centers.
                with ops.colocate_with(total_counts, ignore_existing=True):
                    old_counts = array_ops.gather(total_counts, unique_ids)
                # TODO(agarwal): This colocation seems to run into problems. Fix it.
                with ops.colocate_with(cluster_centers, ignore_existing=True):
                    old_cluster_centers = array_ops.gather(
                        cluster_centers, unique_ids)
                # Locally aggregate the increment to counts.
                count_updates = math_ops.unsorted_segment_sum(
                    array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
                    unique_idx, num_unique_cluster_idx)
                # Locally compute the sum of inputs mapped to each id.
                # For a cluster with old cluster value x, old count n, and with data
                # d_1,...d_k newly assigned to it, we recompute the new value as
                # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
                # Compute \\(sum_i(d_i)\\), see comment above.
                cluster_center_updates = math_ops.unsorted_segment_sum(
                    inp, unique_idx, num_unique_cluster_idx)
                # Shape to enable broadcasting count_updates and learning_rate to inp.
                # It extends the shape with 1's to match the rank of inp.
                broadcast_shape = array_ops.concat([
                    array_ops.reshape(num_unique_cluster_idx, [1]),
                    array_ops.ones(array_ops.reshape(
                        array_ops.rank(inp) - 1, [1]),
                                   dtype=dtypes.int32)
                ], 0)
                # Subtract k * x, see comment above.
                cluster_center_updates -= math_ops.cast(
                    array_ops.reshape(count_updates, broadcast_shape),
                    inp.dtype) * old_cluster_centers
                learning_rate = math_ops.reciprocal(
                    math_ops.cast(old_counts + count_updates, inp.dtype))
                learning_rate = array_ops.reshape(learning_rate,
                                                  broadcast_shape)
                # scale by 1 / (n + k), see comment above.
                cluster_center_updates *= learning_rate
                # Apply the updates.
            update_counts = state_ops.scatter_add(total_counts, unique_ids,
                                                  count_updates)
            update_cluster_centers = state_ops.scatter_add(
                cluster_centers, unique_ids, cluster_center_updates)
            update_ops.extend([update_counts, update_cluster_centers])
        return control_flow_ops.group(*update_ops)
Beispiel #52
0
 def mean(self):
     return self._mu * array_ops.ones_like(self._sigma)
Beispiel #53
0
def _list_mle_loss(labels,
                   logits,
                   weights=None,
                   lambda_weight=None,
                   reduction=core_losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
                   name=None,
                   seed=None):
  """Computes the ListMLE loss [Xia et al.

  2008] for a list.

  Given the labels of graded relevance l_i and the logits s_i, we calculate
  the ListMLE loss for the given list.

  The `lambda_weight` re-weights examples based on l_i and r_i.
  The recommended weighting scheme is the formulation presented in the
  "Position-Aware ListMLE" paper (Lan et. al) and available using
  create_p_list_mle_lambda_weight() factory function above.

  Args:
    labels: A `Tensor` of the same shape as `logits` representing graded
      relevance.
    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
      ranking score of the corresponding item.
    weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
      weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
      weights.
    lambda_weight: A `DCGLambdaWeight` instance.
    reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
      reduce training loss over batch.
    name: A string used as the name for this loss.
    seed: A randomization seed used when shuffling ground truth permutations.

  Returns:
    An op for the ListMLE loss.
  """
  with ops.name_scope(name, 'list_mle_loss', (labels, logits, weights)):
    is_label_valid = utils.is_label_valid(labels)
    # Reset the invalid labels to 0 and reset the invalid logits to a logit with
    # ~= 0 contribution.
    labels = array_ops.where(is_label_valid, labels,
                             array_ops.zeros_like(labels))
    logits = array_ops.where(
        is_label_valid, logits,
        math_ops.log(_EPSILON) * array_ops.ones_like(logits))
    weights = 1.0 if weights is None else ops.convert_to_tensor(weights)
    weights = array_ops.squeeze(weights)

    # Shuffle labels and logits to add randomness to sort.
    shuffled_indices = utils.shuffle_valid_indices(is_label_valid, seed)
    shuffled_labels = array_ops.gather_nd(labels, shuffled_indices)
    shuffled_logits = array_ops.gather_nd(logits, shuffled_indices)

    sorted_labels, sorted_logits = utils.sort_by_scores(
        shuffled_labels, [shuffled_labels, shuffled_logits])

    raw_max = math_ops.reduce_max(sorted_logits, axis=1, keepdims=True)
    sorted_logits = sorted_logits - raw_max
    sums = math_ops.cumsum(math_ops.exp(sorted_logits), axis=1, reverse=True)
    sums = math_ops.log(sums) - sorted_logits

    if lambda_weight is not None and isinstance(lambda_weight,
                                                ListMLELambdaWeight):
      sums *= lambda_weight.individual_weights(sorted_labels)

    negative_log_likelihood = math_ops.reduce_sum(sums, 1)

    return core_losses.compute_weighted_loss(
        negative_log_likelihood, weights=weights, reduction=reduction)
Beispiel #54
0
 def _ones(self):
     return array_ops.ones_like(self.a + self.b)
  def call(self, inputs, training=None):
    if training is None:
      training = K.learning_phase()

    in_eager_mode = context.executing_eagerly()
    if self.virtual_batch_size is not None:
      # Virtual batches (aka ghost batches) can be simulated by reshaping the
      # Tensor and reusing the existing batch norm implementation
      original_shape = [-1] + inputs.shape.as_list()[1:]
      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

      # Will cause errors if virtual_batch_size does not divide the batch size
      inputs = array_ops.reshape(inputs, expanded_shape)

      def undo_virtual_batching(outputs):
        outputs = array_ops.reshape(outputs, original_shape)
        return outputs

    if self.fused:
      outputs = self._fused_batch_norm(inputs, training=training)
      if self.virtual_batch_size is not None:
        # Currently never reaches here since fused_batch_norm does not support
        # virtual batching
        outputs = undo_virtual_batching(outputs)
      return outputs

    # Compute the axes along which to reduce the mean / variance
    input_shape = inputs.get_shape()
    ndims = len(input_shape)
    reduction_axes = [i for i in range(ndims) if i not in self.axis]
    if self.virtual_batch_size is not None:
      del reduction_axes[1]     # Do not reduce along virtual batch dim

    # Broadcasting only necessary for single-axis batch norm where the axis is
    # not the last dimension
    broadcast_shape = [1] * ndims
    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
    def _broadcast(v):
      if (v is not None and
          len(v.get_shape()) != ndims and
          reduction_axes != list(range(ndims - 1))):
        return array_ops.reshape(v, broadcast_shape)
      return v

    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

    def _compose_transforms(scale, offset, then_scale, then_offset):
      if then_scale is not None:
        scale *= then_scale
        offset *= then_scale
      if then_offset is not None:
        offset += then_offset
      return (scale, offset)

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = tf_utils.constant_value(training)
    if training_value is not False:
      if self.adjustment:
        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
        # Adjust only during training.
        adj_scale = tf_utils.smart_cond(training,
                                        lambda: adj_scale,
                                        lambda: array_ops.ones_like(adj_scale))
        adj_bias = tf_utils.smart_cond(training,
                                       lambda: adj_bias,
                                       lambda: array_ops.zeros_like(adj_bias))
        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)

      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
      mean, variance = self._moments(
          math_ops.cast(inputs, self._param_dtype),
          reduction_axes,
          keep_dims=keep_dims)

      moving_mean = self.moving_mean
      moving_variance = self.moving_variance

      mean = tf_utils.smart_cond(training,
                                 lambda: mean,
                                 lambda: moving_mean)
      variance = tf_utils.smart_cond(training,
                                     lambda: variance,
                                     lambda: moving_variance)

      if self.virtual_batch_size is not None:
        # This isn't strictly correct since in ghost batch norm, you are
        # supposed to sequentially update the moving_mean and moving_variance
        # with each sub-batch. However, since the moving statistics are only
        # used during evaluation, it is more efficient to just update in one
        # step and should not make a significant difference in the result.
        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
      else:
        new_mean, new_variance = mean, variance

      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            new_mean, new_variance, training)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
        scale, offset = _compose_transforms(r, d, scale, offset)

      if distribution_strategy_context.in_cross_replica_context():
        strategy = distribution_strategy_context.get_strategy()

        def _do_update(var, value):
          """Compute the updates for mean and variance."""
          if in_eager_mode and not self.trainable:
            return
          return strategy.extended.update(
              var, self._assign_moving_average, (value, self.momentum),
              group=False)
        # We need to unwrap the moving_mean or moving_variance in the case of
        # training being false to match the output of true_fn and false_fn
        # in the smart cond.
        mean_update = tf_utils.smart_cond(
            training,
            lambda: _do_update(self.moving_mean, new_mean),
            lambda: strategy.unwrap(self.moving_mean))
        variance_update = tf_utils.smart_cond(
            training,
            lambda: _do_update(self.moving_variance, new_variance),
            lambda: strategy.unwrap(self.moving_variance))
      else:
        def _do_update(var, value):
          """Compute the updates for mean and variance."""
          if in_eager_mode and not self.trainable:
            return
          return self._assign_moving_average(var, value, self.momentum)
        mean_update = tf_utils.smart_cond(
            training,
            lambda: _do_update(self.moving_mean, new_mean),
            lambda: self.moving_mean)
        variance_update = tf_utils.smart_cond(
            training,
            lambda: _do_update(self.moving_variance, new_variance),
            lambda: self.moving_variance)
      if not context.executing_eagerly():
        self.add_update(mean_update, inputs=True)
        self.add_update(variance_update, inputs=True)

    else:
      mean, variance = self.moving_mean, self.moving_variance

    mean = math_ops.cast(mean, inputs.dtype)
    variance = math_ops.cast(variance, inputs.dtype)
    if offset is not None:
      offset = math_ops.cast(offset, inputs.dtype)
    if scale is not None:
      scale = math_ops.cast(scale, inputs.dtype)
    # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
    # math in float16 hurts validation accuracy of popular models like resnet.
    outputs = nn.batch_normalization(inputs,
                                     _broadcast(mean),
                                     _broadcast(variance),
                                     offset,
                                     scale,
                                     self.epsilon)
    # If some components of the shape got lost due to adjustments, fix that.
    outputs.set_shape(input_shape)

    if self.virtual_batch_size is not None:
      outputs = undo_virtual_batching(outputs)
    return outputs
Beispiel #56
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0, and decay=1 meaning no updates.
        r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
        d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
        decay = _smart_select(training, lambda: self.renorm_momentum,
                              lambda: 1.)

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            # Update the variables without zero debiasing. The debiasing will be
            # accomplished by dividing the exponential moving average by the weight.
            # For example, after a single update, the moving average would be
            # (1-decay) * value. and the weight will be 1-decay, with their ratio
            # giving value.
            # Make sure the weight is not updated until before r and d computation.
            value = array_ops.identity(value)
            with ops.control_dependencies([value]):
                weight_value = array_ops.constant(1., dtype=weight.dtype)
            new_var = moving_averages.assign_moving_average(var,
                                                            value,
                                                            decay,
                                                            zero_debias=False)
            new_weight = moving_averages.assign_moving_average(
                weight, weight_value, decay, zero_debias=False)
            return new_var / new_weight

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
  def _renorm_correction_and_moments(self, mean, variance, training):
    """Returns the correction and update values for renorm."""
    stddev = math_ops.sqrt(variance + self.epsilon)
    # Compute the average mean and standard deviation, as if they were
    # initialized with this batch's moments.
    mixed_renorm_mean = (self.renorm_mean +
                         (1. - self.renorm_mean_weight) * mean)
    mixed_renorm_stddev = (self.renorm_stddev +
                           (1. - self.renorm_stddev_weight) * stddev)
    # Compute the corrections for batch renorm.
    r = stddev / mixed_renorm_stddev
    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
    # Ensure the corrections use pre-update moving averages.
    with ops.control_dependencies([r, d]):
      mean = array_ops.identity(mean)
      stddev = array_ops.identity(stddev)
    rmin, rmax, dmax = [self.renorm_clipping.get(key)
                        for key in ['rmin', 'rmax', 'dmax']]
    if rmin is not None:
      r = math_ops.maximum(r, rmin)
    if rmax is not None:
      r = math_ops.minimum(r, rmax)
    if dmax is not None:
      d = math_ops.maximum(d, -dmax)
      d = math_ops.minimum(d, dmax)
    # When not training, use r=1, d=0.
    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
    d = tf_utils.smart_cond(training,
                            lambda: d,
                            lambda: array_ops.zeros_like(d))

    def _update_renorm_variable(var, weight, value):
      """Updates a moving average and weight, returns the unbiased value."""
      value = array_ops.identity(value)
      def _do_update():
        """Updates the var and weight, returns their updated ratio."""
        # Update the variables without zero debiasing. The debiasing will be
        # accomplished by dividing the exponential moving average by the weight.
        # For example, after a single update, the moving average would be
        # (1-decay) * value. and the weight will be 1-decay, with their ratio
        # giving the value.
        # Make sure the weight is not updated until before r and d computation.
        with ops.control_dependencies([value]):
          weight_value = array_ops.constant(1., dtype=weight.dtype)
        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
        new_weight = self._assign_moving_average(weight, weight_value,
                                                 self.renorm_momentum)
        # TODO(yuefengz): the updates to var and weighted can not be batched
        # together if we fetch their updated values here. Consider calculating
        # new values and delaying the updates.
        return new_var / new_weight

      def _fake_update():
        return array_ops.identity(var)
      return tf_utils.smart_cond(training, _do_update, _fake_update)

    # TODO(yuefengz): colocate the operations
    new_mean = _update_renorm_variable(self.renorm_mean,
                                       self.renorm_mean_weight, mean)
    new_stddev = _update_renorm_variable(self.renorm_stddev,
                                         self.renorm_stddev_weight, stddev)
    # Make sqrt(moving_variance + epsilon) = new_stddev.
    new_variance = math_ops.square(new_stddev) - self.epsilon

    return (r, d, new_mean, new_variance)
Beispiel #58
0
 def _variance(self):
     p = self.probs * array_ops.ones_like(
         self.total_count)[..., array_ops.newaxis]
     return self._mean_val - self._mean_val * p
Beispiel #59
0
 def call(self, inputs):
     return keras.backend.in_train_phase(
         lambda: array_ops.ones_like(inputs),
         lambda: array_ops.zeros_like(inputs))
Beispiel #60
0
    def _create_hook_ops(self, input_row_indices, input_col_indices,
                         train_ops):
        """Creates ops to update is_row_sweep_var, global_step and completed_sweeps.

    Creates two boolean tensors `processed_rows` and `processed_cols`, which
    keep track of which rows/cols have been processed during the current sweep.
    Returns ops that should be run after each row / col update.
      - When `self._is_row_sweep_var` is True, it sets
        processed_rows[input_row_indices] to True.
      - When `self._is_row_sweep_var` is False, it sets
        processed_cols[input_col_indices] to True.

    Args:
      input_row_indices: A Tensor. The indices of the input rows that are
        processed during the current sweep.
      input_col_indices: A Tensor. The indices of the input columns that
        are processed during the current sweep.
      train_ops: A list of ops. The ops created by this function have control
        dependencies on `train_ops`.

    Returns:
      A tuple consisting of:
        update_op: An op to be run jointly with training. It updates the state
          and increments counters (global step and completed sweeps).
        is_sweep_done_var: A Boolean tf.Variable, specifies whether the sweep is
          done, i.e. all rows (during a row sweep) or all columns (during a
          column sweep) have been processed.
        switch_op: An op to be run in `self.before_run` when the sweep is done.
    """
        processed_rows_init = array_ops.fill(dims=[self._num_rows],
                                             value=False)
        with ops.colocate_with(processed_rows_init):
            processed_rows = variable_scope.variable(
                processed_rows_init,
                collections=[ops.GraphKeys.GLOBAL_VARIABLES],
                trainable=False,
                name="sweep_hook_processed_rows")
        processed_cols_init = array_ops.fill(dims=[self._num_cols],
                                             value=False)
        with ops.colocate_with(processed_cols_init):
            processed_cols = variable_scope.variable(
                processed_cols_init,
                collections=[ops.GraphKeys.GLOBAL_VARIABLES],
                trainable=False,
                name="sweep_hook_processed_cols")
        switch_ops = control_flow_ops.group(
            state_ops.assign(self._is_row_sweep_var,
                             math_ops.logical_not(self._is_row_sweep_var)),
            state_ops.assign(processed_rows, processed_rows_init),
            state_ops.assign(processed_cols, processed_cols_init))
        is_sweep_done_var = variable_scope.variable(
            False,
            collections=[ops.GraphKeys.GLOBAL_VARIABLES],
            trainable=False,
            name="is_sweep_done")

        # After running the `train_ops`, updates `processed_rows` or
        # `processed_cols` tensors, depending on whether this is a row or col sweep.
        with ops.control_dependencies(train_ops):
            with ops.colocate_with(processed_rows):
                update_processed_rows = state_ops.scatter_update(
                    processed_rows, input_row_indices,
                    math_ops.logical_and(
                        self._is_row_sweep_var,
                        array_ops.ones_like(input_row_indices,
                                            dtype=dtypes.bool)))
            with ops.colocate_with(processed_cols):
                update_processed_cols = state_ops.scatter_update(
                    processed_cols, input_col_indices,
                    math_ops.logical_and(
                        math_ops.logical_not(self._is_row_sweep_var),
                        array_ops.ones_like(input_col_indices,
                                            dtype=dtypes.bool)))
            update_processed_op = control_flow_ops.group(
                update_processed_rows, update_processed_cols)

            with ops.control_dependencies([update_processed_op]):
                is_sweep_done = math_ops.logical_or(
                    math_ops.reduce_all(processed_rows),
                    math_ops.reduce_all(processed_cols))
                # Increments global step.
                global_step = framework_variables.get_global_step()
                if global_step is not None:
                    global_step_incr_op = state_ops.assign_add(
                        global_step, 1, name="global_step_incr").op
                else:
                    global_step_incr_op = control_flow_ops.no_op()
                # Increments completed sweeps.
                completed_sweeps_incr_op = state_ops.assign_add(
                    self._completed_sweeps_var,
                    math_ops.cast(is_sweep_done, dtypes.int32),
                    use_locking=True).op
                update_ops = control_flow_ops.group(
                    global_step_incr_op, completed_sweeps_incr_op,
                    state_ops.assign(is_sweep_done_var, is_sweep_done))

        return update_ops, is_sweep_done_var, switch_ops