def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.test_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         self.assertAllEqual(simple, sharded)
 def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.cached_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         self.assertAllEqual(simple, sharded)
 def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.cached_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids))
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         # assertAllClose is used here as different implementations of sqrt may
         # be used to compute each of the values being compared.  For example,
         # on AVX512 builds the embedding operation makes use of Eigen's fast
         # vectorized square root algorithm for doubles.  These different
         # implementations of sqrt are not guaranteed to produce exactly the
         # same results. Therefore, an exact comparison cannot be made.
         self.assertAllClose(simple, sharded)
 def testTransform(self):
   # This tests all combinations of:
   #   - ids rank 0, 1, >1
   #   - params sharded/unsharded
   # It always applies max_norm.
   np.random.seed(8)
   l2_norm = 2.
   with self.cached_session():
     # Param values are in [l2_norm, l2_norm+1) so it will always clip.
     params = np.random.rand(6, 3) + l2_norm
     params_norm = l2_norm * params / np.sqrt(
         np.sum(params * params, axis=1, keepdims=True))
     # Compute the norm of each embedding. This will change the embedding
     # rank to 0.
     params_norm = np.linalg.norm(params_norm, axis=1)
     transform = lambda x: linalg_ops.norm(x, axis=1)
     for ids_shape in (), (3), (4, 3), (2, 3, 4):
       # Test ids rank 0, 1, 2, 3.
       ids = np.random.randint(
           params.shape[0], size=np.prod(ids_shape,
                                         dtype=np.int64)).reshape(ids_shape)
       # Compare nonsharded to gather.
       simple = embedding_ops._embedding_lookup_and_transform(
           params, ids, max_norm=l2_norm, transform_fn=transform).eval()
       self.assertAllClose(simple, array_ops.gather(params_norm, ids).eval())
       # Run a few different sharded versions.
       for procs in 1, 2, 3:
         stride = procs * math_ops.range(params.shape[0] // procs)
         split_params = [
             array_ops.gather(params, stride + p) for p in xrange(procs)
         ]
         sharded = embedding_ops._embedding_lookup_and_transform(
             split_params, ids, max_norm=l2_norm,
             transform_fn=transform).eval()
         # assertAllClose is used here as different implementations of sqrt may
         # be used to compute each of the values being compared.  For example,
         # on AVX512 builds the embedding operation makes use of Eigen's fast
         # vectorized square root algorithm for doubles.  These different
         # implementations of sqrt are not guaranteed to produce exactly the
         # same results. Therefore, an exact comparison cannot be made.
         self.assertAllClose(simple, sharded)
Exemple #5
0
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
                   resampling_temperature, partition_strategy):
  """A helper function for rank_sampled_softmax_loss.

  This computes, for each i in `sampled_values`,

      log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))

  where w_i, b_i are the weight and bias of the i-th class, respectively,
  and j ranges over the rows of `inputs`. For efficiency, we rearrange the
  computation to

      log(sum_j exp(w_i * (x_j / resampling_temperature))) +
          b_i / resampling_temperature.

  This translates to the following batched computation using tensorflow ops:

      reduce_logsumexp(matmul(embeddings,
                       transpose(inputs / resampling_temperature))) +
          biases / resampling_temperature

  The computation of the first term is colocated with the embeddings using
  `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second
  term, not the bottleneck, is computed at the worker.

  Args:
    weights: From `rank_sampled_softmax_loss`.
    biases: From `rank_sampled_softmax_loss`.
    inputs: From `rank_sampled_softmax_loss`.
    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
    num_resampled: An `int`. This many values are selected from
        `sampled_values` using the adaptive resampling algorithm. The caller
        must ensure that `num_resampled` is less than the size of
        `sampled_values`.
    resampling_temperature: A scalar `Tensor` with the temperature parameter
        for the adaptive resampling algorithm.
    partition_strategy: From `rank_sampled_softmax_loss`.

  Returns:
    A tuple of (`resampled_candidates`, `true_expected_count`,
        `resampled_expected_count`), similar to `sampled_values` but sampled
        down to `num_resampled` values.
  """
  # This code supports passing a Tensor for num_resampled, but since it is only
  # called with an int, that's what we specify in the arg list. If this
  # function is ever externalized, we should change the doc to support Tensor.

  sampled, true_expected_count, sampled_expected_count = sampled_values

  sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64)
  true_expected_count = array_ops.stop_gradient(true_expected_count)
  sampled_expected_count = array_ops.stop_gradient(sampled_expected_count)

  reweighted_inputs = inputs / resampling_temperature

  def logsumexp_logit(embeddings):
    return math_ops.reduce_logsumexp(
        math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True),
        axis=1,
        keepdims=False)

  # Calling this protected form of embedding_lookup allows co-locating
  # the logsumexp computation with the partitioned weights, which yields
  # a large speedup in practice.
  sampled_logits = embedding_ops._embedding_lookup_and_transform(  # pylint: disable=protected-access
      weights, sampled, partition_strategy, transform_fn=logsumexp_logit)
  sampled_b = array_ops.reshape(
      embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1])
  sampled_logits += sampled_b / resampling_temperature

  _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False)
  resampled = array_ops.gather(sampled, indices=resampled_indices)
  resampled_expected_count = array_ops.gather(
      sampled_expected_count, indices=resampled_indices)

  return resampled, true_expected_count, resampled_expected_count
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
                   resampling_temperature, partition_strategy):
    """A helper function for rank_sampled_softmax_loss.

  This computes, for each i in `sampled_values`,

      log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))

  where w_i, b_i are the weight and bias of the i-th class, respectively,
  and j ranges over the rows of `inputs`. For efficiency, we rearrange the
  computation to

      log(sum_j exp(w_i * (x_j / resampling_temperature))) +
          b_i / resampling_temperature.

  This translates to the following batched computation using tensorflow ops:

      reduce_logsumexp(matmul(embeddings,
                       transpose(inputs / resampling_temperature))) +
          biases / resampling_temperature

  The computation of the first term is colocated with the embeddings using
  `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second
  term, not the bottleneck, is computed at the worker.

  Args:
    weights: From `rank_sampled_softmax_loss`.
    biases: From `rank_sampled_softmax_loss`.
    inputs: From `rank_sampled_softmax_loss`.
    sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
    num_resampled: An `int`. This many values are selected from
        `sampled_values` using the adaptive resampling algorithm. The caller
        must ensure that `num_resampled` is less than the size of
        `sampled_values`.
    resampling_temperature: A scalar `Tensor` with the temperature parameter
        for the adaptive resampling algorithm.
    partition_strategy: From `rank_sampled_softmax_loss`.

  Returns:
    A tuple of (`resampled_candidates`, `true_expected_count`,
        `resampled_expected_count`), similar to `sampled_values` but sampled
        down to `num_resampled` values.
  """
    # This code supports passing a Tensor for num_resampled, but since it is only
    # called with an int, that's what we specify in the arg list. If this
    # function is ever externalized, we should change the doc to support Tensor.

    sampled, true_expected_count, sampled_expected_count = sampled_values

    sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64)
    true_expected_count = array_ops.stop_gradient(true_expected_count)
    sampled_expected_count = array_ops.stop_gradient(sampled_expected_count)

    reweighted_inputs = inputs / resampling_temperature

    def logsumexp_logit(embeddings):
        return math_ops.reduce_logsumexp(math_ops.matmul(embeddings,
                                                         reweighted_inputs,
                                                         transpose_b=True),
                                         axis=1,
                                         keepdims=False)

    # Calling this protected form of embedding_lookup allows co-locating
    # the logsumexp computation with the partitioned weights, which yields
    # a large speedup in practice.
    sampled_logits = embedding_ops._embedding_lookup_and_transform(  # pylint: disable=protected-access
        weights,
        sampled,
        partition_strategy,
        transform_fn=logsumexp_logit)
    sampled_b = array_ops.reshape(
        embedding_ops.embedding_lookup(biases, sampled, partition_strategy),
        [-1])
    sampled_logits += sampled_b / resampling_temperature

    _, resampled_indices = nn.top_k(sampled_logits,
                                    k=num_resampled,
                                    sorted=False)
    resampled = array_ops.gather(sampled, indices=resampled_indices)
    resampled_expected_count = array_ops.gather(sampled_expected_count,
                                                indices=resampled_indices)

    return resampled, true_expected_count, resampled_expected_count