def test_step(self):
    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
    beam_state = beam_search_decoder.BeamSearchDecoderState(
        cell_state=dummy_cell_state,
        log_probs=nn_ops.log_softmax(
            array_ops.ones([self.batch_size, self.beam_width])),
        lengths=constant_op.constant(
            2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int64),
        finished=array_ops.zeros(
            [self.batch_size, self.beam_width], dtype=dtypes.bool),
        accumulated_attention_probs=())

    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
                      0.0001)
    logits_[0, 0, 2] = 1.9
    logits_[0, 0, 3] = 2.1
    logits_[0, 1, 3] = 3.1
    logits_[0, 1, 4] = 0.9
    logits_[1, 0, 1] = 0.5
    logits_[1, 1, 2] = 2.7
    logits_[1, 2, 2] = 10.0
    logits_[1, 2, 3] = 0.2
    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
    log_probs = nn_ops.log_softmax(logits)

    outputs, next_beam_state = beam_search_decoder._beam_search_step(
        time=2,
        logits=logits,
        next_cell_state=dummy_cell_state,
        beam_state=beam_state,
        batch_size=ops.convert_to_tensor(self.batch_size),
        beam_width=self.beam_width,
        end_token=self.end_token,
        length_penalty_weight=self.length_penalty_weight,
        coverage_penalty_weight=self.coverage_penalty_weight)

    with self.cached_session() as sess:
      outputs_, next_state_, state_, log_probs_ = sess.run(
          [outputs, next_beam_state, beam_state, log_probs])

    self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]])
    self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]])
    self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]])
    self.assertAllEqual(next_state_.finished,
                        [[False, False, False], [False, False, False]])

    expected_log_probs = []
    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
    expected_log_probs.append(state_.log_probs[1][[2, 1, 0]])  # 0 --> 1
    expected_log_probs[0][0] += log_probs_[0, 1, 3]
    expected_log_probs[0][1] += log_probs_[0, 0, 3]
    expected_log_probs[0][2] += log_probs_[0, 0, 2]
    expected_log_probs[1][0] += log_probs_[1, 2, 2]
    expected_log_probs[1][1] += log_probs_[1, 1, 2]
    expected_log_probs[1][2] += log_probs_[1, 0, 1]
    self.assertAllEqual(next_state_.log_probs, expected_log_probs)
Example #2
0
 def testLogSoftmaxAxes(self):
   arr = np.linspace(0., 1, 12).reshape(3, 4)
   x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
   y_pos_axis = nn_ops.log_softmax(arr, axis=0)
   z_gt_axis = nn_ops.log_softmax(arr, axis=4)
   x_neg_axis_tf = self.evaluate(x_neg_axis)
   y_pos_axis_tf = self.evaluate(y_pos_axis)
   z_gt_axis_tf = self.evaluate(z_gt_axis)
   eps = 1e-3
   self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
   self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
  def test_step_with_eos(self):
    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])
    beam_state = beam_search_decoder.BeamSearchDecoderState(
        cell_state=dummy_cell_state,
        log_probs=nn_ops.log_softmax(
            array_ops.ones([self.batch_size, self.beam_width])),
        lengths=ops.convert_to_tensor(
            [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int32),
        finished=ops.convert_to_tensor(
            [[False, True, False], [False, False, True]], dtype=dtypes.bool))

    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
                      0.0001)
    logits_[0, 0, 2] = 1.9
    logits_[0, 0, 3] = 2.1
    logits_[0, 1, 3] = 3.1
    logits_[0, 1, 4] = 0.9
    logits_[1, 0, 1] = 0.5
    logits_[1, 1, 2] = 5.7  # why does this not work when it's 2.7?
    logits_[1, 2, 2] = 1.0
    logits_[1, 2, 3] = 0.2
    logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32)
    log_probs = nn_ops.log_softmax(logits)

    outputs, next_beam_state = beam_search_decoder._beam_search_step(
        time=2,
        logits=logits,
        beam_state=beam_state,
        batch_size=ops.convert_to_tensor(self.batch_size),
        beam_width=self.beam_width,
        end_token=self.end_token,
        length_penalty_weight=self.length_penalty_weight)

    with self.test_session() as sess:
      outputs_, next_state_, state_, log_probs_ = sess.run(
          [outputs, next_beam_state, beam_state, log_probs])

    np.testing.assert_array_equal(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]])
    np.testing.assert_array_equal(outputs_.predicted_ids, [[0, 3, 2], [2, 0,
                                                                       1]])
    np.testing.assert_array_equal(next_state_.lengths, [[1, 3, 3], [3, 1, 3]])
    np.testing.assert_array_equal(next_state_.finished, [[True, False, False],
                                                         [False, True, False]])

    expected_log_probs = []
    expected_log_probs.append(state_.log_probs[0][[1, 0, 0]])
    expected_log_probs.append(state_.log_probs[1][[1, 2, 0]])
    expected_log_probs[0][1] += log_probs_[0, 0, 3]
    expected_log_probs[0][2] += log_probs_[0, 0, 2]
    expected_log_probs[1][0] += log_probs_[1, 1, 2]
    expected_log_probs[1][2] += log_probs_[1, 0, 1]
    np.testing.assert_array_equal(next_state_.log_probs, expected_log_probs)
Example #4
0
  def _log_prob(self, x):
    x = self._assert_valid_sample(x)
    # broadcast logits or x if need be.
    logits = self.logits
    if (not x.get_shape().is_fully_defined() or
        not logits.get_shape().is_fully_defined() or
        x.get_shape() != logits.get_shape()):
      logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
      x = array_ops.ones_like(logits, dtype=x.dtype) * x

    logits_shape = array_ops.shape(logits)
    if logits.get_shape().ndims == 2:
      logits_2d = logits
      x_2d = x
    else:
      logits_2d = array_ops.reshape(logits, [-1, self.event_size])
      x_2d = array_ops.reshape(x, [-1, self.event_size])
    # compute the normalization constant
    log_norm_const = (math_ops.lgamma(self.event_size)
                      + (self.event_size - 1)
                      * math_ops.log(self.temperature))
    # compute the unnormalized density
    log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self.temperature)
    log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False)
    # combine unnormalized density with normalization constant
    log_prob = log_norm_const + log_unnorm_prob
    ret = array_ops.reshape(log_prob, logits_shape)
    return ret
 def _log_prob(self, x):
   x = self._assert_valid_sample(x)
   # broadcast logits or x if need be.
   logits = self.logits
   if (not x.get_shape().is_fully_defined() or
       not logits.get_shape().is_fully_defined() or
       x.get_shape() != logits.get_shape()):
     logits = array_ops.ones_like(x, dtype=logits.dtype) * logits
     x = array_ops.ones_like(logits, dtype=x.dtype) * x
   logits_shape = array_ops.shape(math_ops.reduce_sum(logits, axis=[-1]))
   logits_2d = array_ops.reshape(logits, [-1, self.event_size])
   x_2d = array_ops.reshape(x, [-1, self.event_size])
   # compute the normalization constant
   k = math_ops.cast(self.event_size, x.dtype)
   log_norm_const = (math_ops.lgamma(k)
                     + (k - 1.)
                     * math_ops.log(self.temperature))
   # compute the unnormalized density
   log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d)
   log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False)
   # combine unnormalized density with normalization constant
   log_prob = log_norm_const + log_unnorm_prob
   # Reshapes log_prob to be consistent with shape of user-supplied logits
   ret = array_ops.reshape(log_prob, logits_shape)
   return ret
def _kl_divergence(p, p_logits, q):
  """Computes the Kullback-Liebler divergence between p and q.

  This function uses p's logits in some places to improve numerical stability.

  Specifically:

  KL(p || q) = sum[ p * log(p / q) ]
    = sum[ p * ( log(p)                - log(q) ) ]
    = sum[ p * ( log_softmax(p_logits) - log(q) ) ]

  Args:
    p: A 2-D floating-point Tensor p_ij, where `i` corresponds to the minibatch
      example and `j` corresponds to the probability of being in class `j`.
    p_logits: A 2-D floating-point Tensor corresponding to logits for `p`.
    q: A 1-D floating-point Tensor, where q_j corresponds to the probability
      of class `j`.

  Returns:
    KL divergence between two distributions. Output dimension is 1D, one entry
    per distribution in `p`.

  Raises:
    ValueError: If any of the inputs aren't floating-point.
    ValueError: If p or p_logits aren't 2D.
    ValueError: If q isn't 1D.
  """
  for tensor in [p, p_logits, q]:
    if not tensor.dtype.is_floating:
      raise ValueError('Input %s must be floating type.', tensor.name)
  p.shape.assert_has_rank(2)
  p_logits.shape.assert_has_rank(2)
  q.shape.assert_has_rank(1)
  return math_ops.reduce_sum(
      p * (nn_ops.log_softmax(p_logits) - math_ops.log(q)), axis=1)
 def _log_cdf(self, x):
   x = self._pad_sample_dims(x)
   log_cdf_x = self.components_distribution.log_cdf(x)      # [S, B, k]
   log_mix_prob = nn_ops.log_softmax(
       self.mixture_distribution.logits, axis=-1)           # [B, k]
   return math_ops.reduce_logsumexp(
       log_cdf_x + log_mix_prob, axis=-1)                   # [S, B]
Example #8
0
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
  """Gradient function for SoftmaxCrossEntropyWithLogits."""
  # grad_loss is the backprop for cost, and we multiply it with the gradients
  # (which is output[1])
  # grad_grad is the backprop for softmax gradient.
  #
  # Second derivative is just softmax derivative w.r.t. logits.
  softmax_grad = op.outputs[1]
  grad = _BroadcastMul(grad_loss, softmax_grad)

  def IsZero(g):
    # Some introspection to check if the gradient is feeding zeros
    if context.executing_eagerly():
      # TODO(apassos) add an efficient way to detect eager zeros here.
      return False
    if g.op.type in ("ZerosLike", "Zeros"):
      return True
    const_fill_value = tensor_util.constant_value(g)
    return const_fill_value is not None and (const_fill_value == 0).all()

  logits = op.inputs[0]
  if grad_grad is not None and not IsZero(grad_grad):
    softmax = nn_ops.softmax(logits)

    grad += ((grad_grad - array_ops.squeeze(
        math_ops.matmul(
            array_ops.expand_dims(grad_grad, 1),
            array_ops.expand_dims(softmax, 2)),
        axis=1)) * softmax)

  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
  def _sample_n(self, n, seed=None):
    sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0)
    logits = self.logits * array_ops.ones(sample_shape)
    logits_2d = array_ops.reshape(logits, [-1, self.event_size])
    np_dtype = self.dtype.as_numpy_dtype

    # Uniform variates must be sampled from the interval (0,1] rather than
    # [0,1], as they are passed through log() to compute Gumbel variates.
    # We need to use np.finfo(np_dtype).tiny because it is the smallest,
    # positive, "normal" number.  A "normal" number is such that the mantissa
    # has an implicit leading 1.  Normal, positive numbers x, y have the
    # reasonable property that: x + y >= max(x, y).
    # minval=np.nextafter(np.float32(0),1)) can cause
    # tf.random_uniform(dtype=tf.float32) to sample 0.

    uniform = random_ops.random_uniform(shape=array_ops.shape(logits_2d),
                                        minval=np.finfo(np_dtype).tiny,
                                        maxval=1,
                                        dtype=self.dtype,
                                        seed=seed)
    gumbel = -math_ops.log(-math_ops.log(uniform))
    noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d)
    samples = nn_ops.log_softmax(noisy_logits)
    ret = array_ops.reshape(samples, sample_shape)
    return ret
Example #10
0
  def testEntropyGradient(self):
    with self.cached_session() as sess:
      logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])

      probabilities = nn_ops.softmax(logits)
      log_probabilities = nn_ops.log_softmax(logits)
      true_entropy = - math_ops.reduce_sum(
          probabilities * log_probabilities, axis=-1)

      categorical_distribution = categorical.Categorical(probs=probabilities)
      categorical_entropy = categorical_distribution.entropy()

      # works
      true_entropy_g = gradients_impl.gradients(true_entropy, [logits])
      categorical_entropy_g = gradients_impl.gradients(
          categorical_entropy, [logits])

      res = sess.run({"true_entropy": true_entropy,
                      "categorical_entropy": categorical_entropy,
                      "true_entropy_g": true_entropy_g,
                      "categorical_entropy_g": categorical_entropy_g})
      self.assertAllClose(res["true_entropy"],
                          res["categorical_entropy"])
      self.assertAllClose(res["true_entropy_g"],
                          res["categorical_entropy_g"])
 def _log_prob(self, x):
   with ops.control_dependencies(self._runtime_assertions):
     x = self._pad_sample_dims(x)
     log_prob_x = self.components_distribution.log_prob(x)  # [S, B, k]
     log_mix_prob = nn_ops.log_softmax(
         self.mixture_distribution.logits, axis=-1)         # [B, k]
     return math_ops.reduce_logsumexp(
         log_prob_x + log_mix_prob, axis=-1)                # [S, B]
Example #12
0
 def testGradient(self, x_shape):
   x_np = np.random.randn(*x_shape).astype(np.float64)
   with self.cached_session():
     x_tf = constant_op.constant(x_np)
     y_tf = nn_ops.log_softmax(x_tf)
     err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                   x_shape)
   eps = 1e-7
   self.assertLess(err, eps)
Example #13
0
 def testLogSoftmax(self):
   x_shape = [5, 10]
   x_np = np.random.randn(*x_shape).astype(np.float32)
   y_np = self._log_softmax(x_np)
   x_tf = constant_op.constant(x_np)
   y_tf = nn_ops.log_softmax(x_tf)
   y_tf_np = self.evaluate(y_tf)
   eps = 1e-3
   self.assertAllClose(y_tf_np, y_np, eps)
def _kl_categorical_categorical(a, b, name=None):
  """Calculate the batched KL divergence KL(a || b) with a, b OneHotCategorical.

  Args:
    a: instance of a OneHotCategorical distribution object.
    b: instance of a OneHotCategorical distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_categorical_categorical".

  Returns:
    Batchwise KL(a || b)
  """
  with ops.name_scope(
      name, "kl_categorical_categorical", [a.logits, b.logits]):
    # sum(p*ln(p/q))
    return math_ops.reduce_sum(
        nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits)
            - nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
Example #15
0
def _kl_categorical_categorical(a, b, name=None):
  """Calculate the batched KL divergence KL(a || b) with a and b Categorical.

  Args:
    a: instance of a Categorical distribution object.
    b: instance of a Categorical distribution object.
    name: (optional) Name to use for created operations.
      default is "kl_categorical_categorical".

  Returns:
    Batchwise KL(a || b)
  """
  with ops.name_scope(name, "kl_categorical_categorical",
                      values=[a.logits, b.logits]):
    # sum(probs log(probs / (1 - probs)))
    delta_log_probs1 = (nn_ops.log_softmax(a.logits) -
                        nn_ops.log_softmax(b.logits))
    return math_ops.reduce_sum(nn_ops.softmax(a.logits) * delta_log_probs1,
                               axis=-1)
Example #16
0
def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
  """Computes the CTC loss and gradients.

  Most users will want fwd_bwd.ctc_loss

  This function returns the computed gradient, it does not have a gradient
  of its own defined.

  Args:
    logits: tensor of shape [frames, batch_size, num_labels]
    labels: tensor of shape [batch_size, max_label_seq_length]
    label_length: tensor of shape [batch_size]
      Length of reference label sequence in labels.
    logit_length: tensor of shape [batch_size]
      Length of input sequence in logits.
    unique: (optional) unique label indices as computed by unique(labels)
      If supplied, enables an implementation that is faster and more memory
      efficient on TPU.

  Returns:
    loss: tensor of shape [batch_size]
    gradient: tensor of shape [frames, batch_size, num_labels]
  """

  num_labels = _get_dim(logits, 2)
  max_label_seq_length = _get_dim(labels, 1)

  ilabel_log_probs = nn_ops.log_softmax(logits)
  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
  state_trans_probs = _ctc_state_trans(labels)
  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
      label_length, max_label_seq_length)
  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
      state_trans_log_probs=math_ops.log(state_trans_probs),
      initial_state_log_probs=initial_state_log_probs,
      final_state_log_probs=final_state_log_probs,
      observed_log_probs=state_log_probs,
      sequence_length=logit_length)

  if unique:
    olabel_log_probs = _state_to_olabel_unique(
        labels, num_labels, fwd_bwd_log_probs, unique)
  else:
    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)

  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
  loss = -log_likelihood
  return loss, grad
 def _testOverflow(self, use_gpu=False):
     if use_gpu:
         type = np.float32
     else:
         type = np.float64
     max = np.finfo(type).max
     features = np.array([[1.0, 1.0, 1.0, 1.0], [max, 1.0, 2.0, 3.0]]).astype(type)
     with self.test_session(use_gpu=use_gpu):
         tf_log_softmax = nn_ops.log_softmax(features)
         out = tf_log_softmax.eval()
     self.assertAllClose(
         np.array([[-1.386294, -1.386294, -1.386294, -1.386294], [0, -max, -max, -max]]),
         out,
         rtol=1.0e-5,
         atol=1.0e-5,
     )
Example #18
0
 def _testOverflow(self, use_gpu=False):
   if use_gpu:
     type = np.float32  # pylint: disable=redefined-builtin
   else:
     type = np.float64  # pylint: disable=redefined-builtin
   max = np.finfo(type).max  # pylint: disable=redefined-builtin
   features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
   with self.test_session(use_gpu=use_gpu):
     tf_log_softmax = nn_ops.log_softmax(features)
     out = tf_log_softmax.eval()
   self.assertAllClose(
       np.array([[-1.386294, -1.386294, -1.386294, -1.386294],
                 [0, -max, -max, -max]]),
       out,
       rtol=1.e-5,
       atol=1.e-5)
 def _sample_n(self, n, seed=None):
     sample_shape = array_ops.concat_v2(([n], array_ops.shape(self.logits)), 0)
     logits = self.logits * array_ops.ones(sample_shape)
     if logits.get_shape().ndims == 2:
         logits_2d = logits
     else:
         logits_2d = array_ops.reshape(logits, [-1, self.num_classes])
     np_dtype = self.dtype.as_numpy_dtype()
     minval = np.nextafter(np_dtype(0), np_dtype(1))
     uniform = random_ops.random_uniform(
         shape=array_ops.shape(logits_2d), minval=minval, maxval=1, dtype=self.dtype, seed=seed
     )
     gumbel = -math_ops.log(-math_ops.log(uniform))
     noisy_logits = math_ops.div(gumbel + logits_2d, self.temperature)
     samples = nn_ops.log_softmax(noisy_logits)
     ret = array_ops.reshape(samples, sample_shape)
     return ret
 def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
     # A previous version of the code checked the op name rather than the op type
     # to distinguish between log and non-log.  Use an arbitrary name to catch
     # this bug in future.
     name = "arbitrary"
     np_softmax = self._npSoftmax(np_features, dim=dim, log=log)
     with self.test_session(use_gpu=use_gpu):
         if log:
             tf_softmax = nn_ops.log_softmax(np_features, dim=dim, name=name)
         else:
             tf_softmax = nn_ops.softmax(np_features, dim=dim, name=name)
         out = tf_softmax.eval()
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
     if not log:
         # Bonus check: the softmaxes should add to one in dimension dim.
         sum_along_dim = np.sum(out, axis=dim)
         self.assertAllCloseAccordingToType(np.ones(sum_along_dim.shape), sum_along_dim)
 def _sample_n(self, n, seed=None):
   sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0)
   logits = self.logits * array_ops.ones(sample_shape, dtype=self.dtype)
   logits_2d = array_ops.reshape(logits, [-1, self.event_size])
   # Uniform variates must be sampled from the open-interval `(0, 1)` rather
   # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny`
   # because it is the smallest, positive, "normal" number. A "normal" number
   # is such that the mantissa has an implicit leading 1. Normal, positive
   # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In
   # this case, a subnormal number (i.e., np.nextafter) can cause us to sample
   # 0.
   uniform = random_ops.random_uniform(
       shape=array_ops.shape(logits_2d),
       minval=np.finfo(self.dtype.as_numpy_dtype).tiny,
       maxval=1.,
       dtype=self.dtype,
       seed=seed)
   gumbel = -math_ops.log(-math_ops.log(uniform))
   noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d)
   samples = nn_ops.log_softmax(noisy_logits)
   ret = array_ops.reshape(samples, sample_shape)
   return ret
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                      beam_width, end_token, length_penalty_weight):
  """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape
      `[batch_size, beam_width, vocab_size]`
    next_cell_state: The next state from the cell, e.g. an instance of
      AttentionWrapperState if the cell is attentional.
    beam_state: Current state of the beam search.
      An instance of `BeamSearchDecoderState`.
    batch_size: The batch size for this input.
    beam_width: Python int.  The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """
  static_batch_size = tensor_util.constant_value(batch_size)

  # Calculate the current lengths of the predictions
  prediction_lengths = beam_state.lengths
  previously_finished = beam_state.finished

  # Calculate the total log probs for the new hypotheses
  # Final Shape: [batch_size, beam_width, vocab_size]
  step_log_probs = nn_ops.log_softmax(logits)
  step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs

  # Calculate the continuation lengths by adding to all continuing beams.
  vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
  lengths_to_add = array_ops.one_hot(
      indices=array_ops.fill([batch_size, beam_width], end_token),
      depth=vocab_size,
      on_value=np.int64(0),
      off_value=np.int64(1),
      dtype=dtypes.int64)
  add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished))
  lengths_to_add *= array_ops.expand_dims(add_mask, 2)
  new_prediction_lengths = (
      lengths_to_add + array_ops.expand_dims(prediction_lengths, 2))

  # Calculate the scores for each beam
  scores = _get_scores(
      log_probs=total_probs,
      sequence_lengths=new_prediction_lengths,
      length_penalty_weight=length_penalty_weight)

  time = ops.convert_to_tensor(time, name="time")
  # During the first time step we only consider the initial beam
  scores_flat = array_ops.reshape(scores, [batch_size, -1])

  # Pick the next beams according to the specified successors function
  next_beam_size = ops.convert_to_tensor(
      beam_width, dtype=dtypes.int32, name="beam_width")
  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size)

  next_beam_scores.set_shape([static_batch_size, beam_width])
  word_indices.set_shape([static_batch_size, beam_width])

  # Pick out the probs, beam_ids, and states according to the chosen predictions
  next_beam_probs = _tensor_gather_helper(
      gather_indices=word_indices,
      gather_from=total_probs,
      batch_size=batch_size,
      range_size=beam_width * vocab_size,
      gather_shape=[-1],
      name="next_beam_probs")
  # Note: just doing the following
  #   math_ops.to_int32(word_indices % vocab_size,
  #       name="next_beam_word_ids")
  # would be a lot cleaner but for reasons unclear, that hides the results of
  # the op which prevents capturing it with tfdbg debug ops.
  raw_next_word_ids = math_ops.mod(
      word_indices, vocab_size, name="next_beam_word_ids")
  next_word_ids = math_ops.to_int32(raw_next_word_ids)
  next_beam_ids = math_ops.to_int32(
      word_indices / vocab_size, name="next_beam_parent_ids")

  # Append new ids to current predictions
  previously_finished = _tensor_gather_helper(
      gather_indices=next_beam_ids,
      gather_from=previously_finished,
      batch_size=batch_size,
      range_size=beam_width,
      gather_shape=[-1])
  next_finished = math_ops.logical_or(
      previously_finished,
      math_ops.equal(next_word_ids, end_token),
      name="next_beam_finished")

  # Calculate the length of the next predictions.
  # 1. Finished beams remain unchanged.
  # 2. Beams that are now finished (EOS predicted) have their length
  #    increased by 1.
  # 3. Beams that are not yet finished have their length increased by 1.
  lengths_to_add = math_ops.to_int64(math_ops.logical_not(previously_finished))
  next_prediction_len = _tensor_gather_helper(
      gather_indices=next_beam_ids,
      gather_from=beam_state.lengths,
      batch_size=batch_size,
      range_size=beam_width,
      gather_shape=[-1])
  next_prediction_len += lengths_to_add

  # Pick out the cell_states according to the next_beam_ids. We use a
  # different gather_shape here because the cell_state tensors, i.e.
  # the tensors that would be gathered from, all have dimension
  # greater than two and we need to preserve those dimensions.
  # pylint: disable=g-long-lambda
  next_cell_state = nest.map_structure(
      lambda gather_from: _maybe_tensor_gather_helper(
          gather_indices=next_beam_ids,
          gather_from=gather_from,
          batch_size=batch_size,
          range_size=beam_width,
          gather_shape=[batch_size * beam_width, -1]),
      next_cell_state)
  # pylint: enable=g-long-lambda

  next_state = BeamSearchDecoderState(
      cell_state=next_cell_state,
      log_probs=next_beam_probs,
      lengths=next_prediction_len,
      finished=next_finished)

  output = BeamSearchDecoderOutput(
      scores=next_beam_scores,
      predicted_ids=next_word_ids,
      parent_ids=next_beam_ids)

  return output, next_state
  def test_step(self):

    def get_probs():
      """this simulates the initialize method in BeamSearchDecoder."""
      log_prob_mask = array_ops.one_hot(
          array_ops.zeros([self.batch_size], dtype=dtypes.int32),
          depth=self.beam_width,
          on_value=True,
          off_value=False,
          dtype=dtypes.bool)

      log_prob_zeros = array_ops.zeros(
          [self.batch_size, self.beam_width], dtype=dtypes.float32)
      log_prob_neg_inf = array_ops.ones(
          [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf

      log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
                                  log_prob_neg_inf)
      return log_probs

    log_probs = get_probs()
    dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])

    # pylint: disable=invalid-name
    _finished = array_ops.one_hot(
        array_ops.zeros([self.batch_size], dtype=dtypes.int32),
        depth=self.beam_width,
        on_value=False,
        off_value=True,
        dtype=dtypes.bool)
    _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64)
    _lengths[:, 0] = 2
    _lengths = constant_op.constant(_lengths, dtype=dtypes.int64)

    beam_state = beam_search_decoder.BeamSearchDecoderState(
        cell_state=dummy_cell_state,
        log_probs=log_probs,
        lengths=_lengths,
        finished=_finished,
        accumulated_attention_probs=())

    logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
                      0.0001)
    logits_[0, 0, 2] = 1.9
    logits_[0, 0, 3] = 2.1
    logits_[0, 1, 3] = 3.1
    logits_[0, 1, 4] = 0.9
    logits_[1, 0, 1] = 0.5
    logits_[1, 1, 2] = 2.7
    logits_[1, 2, 2] = 10.0
    logits_[1, 2, 3] = 0.2
    logits = constant_op.constant(logits_, dtype=dtypes.float32)
    log_probs = nn_ops.log_softmax(logits)

    outputs, next_beam_state = beam_search_decoder._beam_search_step(
        time=2,
        logits=logits,
        next_cell_state=dummy_cell_state,
        beam_state=beam_state,
        batch_size=ops.convert_to_tensor(self.batch_size),
        beam_width=self.beam_width,
        end_token=self.end_token,
        length_penalty_weight=self.length_penalty_weight,
        coverage_penalty_weight=self.coverage_penalty_weight)

    with self.cached_session() as sess:
      outputs_, next_state_, _, _ = sess.run(
          [outputs, next_beam_state, beam_state, log_probs])

    self.assertEqual(outputs_.predicted_ids[0, 0], 3)
    self.assertEqual(outputs_.predicted_ids[0, 1], 2)
    self.assertEqual(outputs_.predicted_ids[1, 0], 1)
    neg_inf = -np.Inf
    self.assertAllEqual(
        next_state_.log_probs[:, -3:],
        [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]])
    self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True)
    self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True)
    self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
Example #24
0
 def _entropy(self):
   return -math_ops.reduce_sum(
       nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
                      end_token, length_penalty_weight):
    """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape
      `[batch_size, beam_width, vocab_size]`
    beam_state: Current state of the beam search.
      An instance of `BeamSearchDecoderState`.
    batch_size: The batch size for this input.
    beam_width: Python int.  The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """
    static_batch_size = tensor_util.constant_value(batch_size)

    # Calculate the current lengths of the predictions
    prediction_lengths = beam_state.lengths
    previously_finished = beam_state.finished

    # Calculate the total log probs for the new hypotheses
    # Final Shape: [batch_size, beam_width, vocab_size]
    step_log_probs = nn_ops.log_softmax(logits)
    step_log_probs = _mask_probs(step_log_probs, end_token,
                                 previously_finished)
    total_probs = array_ops.expand_dims(beam_state.log_probs,
                                        2) + step_log_probs

    # Calculate the continuation lengths by adding to all continuing beams.
    vocab_size = logits.shape[-1].value
    lengths_to_add = array_ops.one_hot(indices=array_ops.tile(
        array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
                                       depth=vocab_size,
                                       on_value=0,
                                       off_value=1)
    add_mask = (1 - math_ops.to_int32(previously_finished))
    lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
    new_prediction_lengths = (lengths_to_add +
                              array_ops.expand_dims(prediction_lengths, 2))

    # Calculate the scores for each beam
    scores = _get_scores(log_probs=total_probs,
                         sequence_lengths=new_prediction_lengths,
                         length_penalty_weight=length_penalty_weight)

    time = ops.convert_to_tensor(time, name="time")
    # During the first time step we only consider the initial beam
    scores_flat = control_flow_ops.cond(
        time > 0, lambda: array_ops.reshape(scores, [batch_size, -1]),
        lambda: scores[:, 0])

    # Pick the next beams according to the specified successors function
    next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width)
    next_beam_scores.set_shape([static_batch_size, beam_width])
    word_indices.set_shape([static_batch_size, beam_width])

    # Pick out the probs, beam_ids, and states according to the chosen predictions
    next_beam_probs = _tensor_gather_helper(
        gather_indices=word_indices,
        gather_from=total_probs,
        range_input=batch_size,
        range_size=beam_width * vocab_size,
        final_shape=[static_batch_size, beam_width])

    next_word_ids = math_ops.to_int32(word_indices % vocab_size)
    next_beam_ids = math_ops.to_int32(word_indices / vocab_size)

    # Append new ids to current predictions
    previously_finished = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=previously_finished,
        range_input=batch_size,
        range_size=beam_width,
        final_shape=[static_batch_size, beam_width])
    next_finished = math_ops.logical_or(
        previously_finished, math_ops.equal(next_word_ids, end_token))

    # Calculate the length of the next predictions.
    # 1. Finished beams remain unchanged
    # 2. Beams that are now finished (EOS predicted) remain unchanged
    # 3. Beams that are not yet finished have their length increased by 1
    lengths_to_add = math_ops.to_int32(
        math_ops.not_equal(next_word_ids, end_token))
    lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add
    next_prediction_len = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=beam_state.lengths,
        range_input=batch_size,
        range_size=beam_width,
        final_shape=[static_batch_size, beam_width])
    next_prediction_len += lengths_to_add

    next_state = BeamSearchDecoderState(cell_state=beam_state.cell_state,
                                        log_probs=next_beam_probs,
                                        lengths=next_prediction_len,
                                        finished=next_finished)

    output = BeamSearchDecoderOutput(scores=next_beam_scores,
                                     predicted_ids=next_word_ids,
                                     parent_ids=next_beam_ids)

    return output, next_state
Example #26
0
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                      beam_width, end_token, length_penalty_weight):
    """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape
      `[batch_size, beam_width, vocab_size]`
    next_cell_state: The next state from the cell, e.g. an instance of
      AttentionWrapperState if the cell is attentional.
    beam_state: Current state of the beam search.
      An instance of `BeamSearchDecoderState`.
    batch_size: The batch size for this input.
    beam_width: Python int.  The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """
    static_batch_size = tensor_util.constant_value(batch_size)

    # Calculate the current lengths of the predictions
    prediction_lengths = beam_state.lengths
    previously_finished = beam_state.finished

    # Calculate the total log probs for the new hypotheses
    # Final Shape: [batch_size, beam_width, vocab_size]
    step_log_probs = nn_ops.log_softmax(logits)
    step_log_probs = _mask_probs(step_log_probs, end_token,
                                 previously_finished)
    total_probs = array_ops.expand_dims(beam_state.log_probs,
                                        2) + step_log_probs

    # Calculate the continuation lengths by adding to all continuing beams.
    vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
    lengths_to_add = array_ops.one_hot(indices=array_ops.fill(
        [batch_size, beam_width], end_token),
                                       depth=vocab_size,
                                       on_value=np.int64(0),
                                       off_value=np.int64(1),
                                       dtype=dtypes.int64)
    add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished))
    lengths_to_add *= array_ops.expand_dims(add_mask, 2)
    new_prediction_lengths = (lengths_to_add +
                              array_ops.expand_dims(prediction_lengths, 2))

    # Calculate the scores for each beam
    scores = _get_scores(log_probs=total_probs,
                         sequence_lengths=new_prediction_lengths,
                         length_penalty_weight=length_penalty_weight)

    time = ops.convert_to_tensor(time, name="time")
    # During the first time step we only consider the initial beam
    scores_flat = array_ops.reshape(scores, [batch_size, -1])

    # Pick the next beams according to the specified successors function
    next_beam_size = ops.convert_to_tensor(beam_width,
                                           dtype=dtypes.int32,
                                           name="beam_width")
    next_beam_scores, word_indices = nn_ops.top_k(scores_flat,
                                                  k=next_beam_size)

    next_beam_scores.set_shape([static_batch_size, beam_width])
    word_indices.set_shape([static_batch_size, beam_width])

    # Pick out the probs, beam_ids, and states according to the chosen predictions
    next_beam_probs = _tensor_gather_helper(gather_indices=word_indices,
                                            gather_from=total_probs,
                                            batch_size=batch_size,
                                            range_size=beam_width * vocab_size,
                                            gather_shape=[-1],
                                            name="next_beam_probs")
    # Note: just doing the following
    #   math_ops.to_int32(word_indices % vocab_size,
    #       name="next_beam_word_ids")
    # would be a lot cleaner but for reasons unclear, that hides the results of
    # the op which prevents capturing it with tfdbg debug ops.
    raw_next_word_ids = math_ops.mod(word_indices,
                                     vocab_size,
                                     name="next_beam_word_ids")
    next_word_ids = math_ops.to_int32(raw_next_word_ids)
    next_beam_ids = math_ops.to_int32(word_indices / vocab_size,
                                      name="next_beam_parent_ids")

    # Append new ids to current predictions
    previously_finished = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=previously_finished,
        batch_size=batch_size,
        range_size=beam_width,
        gather_shape=[-1])
    next_finished = math_ops.logical_or(previously_finished,
                                        math_ops.equal(next_word_ids,
                                                       end_token),
                                        name="next_beam_finished")

    # Calculate the length of the next predictions.
    # 1. Finished beams remain unchanged.
    # 2. Beams that are now finished (EOS predicted) have their length
    #    increased by 1.
    # 3. Beams that are not yet finished have their length increased by 1.
    lengths_to_add = math_ops.to_int64(
        math_ops.logical_not(previously_finished))
    next_prediction_len = _tensor_gather_helper(gather_indices=next_beam_ids,
                                                gather_from=beam_state.lengths,
                                                batch_size=batch_size,
                                                range_size=beam_width,
                                                gather_shape=[-1])
    next_prediction_len += lengths_to_add

    # Pick out the cell_states according to the next_beam_ids. We use a
    # different gather_shape here because the cell_state tensors, i.e.
    # the tensors that would be gathered from, all have dimension
    # greater than two and we need to preserve those dimensions.
    # pylint: disable=g-long-lambda
    next_cell_state = nest.map_structure(
        lambda gather_from: _maybe_tensor_gather_helper(
            gather_indices=next_beam_ids,
            gather_from=gather_from,
            batch_size=batch_size,
            range_size=beam_width,
            gather_shape=[batch_size * beam_width, -1]), next_cell_state)
    # pylint: enable=g-long-lambda

    next_state = BeamSearchDecoderState(cell_state=next_cell_state,
                                        log_probs=next_beam_probs,
                                        lengths=next_prediction_len,
                                        finished=next_finished)

    output = BeamSearchDecoderOutput(scores=next_beam_scores,
                                     predicted_ids=next_word_ids,
                                     parent_ids=next_beam_ids)

    return output, next_state
 def _log_unnormalized_prob(self, counts):
     counts = self._maybe_assert_valid_sample(counts)
     return math_ops.reduce_sum(counts * nn_ops.log_softmax(self.logits),
                                -1)
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                      beam_width, end_token, length_penalty_weight):
    """Performs a single step of Beam Search Decoding.
    Args:
      time: Beam search time step, should start at 0. At time 0 we assume
        that all beams are equal and consider only the first beam for
        continuations.
      logits: Logits at the current time step. A tensor of shape
        `[batch_size, beam_width, vocab_size]`
      next_cell_state: The next state from the cell, e.g. an instance of
        AttentionWrapperState if the cell is attentional.
      beam_state: Current state of the beam search.
        An instance of `BeamSearchDecoderState`.
      batch_size: The batch size for this input.
      beam_width: Python int.  The size of the beams.
      end_token: The int32 end token.
      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
    Returns:
      A new beam state.
    """
    static_batch_size = tensor_util.constant_value(batch_size)

    # Calculate the current lengths of the predictions
    prediction_lengths = beam_state.lengths
    previously_finished = beam_state.finished

    # Calculate the total log probs for the new hypotheses
    # Final Shape: [batch_size, beam_width, vocab_size]
    step_log_probs = nn_ops.log_softmax(logits)
    step_log_probs = _mask_probs(
        step_log_probs, end_token, previously_finished)
    total_probs = tf.expand_dims(
        beam_state.log_probs, axis=2) + step_log_probs

    # Calculate the continuation lengths by adding to all continuing beams.
    vocab_size = logits.shape[-1].value
    lengths_to_add = tf.one_hot(
        indices=tf.tile(
            tf.reshape(end_token, [1, 1]), [batch_size, beam_width]),
        depth=vocab_size,
        on_value=0,
        off_value=1)
    add_mask = (1 - tf.to_int32(previously_finished))
    lengths_to_add = tf.expand_dims(add_mask, 2) * lengths_to_add
    new_prediction_lengths = (
        lengths_to_add + tf.expand_dims(prediction_lengths, 2))

    # Calculate the scores for each beam
    scores = _get_scores(
        log_probs=total_probs,
        sequence_lengths=new_prediction_lengths,
        length_penalty_weight=length_penalty_weight)

    time = ops.convert_to_tensor(time, name="time")
    # During the first time step we only consider the initial beam
    scores_shape = tf.shape(scores)
    scores_flat = tf.cond(
        time > 0,
        lambda: tf.reshape(scores, [batch_size, -1]),
        lambda: scores[:, 0])
    num_available_beam = tf.cond(
        time > 0, lambda: tf.reduce_prod(scores_shape[1:]),
        lambda: tf.reduce_prod(scores_shape[2:]))

    # Pick the next beams according to the specified successors function
    next_beam_size = tf.minimum(
        ops.convert_to_tensor(
            beam_width, dtype=dtypes.int32, name="beam_width"),
        num_available_beam)
    next_beam_scores, word_indices = nn_ops.top_k(
        scores_flat, k=next_beam_size)
    next_beam_scores.set_shape([static_batch_size, beam_width])
    word_indices.set_shape([static_batch_size, beam_width])

    # Pick out the probs, beam_ids, and states according to the chosen
    # predictions
    next_beam_probs = _tensor_gather_helper(
        gather_indices=word_indices,
        gather_from=total_probs,
        batch_size=batch_size,
        range_size=beam_width * vocab_size,
        gather_shape=[-1])
    next_word_ids = tf.to_int32(word_indices % vocab_size)
    next_beam_ids = tf.to_int32(word_indices / vocab_size)

    # Append new ids to current predictions
    previously_finished = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=previously_finished,
        batch_size=batch_size,
        range_size=beam_width,
        gather_shape=[-1])
    next_finished = tf.logical_or(previously_finished,
                                  tf.equal(next_word_ids, end_token))

    # Calculate the length of the next predictions.
    # 1. Finished beams remain unchanged
    # 2. Beams that are now finished (EOS predicted) remain unchanged
    # 3. Beams that are not yet finished have their length increased by 1
    lengths_to_add = tf.to_int32(
        tf.not_equal(next_word_ids, end_token))
    lengths_to_add = (1 - tf.to_int32(next_finished)) * lengths_to_add
    next_prediction_len = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=beam_state.lengths,
        batch_size=batch_size,
        range_size=beam_width,
        gather_shape=[-1])
    next_prediction_len += lengths_to_add

    # Pick out the cell_states according to the next_beam_ids. We use a
    # different gather_shape here because the cell_state tensors, i.e.
    # the tensors that would be gathered from, all have dimension
    # greater than two and we need to preserve those dimensions.
    next_cell_state = nest.map_structure(
        lambda gather_from: _maybe_tensor_gather_helper(
            gather_indices=next_beam_ids,
            gather_from=gather_from,
            batch_size=batch_size,
            range_size=beam_width,
            gather_shape=[batch_size * beam_width, -1]),
        next_cell_state)

    next_state = BeamSearchDecoderState(
        cell_state=next_cell_state,
        log_probs=next_beam_probs,
        lengths=next_prediction_len,
        finished=next_finished)

    output = BeamSearchDecoderOutput(
        scores=next_beam_scores,
        predicted_ids=next_word_ids,
        parent_ids=next_beam_ids)

    return output, next_state
Example #29
0
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                      beam_width, end_token, length_penalty_weight):
    """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape
      `[batch_size, beam_width, vocab_size]`
    next_cell_state: The next state from the cell, e.g. an instance of
      AttentionWrapperState if the cell is attentional.
    beam_state: Current state of the beam search.
      An instance of `BeamSearchDecoderState`.
    batch_size: The batch size for this input.
    beam_width: Python int.  The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """

    static_batch_size = tensor_util.constant_value(batch_size)

    # Calculate the current lengths of the predictions
    prediction_lengths = beam_state.lengths
    previously_finished = beam_state.finished

    # Calculate the total log probs for the new hypotheses
    # Final Shape: [batch_size, beam_width, vocab_size]
    step_log_probs = nn_ops.log_softmax(logits)
    #step_log_probs",Tensor shape=(?, 10, 56136)
    step_log_probs = _mask_probs(step_log_probs, end_token,
                                 previously_finished)
    #step_log_probs_masked (?, 10, 56136)
    total_probs = array_ops.expand_dims(beam_state.log_probs,
                                        2) + step_log_probs
    #total_probs (?, 10, 56136)
    # Calculate the continuation lengths by adding to all continuing beams.
    vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
    lengths_to_add = array_ops.one_hot(
        indices=array_ops.tile(array_ops.reshape(end_token, [1, 1]),
                               [batch_size, beam_width]),
        depth=vocab_size,
        on_value=constant_op.constant(0, dtype=dtypes.int64),
        off_value=constant_op.constant(1, dtype=dtypes.int64),
        dtype=dtypes.int64)
    #lengths_to_add shape=(?, 10, 56136)
    add_mask = (1 - math_ops.to_int64(previously_finished))
    #add_mask shape=(?, 10), dtype=int64
    lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
    #lengths_to_add shape=(?, 10, 56136)
    new_prediction_lengths = (lengths_to_add +
                              array_ops.expand_dims(prediction_lengths, 2))
    #new_prediction_lengths shape=(?, 10, 56136)
    # Calculate the scores for each beam
    scores = _get_scores(log_probs=total_probs,
                         sequence_lengths=new_prediction_lengths,
                         length_penalty_weight=length_penalty_weight)
    scores_mask = tf.constant([step_log_probs.dtype.min, 0],
                              dtype=dtypes.float32,
                              shape=[vocab_size],
                              name='mask')
    scores_masked = tf.add(scores, scores_mask)
    scores_mask2 = tf.constant([0, 0, 0, 0, 0, step_log_probs.dtype.min, 0],
                               dtype=dtypes.float32,
                               shape=[vocab_size],
                               name='mask2')
    scores_masked = tf.add(scores_mask2, scores_masked)

    def new_scores(scores_masked):
        scores_no_stop = tf.constant([0, 0, step_log_probs.dtype.min, 0],
                                     dtype=dtypes.float32,
                                     shape=[vocab_size],
                                     name='no_stop')
        scores = tf.add(scores_masked, scores_no_stop)
        return scores

    #constrain the length
    scores = control_flow_ops.cond(
        #time <9 ,
        time < 0,
        lambda: new_scores(scores_masked),
        lambda: scores_masked)

    #scores shape=(?, 10, 56136)
    #[batch_size, beam_width, vocab_size]
    time = ops.convert_to_tensor(time, name="time")
    # During the first time step we only consider the initial beam
    scores_shape = array_ops.shape(scores)
    #scores_shape" shape=(3,)
    print("scores_shape", scores_shape)
    scores_to_flat_1 = array_ops.reshape(scores, [batch_size, 2, -1])
    print("scores_to_flat_1", scores_to_flat_1)
    scores_to_0 = scores[:, 0]
    scores_to_1 = scores[:, -1]
    scores_to_flat_2 = tf.concat([scores_to_0, scores_to_1], 1)
    scores_flat = control_flow_ops.cond(
        time > 0, lambda: scores_to_flat_1,
        lambda: array_ops.reshape(scores_to_flat_2, [batch_size, 2, -1]))
    num_available_beam = control_flow_ops.cond(
        time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]),
        lambda: math_ops.reduce_prod(scores_shape[2:]))
    #scores_flat", shape=(?, ?)
    #num_available_beam" shape=()
    # Pick the next beams according to the specified successors function
    next_beam_size = math_ops.minimum(
        ops.convert_to_tensor(beam_width,
                              dtype=dtypes.int32,
                              name="beam_width"), num_available_beam)
    #scores_t = tf.reshape(scores_flat,[batch_size,2,-1])
    ############################
    #input_words=['entrencheds01', 'entrencheds02', 'forgev01', 'forgev04', \
    #             'hitn02', 'hitn03', 'vaultn02', 'vaultn04', 'deepa03', \
    #             'deeps02', 'admitv01', 'admitv02', 'plantn01', 'plantn02',\
    #             'squaren01', 'squaren05', 'drawv05', 'drawv06', 'spellv03', \
    #             'spellv02', 'shotn02', 'shotn04', 'coachv01', 'coachv02', 'casen05',\
    #             'casen09', 'focusn01', 'focusn02', 'tasten01', 'tasten04', 'footn01', \
    #             'footv01']
    input_words = get_words()
    return_list = prior_scores(input_words)
    return_array = np.array(return_list)
    return_tensor = tf.convert_to_tensor(return_array)
    tiling = [1, 5, 1]
    prior_mask = tf.tile(tf.expand_dims(return_tensor, 1), tiling)
    prior_mask = tf.cast(prior_mask, tf.float32)
    prior_mask = array_ops.reshape(prior_mask, [batch_size, -1])
    #print ("prior_mask",prior_mask)
    scores_sum = tf.reduce_sum(scores_to_flat_1, 1)

    #print ("scores_sum_1",scores_sum)
    #def cal_scores_sum(scores_sum,prior_mask):
    #    return tf.add(scores_sum,prior_mask)
    #scores_sum = control_flow_ops.cond(
    #    time > 0,
    #    lambda: cal_scores_sum(scores_sum,prior_mask),
    #    lambda: scores_sum)
    #scores_sum=tf.add(scores_sum,prior_mask)
    #print ("scores_sum_2",scores_sum)
    ############################

    #scores_final=tf.concat([scores_sum, scores_sum],1)
    def cal_scores_indices(scores_to_0, scores_to_1):
        next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_to_0, k=5)
        print("ori next_beam_scores_1,word_indices_1", next_beam_scores_1)
        print("ori word_indices_1", word_indices_1)
        next_beam_scores_2, word_indices_2 = nn_ops.top_k(scores_to_1, k=5)
        next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_2],
                                     1)
        word_indices = tf.concat(
            [word_indices_1, word_indices_2 + 9 * vocab_size], 1)
        return next_beam_scores, word_indices

    def cal_scores_indices_t1(scores_final, next_beam_size):
        next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_final, k=5)
        #next_beam_scores_1, word_indices_1=sample(next_beam_scores_1,word_indices_1)
        print("next_beam_scores_1", next_beam_scores_1)
        print("word_indices_1", word_indices_1)
        next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_1],
                                     1)
        word_indices = tf.concat(
            [word_indices_1, word_indices_1 + 5 * vocab_size], 1)
        return next_beam_scores, word_indices

    next_beam_scores, word_indices = control_flow_ops.cond(
        time > 0, lambda: cal_scores_indices_t1(scores_sum, next_beam_size),
        lambda: cal_scores_indices(scores_to_0, scores_to_1))
    print('next_beam_scores.shape', next_beam_scores.shape)
    next_beam_scores.set_shape([static_batch_size, beam_width])
    word_indices.set_shape([static_batch_size, beam_width])
    #shape=(?, ?)
    # Pick out the probs, beam_ids, and states according to the chosen predictions

    next_beam_probs = _tensor_gather_helper(gather_indices=word_indices,
                                            gather_from=total_probs,
                                            batch_size=batch_size,
                                            range_size=beam_width * vocab_size,
                                            gather_shape=[-1],
                                            name="next_beam_probs")
    # Note: just doing the following
    #   math_ops.to_int32(word_indices % vocab_size,
    #       name="next_beam_word_ids")
    # would be a lot cleaner but for reasons unclear, that hides the results of
    # the op which prevents capturing it with tfdbg debug ops.
    raw_next_word_ids = math_ops.mod(word_indices,
                                     vocab_size,
                                     name="next_beam_word_ids")
    #raw_next_word_ids shape=(?, 10)
    next_word_ids = math_ops.to_int32(raw_next_word_ids)
    next_beam_ids = math_ops.to_int32(word_indices / vocab_size,
                                      name="next_beam_parent_ids")

    # Append new ids to current predictions
    previously_finished = _tensor_gather_helper(
        gather_indices=next_beam_ids,
        gather_from=previously_finished,
        batch_size=batch_size,
        range_size=beam_width,
        gather_shape=[-1])
    next_finished = math_ops.logical_or(previously_finished,
                                        math_ops.equal(next_word_ids,
                                                       end_token),
                                        name="next_beam_finished")

    # Calculate the length of the next predictions.
    # 1. Finished beams remain unchanged
    # 2. Beams that are now finished (EOS predicted) remain unchanged
    # 3. Beams that are not yet finished have their length increased by 1
    lengths_to_add = math_ops.to_int64(
        math_ops.not_equal(next_word_ids, end_token))
    lengths_to_add = (1 - math_ops.to_int64(next_finished)) * lengths_to_add
    next_prediction_len = _tensor_gather_helper(gather_indices=next_beam_ids,
                                                gather_from=beam_state.lengths,
                                                batch_size=batch_size,
                                                range_size=beam_width,
                                                gather_shape=[-1])
    next_prediction_len += lengths_to_add

    # Pick out the cell_states according to the next_beam_ids. We use a
    # different gather_shape here because the cell_state tensors, i.e.
    # the tensors that would be gathered from, all have dimension
    # greater than two and we need to preserve those dimensions.
    # pylint: disable=g-long-lambda
    next_cell_state = nest.map_structure(
        lambda gather_from: _maybe_tensor_gather_helper(
            gather_indices=next_beam_ids,
            gather_from=gather_from,
            batch_size=batch_size,
            range_size=beam_width,
            gather_shape=[batch_size * beam_width, -1]), next_cell_state)
    # pylint: enable=g-long-lambda

    next_state = BeamSearchDecoderState(cell_state=next_cell_state,
                                        log_probs=next_beam_probs,
                                        lengths=next_prediction_len,
                                        finished=next_finished)
    print('next_beam_probs', next_beam_probs)
    output = BeamSearchDecoderOutput(scores=next_beam_scores,
                                     predicted_ids=next_word_ids,
                                     parent_ids=next_beam_ids)

    return output, next_state
    def test_step(self):
        def get_probs():
            """this simulates the initialize method in BeamSearchDecoder."""
            log_prob_mask = array_ops.one_hot(array_ops.zeros(
                [self.batch_size], dtype=dtypes.int32),
                                              depth=self.beam_width,
                                              on_value=True,
                                              off_value=False,
                                              dtype=dtypes.bool)

            log_prob_zeros = array_ops.zeros(
                [self.batch_size, self.beam_width], dtype=dtypes.float32)
            log_prob_neg_inf = array_ops.ones(
                [self.batch_size, self.beam_width],
                dtype=dtypes.float32) * -np.Inf

            log_probs = array_ops.where(log_prob_mask, log_prob_zeros,
                                        log_prob_neg_inf)
            return log_probs

        log_probs = get_probs()
        dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width])

        # pylint: disable=invalid-name
        _finished = array_ops.one_hot(array_ops.zeros([self.batch_size],
                                                      dtype=dtypes.int32),
                                      depth=self.beam_width,
                                      on_value=False,
                                      off_value=True,
                                      dtype=dtypes.bool)
        _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64)
        _lengths[:, 0] = 2
        _lengths = constant_op.constant(_lengths, dtype=dtypes.int64)

        beam_state = beam_search_decoder.BeamSearchDecoderState(
            cell_state=dummy_cell_state,
            log_probs=log_probs,
            lengths=_lengths,
            finished=_finished)

        logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size],
                          0.0001)
        logits_[0, 0, 2] = 1.9
        logits_[0, 0, 3] = 2.1
        logits_[0, 1, 3] = 3.1
        logits_[0, 1, 4] = 0.9
        logits_[1, 0, 1] = 0.5
        logits_[1, 1, 2] = 2.7
        logits_[1, 2, 2] = 10.0
        logits_[1, 2, 3] = 0.2
        logits = constant_op.constant(logits_, dtype=dtypes.float32)
        log_probs = nn_ops.log_softmax(logits)

        outputs, next_beam_state = beam_search_decoder._beam_search_step(
            time=2,
            logits=logits,
            next_cell_state=dummy_cell_state,
            beam_state=beam_state,
            batch_size=ops.convert_to_tensor(self.batch_size),
            beam_width=self.beam_width,
            end_token=self.end_token,
            length_penalty_weight=self.length_penalty_weight)

        with self.test_session() as sess:
            outputs_, next_state_, _, _ = sess.run(
                [outputs, next_beam_state, beam_state, log_probs])

        self.assertEqual(outputs_.predicted_ids[0, 0], 3)
        self.assertEqual(outputs_.predicted_ids[0, 1], 2)
        self.assertEqual(outputs_.predicted_ids[1, 0], 1)
        neg_inf = -np.Inf
        self.assertAllEqual(
            next_state_.log_probs[:, -3:],
            [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]])
        self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True)
        self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True)
        self.assertAllEqual(next_state_.lengths[:, -3:],
                            [[0, 0, 0], [0, 0, 0]])
Example #31
0
def _beam_search_step(time, logits, beam_state, batch_size, beam_width,
                      end_token, length_penalty_weight):
  """Performs a single step of Beam Search Decoding.

  Args:
    time: Beam search time step, should start at 0. At time 0 we assume
      that all beams are equal and consider only the first beam for
      continuations.
    logits: Logits at the current time step. A tensor of shape `[B, vocab_size]`
    beam_state: Current state of the beam search. An instance of `BeamState`
    batch_size: The batch size for this input.
    beam_width: The size of the beams.
    end_token: The int32 end token.
    length_penalty_weight: Float weight to penalize length. Disabled with 0.0.

  Returns:
    A new beam state.
  """
  static_batch_size = tensor_util.constant_value(batch_size)

  # Calculate the current lengths of the predictions
  prediction_lengths = beam_state.lengths
  previously_finished = beam_state.finished

  # Calculate the total log probs for the new hypotheses
  # Final Shape: [batch_size, beam_width, vocab_size]
  probs = nn_ops.log_softmax(logits)
  probs = _mask_probs(probs, end_token, previously_finished)
  total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + probs

  # Calculate the continuation lengths by adding to all continuing beams.
  vocab_size = logits.get_shape().as_list()[-1]
  lengths_to_add = array_ops.one_hot(
      array_ops.tile(
          array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]),
      vocab_size, 0, 1)
  add_mask = (1 - math_ops.to_int32(previously_finished))
  lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add
  new_prediction_lengths = array_ops.expand_dims(prediction_lengths,
                                                 2) + lengths_to_add

  # Calculate the scores for each beam
  scores = _get_scores(
      log_probs=total_probs,
      sequence_lengths=new_prediction_lengths,
      length_penalty_weight=length_penalty_weight)

  scores_flat = array_ops.reshape(scores, [batch_size, -1])
  # During the first time step we only consider the initial beam
  scores_flat = control_flow_ops.cond(
      ops.convert_to_tensor(time) > 0, lambda: scores_flat,
      lambda: scores[:, 0])

  # Pick the next beams according to the specified successors function
  next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width)
  next_beam_scores.set_shape([static_batch_size, beam_width])
  word_indices.set_shape([static_batch_size, beam_width])

  # Pick out the probs, beam_ids, and states according to the chosen predictions
  next_beam_probs = _tensor_gather_helper(
      gather_indices=word_indices,
      gather_from=total_probs,
      range_input=batch_size,
      range_size=beam_width * vocab_size,
      final_shape=[static_batch_size, beam_width])

  next_word_ids = math_ops.to_int32(word_indices % vocab_size)
  next_beam_ids = math_ops.to_int32(word_indices / vocab_size)

  # Append new ids to current predictions
  previously_finished = _tensor_gather_helper(
      gather_indices=next_beam_ids,
      gather_from=previously_finished,
      range_input=batch_size,
      range_size=beam_width,
      final_shape=[static_batch_size, beam_width])
  next_finished = math_ops.logical_or(previously_finished,
                                      math_ops.equal(next_word_ids, end_token))

  # Calculate the length of the next predictions.
  # 1. Finished beams remain unchanged
  # 2. Beams that are now finished (EOS predicted) remain unchanged
  # 3. Beams that are not yet finished have their length increased by 1
  lengths_to_add = math_ops.to_int32(
      math_ops.not_equal(next_word_ids, end_token))
  lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add
  next_prediction_len = _tensor_gather_helper(
      gather_indices=next_beam_ids,
      gather_from=beam_state.lengths,
      range_input=batch_size,
      range_size=beam_width,
      final_shape=[static_batch_size, beam_width])
  next_prediction_len += lengths_to_add

  next_state = BeamSearchDecoderState(
      cell_state=beam_state.cell_state,
      log_probs=next_beam_probs,
      lengths=next_prediction_len,
      finished=next_finished)

  output = BeamSearchDecoderOutput(
      scores=next_beam_scores,
      predicted_ids=next_word_ids,
      parent_ids=next_beam_ids)

  return output, next_state
 def _entropy(self):
   return -math_ops.reduce_sum(
       nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
Example #33
0
 def _log_unnormalized_prob(self, counts):
   counts = self._maybe_assert_valid_sample(counts)
   return math_ops.reduce_sum(counts * nn_ops.log_softmax(self.logits), -1)