def test_step(self): dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width]) beam_state = beam_search_decoder.BeamSearchDecoderState( cell_state=dummy_cell_state, log_probs=nn_ops.log_softmax( array_ops.ones([self.batch_size, self.beam_width])), lengths=constant_op.constant( 2, shape=[self.batch_size, self.beam_width], dtype=dtypes.int64), finished=array_ops.zeros( [self.batch_size, self.beam_width], dtype=dtypes.bool), accumulated_attention_probs=()) logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size], 0.0001) logits_[0, 0, 2] = 1.9 logits_[0, 0, 3] = 2.1 logits_[0, 1, 3] = 3.1 logits_[0, 1, 4] = 0.9 logits_[1, 0, 1] = 0.5 logits_[1, 1, 2] = 2.7 logits_[1, 2, 2] = 10.0 logits_[1, 2, 3] = 0.2 logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32) log_probs = nn_ops.log_softmax(logits) outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, next_cell_state=dummy_cell_state, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, end_token=self.end_token, length_penalty_weight=self.length_penalty_weight, coverage_penalty_weight=self.coverage_penalty_weight) with self.cached_session() as sess: outputs_, next_state_, state_, log_probs_ = sess.run( [outputs, next_beam_state, beam_state, log_probs]) self.assertAllEqual(outputs_.predicted_ids, [[3, 3, 2], [2, 2, 1]]) self.assertAllEqual(outputs_.parent_ids, [[1, 0, 0], [2, 1, 0]]) self.assertAllEqual(next_state_.lengths, [[3, 3, 3], [3, 3, 3]]) self.assertAllEqual(next_state_.finished, [[False, False, False], [False, False, False]]) expected_log_probs = [] expected_log_probs.append(state_.log_probs[0][[1, 0, 0]]) expected_log_probs.append(state_.log_probs[1][[2, 1, 0]]) # 0 --> 1 expected_log_probs[0][0] += log_probs_[0, 1, 3] expected_log_probs[0][1] += log_probs_[0, 0, 3] expected_log_probs[0][2] += log_probs_[0, 0, 2] expected_log_probs[1][0] += log_probs_[1, 2, 2] expected_log_probs[1][1] += log_probs_[1, 1, 2] expected_log_probs[1][2] += log_probs_[1, 0, 1] self.assertAllEqual(next_state_.log_probs, expected_log_probs)
def testLogSoftmaxAxes(self): arr = np.linspace(0., 1, 12).reshape(3, 4) x_neg_axis = nn_ops.log_softmax(arr, axis=-2) y_pos_axis = nn_ops.log_softmax(arr, axis=0) z_gt_axis = nn_ops.log_softmax(arr, axis=4) x_neg_axis_tf = self.evaluate(x_neg_axis) y_pos_axis_tf = self.evaluate(y_pos_axis) z_gt_axis_tf = self.evaluate(z_gt_axis) eps = 1e-3 self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps) self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
def test_step_with_eos(self): dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width]) beam_state = beam_search_decoder.BeamSearchDecoderState( cell_state=dummy_cell_state, log_probs=nn_ops.log_softmax( array_ops.ones([self.batch_size, self.beam_width])), lengths=ops.convert_to_tensor( [[2, 1, 2], [2, 2, 1]], dtype=dtypes.int32), finished=ops.convert_to_tensor( [[False, True, False], [False, False, True]], dtype=dtypes.bool)) logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size], 0.0001) logits_[0, 0, 2] = 1.9 logits_[0, 0, 3] = 2.1 logits_[0, 1, 3] = 3.1 logits_[0, 1, 4] = 0.9 logits_[1, 0, 1] = 0.5 logits_[1, 1, 2] = 5.7 # why does this not work when it's 2.7? logits_[1, 2, 2] = 1.0 logits_[1, 2, 3] = 0.2 logits = ops.convert_to_tensor(logits_, dtype=dtypes.float32) log_probs = nn_ops.log_softmax(logits) outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, end_token=self.end_token, length_penalty_weight=self.length_penalty_weight) with self.test_session() as sess: outputs_, next_state_, state_, log_probs_ = sess.run( [outputs, next_beam_state, beam_state, log_probs]) np.testing.assert_array_equal(outputs_.parent_ids, [[1, 0, 0], [1, 2, 0]]) np.testing.assert_array_equal(outputs_.predicted_ids, [[0, 3, 2], [2, 0, 1]]) np.testing.assert_array_equal(next_state_.lengths, [[1, 3, 3], [3, 1, 3]]) np.testing.assert_array_equal(next_state_.finished, [[True, False, False], [False, True, False]]) expected_log_probs = [] expected_log_probs.append(state_.log_probs[0][[1, 0, 0]]) expected_log_probs.append(state_.log_probs[1][[1, 2, 0]]) expected_log_probs[0][1] += log_probs_[0, 0, 3] expected_log_probs[0][2] += log_probs_[0, 0, 2] expected_log_probs[1][0] += log_probs_[1, 1, 2] expected_log_probs[1][2] += log_probs_[1, 0, 1] np.testing.assert_array_equal(next_state_.log_probs, expected_log_probs)
def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(logits) if logits.get_shape().ndims == 2: logits_2d = logits x_2d = x else: logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) # compute the normalization constant log_norm_const = (math_ops.lgamma(self.event_size) + (self.event_size - 1) * math_ops.log(self.temperature)) # compute the unnormalized density log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self.temperature) log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False) # combine unnormalized density with normalization constant log_prob = log_norm_const + log_unnorm_prob ret = array_ops.reshape(log_prob, logits_shape) return ret
def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(math_ops.reduce_sum(logits, axis=[-1])) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) # compute the normalization constant k = math_ops.cast(self.event_size, x.dtype) log_norm_const = (math_ops.lgamma(k) + (k - 1.) * math_ops.log(self.temperature)) # compute the unnormalized density log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d) log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False) # combine unnormalized density with normalization constant log_prob = log_norm_const + log_unnorm_prob # Reshapes log_prob to be consistent with shape of user-supplied logits ret = array_ops.reshape(log_prob, logits_shape) return ret
def _kl_divergence(p, p_logits, q): """Computes the Kullback-Liebler divergence between p and q. This function uses p's logits in some places to improve numerical stability. Specifically: KL(p || q) = sum[ p * log(p / q) ] = sum[ p * ( log(p) - log(q) ) ] = sum[ p * ( log_softmax(p_logits) - log(q) ) ] Args: p: A 2-D floating-point Tensor p_ij, where `i` corresponds to the minibatch example and `j` corresponds to the probability of being in class `j`. p_logits: A 2-D floating-point Tensor corresponding to logits for `p`. q: A 1-D floating-point Tensor, where q_j corresponds to the probability of class `j`. Returns: KL divergence between two distributions. Output dimension is 1D, one entry per distribution in `p`. Raises: ValueError: If any of the inputs aren't floating-point. ValueError: If p or p_logits aren't 2D. ValueError: If q isn't 1D. """ for tensor in [p, p_logits, q]: if not tensor.dtype.is_floating: raise ValueError('Input %s must be floating type.', tensor.name) p.shape.assert_has_rank(2) p_logits.shape.assert_has_rank(2) q.shape.assert_has_rank(1) return math_ops.reduce_sum( p * (nn_ops.log_softmax(p_logits) - math_ops.log(q)), axis=1)
def _log_cdf(self, x): x = self._pad_sample_dims(x) log_cdf_x = self.components_distribution.log_cdf(x) # [S, B, k] log_mix_prob = nn_ops.log_softmax( self.mixture_distribution.logits, axis=-1) # [B, k] return math_ops.reduce_logsumexp( log_cdf_x + log_mix_prob, axis=-1) # [S, B]
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) # grad_grad is the backprop for softmax gradient. # # Second derivative is just softmax derivative w.r.t. logits. softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) def IsZero(g): # Some introspection to check if the gradient is feeding zeros if context.executing_eagerly(): # TODO(apassos) add an efficient way to detect eager zeros here. return False if g.op.type in ("ZerosLike", "Zeros"): return True const_fill_value = tensor_util.constant_value(g) return const_fill_value is not None and (const_fill_value == 0).all() logits = op.inputs[0] if grad_grad is not None and not IsZero(grad_grad): softmax = nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( math_ops.matmul( array_ops.expand_dims(grad_grad, 1), array_ops.expand_dims(softmax, 2)), axis=1)) * softmax) return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
def _sample_n(self, n, seed=None): sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0) logits = self.logits * array_ops.ones(sample_shape) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) np_dtype = self.dtype.as_numpy_dtype # Uniform variates must be sampled from the interval (0,1] rather than # [0,1], as they are passed through log() to compute Gumbel variates. # We need to use np.finfo(np_dtype).tiny because it is the smallest, # positive, "normal" number. A "normal" number is such that the mantissa # has an implicit leading 1. Normal, positive numbers x, y have the # reasonable property that: x + y >= max(x, y). # minval=np.nextafter(np.float32(0),1)) can cause # tf.random_uniform(dtype=tf.float32) to sample 0. uniform = random_ops.random_uniform(shape=array_ops.shape(logits_2d), minval=np.finfo(np_dtype).tiny, maxval=1, dtype=self.dtype, seed=seed) gumbel = -math_ops.log(-math_ops.log(uniform)) noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d) samples = nn_ops.log_softmax(noisy_logits) ret = array_ops.reshape(samples, sample_shape) return ret
def testEntropyGradient(self): with self.cached_session() as sess: logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]]) probabilities = nn_ops.softmax(logits) log_probabilities = nn_ops.log_softmax(logits) true_entropy = - math_ops.reduce_sum( probabilities * log_probabilities, axis=-1) categorical_distribution = categorical.Categorical(probs=probabilities) categorical_entropy = categorical_distribution.entropy() # works true_entropy_g = gradients_impl.gradients(true_entropy, [logits]) categorical_entropy_g = gradients_impl.gradients( categorical_entropy, [logits]) res = sess.run({"true_entropy": true_entropy, "categorical_entropy": categorical_entropy, "true_entropy_g": true_entropy_g, "categorical_entropy_g": categorical_entropy_g}) self.assertAllClose(res["true_entropy"], res["categorical_entropy"]) self.assertAllClose(res["true_entropy_g"], res["categorical_entropy_g"])
def _log_prob(self, x): with ops.control_dependencies(self._runtime_assertions): x = self._pad_sample_dims(x) log_prob_x = self.components_distribution.log_prob(x) # [S, B, k] log_mix_prob = nn_ops.log_softmax( self.mixture_distribution.logits, axis=-1) # [B, k] return math_ops.reduce_logsumexp( log_prob_x + log_mix_prob, axis=-1) # [S, B]
def testGradient(self, x_shape): x_np = np.random.randn(*x_shape).astype(np.float64) with self.cached_session(): x_tf = constant_op.constant(x_np) y_tf = nn_ops.log_softmax(x_tf) err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf, x_shape) eps = 1e-7 self.assertLess(err, eps)
def testLogSoftmax(self): x_shape = [5, 10] x_np = np.random.randn(*x_shape).astype(np.float32) y_np = self._log_softmax(x_np) x_tf = constant_op.constant(x_np) y_tf = nn_ops.log_softmax(x_tf) y_tf_np = self.evaluate(y_tf) eps = 1e-3 self.assertAllClose(y_tf_np, y_np, eps)
def _kl_categorical_categorical(a, b, name=None): """Calculate the batched KL divergence KL(a || b) with a, b OneHotCategorical. Args: a: instance of a OneHotCategorical distribution object. b: instance of a OneHotCategorical distribution object. name: (optional) Name to use for created operations. default is "kl_categorical_categorical". Returns: Batchwise KL(a || b) """ with ops.name_scope( name, "kl_categorical_categorical", [a.logits, b.logits]): # sum(p*ln(p/q)) return math_ops.reduce_sum( nn_ops.softmax(a.logits)*(nn_ops.log_softmax(a.logits) - nn_ops.log_softmax(b.logits)), reduction_indices=[-1])
def _kl_categorical_categorical(a, b, name=None): """Calculate the batched KL divergence KL(a || b) with a and b Categorical. Args: a: instance of a Categorical distribution object. b: instance of a Categorical distribution object. name: (optional) Name to use for created operations. default is "kl_categorical_categorical". Returns: Batchwise KL(a || b) """ with ops.name_scope(name, "kl_categorical_categorical", values=[a.logits, b.logits]): # sum(probs log(probs / (1 - probs))) delta_log_probs1 = (nn_ops.log_softmax(a.logits) - nn_ops.log_softmax(b.logits)) return math_ops.reduce_sum(nn_ops.softmax(a.logits) * delta_log_probs1, axis=-1)
def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None): """Computes the CTC loss and gradients. Most users will want fwd_bwd.ctc_loss This function returns the computed gradient, it does not have a gradient of its own defined. Args: logits: tensor of shape [frames, batch_size, num_labels] labels: tensor of shape [batch_size, max_label_seq_length] label_length: tensor of shape [batch_size] Length of reference label sequence in labels. logit_length: tensor of shape [batch_size] Length of input sequence in logits. unique: (optional) unique label indices as computed by unique(labels) If supplied, enables an implementation that is faster and more memory efficient on TPU. Returns: loss: tensor of shape [batch_size] gradient: tensor of shape [frames, batch_size, num_labels] """ num_labels = _get_dim(logits, 2) max_label_seq_length = _get_dim(labels, 1) ilabel_log_probs = nn_ops.log_softmax(logits) state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs) state_trans_probs = _ctc_state_trans(labels) initial_state_log_probs, final_state_log_probs = ctc_state_log_probs( label_length, max_label_seq_length) fwd_bwd_log_probs, log_likelihood = _forward_backward_log( state_trans_log_probs=math_ops.log(state_trans_probs), initial_state_log_probs=initial_state_log_probs, final_state_log_probs=final_state_log_probs, observed_log_probs=state_log_probs, sequence_length=logit_length) if unique: olabel_log_probs = _state_to_olabel_unique( labels, num_labels, fwd_bwd_log_probs, unique) else: olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs) grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs) loss = -log_likelihood return loss, grad
def _testOverflow(self, use_gpu=False): if use_gpu: type = np.float32 else: type = np.float64 max = np.finfo(type).max features = np.array([[1.0, 1.0, 1.0, 1.0], [max, 1.0, 2.0, 3.0]]).astype(type) with self.test_session(use_gpu=use_gpu): tf_log_softmax = nn_ops.log_softmax(features) out = tf_log_softmax.eval() self.assertAllClose( np.array([[-1.386294, -1.386294, -1.386294, -1.386294], [0, -max, -max, -max]]), out, rtol=1.0e-5, atol=1.0e-5, )
def _testOverflow(self, use_gpu=False): if use_gpu: type = np.float32 # pylint: disable=redefined-builtin else: type = np.float64 # pylint: disable=redefined-builtin max = np.finfo(type).max # pylint: disable=redefined-builtin features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type) with self.test_session(use_gpu=use_gpu): tf_log_softmax = nn_ops.log_softmax(features) out = tf_log_softmax.eval() self.assertAllClose( np.array([[-1.386294, -1.386294, -1.386294, -1.386294], [0, -max, -max, -max]]), out, rtol=1.e-5, atol=1.e-5)
def _sample_n(self, n, seed=None): sample_shape = array_ops.concat_v2(([n], array_ops.shape(self.logits)), 0) logits = self.logits * array_ops.ones(sample_shape) if logits.get_shape().ndims == 2: logits_2d = logits else: logits_2d = array_ops.reshape(logits, [-1, self.num_classes]) np_dtype = self.dtype.as_numpy_dtype() minval = np.nextafter(np_dtype(0), np_dtype(1)) uniform = random_ops.random_uniform( shape=array_ops.shape(logits_2d), minval=minval, maxval=1, dtype=self.dtype, seed=seed ) gumbel = -math_ops.log(-math_ops.log(uniform)) noisy_logits = math_ops.div(gumbel + logits_2d, self.temperature) samples = nn_ops.log_softmax(noisy_logits) ret = array_ops.reshape(samples, sample_shape) return ret
def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False): # A previous version of the code checked the op name rather than the op type # to distinguish between log and non-log. Use an arbitrary name to catch # this bug in future. name = "arbitrary" np_softmax = self._npSoftmax(np_features, dim=dim, log=log) with self.test_session(use_gpu=use_gpu): if log: tf_softmax = nn_ops.log_softmax(np_features, dim=dim, name=name) else: tf_softmax = nn_ops.softmax(np_features, dim=dim, name=name) out = tf_softmax.eval() self.assertAllCloseAccordingToType(np_softmax, out) self.assertShapeEqual(np_softmax, tf_softmax) if not log: # Bonus check: the softmaxes should add to one in dimension dim. sum_along_dim = np.sum(out, axis=dim) self.assertAllCloseAccordingToType(np.ones(sum_along_dim.shape), sum_along_dim)
def _sample_n(self, n, seed=None): sample_shape = array_ops.concat([[n], array_ops.shape(self.logits)], 0) logits = self.logits * array_ops.ones(sample_shape, dtype=self.dtype) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) # Uniform variates must be sampled from the open-interval `(0, 1)` rather # than `[0, 1)`. To do so, we use `np.finfo(self.dtype.as_numpy_dtype).tiny` # because it is the smallest, positive, "normal" number. A "normal" number # is such that the mantissa has an implicit leading 1. Normal, positive # numbers x, y have the reasonable property that, `x + y >= max(x, y)`. In # this case, a subnormal number (i.e., np.nextafter) can cause us to sample # 0. uniform = random_ops.random_uniform( shape=array_ops.shape(logits_2d), minval=np.finfo(self.dtype.as_numpy_dtype).tiny, maxval=1., dtype=self.dtype, seed=seed) gumbel = -math_ops.log(-math_ops.log(uniform)) noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d) samples = nn_ops.log_softmax(noisy_logits) ret = array_ops.reshape(samples, sample_shape) return ret
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` next_cell_state: The next state from the cell, e.g. an instance of AttentionWrapperState if the cell is attentional. beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. beam_width: Python int. The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] step_log_probs = nn_ops.log_softmax(logits) step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1] lengths_to_add = array_ops.one_hot( indices=array_ops.fill([batch_size, beam_width], end_token), depth=vocab_size, on_value=np.int64(0), off_value=np.int64(1), dtype=dtypes.int64) add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished)) lengths_to_add *= array_ops.expand_dims(add_mask, 2) new_prediction_lengths = ( lengths_to_add + array_ops.expand_dims(prediction_lengths, 2)) # Calculate the scores for each beam scores = _get_scores( log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) time = ops.convert_to_tensor(time, name="time") # During the first time step we only consider the initial beam scores_flat = array_ops.reshape(scores, [batch_size, -1]) # Pick the next beams according to the specified successors function next_beam_size = ops.convert_to_tensor( beam_width, dtype=dtypes.int32, name="beam_width") next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) # Pick out the probs, beam_ids, and states according to the chosen predictions next_beam_probs = _tensor_gather_helper( gather_indices=word_indices, gather_from=total_probs, batch_size=batch_size, range_size=beam_width * vocab_size, gather_shape=[-1], name="next_beam_probs") # Note: just doing the following # math_ops.to_int32(word_indices % vocab_size, # name="next_beam_word_ids") # would be a lot cleaner but for reasons unclear, that hides the results of # the op which prevents capturing it with tfdbg debug ops. raw_next_word_ids = math_ops.mod( word_indices, vocab_size, name="next_beam_word_ids") next_word_ids = math_ops.to_int32(raw_next_word_ids) next_beam_ids = math_ops.to_int32( word_indices / vocab_size, name="next_beam_parent_ids") # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_finished = math_ops.logical_or( previously_finished, math_ops.equal(next_word_ids, end_token), name="next_beam_finished") # Calculate the length of the next predictions. # 1. Finished beams remain unchanged. # 2. Beams that are now finished (EOS predicted) have their length # increased by 1. # 3. Beams that are not yet finished have their length increased by 1. lengths_to_add = math_ops.to_int64(math_ops.logical_not(previously_finished)) next_prediction_len = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=beam_state.lengths, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_prediction_len += lengths_to_add # Pick out the cell_states according to the next_beam_ids. We use a # different gather_shape here because the cell_state tensors, i.e. # the tensors that would be gathered from, all have dimension # greater than two and we need to preserve those dimensions. # pylint: disable=g-long-lambda next_cell_state = nest.map_structure( lambda gather_from: _maybe_tensor_gather_helper( gather_indices=next_beam_ids, gather_from=gather_from, batch_size=batch_size, range_size=beam_width, gather_shape=[batch_size * beam_width, -1]), next_cell_state) # pylint: enable=g-long-lambda next_state = BeamSearchDecoderState( cell_state=next_cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) output = BeamSearchDecoderOutput( scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state
def test_step(self): def get_probs(): """this simulates the initialize method in BeamSearchDecoder.""" log_prob_mask = array_ops.one_hot( array_ops.zeros([self.batch_size], dtype=dtypes.int32), depth=self.beam_width, on_value=True, off_value=False, dtype=dtypes.bool) log_prob_zeros = array_ops.zeros( [self.batch_size, self.beam_width], dtype=dtypes.float32) log_prob_neg_inf = array_ops.ones( [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf log_probs = array_ops.where(log_prob_mask, log_prob_zeros, log_prob_neg_inf) return log_probs log_probs = get_probs() dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width]) # pylint: disable=invalid-name _finished = array_ops.one_hot( array_ops.zeros([self.batch_size], dtype=dtypes.int32), depth=self.beam_width, on_value=False, off_value=True, dtype=dtypes.bool) _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64) _lengths[:, 0] = 2 _lengths = constant_op.constant(_lengths, dtype=dtypes.int64) beam_state = beam_search_decoder.BeamSearchDecoderState( cell_state=dummy_cell_state, log_probs=log_probs, lengths=_lengths, finished=_finished, accumulated_attention_probs=()) logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size], 0.0001) logits_[0, 0, 2] = 1.9 logits_[0, 0, 3] = 2.1 logits_[0, 1, 3] = 3.1 logits_[0, 1, 4] = 0.9 logits_[1, 0, 1] = 0.5 logits_[1, 1, 2] = 2.7 logits_[1, 2, 2] = 10.0 logits_[1, 2, 3] = 0.2 logits = constant_op.constant(logits_, dtype=dtypes.float32) log_probs = nn_ops.log_softmax(logits) outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, next_cell_state=dummy_cell_state, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, end_token=self.end_token, length_penalty_weight=self.length_penalty_weight, coverage_penalty_weight=self.coverage_penalty_weight) with self.cached_session() as sess: outputs_, next_state_, _, _ = sess.run( [outputs, next_beam_state, beam_state, log_probs]) self.assertEqual(outputs_.predicted_ids[0, 0], 3) self.assertEqual(outputs_.predicted_ids[0, 1], 2) self.assertEqual(outputs_.predicted_ids[1, 0], 1) neg_inf = -np.Inf self.assertAllEqual( next_state_.log_probs[:, -3:], [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]]) self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True) self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True) self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
def _entropy(self): return -math_ops.reduce_sum( nn_ops.log_softmax(self.logits) * self.probs, axis=-1)
def _beam_search_step(time, logits, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. beam_width: Python int. The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] step_log_probs = nn_ops.log_softmax(logits) step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.shape[-1].value lengths_to_add = array_ops.one_hot(indices=array_ops.tile( array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]), depth=vocab_size, on_value=0, off_value=1) add_mask = (1 - math_ops.to_int32(previously_finished)) lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add new_prediction_lengths = (lengths_to_add + array_ops.expand_dims(prediction_lengths, 2)) # Calculate the scores for each beam scores = _get_scores(log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) time = ops.convert_to_tensor(time, name="time") # During the first time step we only consider the initial beam scores_flat = control_flow_ops.cond( time > 0, lambda: array_ops.reshape(scores, [batch_size, -1]), lambda: scores[:, 0]) # Pick the next beams according to the specified successors function next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) # Pick out the probs, beam_ids, and states according to the chosen predictions next_beam_probs = _tensor_gather_helper( gather_indices=word_indices, gather_from=total_probs, range_input=batch_size, range_size=beam_width * vocab_size, final_shape=[static_batch_size, beam_width]) next_word_ids = math_ops.to_int32(word_indices % vocab_size) next_beam_ids = math_ops.to_int32(word_indices / vocab_size) # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, range_input=batch_size, range_size=beam_width, final_shape=[static_batch_size, beam_width]) next_finished = math_ops.logical_or( previously_finished, math_ops.equal(next_word_ids, end_token)) # Calculate the length of the next predictions. # 1. Finished beams remain unchanged # 2. Beams that are now finished (EOS predicted) remain unchanged # 3. Beams that are not yet finished have their length increased by 1 lengths_to_add = math_ops.to_int32( math_ops.not_equal(next_word_ids, end_token)) lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add next_prediction_len = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=beam_state.lengths, range_input=batch_size, range_size=beam_width, final_shape=[static_batch_size, beam_width]) next_prediction_len += lengths_to_add next_state = BeamSearchDecoderState(cell_state=beam_state.cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) output = BeamSearchDecoderOutput(scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` next_cell_state: The next state from the cell, e.g. an instance of AttentionWrapperState if the cell is attentional. beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. beam_width: Python int. The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] step_log_probs = nn_ops.log_softmax(logits) step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1] lengths_to_add = array_ops.one_hot(indices=array_ops.fill( [batch_size, beam_width], end_token), depth=vocab_size, on_value=np.int64(0), off_value=np.int64(1), dtype=dtypes.int64) add_mask = math_ops.to_int64(math_ops.logical_not(previously_finished)) lengths_to_add *= array_ops.expand_dims(add_mask, 2) new_prediction_lengths = (lengths_to_add + array_ops.expand_dims(prediction_lengths, 2)) # Calculate the scores for each beam scores = _get_scores(log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) time = ops.convert_to_tensor(time, name="time") # During the first time step we only consider the initial beam scores_flat = array_ops.reshape(scores, [batch_size, -1]) # Pick the next beams according to the specified successors function next_beam_size = ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width") next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=next_beam_size) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) # Pick out the probs, beam_ids, and states according to the chosen predictions next_beam_probs = _tensor_gather_helper(gather_indices=word_indices, gather_from=total_probs, batch_size=batch_size, range_size=beam_width * vocab_size, gather_shape=[-1], name="next_beam_probs") # Note: just doing the following # math_ops.to_int32(word_indices % vocab_size, # name="next_beam_word_ids") # would be a lot cleaner but for reasons unclear, that hides the results of # the op which prevents capturing it with tfdbg debug ops. raw_next_word_ids = math_ops.mod(word_indices, vocab_size, name="next_beam_word_ids") next_word_ids = math_ops.to_int32(raw_next_word_ids) next_beam_ids = math_ops.to_int32(word_indices / vocab_size, name="next_beam_parent_ids") # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_finished = math_ops.logical_or(previously_finished, math_ops.equal(next_word_ids, end_token), name="next_beam_finished") # Calculate the length of the next predictions. # 1. Finished beams remain unchanged. # 2. Beams that are now finished (EOS predicted) have their length # increased by 1. # 3. Beams that are not yet finished have their length increased by 1. lengths_to_add = math_ops.to_int64( math_ops.logical_not(previously_finished)) next_prediction_len = _tensor_gather_helper(gather_indices=next_beam_ids, gather_from=beam_state.lengths, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_prediction_len += lengths_to_add # Pick out the cell_states according to the next_beam_ids. We use a # different gather_shape here because the cell_state tensors, i.e. # the tensors that would be gathered from, all have dimension # greater than two and we need to preserve those dimensions. # pylint: disable=g-long-lambda next_cell_state = nest.map_structure( lambda gather_from: _maybe_tensor_gather_helper( gather_indices=next_beam_ids, gather_from=gather_from, batch_size=batch_size, range_size=beam_width, gather_shape=[batch_size * beam_width, -1]), next_cell_state) # pylint: enable=g-long-lambda next_state = BeamSearchDecoderState(cell_state=next_cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) output = BeamSearchDecoderOutput(scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state
def _log_unnormalized_prob(self, counts): counts = self._maybe_assert_valid_sample(counts) return math_ops.reduce_sum(counts * nn_ops.log_softmax(self.logits), -1)
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` next_cell_state: The next state from the cell, e.g. an instance of AttentionWrapperState if the cell is attentional. beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. beam_width: Python int. The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] step_log_probs = nn_ops.log_softmax(logits) step_log_probs = _mask_probs( step_log_probs, end_token, previously_finished) total_probs = tf.expand_dims( beam_state.log_probs, axis=2) + step_log_probs # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.shape[-1].value lengths_to_add = tf.one_hot( indices=tf.tile( tf.reshape(end_token, [1, 1]), [batch_size, beam_width]), depth=vocab_size, on_value=0, off_value=1) add_mask = (1 - tf.to_int32(previously_finished)) lengths_to_add = tf.expand_dims(add_mask, 2) * lengths_to_add new_prediction_lengths = ( lengths_to_add + tf.expand_dims(prediction_lengths, 2)) # Calculate the scores for each beam scores = _get_scores( log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) time = ops.convert_to_tensor(time, name="time") # During the first time step we only consider the initial beam scores_shape = tf.shape(scores) scores_flat = tf.cond( time > 0, lambda: tf.reshape(scores, [batch_size, -1]), lambda: scores[:, 0]) num_available_beam = tf.cond( time > 0, lambda: tf.reduce_prod(scores_shape[1:]), lambda: tf.reduce_prod(scores_shape[2:])) # Pick the next beams according to the specified successors function next_beam_size = tf.minimum( ops.convert_to_tensor( beam_width, dtype=dtypes.int32, name="beam_width"), num_available_beam) next_beam_scores, word_indices = nn_ops.top_k( scores_flat, k=next_beam_size) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) # Pick out the probs, beam_ids, and states according to the chosen # predictions next_beam_probs = _tensor_gather_helper( gather_indices=word_indices, gather_from=total_probs, batch_size=batch_size, range_size=beam_width * vocab_size, gather_shape=[-1]) next_word_ids = tf.to_int32(word_indices % vocab_size) next_beam_ids = tf.to_int32(word_indices / vocab_size) # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_finished = tf.logical_or(previously_finished, tf.equal(next_word_ids, end_token)) # Calculate the length of the next predictions. # 1. Finished beams remain unchanged # 2. Beams that are now finished (EOS predicted) remain unchanged # 3. Beams that are not yet finished have their length increased by 1 lengths_to_add = tf.to_int32( tf.not_equal(next_word_ids, end_token)) lengths_to_add = (1 - tf.to_int32(next_finished)) * lengths_to_add next_prediction_len = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=beam_state.lengths, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_prediction_len += lengths_to_add # Pick out the cell_states according to the next_beam_ids. We use a # different gather_shape here because the cell_state tensors, i.e. # the tensors that would be gathered from, all have dimension # greater than two and we need to preserve those dimensions. next_cell_state = nest.map_structure( lambda gather_from: _maybe_tensor_gather_helper( gather_indices=next_beam_ids, gather_from=gather_from, batch_size=batch_size, range_size=beam_width, gather_shape=[batch_size * beam_width, -1]), next_cell_state) next_state = BeamSearchDecoderState( cell_state=next_cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) output = BeamSearchDecoderOutput( scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state
def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]` next_cell_state: The next state from the cell, e.g. an instance of AttentionWrapperState if the cell is attentional. beam_state: Current state of the beam search. An instance of `BeamSearchDecoderState`. batch_size: The batch size for this input. beam_width: Python int. The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] step_log_probs = nn_ops.log_softmax(logits) #step_log_probs",Tensor shape=(?, 10, 56136) step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished) #step_log_probs_masked (?, 10, 56136) total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs #total_probs (?, 10, 56136) # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1] lengths_to_add = array_ops.one_hot( indices=array_ops.tile(array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]), depth=vocab_size, on_value=constant_op.constant(0, dtype=dtypes.int64), off_value=constant_op.constant(1, dtype=dtypes.int64), dtype=dtypes.int64) #lengths_to_add shape=(?, 10, 56136) add_mask = (1 - math_ops.to_int64(previously_finished)) #add_mask shape=(?, 10), dtype=int64 lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add #lengths_to_add shape=(?, 10, 56136) new_prediction_lengths = (lengths_to_add + array_ops.expand_dims(prediction_lengths, 2)) #new_prediction_lengths shape=(?, 10, 56136) # Calculate the scores for each beam scores = _get_scores(log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) scores_mask = tf.constant([step_log_probs.dtype.min, 0], dtype=dtypes.float32, shape=[vocab_size], name='mask') scores_masked = tf.add(scores, scores_mask) scores_mask2 = tf.constant([0, 0, 0, 0, 0, step_log_probs.dtype.min, 0], dtype=dtypes.float32, shape=[vocab_size], name='mask2') scores_masked = tf.add(scores_mask2, scores_masked) def new_scores(scores_masked): scores_no_stop = tf.constant([0, 0, step_log_probs.dtype.min, 0], dtype=dtypes.float32, shape=[vocab_size], name='no_stop') scores = tf.add(scores_masked, scores_no_stop) return scores #constrain the length scores = control_flow_ops.cond( #time <9 , time < 0, lambda: new_scores(scores_masked), lambda: scores_masked) #scores shape=(?, 10, 56136) #[batch_size, beam_width, vocab_size] time = ops.convert_to_tensor(time, name="time") # During the first time step we only consider the initial beam scores_shape = array_ops.shape(scores) #scores_shape" shape=(3,) print("scores_shape", scores_shape) scores_to_flat_1 = array_ops.reshape(scores, [batch_size, 2, -1]) print("scores_to_flat_1", scores_to_flat_1) scores_to_0 = scores[:, 0] scores_to_1 = scores[:, -1] scores_to_flat_2 = tf.concat([scores_to_0, scores_to_1], 1) scores_flat = control_flow_ops.cond( time > 0, lambda: scores_to_flat_1, lambda: array_ops.reshape(scores_to_flat_2, [batch_size, 2, -1])) num_available_beam = control_flow_ops.cond( time > 0, lambda: math_ops.reduce_prod(scores_shape[1:]), lambda: math_ops.reduce_prod(scores_shape[2:])) #scores_flat", shape=(?, ?) #num_available_beam" shape=() # Pick the next beams according to the specified successors function next_beam_size = math_ops.minimum( ops.convert_to_tensor(beam_width, dtype=dtypes.int32, name="beam_width"), num_available_beam) #scores_t = tf.reshape(scores_flat,[batch_size,2,-1]) ############################ #input_words=['entrencheds01', 'entrencheds02', 'forgev01', 'forgev04', \ # 'hitn02', 'hitn03', 'vaultn02', 'vaultn04', 'deepa03', \ # 'deeps02', 'admitv01', 'admitv02', 'plantn01', 'plantn02',\ # 'squaren01', 'squaren05', 'drawv05', 'drawv06', 'spellv03', \ # 'spellv02', 'shotn02', 'shotn04', 'coachv01', 'coachv02', 'casen05',\ # 'casen09', 'focusn01', 'focusn02', 'tasten01', 'tasten04', 'footn01', \ # 'footv01'] input_words = get_words() return_list = prior_scores(input_words) return_array = np.array(return_list) return_tensor = tf.convert_to_tensor(return_array) tiling = [1, 5, 1] prior_mask = tf.tile(tf.expand_dims(return_tensor, 1), tiling) prior_mask = tf.cast(prior_mask, tf.float32) prior_mask = array_ops.reshape(prior_mask, [batch_size, -1]) #print ("prior_mask",prior_mask) scores_sum = tf.reduce_sum(scores_to_flat_1, 1) #print ("scores_sum_1",scores_sum) #def cal_scores_sum(scores_sum,prior_mask): # return tf.add(scores_sum,prior_mask) #scores_sum = control_flow_ops.cond( # time > 0, # lambda: cal_scores_sum(scores_sum,prior_mask), # lambda: scores_sum) #scores_sum=tf.add(scores_sum,prior_mask) #print ("scores_sum_2",scores_sum) ############################ #scores_final=tf.concat([scores_sum, scores_sum],1) def cal_scores_indices(scores_to_0, scores_to_1): next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_to_0, k=5) print("ori next_beam_scores_1,word_indices_1", next_beam_scores_1) print("ori word_indices_1", word_indices_1) next_beam_scores_2, word_indices_2 = nn_ops.top_k(scores_to_1, k=5) next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_2], 1) word_indices = tf.concat( [word_indices_1, word_indices_2 + 9 * vocab_size], 1) return next_beam_scores, word_indices def cal_scores_indices_t1(scores_final, next_beam_size): next_beam_scores_1, word_indices_1 = nn_ops.top_k(scores_final, k=5) #next_beam_scores_1, word_indices_1=sample(next_beam_scores_1,word_indices_1) print("next_beam_scores_1", next_beam_scores_1) print("word_indices_1", word_indices_1) next_beam_scores = tf.concat([next_beam_scores_1, next_beam_scores_1], 1) word_indices = tf.concat( [word_indices_1, word_indices_1 + 5 * vocab_size], 1) return next_beam_scores, word_indices next_beam_scores, word_indices = control_flow_ops.cond( time > 0, lambda: cal_scores_indices_t1(scores_sum, next_beam_size), lambda: cal_scores_indices(scores_to_0, scores_to_1)) print('next_beam_scores.shape', next_beam_scores.shape) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) #shape=(?, ?) # Pick out the probs, beam_ids, and states according to the chosen predictions next_beam_probs = _tensor_gather_helper(gather_indices=word_indices, gather_from=total_probs, batch_size=batch_size, range_size=beam_width * vocab_size, gather_shape=[-1], name="next_beam_probs") # Note: just doing the following # math_ops.to_int32(word_indices % vocab_size, # name="next_beam_word_ids") # would be a lot cleaner but for reasons unclear, that hides the results of # the op which prevents capturing it with tfdbg debug ops. raw_next_word_ids = math_ops.mod(word_indices, vocab_size, name="next_beam_word_ids") #raw_next_word_ids shape=(?, 10) next_word_ids = math_ops.to_int32(raw_next_word_ids) next_beam_ids = math_ops.to_int32(word_indices / vocab_size, name="next_beam_parent_ids") # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_finished = math_ops.logical_or(previously_finished, math_ops.equal(next_word_ids, end_token), name="next_beam_finished") # Calculate the length of the next predictions. # 1. Finished beams remain unchanged # 2. Beams that are now finished (EOS predicted) remain unchanged # 3. Beams that are not yet finished have their length increased by 1 lengths_to_add = math_ops.to_int64( math_ops.not_equal(next_word_ids, end_token)) lengths_to_add = (1 - math_ops.to_int64(next_finished)) * lengths_to_add next_prediction_len = _tensor_gather_helper(gather_indices=next_beam_ids, gather_from=beam_state.lengths, batch_size=batch_size, range_size=beam_width, gather_shape=[-1]) next_prediction_len += lengths_to_add # Pick out the cell_states according to the next_beam_ids. We use a # different gather_shape here because the cell_state tensors, i.e. # the tensors that would be gathered from, all have dimension # greater than two and we need to preserve those dimensions. # pylint: disable=g-long-lambda next_cell_state = nest.map_structure( lambda gather_from: _maybe_tensor_gather_helper( gather_indices=next_beam_ids, gather_from=gather_from, batch_size=batch_size, range_size=beam_width, gather_shape=[batch_size * beam_width, -1]), next_cell_state) # pylint: enable=g-long-lambda next_state = BeamSearchDecoderState(cell_state=next_cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) print('next_beam_probs', next_beam_probs) output = BeamSearchDecoderOutput(scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state
def test_step(self): def get_probs(): """this simulates the initialize method in BeamSearchDecoder.""" log_prob_mask = array_ops.one_hot(array_ops.zeros( [self.batch_size], dtype=dtypes.int32), depth=self.beam_width, on_value=True, off_value=False, dtype=dtypes.bool) log_prob_zeros = array_ops.zeros( [self.batch_size, self.beam_width], dtype=dtypes.float32) log_prob_neg_inf = array_ops.ones( [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf log_probs = array_ops.where(log_prob_mask, log_prob_zeros, log_prob_neg_inf) return log_probs log_probs = get_probs() dummy_cell_state = array_ops.zeros([self.batch_size, self.beam_width]) # pylint: disable=invalid-name _finished = array_ops.one_hot(array_ops.zeros([self.batch_size], dtype=dtypes.int32), depth=self.beam_width, on_value=False, off_value=True, dtype=dtypes.bool) _lengths = np.zeros([self.batch_size, self.beam_width], dtype=np.int64) _lengths[:, 0] = 2 _lengths = constant_op.constant(_lengths, dtype=dtypes.int64) beam_state = beam_search_decoder.BeamSearchDecoderState( cell_state=dummy_cell_state, log_probs=log_probs, lengths=_lengths, finished=_finished) logits_ = np.full([self.batch_size, self.beam_width, self.vocab_size], 0.0001) logits_[0, 0, 2] = 1.9 logits_[0, 0, 3] = 2.1 logits_[0, 1, 3] = 3.1 logits_[0, 1, 4] = 0.9 logits_[1, 0, 1] = 0.5 logits_[1, 1, 2] = 2.7 logits_[1, 2, 2] = 10.0 logits_[1, 2, 3] = 0.2 logits = constant_op.constant(logits_, dtype=dtypes.float32) log_probs = nn_ops.log_softmax(logits) outputs, next_beam_state = beam_search_decoder._beam_search_step( time=2, logits=logits, next_cell_state=dummy_cell_state, beam_state=beam_state, batch_size=ops.convert_to_tensor(self.batch_size), beam_width=self.beam_width, end_token=self.end_token, length_penalty_weight=self.length_penalty_weight) with self.test_session() as sess: outputs_, next_state_, _, _ = sess.run( [outputs, next_beam_state, beam_state, log_probs]) self.assertEqual(outputs_.predicted_ids[0, 0], 3) self.assertEqual(outputs_.predicted_ids[0, 1], 2) self.assertEqual(outputs_.predicted_ids[1, 0], 1) neg_inf = -np.Inf self.assertAllEqual( next_state_.log_probs[:, -3:], [[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]]) self.assertEqual((next_state_.log_probs[:, :-3] > neg_inf).all(), True) self.assertEqual((next_state_.lengths[:, :-3] > 0).all(), True) self.assertAllEqual(next_state_.lengths[:, -3:], [[0, 0, 0], [0, 0, 0]])
def _beam_search_step(time, logits, beam_state, batch_size, beam_width, end_token, length_penalty_weight): """Performs a single step of Beam Search Decoding. Args: time: Beam search time step, should start at 0. At time 0 we assume that all beams are equal and consider only the first beam for continuations. logits: Logits at the current time step. A tensor of shape `[B, vocab_size]` beam_state: Current state of the beam search. An instance of `BeamState` batch_size: The batch size for this input. beam_width: The size of the beams. end_token: The int32 end token. length_penalty_weight: Float weight to penalize length. Disabled with 0.0. Returns: A new beam state. """ static_batch_size = tensor_util.constant_value(batch_size) # Calculate the current lengths of the predictions prediction_lengths = beam_state.lengths previously_finished = beam_state.finished # Calculate the total log probs for the new hypotheses # Final Shape: [batch_size, beam_width, vocab_size] probs = nn_ops.log_softmax(logits) probs = _mask_probs(probs, end_token, previously_finished) total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + probs # Calculate the continuation lengths by adding to all continuing beams. vocab_size = logits.get_shape().as_list()[-1] lengths_to_add = array_ops.one_hot( array_ops.tile( array_ops.reshape(end_token, [1, 1]), [batch_size, beam_width]), vocab_size, 0, 1) add_mask = (1 - math_ops.to_int32(previously_finished)) lengths_to_add = array_ops.expand_dims(add_mask, 2) * lengths_to_add new_prediction_lengths = array_ops.expand_dims(prediction_lengths, 2) + lengths_to_add # Calculate the scores for each beam scores = _get_scores( log_probs=total_probs, sequence_lengths=new_prediction_lengths, length_penalty_weight=length_penalty_weight) scores_flat = array_ops.reshape(scores, [batch_size, -1]) # During the first time step we only consider the initial beam scores_flat = control_flow_ops.cond( ops.convert_to_tensor(time) > 0, lambda: scores_flat, lambda: scores[:, 0]) # Pick the next beams according to the specified successors function next_beam_scores, word_indices = nn_ops.top_k(scores_flat, k=beam_width) next_beam_scores.set_shape([static_batch_size, beam_width]) word_indices.set_shape([static_batch_size, beam_width]) # Pick out the probs, beam_ids, and states according to the chosen predictions next_beam_probs = _tensor_gather_helper( gather_indices=word_indices, gather_from=total_probs, range_input=batch_size, range_size=beam_width * vocab_size, final_shape=[static_batch_size, beam_width]) next_word_ids = math_ops.to_int32(word_indices % vocab_size) next_beam_ids = math_ops.to_int32(word_indices / vocab_size) # Append new ids to current predictions previously_finished = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=previously_finished, range_input=batch_size, range_size=beam_width, final_shape=[static_batch_size, beam_width]) next_finished = math_ops.logical_or(previously_finished, math_ops.equal(next_word_ids, end_token)) # Calculate the length of the next predictions. # 1. Finished beams remain unchanged # 2. Beams that are now finished (EOS predicted) remain unchanged # 3. Beams that are not yet finished have their length increased by 1 lengths_to_add = math_ops.to_int32( math_ops.not_equal(next_word_ids, end_token)) lengths_to_add = (1 - math_ops.to_int32(next_finished)) * lengths_to_add next_prediction_len = _tensor_gather_helper( gather_indices=next_beam_ids, gather_from=beam_state.lengths, range_input=batch_size, range_size=beam_width, final_shape=[static_batch_size, beam_width]) next_prediction_len += lengths_to_add next_state = BeamSearchDecoderState( cell_state=beam_state.cell_state, log_probs=next_beam_probs, lengths=next_prediction_len, finished=next_finished) output = BeamSearchDecoderOutput( scores=next_beam_scores, predicted_ids=next_word_ids, parent_ids=next_beam_ids) return output, next_state