def testGreedyWithCornerCase(self): batch_size = 1 beam_size = 1 vocab_size = 3 decode_length = 2 initial_ids = tf.constant([0] * batch_size) # GO probabilities = tf.constant([[0.2, 0.1, 0.7], [0.4, 0.1, 0.5]]) def symbols_to_logits(ids): pos = tf.shape(ids)[1] logits = tf.to_float(tf.log(probabilities[pos - 1, :])) return logits final_ids, final_probs = beam_search.beam_search(symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, 0.0, eos_id=1) with self.test_session(): ids = final_ids.eval() probs = final_probs.eval() self.assertAllEqual([[[0, 2, 2]]], ids) self.assertAllClose([[0.7 * 0.5]], np.exp(probs))
def testNotGreedyBeamTwo(self): batch_size = 1 beam_size = 2 vocab_size = 3 decode_length = 3 initial_ids = tf.constant([0] * batch_size) # GO probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]], [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]], [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]) def symbols_to_logits(ids): pos = tf.shape(ids)[1] logits = tf.to_float(tf.log(probabilities[pos - 1, :])) return logits final_ids, final_probs = beam_search.beam_search(symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, 0.0, eos_id=1) with self.test_session(): ids = final_ids.eval() probs = final_probs.eval() self.assertAllEqual([[[0, 2, 1, 0], [0, 2, 0, 1]]], ids) self.assertAllClose([[0.8 * 0.5, 0.8 * 0.4 * 0.9]], np.exp(probs))
def testGreedyBatchOne(self): batch_size = 1 beam_size = 1 vocab_size = 2 decode_length = 3 initial_ids = tf.constant([0] * batch_size) # GO # Test that beam search finds the most probable sequence. # These probabilities represent the following search # # G0 (0) # / \ # / \ # / \ # / \ # 0(0.7) 1(0.3) # / \ # / \ # / \ # 0(0.4) 1(0.6) # /\ # / \ # / \ # 0(0.5) 1(0.5) # and the following decoding probabilities # 0000 - 0.7 * 0.4 * 0.1 # 0001 - 0.7 * 0.4 * 0.9 # 001 - 0.7 * 0.6 (Best) # 01 = 0.3 # # 001 is the most likely sequence under these probabilities. probabilities = tf.constant([[[0.7, 0.3]], [[0.4, 0.6]], [[0.5, 0.5]]]) def symbols_to_logits(ids): pos = tf.shape(ids)[1] logits = tf.to_float(tf.log(probabilities[pos - 1, :])) return logits final_ids, final_probs = beam_search.beam_search(symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, 0.0, eos_id=1) with self.test_session(): ids = final_ids.eval() probs = final_probs.eval() self.assertAllEqual([[[0, 0, 1]]], ids) self.assertAllClose([[0.7 * 0.6]], np.exp(probs))
def testNotGreedyBatchTwoBeamTwoWithAlpha(self): batch_size = 2 beam_size = 2 vocab_size = 3 decode_length = 3 initial_ids = tf.constant([0] * batch_size) # GO # Probabilities for position * batch * beam * vocab # Probabilities have been set such that with alpha = 3.5, the less probable # but longer sequence will have a better score than the shorter sequence # with higher log prob in batch 1, and the order will be reverse in batch # 2. That is, the shorter sequence will still have a higher score in spite # of the length penalty probabilities = tf.constant([[[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]], [[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]], [[[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]], [[0.3, 0.6, 0.1], [0.2, 0.4, 0.4]]], [[[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]], [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]]]) def symbols_to_logits(ids): pos = tf.shape(ids)[1] logits = tf.to_float(tf.log(probabilities[pos - 1, :])) return logits final_ids, final_scores = beam_search.beam_search(symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, 3.5, eos_id=1) with self.test_session(): ids = final_ids.eval() scores = final_scores.eval() self.assertAllEqual( [[[0, 2, 0, 1], [0, 2, 1, 0]], [[0, 2, 1, 0], [0, 2, 0, 1]]], ids) self.assertAllClose([[ np.log(0.8 * 0.4 * 0.9) / (8. / 6.)**3.5, np.log(0.8 * 0.5) / (7. / 6.)**3.5 ], [ np.log(0.8 * 0.6) / (7. / 6.)**3.5, np.log(0.8 * 0.3 * 0.9) / (8. / 6.)**3.5 ]], scores)
def testShapes(self): batch_size = 2 beam_size = 3 vocab_size = 4 decode_length = 10 initial_ids = tf.constant([0, 0]) # GO def symbols_to_logits(_): # Just return random logits return tf.random_uniform((batch_size * beam_size, vocab_size)) final_ids, final_probs = beam_search.beam_search( symbols_to_logits, initial_ids, beam_size, decode_length, vocab_size, 0.) self.assertEqual(final_ids.get_shape().as_list(), [None, beam_size, None]) self.assertEqual(final_probs.get_shape().as_list(), [None, beam_size])
def _beam_decode(self, features, decode_length, beam_size, top_beams, last_position_only, alpha): """Beam search decoding. Args: features: an map of string to `Tensor` decode_length: an integer. How many additional timesteps to decode. beam_size: number of beams. top_beams: an integer. How many of the beams to return. last_position_only: a boolean, speed-up by computing last position only. alpha: Float that controls the length penalty. larger the alpha, stronger the preference for slonger translations. Returns: samples: an integer `Tensor`. Top samples from the beam search """ batch_size = tf.shape(features["inputs"])[0] batch_size = tf.Print(batch_size, [batch_size], "beam_decode batch_size=") def symbols_to_logits_fn(ids): """Go from ids to logits.""" ids = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3) ids = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0], [0, 0]]) if "partial_targets" in features: pt = features["partial_targets"] pt_length = tf.shape(pt)[1] pt = tf.tile(pt, [1, beam_size]) pt = tf.reshape(pt, [batch_size * beam_size, pt_length, 1, 1]) ids = tf.concat([pt, ids], axis=1) features["targets"] = ids self._coverage = None sharded_logits, _ = self.model_fn( features, False, last_position_only=last_position_only) # now self._coverage is a coverage tensor for the first datashard. # it has shape [batch_size] and contains floats between 0 and # source_length. logits = sharded_logits[0] # Assuming we have one shard. if last_position_only: return tf.squeeze(logits, axis=[1, 2, 3]) current_output_position = tf.shape(ids)[1] - 1 # -1 due to the pad above. logits = logits[:, current_output_position, :, :] return tf.squeeze(logits, axis=[1, 2]) initial_ids = tf.zeros([batch_size], dtype=tf.int32) inputs_old = features["inputs"] features["inputs"] = tf.expand_dims(features["inputs"], 1) if len(features["inputs"].shape) < 5: features["inputs"] = tf.expand_dims(features["inputs"], 4) # Expand the inputs in to the beam size. features["inputs"] = tf.tile(features["inputs"], [1, beam_size, 1, 1, 1]) s = tf.shape(features["inputs"]) features["inputs"] = tf.reshape(features["inputs"], [s[0] * s[1], s[2], s[3], s[4]]) target_modality = self._hparams.problems[self._problem_idx].target_modality vocab_size = target_modality.top_dimensionality # Setting decode length to input length + decode_length decode_length = tf.constant(decode_length) if "partial_targets" not in features: decode_length += tf.shape(features["inputs"])[1] ids, scores = beam_search.beam_search(symbols_to_logits_fn, initial_ids, beam_size, decode_length, vocab_size, alpha) # Set inputs back to the unexpanded inputs to not to confuse the Estimator! features["inputs"] = inputs_old # Return `top_beams` decodings (also remove initial id from the beam search) return_scores = False # TODO(lukaszkaiser): make it work multi-problem. if top_beams == 1: if return_scores: return {"outputs": ids[:, 0, 1:], "scores": scores} return ids[:, 0, 1:] else: if return_scores: return {"outputs": ids[:, :top_beams, 1:], "scores": scores} return ids[:, :top_beams, 1:]