def _make_seq2seq_model(temp_dir): vocab = test_util.make_vocab(os.path.join(temp_dir, "vocab.txt"), ["1", "2", "3", "4"]) model = models.Transformer(source_inputter=inputters.WordEmbedder(20), target_inputter=inputters.WordEmbedder(20), num_layers=3, num_units=20, num_heads=4, ffn_inner_dim=40) model.initialize(dict(source_vocabulary=vocab, target_vocabulary=vocab)) return model
def testWordEmbedderForDecoder(self): vocab_file = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["the", "world", "hello", "toto"]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_decoder_mode(mark_start=True, mark_end=True) embedder.initialize({"vocabulary": vocab_file}) features = self.evaluate( embedder.make_features(tf.constant("hello world"))) self.assertEqual(features["length"], 3) self.assertEqual( embedder.get_length(features, ignore_special_tokens=True), 2) self.assertAllEqual(features["ids"], [1, 5, 4]) self.assertAllEqual(features["ids_out"], [5, 4, 2])
def testWordEmbedderForDecoder(self): vocab_file = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["the", "world", "hello", "toto"], ) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_decoder_mode(mark_start=True, mark_end=True) embedder.initialize({"vocabulary": vocab_file}) features = embedder.make_features(tf.constant("hello world !")) self.assertEqual(features["length"], 4) self.assertEqual(embedder.get_length(features, ignore_special_tokens=True), 3) self.assertAllEqual(features["ids"], [1, 5, 4, 7]) self.assertAllEqual(features["ids_out"], [5, 4, 7, 2]) oov_tokens = embedder.get_oov_tokens(features) self.assertListEqual(oov_tokens.numpy().flatten().tolist(), [b"!"])
def testBeamSearchWithMultiSourceEncoder(self): shared_vocabulary = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["1", "2", "3"]) data_config = { "source_1_vocabulary": shared_vocabulary, "source_2_vocabulary": shared_vocabulary, "target_vocabulary": shared_vocabulary, } params = { "beam_width": 2, } model = models.Transformer(inputters.ParallelInputter( [inputters.WordEmbedder(32), inputters.WordEmbedder(32)]), inputters.WordEmbedder(32), num_layers=3, num_units=32, num_heads=8, ffn_inner_dim=64) model.initialize(data_config, params=params) model.serve_function().get_concrete_function()
def testLanguageModelInputter(self): vocabulary_path = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["a", "b", "c"]) inputter = models.LanguageModelInputter(embedding_size=10) inputter.initialize({ "vocabulary": vocabulary_path, "sequence_controls": {"start": True, "end": False}}) features, labels = self.evaluate(inputter.make_features(tf.constant("a b c"))) self.assertAllEqual(features["ids"], [1, 3, 4, 5]) self.assertEqual(features["length"], 4) self.assertAllEqual(labels["ids"], [1, 3, 4]) self.assertAllEqual(labels["ids_out"], [3, 4, 5]) self.assertEqual(labels["length"], 3) # Backward compatibility mode. inputter = models.LanguageModelInputter(embedding_size=10) inputter.initialize({"vocabulary": vocabulary_path}) features, labels = self.evaluate(inputter.make_features(tf.constant("a b c"))) self.assertAllEqual(features["ids"], [3, 4, 5]) self.assertEqual(features["length"], 3) self.assertAllEqual(labels["ids"], [3, 4, 5]) self.assertAllEqual(labels["ids_out"], [4, 5, 2]) self.assertEqual(labels["length"], 3)
def testCheckpointExport(self): vocab = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["a", "b", "c"]) model = models.TransformerBase() model.initialize(dict(source_vocabulary=vocab, target_vocabulary=vocab)) model.create_variables() original_embedding = model.features_inputter.embedding.numpy() export_dir = self.get_temp_dir() exporter = exporters.CheckpointExporter() exporter.export(model, export_dir) model = models.TransformerBase() model.initialize(dict(source_vocabulary=vocab, target_vocabulary=vocab)) model.create_variables() checkpoint = tf.train.Checkpoint(model=model) checkpoint.restore(os.path.join(export_dir, "ckpt")) restored_embedding = model.features_inputter.embedding.numpy() self.assertAllEqual(restored_embedding, original_embedding)
def _create_vocab(temp_dir): vocab_path = os.path.join(temp_dir, "vocab.txt") vocab = test_util.make_vocab(vocab_path, ["a", "b", "c"]) return vocab, vocab_path