def testExampleInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) source_inputter = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) target_inputter = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) example_inputter = inputter.ExampleInputter(source_inputter, target_inputter) self.assertEqual(example_inputter.num_outputs, 2) features, transformed = self._makeDataset( example_inputter, [data_file, data_file], metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}) self.assertIsInstance(features, tuple) self.assertEqual(len(features), 2) self.assertEqual(len(transformed), 2) features, labels = features for field in ("ids", "length", "tokens"): self.assertIn(field, features) for field in ("ids", "ids_out", "length", "tokens"): self.assertIn(field, labels)
def testWordEmbedderWithInGraphTokenizer(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "■"]) embedder = text_inputter.WordEmbedder(embedding_size=10) data_config = { "vocabulary": vocab_file, "tokenization": { "type": "CharacterTokenizer" }, } embedder.initialize(data_config) self.assertIn("text", embedder.input_signature()) self._testServing(embedder)
def testWordEmbedderWithInPlaceNoise(self, probability): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_noise(noiser, probability=probability) features, transformed = self._makeDataset( embedder, data_file, data_config={"vocabulary": vocab_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}, ) self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
def testNestedParallelInputterShareParameters(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) metadata = {"vocabulary_file": vocab_file} source_inputters = [ text_inputter.WordEmbedder("vocabulary_file", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) ] target_inputter = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) inputters = [ inputter.ParallelInputter(source_inputters, share_parameters=True), target_inputter ] parallel_inputter = inputter.ParallelInputter(inputters, share_parameters=True) parallel_inputter.initialize(metadata) parallel_inputter.build() self.assertEqual(source_inputters[0].embedding, target_inputter.embedding) self.assertEqual(source_inputters[1].embedding, target_inputter.embedding)
def testWordEmbedderForDecoder(self): vocab_file = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["the", "world", "hello", "toto"]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_decoder_mode(mark_start=True, mark_end=True) embedder.initialize({"vocabulary": vocab_file}) features = self.evaluate( embedder.make_features(tf.constant("hello world"))) self.assertEqual(features["length"], 3) self.assertEqual( embedder.get_length(features, ignore_special_tokens=True), 2) self.assertAllEqual(features["ids"], [1, 5, 4]) self.assertAllEqual(features["ids_out"], [5, 4, 2])
def testSequenceToSequenceInputter(self): source_vocabulary = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src_vocab.txt"), ["<blank>", "<s>", "</s>", "a", "b", "c", "d"]) target_vocabulary = test_util.make_data_file( os.path.join(self.get_temp_dir(), "tgt_vocab.txt"), ["<blank>", "<s>", "</s>", "e", "f", "g", "h"]) source_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["a c c", "b d", "a e"]) target_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "tgt.txt"), ["f h g", "e h", "a e"]) inputter = sequence_to_sequence.SequenceToSequenceInputter( text_inputter.WordEmbedder(embedding_size=20), text_inputter.WordEmbedder(embedding_size=20)) inputter.initialize(dict( source_vocabulary=source_vocabulary, target_vocabulary=target_vocabulary)) dataset = inputter.make_dataset([source_file, target_file]) element = iter(dataset).next() features, labels = inputter.make_features(element) self.assertIn("ids_out", labels) self.assertAllEqual(labels["ids"], [1, 4, 6, 5]) self.assertAllEqual(labels["ids_out"], [4, 6, 5, 2]) self.assertEqual(labels["length"], 4)
def testParallelInputter(self): with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(data_file, "w") as data: data.write("hello world !\n") parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)]) data, transformed = _first_element( parallel_inputter, [data_file, data_file], {"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}) self.assertEqual(2, len(parallel_inputter.get_length(data))) input_receiver = parallel_inputter.get_serving_input_receiver() self.assertIn("inputter_0_ids", input_receiver.features) self.assertIn("inputter_1_ids", input_receiver.features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) data, transformed = sess.run([data, transformed]) self.assertNotIn("inputter_0_raw", data) self.assertNotIn("inputter_0_tokens", data) self.assertNotIn("inputter_1_raw", data) self.assertNotIn("inputter_1_tokens", data) self.assertIn("inputter_0_ids", data) self.assertIn("inputter_1_ids", data) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testParallelInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with io.open(vocab_file, encoding="utf-8", mode="w") as vocab: vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n") with io.open(data_file, encoding="utf-8", mode="w") as data: data.write(u"hello world !\n") data_files = [data_file, data_file] parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)]) features, transformed = self._makeDataset( parallel_inputter, data_files, metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}, shapes={"inputter_0_ids": [None, None], "inputter_0_length": [None], "inputter_1_ids": [None, None], "inputter_1_length": [None]}) self.assertEqual(2, len(parallel_inputter.get_length(features))) self.assertNotIn("inputter_0_raw", features) self.assertNotIn("inputter_0_tokens", features) self.assertNotIn("inputter_1_raw", features) self.assertNotIn("inputter_1_tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testWordEmbedder(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}) self.assertEqual(embedder.embedding.name, "w_embs:0") self.assertAllEqual([3], features["length"]) self.assertAllEqual([[2, 1, 4]], features["ids"]) self.assertAllEqual([1, 3, 10], transformed.shape)
def testParallelInputterSplitFeatures(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) source_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) target_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10) target_embedder.is_target = True parallel_inputter = inputter.ParallelInputter( [source_embedder, target_embedder], combine_features=False) self.assertEqual(parallel_inputter.num_outputs, 2) features, transformed = self._makeDataset( parallel_inputter, [data_file, data_file], metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file}) self.assertIsInstance(features, tuple) self.assertEqual(len(features), 2) self.assertEqual(len(transformed), 2) features, labels = features for field in ("ids", "length", "tokens"): self.assertIn(field, features) for field in ("ids", "ids_out", "length", "tokens"): self.assertIn(field, labels)
def testWordEmbedderForDecoder(self): vocab_file = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), ["the", "world", "hello", "toto"], ) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_decoder_mode(mark_start=True, mark_end=True) embedder.initialize({"vocabulary": vocab_file}) features = embedder.make_features(tf.constant("hello world !")) self.assertEqual(features["length"], 4) self.assertEqual(embedder.get_length(features, ignore_special_tokens=True), 3) self.assertAllEqual(features["ids"], [1, 5, 4, 7]) self.assertAllEqual(features["ids_out"], [5, 4, 7, 2]) oov_tokens = embedder.get_oov_tokens(features) self.assertListEqual(oov_tokens.numpy().flatten().tolist(), [b"!"])
def testMixedInputter(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) vocab_alt_file = self._makeTextFile("vocab_alt.txt", ["h", "e", "l", "w", "o"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) mixed_inputter = inputter.MixedInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)], reducer=reducer.ConcatReducer()) self.assertEqual(mixed_inputter.num_outputs, 1) features, transformed = self._makeDataset( mixed_inputter, data_file, metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file}, shapes={"char_ids": [None, None, None], "ids": [None, None], "length": [None]}) self.assertAllEqual([1, 3, 15], transformed.shape)
def testWordEmbedderBatchElement(self): vocab_file = self._makeTextFile("vocab.txt", ["<blank>", "<s>", "</s>"] + list(map(str, range(10)))) embedder = text_inputter.WordEmbedder(32) embedder.initialize(dict(vocabulary=vocab_file)) features = embedder.make_features(["1 2 3", "1 2 3 4"]) self.assertAllEqual(features["length"], [3, 4]) self.assertAllEqual(features["ids"], [[4, 5, 6, 0], [4, 5, 6, 7]]) embedder.set_decoder_mode(mark_start=True, mark_end=True) features = embedder.make_features(["1 2 3", "1 2 3 4"]) self.assertAllEqual(features["length"], [4, 5]) self.assertAllEqual(features["ids"], [[1, 4, 5, 6, 0], [1, 4, 5, 6, 7]]) self.assertAllEqual(features["ids_out"], [[4, 5, 6, 2, 0], [4, 5, 6, 7, 2]])
def testWordEmbedderWithPretrainedEmbeddings(self): data_file = self._makeTextFile("data.txt", ["hello world !"]) vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) embedding_file = self._makeEmbeddingsFile( [("hello", [1, 1]), ("world", [2, 2]), ("toto", [3, 3])]) embedder = text_inputter.WordEmbedder( "vocabulary_file", embedding_file_key="embedding_file", embedding_file_with_header=False) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file, "embedding_file": embedding_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}) self.assertAllEqual([1, 1], transformed[0][0]) self.assertAllEqual([2, 2], transformed[0][1])
def testWordEmbedder(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) embedder = text_inputter.WordEmbedder(embedding_size=10) features, transformed = self._makeDataset( embedder, data_file, data_config={"vocabulary": vocab_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}, ) self.assertAllEqual([3], features["length"]) self.assertAllEqual([[2, 1, 4]], features["ids"]) self.assertAllEqual([1, 3, 10], transformed.shape) oov_tokens = embedder.get_oov_tokens(features) self.assertListEqual(oov_tokens.numpy().flatten().tolist(), [b"!"])
def testWordEmbedder(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([3], features["length"]) self.assertAllEqual([[2, 1, 4]], features["ids"]) self.assertAllEqual([1, 3, 10], transformed.shape)
def testMixedInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(vocab_alt_file, "w") as vocab_alt: vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") mixed_inputter = inputter.MixedInputter( [ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5) ], reducer=reducer.ConcatReducer()) features, transformed = self._makeDataset(mixed_inputter, data_file, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file }, shapes={ "char_ids": [None, None, None], "ids": [None, None], "length": [None] }) self.assertNotIn("tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 3, 15], transformed.shape)
def testWordEmbedderWithNoise(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_noise(noiser, in_place=False) expected_shapes = { "tokens": [None, None], "ids": [None, None], "length": [None], "noisy_tokens": [None, None], "noisy_ids": [None, None], "noisy_length": [None] } features, transformed = self._makeDataset( embedder, data_file, data_config={"vocabulary": vocab_file}, shapes=expected_shapes) self.assertEqual(features["noisy_length"][0], features["length"][0] - 1)
def testWordEmbedderTarget(self): vocab_file = self._makeTextFile( "vocab.txt", ["<blank>", "<s>", "</s>", "the", "world", "hello", "toto"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) embedder.is_target = True features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={ "tokens": [None, None], "ids": [None, None], "ids_out": [None, None], "length": [None] }) self.assertAllEqual([4], features["length"]) self.assertAllEqual([[1, 5, 4, 7]], features["ids"]) self.assertAllEqual([[5, 4, 7, 2]], features["ids_out"])
def testMixedInputter(self): with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(vocab_alt_file, "w") as vocab_alt: vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") mixed_inputter = inputter.MixedInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)], reducer=reducer.ConcatReducer()) data, transformed = _first_element( mixed_inputter, data_file, {"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file}) input_receiver = mixed_inputter.get_serving_input_receiver() self.assertIn("ids", input_receiver.features) self.assertIn("char_ids", input_receiver.features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) data, transformed = sess.run([data, transformed]) self.assertNotIn("raw", data) self.assertNotIn("tokens", data) self.assertIn("ids", data) self.assertIn("char_ids", data) self.assertAllEqual([1, 3, 15], transformed.shape)
def testWordEmbedderWithPretrainedEmbeddings(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") embedding_file = os.path.join(self.get_temp_dir(), "embedding.txt") with io.open(embedding_file, encoding="utf-8", mode="w") as embedding: embedding.write(u"hello 1 1\n" u"world 2 2\n" u"toto 3 3\n") with io.open(vocab_file, encoding="utf-8", mode="w") as vocab: vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n") with io.open(data_file, encoding="utf-8", mode="w") as data: data.write(u"hello world !\n") embedder = text_inputter.WordEmbedder( "vocabulary_file", embedding_file_key="embedding_file", embedding_file_with_header=False) features, transformed = self._makeDataset(embedder, data_file, metadata={ "vocabulary_file": vocab_file, "embedding_file": embedding_file }, shapes={ "tokens": [None, None], "ids": [None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 1], transformed[0][0]) self.assertAllEqual([2, 2], transformed[0][1])
def testWordEmbedderMissingInitialization(self): embedder = text_inputter.WordEmbedder() with self.assertRaisesRegex(RuntimeError, "initialize"): embedder.input_signature() with self.assertRaisesRegex(RuntimeError, "initialize"): embedder.make_features("Hello world !")