def testCharConvEmbedder(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with io.open(vocab_file, encoding="utf-8", mode="w") as vocab: vocab.write(u"h\n" u"e\n" u"l\n" u"w\n" u"o\n") with io.open(data_file, encoding="utf-8", mode="w") as data: data.write(u"hello world !\n") embedder = text_inputter.CharConvEmbedder("vocabulary_file", 10, 5) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={ "char_ids": [None, None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([3], features["length"]) self.assertAllEqual( [[[0, 1, 2, 2, 4], [3, 4, 5, 2, 5], [5, 5, 5, 5, 5]]], features["char_ids"]) self.assertAllEqual([1, 3, 5], transformed.shape)
def testWordEmbedder(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with io.open(vocab_file, encoding="utf-8", mode="w") as vocab: vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n") with io.open(data_file, encoding="utf-8", mode="w") as data: data.write(u"hello world !\n") embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={ "tokens": [None, None], "ids": [None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([3], features["length"]) self.assertAllEqual([[2, 1, 4]], features["ids"]) self.assertAllEqual([1, 3, 10], transformed.shape)
def testWordEmbedderWithPretrainedEmbeddings(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") embedding_file = os.path.join(self.get_temp_dir(), "embedding.txt") with io.open(embedding_file, encoding="utf-8", mode="w") as embedding: embedding.write(u"hello 1 1\n" u"world 2 2\n" u"toto 3 3\n") with io.open(vocab_file, encoding="utf-8", mode="w") as vocab: vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n") with io.open(data_file, encoding="utf-8", mode="w") as data: data.write(u"hello world !\n") embedder = text_inputter.WordEmbedder( "vocabulary_file", embedding_file_key="embedding_file", embedding_file_with_header=False) features, transformed = self._makeDataset(embedder, data_file, metadata={ "vocabulary_file": vocab_file, "embedding_file": embedding_file }, shapes={ "ids": [None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 1], transformed[0][0]) self.assertAllEqual([2, 2], transformed[0][1])
def testParallelInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(data_file, "w") as data: data.write("hello world !\n") data_files = [data_file, data_file] parallel_inputter = inputter.ParallelInputter([ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5) ]) features, transformed = self._makeDataset(parallel_inputter, data_files, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file }, shapes={ "inputter_0_ids": [None, None], "inputter_0_length": [None], "inputter_1_ids": [None, None], "inputter_1_length": [None] }) self.assertEqual(2, len(parallel_inputter.get_length(features))) self.assertNotIn("inputter_0_raw", features) self.assertNotIn("inputter_0_tokens", features) self.assertNotIn("inputter_1_raw", features) self.assertNotIn("inputter_1_tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertEqual(2, len(transformed)) self.assertAllEqual([1, 3, 10], transformed[0].shape) self.assertAllEqual([1, 3, 5], transformed[1].shape)
def testMixedInputter(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("the\n" "world\n" "hello\n" "toto\n") with open(vocab_alt_file, "w") as vocab_alt: vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") mixed_inputter = inputter.MixedInputter( [ text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10), text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5) ], reducer=reducer.ConcatReducer()) features, transformed = self._makeDataset(mixed_inputter, data_file, metadata={ "vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file }, shapes={ "char_ids": [None, None, None], "ids": [None, None], "length": [None] }) self.assertNotIn("tokens", features) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 3, 15], transformed.shape)
def testCharRNNEmbedder(self): vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt") data_file = os.path.join(self.get_temp_dir(), "data.txt") with open(vocab_file, "w") as vocab: vocab.write("h\n" "e\n" "l\n" "w\n" "o\n") with open(data_file, "w") as data: data.write("hello world !\n") embedder = text_inputter.CharRNNEmbedder("vocabulary_file", 10, 5) features, transformed = self._makeDataset( embedder, data_file, metadata={"vocabulary_file": vocab_file}, shapes={ "char_ids": [None, None, None], "length": [None] }) with self.test_session() as sess: sess.run(tf.tables_initializer()) sess.run(tf.global_variables_initializer()) features, transformed = sess.run([features, transformed]) self.assertAllEqual([1, 3, 5], transformed.shape)