Esempio n. 1
0
    def testCharConvEmbedder(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
            vocab.write(u"h\n" u"e\n" u"l\n" u"w\n" u"o\n")
        with io.open(data_file, encoding="utf-8", mode="w") as data:
            data.write(u"hello world !\n")

        embedder = text_inputter.CharConvEmbedder("vocabulary_file", 10, 5)
        features, transformed = self._makeDataset(
            embedder,
            data_file,
            metadata={"vocabulary_file": vocab_file},
            shapes={
                "char_ids": [None, None, None],
                "length": [None]
            })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([3], features["length"])
            self.assertAllEqual(
                [[[0, 1, 2, 2, 4], [3, 4, 5, 2, 5], [5, 5, 5, 5, 5]]],
                features["char_ids"])
            self.assertAllEqual([1, 3, 5], transformed.shape)
Esempio n. 2
0
    def testWordEmbedder(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
            vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n")
        with io.open(data_file, encoding="utf-8", mode="w") as data:
            data.write(u"hello world !\n")

        embedder = text_inputter.WordEmbedder("vocabulary_file",
                                              embedding_size=10)
        features, transformed = self._makeDataset(
            embedder,
            data_file,
            metadata={"vocabulary_file": vocab_file},
            shapes={
                "tokens": [None, None],
                "ids": [None, None],
                "length": [None]
            })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([3], features["length"])
            self.assertAllEqual([[2, 1, 4]], features["ids"])
            self.assertAllEqual([1, 3, 10], transformed.shape)
Esempio n. 3
0
    def testWordEmbedderWithPretrainedEmbeddings(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")
        embedding_file = os.path.join(self.get_temp_dir(), "embedding.txt")

        with io.open(embedding_file, encoding="utf-8", mode="w") as embedding:
            embedding.write(u"hello 1 1\n" u"world 2 2\n" u"toto 3 3\n")
        with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
            vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n")
        with io.open(data_file, encoding="utf-8", mode="w") as data:
            data.write(u"hello world !\n")

        embedder = text_inputter.WordEmbedder(
            "vocabulary_file",
            embedding_file_key="embedding_file",
            embedding_file_with_header=False)
        features, transformed = self._makeDataset(embedder,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file":
                                                      vocab_file,
                                                      "embedding_file":
                                                      embedding_file
                                                  },
                                                  shapes={
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 1], transformed[0][0])
            self.assertAllEqual([2, 2], transformed[0][1])
Esempio n. 4
0
    def testParallelInputter(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        data_files = [data_file, data_file]

        parallel_inputter = inputter.ParallelInputter([
            text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
            text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)
        ])
        features, transformed = self._makeDataset(parallel_inputter,
                                                  data_files,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_file
                                                  },
                                                  shapes={
                                                      "inputter_0_ids":
                                                      [None, None],
                                                      "inputter_0_length":
                                                      [None],
                                                      "inputter_1_ids":
                                                      [None, None],
                                                      "inputter_1_length":
                                                      [None]
                                                  })

        self.assertEqual(2, len(parallel_inputter.get_length(features)))
        self.assertNotIn("inputter_0_raw", features)
        self.assertNotIn("inputter_0_tokens", features)
        self.assertNotIn("inputter_1_raw", features)
        self.assertNotIn("inputter_1_tokens", features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertEqual(2, len(transformed))
            self.assertAllEqual([1, 3, 10], transformed[0].shape)
            self.assertAllEqual([1, 3, 5], transformed[1].shape)
Esempio n. 5
0
    def testMixedInputter(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(vocab_alt_file, "w") as vocab_alt:
            vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder("vocabulary_file_1",
                                           embedding_size=10),
                text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)
            ],
            reducer=reducer.ConcatReducer())
        features, transformed = self._makeDataset(mixed_inputter,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_alt_file
                                                  },
                                                  shapes={
                                                      "char_ids":
                                                      [None, None, None],
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        self.assertNotIn("tokens", features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 3, 15], transformed.shape)
Esempio n. 6
0
    def testCharRNNEmbedder(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("h\n" "e\n" "l\n" "w\n" "o\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        embedder = text_inputter.CharRNNEmbedder("vocabulary_file", 10, 5)
        features, transformed = self._makeDataset(
            embedder,
            data_file,
            metadata={"vocabulary_file": vocab_file},
            shapes={
                "char_ids": [None, None, None],
                "length": [None]
            })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 3, 5], transformed.shape)