Example #1
0
    def testCharConvEmbedder(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
            vocab.write(u"h\n" u"e\n" u"l\n" u"w\n" u"o\n")
        with io.open(data_file, encoding="utf-8", mode="w") as data:
            data.write(u"hello world !\n")

        embedder = text_inputter.CharConvEmbedder("vocabulary_file", 10, 5)
        features, transformed = self._makeDataset(
            embedder,
            data_file,
            metadata={"vocabulary_file": vocab_file},
            shapes={
                "char_ids": [None, None, None],
                "length": [None]
            })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([3], features["length"])
            self.assertAllEqual(
                [[[0, 1, 2, 2, 4], [3, 4, 5, 2, 5], [5, 5, 5, 5, 5]]],
                features["char_ids"])
            self.assertAllEqual([1, 3, 5], transformed.shape)
Example #2
0
    def testMixedInputter(self):
        vocab_file = self._makeTextFile("vocab.txt",
                                        ["the", "world", "hello", "toto"])
        vocab_alt_file = self._makeTextFile("vocab_alt.txt",
                                            ["h", "e", "l", "w", "o"])
        data_file = self._makeTextFile("data.txt", ["hello world !"])

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder(embedding_size=10),
                text_inputter.CharConvEmbedder(10, 5),
            ],
            reducer=reducer.ConcatReducer(),
        )
        self.assertEqual(mixed_inputter.num_outputs, 1)
        features, transformed = self._makeDataset(
            mixed_inputter,
            data_file,
            data_config={
                "1_vocabulary": vocab_file,
                "2_vocabulary": vocab_alt_file
            },
            shapes={
                "char_ids": [None, None, None],
                "ids": [None, None],
                "length": [None],
            },
        )
        self.assertAllEqual([1, 3, 15], transformed.shape)
Example #3
0
    def testMixedInputter(self):
        vocab_file = self._makeTextFile("vocab.txt",
                                        ["the", "world", "hello", "toto"])
        vocab_alt_file = self._makeTextFile("vocab_alt.txt",
                                            ["h", "e", "l", "w", "o"])
        data_file = self._makeTextFile("data.txt", ["hello world !"])

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder("vocabulary_file_1",
                                           embedding_size=10),
                text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)
            ],
            reducer=reducer.ConcatReducer())
        self.assertEqual(mixed_inputter.num_outputs, 1)
        features, transformed = self._makeDataset(mixed_inputter,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_alt_file
                                                  },
                                                  shapes={
                                                      "char_ids":
                                                      [None, None, None],
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 3, 15], transformed.shape)
Example #4
0
  def testCharConvEmbedder(self):
    with open(vocab_file, "w") as vocab:
      vocab.write("h\n"
                  "e\n"
                  "l\n"
                  "w\n"
                  "o\n")
    with open(data_file, "w") as data:
      data.write("hello world !\n")

    embedder = text_inputter.CharConvEmbedder("vocabulary_file", 10, 5)
    data, transformed = _first_element(
        embedder, data_file, {"vocabulary_file": vocab_file})

    input_receiver = embedder.get_serving_input_receiver()
    self.assertAllEqual(
        [None, None, None],
        input_receiver.features["char_ids"].get_shape().as_list())
    self.assertAllEqual(
        [None],
        input_receiver.features["length"].get_shape().as_list())

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      data, transformed = sess.run([data, transformed])
      self.assertNotIn("raw", data)
      self.assertNotIn("tokens", data)
      self.assertAllEqual([3], data["length"])
      self.assertAllEqual(
          [[[0, 1, 2, 2, 4], [3, 4, 5, 2, 5], [5, 5, 5, 5, 5]]],
          data["char_ids"])
      self.assertAllEqual([1, 3, 5], transformed.shape)
Example #5
0
    def testMixedInputter(self):
        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(vocab_alt_file, "w") as vocab_alt:
            vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder("vocabulary_file_1",
                                           embedding_size=10),
                text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)
            ],
            reducer=reducer.ConcatReducer())

        data, transformed = _first_element(mixed_inputter, data_file, {
            "vocabulary_file_1": vocab_file,
            "vocabulary_file_2": vocab_alt_file
        })

        input_receiver = mixed_inputter.get_serving_input_receiver()
        self.assertIn("ids", input_receiver.features)
        self.assertIn("char_ids", input_receiver.features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            data, transformed = sess.run([data, transformed])
            self.assertNotIn("raw", data)
            self.assertNotIn("tokens", data)
            self.assertIn("ids", data)
            self.assertIn("char_ids", data)
            self.assertAllEqual([1, 3, 15], transformed.shape)
Example #6
0
  def testCharConvEmbedder(self):
    vocab_file = self._makeTextFile("vocab.txt", ["h", "e", "l", "w", "o"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    embedder = text_inputter.CharConvEmbedder("vocabulary_file", 10, 5)
    features, transformed = self._makeDataset(
        embedder,
        data_file,
        metadata={"vocabulary_file": vocab_file},
        shapes={"char_ids": [None, None, None], "length": [None]})

    self.assertAllEqual([3], features["length"])
    self.assertAllEqual(
        [[[0, 1, 2, 2, 4], [3, 4, 5, 2, 5], [5, 5, 5, 5, 5]]],
        features["char_ids"])
    self.assertAllEqual([1, 3, 5], transformed.shape)
Example #7
0
    def testMixedInputter(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(vocab_alt_file, "w") as vocab_alt:
            vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder("vocabulary_file_1",
                                           embedding_size=10),
                text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)
            ],
            reducer=reducer.ConcatReducer())
        features, transformed = self._makeDataset(mixed_inputter,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_alt_file
                                                  },
                                                  shapes={
                                                      "char_ids":
                                                      [None, None, None],
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        self.assertNotIn("tokens", features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 3, 15], transformed.shape)