Esempio n. 1
0
  def testExampleInputter(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    source_inputter = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    target_inputter = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    example_inputter = inputter.ExampleInputter(source_inputter, target_inputter)
    self.assertEqual(example_inputter.num_outputs, 2)

    features, transformed = self._makeDataset(
        example_inputter,
        [data_file, data_file],
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file})

    self.assertIsInstance(features, tuple)
    self.assertEqual(len(features), 2)
    self.assertEqual(len(transformed), 2)
    features, labels = features
    for field in ("ids", "length", "tokens"):
      self.assertIn(field, features)
    for field in ("ids", "ids_out", "length", "tokens"):
      self.assertIn(field, labels)
Esempio n. 2
0
 def testWordEmbedderWithInGraphTokenizer(self):
     vocab_file = self._makeTextFile("vocab.txt",
                                     ["the", "world", "hello", "■"])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     data_config = {
         "vocabulary": vocab_file,
         "tokenization": {
             "type": "CharacterTokenizer"
         },
     }
     embedder.initialize(data_config)
     self.assertIn("text", embedder.input_signature())
     self._testServing(embedder)
Esempio n. 3
0
 def testWordEmbedderWithInPlaceNoise(self, probability):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     embedder.set_noise(noiser, probability=probability)
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         data_config={"vocabulary": vocab_file},
         shapes={"tokens": [None, None], "ids": [None, None], "length": [None]},
     )
     self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
Esempio n. 4
0
 def testNestedParallelInputterShareParameters(self):
     vocab_file = self._makeTextFile("vocab.txt",
                                     ["the", "world", "hello", "toto"])
     metadata = {"vocabulary_file": vocab_file}
     source_inputters = [
         text_inputter.WordEmbedder("vocabulary_file", embedding_size=10),
         text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)
     ]
     target_inputter = text_inputter.WordEmbedder("vocabulary_file",
                                                  embedding_size=10)
     inputters = [
         inputter.ParallelInputter(source_inputters, share_parameters=True),
         target_inputter
     ]
     parallel_inputter = inputter.ParallelInputter(inputters,
                                                   share_parameters=True)
     parallel_inputter.initialize(metadata)
     parallel_inputter.build()
     self.assertEqual(source_inputters[0].embedding,
                      target_inputter.embedding)
     self.assertEqual(source_inputters[1].embedding,
                      target_inputter.embedding)
Esempio n. 5
0
 def testWordEmbedderForDecoder(self):
     vocab_file = test_util.make_vocab(
         os.path.join(self.get_temp_dir(), "vocab.txt"),
         ["the", "world", "hello", "toto"])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     embedder.set_decoder_mode(mark_start=True, mark_end=True)
     embedder.initialize({"vocabulary": vocab_file})
     features = self.evaluate(
         embedder.make_features(tf.constant("hello world")))
     self.assertEqual(features["length"], 3)
     self.assertEqual(
         embedder.get_length(features, ignore_special_tokens=True), 2)
     self.assertAllEqual(features["ids"], [1, 5, 4])
     self.assertAllEqual(features["ids_out"], [5, 4, 2])
Esempio n. 6
0
 def testSequenceToSequenceInputter(self):
   source_vocabulary = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "src_vocab.txt"),
       ["<blank>", "<s>", "</s>", "a", "b", "c", "d"])
   target_vocabulary = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "tgt_vocab.txt"),
       ["<blank>", "<s>", "</s>", "e", "f", "g", "h"])
   source_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "src.txt"), ["a c c", "b d", "a e"])
   target_file = test_util.make_data_file(
       os.path.join(self.get_temp_dir(), "tgt.txt"), ["f h g", "e h", "a e"])
   inputter = sequence_to_sequence.SequenceToSequenceInputter(
       text_inputter.WordEmbedder(embedding_size=20),
       text_inputter.WordEmbedder(embedding_size=20))
   inputter.initialize(dict(
       source_vocabulary=source_vocabulary, target_vocabulary=target_vocabulary))
   dataset = inputter.make_dataset([source_file, target_file])
   element = iter(dataset).next()
   features, labels = inputter.make_features(element)
   self.assertIn("ids_out", labels)
   self.assertAllEqual(labels["ids"], [1, 4, 6, 5])
   self.assertAllEqual(labels["ids_out"], [4, 6, 5, 2])
   self.assertEqual(labels["length"], 4)
Esempio n. 7
0
  def testParallelInputter(self):
    with open(vocab_file, "w") as vocab:
      vocab.write("the\n"
                  "world\n"
                  "hello\n"
                  "toto\n")
    with open(data_file, "w") as data:
      data.write("hello world !\n")

    parallel_inputter = inputter.ParallelInputter([
        text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
        text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)])

    data, transformed = _first_element(
        parallel_inputter,
        [data_file, data_file],
        {"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file})

    self.assertEqual(2, len(parallel_inputter.get_length(data)))

    input_receiver = parallel_inputter.get_serving_input_receiver()
    self.assertIn("inputter_0_ids", input_receiver.features)
    self.assertIn("inputter_1_ids", input_receiver.features)

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      data, transformed = sess.run([data, transformed])
      self.assertNotIn("inputter_0_raw", data)
      self.assertNotIn("inputter_0_tokens", data)
      self.assertNotIn("inputter_1_raw", data)
      self.assertNotIn("inputter_1_tokens", data)
      self.assertIn("inputter_0_ids", data)
      self.assertIn("inputter_1_ids", data)
      self.assertEqual(2, len(transformed))
      self.assertAllEqual([1, 3, 10], transformed[0].shape)
      self.assertAllEqual([1, 3, 5], transformed[1].shape)
Esempio n. 8
0
  def testParallelInputter(self):
    vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
    data_file = os.path.join(self.get_temp_dir(), "data.txt")

    with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
      vocab.write(u"the\n"
                  u"world\n"
                  u"hello\n"
                  u"toto\n")
    with io.open(data_file, encoding="utf-8", mode="w") as data:
      data.write(u"hello world !\n")

    data_files = [data_file, data_file]

    parallel_inputter = inputter.ParallelInputter([
        text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
        text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)])
    features, transformed = self._makeDataset(
        parallel_inputter,
        data_files,
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file},
        shapes={"inputter_0_ids": [None, None], "inputter_0_length": [None],
                "inputter_1_ids": [None, None], "inputter_1_length": [None]})

    self.assertEqual(2, len(parallel_inputter.get_length(features)))
    self.assertNotIn("inputter_0_raw", features)
    self.assertNotIn("inputter_0_tokens", features)
    self.assertNotIn("inputter_1_raw", features)
    self.assertNotIn("inputter_1_tokens", features)

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      features, transformed = sess.run([features, transformed])
      self.assertEqual(2, len(transformed))
      self.assertAllEqual([1, 3, 10], transformed[0].shape)
      self.assertAllEqual([1, 3, 5], transformed[1].shape)
Esempio n. 9
0
  def testWordEmbedder(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)
    features, transformed = self._makeDataset(
        embedder,
        data_file,
        metadata={"vocabulary_file": vocab_file},
        shapes={"tokens": [None, None], "ids": [None, None], "length": [None]})

    self.assertEqual(embedder.embedding.name, "w_embs:0")
    self.assertAllEqual([3], features["length"])
    self.assertAllEqual([[2, 1, 4]], features["ids"])
    self.assertAllEqual([1, 3, 10], transformed.shape)
Esempio n. 10
0
  def testParallelInputterSplitFeatures(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    source_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    target_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    target_embedder.is_target = True
    parallel_inputter = inputter.ParallelInputter(
        [source_embedder, target_embedder], combine_features=False)
    self.assertEqual(parallel_inputter.num_outputs, 2)

    features, transformed = self._makeDataset(
        parallel_inputter,
        [data_file, data_file],
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file})

    self.assertIsInstance(features, tuple)
    self.assertEqual(len(features), 2)
    self.assertEqual(len(transformed), 2)
    features, labels = features
    for field in ("ids", "length", "tokens"):
      self.assertIn(field, features)
    for field in ("ids", "ids_out", "length", "tokens"):
      self.assertIn(field, labels)
Esempio n. 11
0
    def testWordEmbedderForDecoder(self):
        vocab_file = test_util.make_vocab(
            os.path.join(self.get_temp_dir(), "vocab.txt"),
            ["the", "world", "hello", "toto"],
        )
        embedder = text_inputter.WordEmbedder(embedding_size=10)
        embedder.set_decoder_mode(mark_start=True, mark_end=True)
        embedder.initialize({"vocabulary": vocab_file})
        features = embedder.make_features(tf.constant("hello world !"))
        self.assertEqual(features["length"], 4)
        self.assertEqual(embedder.get_length(features, ignore_special_tokens=True), 3)
        self.assertAllEqual(features["ids"], [1, 5, 4, 7])
        self.assertAllEqual(features["ids_out"], [5, 4, 7, 2])

        oov_tokens = embedder.get_oov_tokens(features)
        self.assertListEqual(oov_tokens.numpy().flatten().tolist(), [b"!"])
Esempio n. 12
0
  def testMixedInputter(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    vocab_alt_file = self._makeTextFile("vocab_alt.txt", ["h", "e", "l", "w", "o"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    mixed_inputter = inputter.MixedInputter([
        text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
        text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)],
        reducer=reducer.ConcatReducer())
    self.assertEqual(mixed_inputter.num_outputs, 1)
    features, transformed = self._makeDataset(
        mixed_inputter,
        data_file,
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file},
        shapes={"char_ids": [None, None, None], "ids": [None, None], "length": [None]})
    self.assertAllEqual([1, 3, 15], transformed.shape)
Esempio n. 13
0
    def testWordEmbedderBatchElement(self):
        vocab_file = self._makeTextFile("vocab.txt",
                                        ["<blank>", "<s>", "</s>"] +
                                        list(map(str, range(10))))
        embedder = text_inputter.WordEmbedder(32)
        embedder.initialize(dict(vocabulary=vocab_file))
        features = embedder.make_features(["1 2 3", "1 2 3 4"])
        self.assertAllEqual(features["length"], [3, 4])
        self.assertAllEqual(features["ids"], [[4, 5, 6, 0], [4, 5, 6, 7]])

        embedder.set_decoder_mode(mark_start=True, mark_end=True)
        features = embedder.make_features(["1 2 3", "1 2 3 4"])
        self.assertAllEqual(features["length"], [4, 5])
        self.assertAllEqual(features["ids"],
                            [[1, 4, 5, 6, 0], [1, 4, 5, 6, 7]])
        self.assertAllEqual(features["ids_out"],
                            [[4, 5, 6, 2, 0], [4, 5, 6, 7, 2]])
Esempio n. 14
0
  def testWordEmbedderWithPretrainedEmbeddings(self):
    data_file = self._makeTextFile("data.txt", ["hello world !"])
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    embedding_file = self._makeEmbeddingsFile(
        [("hello", [1, 1]), ("world", [2, 2]), ("toto", [3, 3])])

    embedder = text_inputter.WordEmbedder(
        "vocabulary_file",
        embedding_file_key="embedding_file",
        embedding_file_with_header=False)
    features, transformed = self._makeDataset(
        embedder,
        data_file,
        metadata={"vocabulary_file": vocab_file, "embedding_file": embedding_file},
        shapes={"tokens": [None, None], "ids": [None, None], "length": [None]})

    self.assertAllEqual([1, 1], transformed[0][0])
    self.assertAllEqual([2, 2], transformed[0][1])
Esempio n. 15
0
    def testWordEmbedder(self):
        vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
        data_file = self._makeTextFile("data.txt", ["hello world !"])

        embedder = text_inputter.WordEmbedder(embedding_size=10)
        features, transformed = self._makeDataset(
            embedder,
            data_file,
            data_config={"vocabulary": vocab_file},
            shapes={"tokens": [None, None], "ids": [None, None], "length": [None]},
        )

        self.assertAllEqual([3], features["length"])
        self.assertAllEqual([[2, 1, 4]], features["ids"])
        self.assertAllEqual([1, 3, 10], transformed.shape)

        oov_tokens = embedder.get_oov_tokens(features)
        self.assertListEqual(oov_tokens.numpy().flatten().tolist(), [b"!"])
Esempio n. 16
0
  def testWordEmbedder(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)
    features, transformed = self._makeDataset(
        embedder,
        data_file,
        metadata={"vocabulary_file": vocab_file},
        shapes={"tokens": [None, None], "ids": [None, None], "length": [None]})

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      features, transformed = sess.run([features, transformed])
      self.assertAllEqual([3], features["length"])
      self.assertAllEqual([[2, 1, 4]], features["ids"])
      self.assertAllEqual([1, 3, 10], transformed.shape)
Esempio n. 17
0
    def testMixedInputter(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        vocab_alt_file = os.path.join(self.get_temp_dir(), "vocab_alt.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(vocab_alt_file, "w") as vocab_alt:
            vocab_alt.write("h\n" "e\n" "l\n" "w\n" "o\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        mixed_inputter = inputter.MixedInputter(
            [
                text_inputter.WordEmbedder("vocabulary_file_1",
                                           embedding_size=10),
                text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)
            ],
            reducer=reducer.ConcatReducer())
        features, transformed = self._makeDataset(mixed_inputter,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_alt_file
                                                  },
                                                  shapes={
                                                      "char_ids":
                                                      [None, None, None],
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        self.assertNotIn("tokens", features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 3, 15], transformed.shape)
Esempio n. 18
0
 def testWordEmbedderWithNoise(self):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     embedder.set_noise(noiser, in_place=False)
     expected_shapes = {
         "tokens": [None, None],
         "ids": [None, None],
         "length": [None],
         "noisy_tokens": [None, None],
         "noisy_ids": [None, None],
         "noisy_length": [None]
     }
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         data_config={"vocabulary": vocab_file},
         shapes=expected_shapes)
     self.assertEqual(features["noisy_length"][0],
                      features["length"][0] - 1)
Esempio n. 19
0
  def testWordEmbedderTarget(self):
    vocab_file = self._makeTextFile(
        "vocab.txt", ["<blank>", "<s>", "</s>", "the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    embedder = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)
    embedder.is_target = True
    features, transformed = self._makeDataset(
        embedder,
        data_file,
        metadata={"vocabulary_file": vocab_file},
        shapes={
            "tokens": [None, None],
            "ids": [None, None],
            "ids_out": [None, None],
            "length": [None]
        })

    self.assertAllEqual([4], features["length"])
    self.assertAllEqual([[1, 5, 4, 7]], features["ids"])
    self.assertAllEqual([[5, 4, 7, 2]], features["ids_out"])
Esempio n. 20
0
  def testMixedInputter(self):
    with open(vocab_file, "w") as vocab:
      vocab.write("the\n"
                  "world\n"
                  "hello\n"
                  "toto\n")
    with open(vocab_alt_file, "w") as vocab_alt:
      vocab_alt.write("h\n"
                      "e\n"
                      "l\n"
                      "w\n"
                      "o\n")
    with open(data_file, "w") as data:
      data.write("hello world !\n")

    mixed_inputter = inputter.MixedInputter([
        text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
        text_inputter.CharConvEmbedder("vocabulary_file_2", 10, 5)],
        reducer=reducer.ConcatReducer())

    data, transformed = _first_element(
        mixed_inputter,
        data_file,
        {"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_alt_file})

    input_receiver = mixed_inputter.get_serving_input_receiver()
    self.assertIn("ids", input_receiver.features)
    self.assertIn("char_ids", input_receiver.features)

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      data, transformed = sess.run([data, transformed])
      self.assertNotIn("raw", data)
      self.assertNotIn("tokens", data)
      self.assertIn("ids", data)
      self.assertIn("char_ids", data)
      self.assertAllEqual([1, 3, 15], transformed.shape)
Esempio n. 21
0
    def testWordEmbedderWithPretrainedEmbeddings(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")
        embedding_file = os.path.join(self.get_temp_dir(), "embedding.txt")

        with io.open(embedding_file, encoding="utf-8", mode="w") as embedding:
            embedding.write(u"hello 1 1\n" u"world 2 2\n" u"toto 3 3\n")
        with io.open(vocab_file, encoding="utf-8", mode="w") as vocab:
            vocab.write(u"the\n" u"world\n" u"hello\n" u"toto\n")
        with io.open(data_file, encoding="utf-8", mode="w") as data:
            data.write(u"hello world !\n")

        embedder = text_inputter.WordEmbedder(
            "vocabulary_file",
            embedding_file_key="embedding_file",
            embedding_file_with_header=False)
        features, transformed = self._makeDataset(embedder,
                                                  data_file,
                                                  metadata={
                                                      "vocabulary_file":
                                                      vocab_file,
                                                      "embedding_file":
                                                      embedding_file
                                                  },
                                                  shapes={
                                                      "tokens": [None, None],
                                                      "ids": [None, None],
                                                      "length": [None]
                                                  })

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertAllEqual([1, 1], transformed[0][0])
            self.assertAllEqual([2, 2], transformed[0][1])
Esempio n. 22
0
 def testWordEmbedderMissingInitialization(self):
     embedder = text_inputter.WordEmbedder()
     with self.assertRaisesRegex(RuntimeError, "initialize"):
         embedder.input_signature()
     with self.assertRaisesRegex(RuntimeError, "initialize"):
         embedder.make_features("Hello world !")