def testNestedParallelInputterShareParameters(self):
     vocab_file = self._makeTextFile("vocab.txt",
                                     ["the", "world", "hello", "toto"])
     data_config = {
         "1_1_vocabulary": vocab_file,
         "1_2_vocabulary": vocab_file,
         "2_vocabulary": vocab_file,
     }
     source_inputters = [
         text_inputter.WordEmbedder(embedding_size=10),
         text_inputter.WordEmbedder(embedding_size=10),
     ]
     target_inputter = text_inputter.WordEmbedder(embedding_size=10)
     inputters = [
         inputter.ParallelInputter(source_inputters, share_parameters=True),
         target_inputter,
     ]
     parallel_inputter = inputter.ParallelInputter(inputters,
                                                   share_parameters=True)
     parallel_inputter.initialize(data_config)
     parallel_inputter.build(None)
     self.assertEqual(source_inputters[0].embedding.ref(),
                      target_inputter.embedding.ref())
     self.assertEqual(source_inputters[1].embedding.ref(),
                      target_inputter.embedding.ref())
    def testNestedInputtersWithFlatDataFiles(self):
        inputters = inputter.ParallelInputter(
            [
                record_inputter.SequenceRecordInputter(10),
                record_inputter.SequenceRecordInputter(10),
            ],
            reducer=reducer.SumReducer(),
        )
        inputters = inputter.ParallelInputter(
            [
                record_inputter.SequenceRecordInputter(10),
                inputters,
            ],
            reducer=reducer.ConcatReducer(),
        )

        self.assertListEqual(inputters._structure(), [None, [None, None]])

        empty_file = os.path.join(self.get_temp_dir(), "test.txt")
        with open(empty_file, "w"):
            pass

        with self.assertRaises(ValueError):
            inputters.make_inference_dataset([empty_file, empty_file],
                                             batch_size=2)
        inputters.make_inference_dataset([empty_file, empty_file, empty_file],
                                         batch_size=2)
Beispiel #3
0
 def testNestedParallelInputterShareParameters(self):
   vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
   metadata = {"vocabulary_file": vocab_file}
   source_inputters = [
       text_inputter.WordEmbedder("vocabulary_file", embedding_size=10),
       text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)]
   target_inputter = text_inputter.WordEmbedder("vocabulary_file", embedding_size=10)
   inputters = [
       inputter.ParallelInputter(source_inputters, share_parameters=True),
       target_inputter]
   parallel_inputter = inputter.ParallelInputter(inputters, share_parameters=True)
   parallel_inputter.initialize(metadata)
   parallel_inputter.build()
   self.assertEqual(source_inputters[0].embedding, target_inputter.embedding)
   self.assertEqual(source_inputters[1].embedding, target_inputter.embedding)
    def testBatchAutotuneDatasetMultiSource(self):
        vocab_file = self._makeTextFile("vocab.txt", ["1", "2", "3", "4"])
        data_file = self._makeTextFile("data.txt", ["hello world !"])
        source_inputter = inputter.ParallelInputter([
            text_inputter.WordEmbedder(embedding_size=10),
            text_inputter.WordEmbedder(embedding_size=10),
        ])
        target_inputter = text_inputter.WordEmbedder(embedding_size=10)
        target_inputter.set_decoder_mode(mark_start=True, mark_end=True)
        example_inputter = inputter.ExampleInputter(source_inputter,
                                                    target_inputter)
        example_inputter.initialize({
            "source_1_vocabulary": vocab_file,
            "source_2_vocabulary": vocab_file,
            "target_vocabulary": vocab_file,
        })

        dataset = example_inputter.make_training_dataset(
            [data_file, data_file],
            data_file,
            batch_size=1024,
            batch_type="tokens",
            maximum_features_length=[100, 110],
            maximum_labels_length=120,
            batch_autotune_mode=True,
        )

        source, target = next(iter(dataset))
        self.assertListEqual(source["inputter_0_ids"].shape.as_list(),
                             [8, 100])
        self.assertListEqual(source["inputter_1_ids"].shape.as_list(),
                             [8, 110])
        self.assertListEqual(target["ids"].shape.as_list(), [8, 120])
        self.assertListEqual(target["ids_out"].shape.as_list(), [8, 120])
    def testParallelInputter(self):
        vocab_file = self._makeTextFile("vocab.txt",
                                        ["the", "world", "hello", "toto"])
        data_file = self._makeTextFile("data.txt", ["hello world !"])

        data_files = [data_file, data_file]

        parallel_inputter = inputter.ParallelInputter([
            text_inputter.WordEmbedder(embedding_size=10),
            text_inputter.WordEmbedder(embedding_size=5),
        ])
        self.assertEqual(parallel_inputter.num_outputs, 2)
        features, transformed = self._makeDataset(
            parallel_inputter,
            data_files,
            data_config={
                "1_vocabulary": vocab_file,
                "2_vocabulary": vocab_file
            },
            shapes={
                "inputter_0_ids": [None, None],
                "inputter_0_length": [None],
                "inputter_1_ids": [None, None],
                "inputter_1_length": [None],
            },
        )

        self.assertEqual(2, len(parallel_inputter.get_length(features)))
        self.assertEqual(2, len(transformed))
        self.assertAllEqual([1, 3, 10], transformed[0].shape)
        self.assertAllEqual([1, 3, 5], transformed[1].shape)
Beispiel #6
0
    def testParallelInputter(self):
        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        parallel_inputter = inputter.ParallelInputter([
            text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
            text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)
        ])

        data, transformed = _first_element(parallel_inputter,
                                           [data_file, data_file], {
                                               "vocabulary_file_1": vocab_file,
                                               "vocabulary_file_2": vocab_file
                                           })

        self.assertEqual(2, len(parallel_inputter.get_length(data)))

        input_receiver = parallel_inputter.get_serving_input_receiver()
        self.assertIn("inputter_0_ids", input_receiver.features)
        self.assertIn("inputter_1_ids", input_receiver.features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            data, transformed = sess.run([data, transformed])
            self.assertIn("inputter_0_ids", data)
            self.assertIn("inputter_1_ids", data)
            self.assertEqual(2, len(transformed))
            self.assertAllEqual([1, 3, 10], transformed[0].shape)
            self.assertAllEqual([1, 3, 5], transformed[1].shape)
Beispiel #7
0
  def testParallelInputter(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    data_files = [data_file, data_file]

    parallel_inputter = inputter.ParallelInputter([
        text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
        text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)])
    self.assertEqual(parallel_inputter.num_outputs, 2)
    features, transformed = self._makeDataset(
        parallel_inputter,
        data_files,
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file},
        shapes={"inputter_0_ids": [None, None], "inputter_0_length": [None],
                "inputter_1_ids": [None, None], "inputter_1_length": [None]})

    self.assertEqual(2, len(parallel_inputter.get_length(features)))
    self.assertNotIn("inputter_0_raw", features)
    self.assertNotIn("inputter_1_raw", features)

    with self.test_session() as sess:
      sess.run(tf.tables_initializer())
      sess.run(tf.global_variables_initializer())
      features, transformed = sess.run([features, transformed])
      self.assertEqual(2, len(transformed))
      self.assertAllEqual([1, 3, 10], transformed[0].shape)
      self.assertAllEqual([1, 3, 5], transformed[1].shape)
Beispiel #8
0
 def testParallelInputterShareParameters(self):
     vocab_file = self._makeTextFile("vocab.txt",
                                     ["the", "world", "hello", "toto"])
     data_config = {"1_vocabulary": vocab_file, "2_vocabulary": vocab_file}
     inputters = [
         text_inputter.WordEmbedder(embedding_size=10),
         text_inputter.WordEmbedder(embedding_size=10)
     ]
     parallel_inputter = inputter.ParallelInputter(inputters,
                                                   share_parameters=True)
     parallel_inputter.initialize(data_config)
     parallel_inputter.build(None)
     self.assertEqual(inputters[0].embedding.experimental_ref(),
                      inputters[1].embedding.experimental_ref())
Beispiel #9
0
    def testParallelInputter(self):
        vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
        data_file = os.path.join(self.get_temp_dir(), "data.txt")

        with open(vocab_file, "w") as vocab:
            vocab.write("the\n" "world\n" "hello\n" "toto\n")
        with open(data_file, "w") as data:
            data.write("hello world !\n")

        data_files = [data_file, data_file]

        parallel_inputter = inputter.ParallelInputter([
            text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10),
            text_inputter.WordEmbedder("vocabulary_file_2", embedding_size=5)
        ])
        features, transformed = self._makeDataset(parallel_inputter,
                                                  data_files,
                                                  metadata={
                                                      "vocabulary_file_1":
                                                      vocab_file,
                                                      "vocabulary_file_2":
                                                      vocab_file
                                                  },
                                                  shapes={
                                                      "inputter_0_ids":
                                                      [None, None],
                                                      "inputter_0_length":
                                                      [None],
                                                      "inputter_1_ids":
                                                      [None, None],
                                                      "inputter_1_length":
                                                      [None]
                                                  })

        self.assertEqual(2, len(parallel_inputter.get_length(features)))
        self.assertNotIn("inputter_0_raw", features)
        self.assertNotIn("inputter_0_tokens", features)
        self.assertNotIn("inputter_1_raw", features)
        self.assertNotIn("inputter_1_tokens", features)

        with self.test_session() as sess:
            sess.run(tf.tables_initializer())
            sess.run(tf.global_variables_initializer())
            features, transformed = sess.run([features, transformed])
            self.assertEqual(2, len(transformed))
            self.assertAllEqual([1, 3, 10], transformed[0].shape)
            self.assertAllEqual([1, 3, 5], transformed[1].shape)
Beispiel #10
0
  def testParallelInputterSplitFeatures(self):
    vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello", "toto"])
    data_file = self._makeTextFile("data.txt", ["hello world !"])

    source_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    target_embedder = text_inputter.WordEmbedder("vocabulary_file_1", embedding_size=10)
    target_embedder.is_target = True
    parallel_inputter = inputter.ParallelInputter(
        [source_embedder, target_embedder], combine_features=False)
    self.assertEqual(parallel_inputter.num_outputs, 2)

    features, transformed = self._makeDataset(
        parallel_inputter,
        [data_file, data_file],
        metadata={"vocabulary_file_1": vocab_file, "vocabulary_file_2": vocab_file})

    self.assertIsInstance(features, tuple)
    self.assertEqual(len(features), 2)
    self.assertEqual(len(transformed), 2)
    features, labels = features
    for field in ("ids", "length", "tokens"):
      self.assertIn(field, features)
    for field in ("ids", "ids_out", "length", "tokens"):
      self.assertIn(field, labels)