Example #1
0
 def testWordOmission(self, count):
     words = [["a", "b", ""], ["c", "", ""], ["d", "e", "f"], ["g", "", ""]]
     x = tf.constant(words, dtype=tf.string)
     y = noise.WordOmission(count)(x)
     y = self.evaluate(y)
     expected_omit_count = min(count, len(words) - 1)
     self.assertEqual(y.shape[0], len(words) - expected_omit_count)
 def initialize(self, data_config, params=None):
     super(SequenceToSequence, self).initialize(data_config, params=params)
     if self.params.get("contrastive_learning"):
         # Use the simplest and most effective CL_one from the paper.
         # https://www.aclweb.org/anthology/P19-1623
         noiser = noise.WordNoiser(noises=[noise.WordOmission(1)],
                                   subword_token=self.params.get(
                                       "decoding_subword_token", "■"))
         self.labels_inputter.set_noise(noiser, in_place=False)
Example #3
0
 def testWordEmbedderWithInPlaceNoise(self, probability):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     embedder.set_noise(noiser, probability=probability)
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         data_config={"vocabulary": vocab_file},
         shapes={"tokens": [None, None], "ids": [None, None], "length": [None]},
     )
     self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
 def testWordEmbedderWithNoise(self):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder(embedding_size=10)
     embedder.set_noise(noiser, in_place=False)
     expected_shapes = {
         "tokens": [None, None],
         "ids": [None, None],
         "length": [None],
         "noisy_tokens": [None, None],
         "noisy_ids": [None, None],
         "noisy_length": [None]
     }
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         data_config={"vocabulary": vocab_file},
         shapes=expected_shapes)
     self.assertEqual(features["noisy_length"][0],
                      features["length"][0] - 1)