def testWordOmission(self, count): words = [["a", "b", ""], ["c", "", ""], ["d", "e", "f"], ["g", "", ""]] x = tf.constant(words, dtype=tf.string) y = noise.WordOmission(count)(x) y = self.evaluate(y) expected_omit_count = min(count, len(words) - 1) self.assertEqual(y.shape[0], len(words) - expected_omit_count)
def initialize(self, data_config, params=None): super(SequenceToSequence, self).initialize(data_config, params=params) if self.params.get("contrastive_learning"): # Use the simplest and most effective CL_one from the paper. # https://www.aclweb.org/anthology/P19-1623 noiser = noise.WordNoiser(noises=[noise.WordOmission(1)], subword_token=self.params.get( "decoding_subword_token", "■")) self.labels_inputter.set_noise(noiser, in_place=False)
def testWordEmbedderWithInPlaceNoise(self, probability): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_noise(noiser, probability=probability) features, transformed = self._makeDataset( embedder, data_file, data_config={"vocabulary": vocab_file}, shapes={"tokens": [None, None], "ids": [None, None], "length": [None]}, ) self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
def testWordEmbedderWithNoise(self): vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"]) data_file = self._makeTextFile("data.txt", ["hello world !"]) noiser = noise.WordNoiser(noises=[noise.WordOmission(1)]) embedder = text_inputter.WordEmbedder(embedding_size=10) embedder.set_noise(noiser, in_place=False) expected_shapes = { "tokens": [None, None], "ids": [None, None], "length": [None], "noisy_tokens": [None, None], "noisy_ids": [None, None], "noisy_length": [None] } features, transformed = self._makeDataset( embedder, data_file, data_config={"vocabulary": vocab_file}, shapes=expected_shapes) self.assertEqual(features["noisy_length"][0], features["length"][0] - 1)