コード例 #1
0
 def testWordEmbedderWithNoise(self):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder("vocabulary_file",
                                           embedding_size=10)
     embedder.is_target = True
     embedder.set_noise(noiser, in_place=False)
     expected_shapes = {
         "tokens": [None, None],
         "ids": [None, None],
         "ids_out": [None, None],
         "length": [None],
         "noisy_tokens": [None, None],
         "noisy_ids": [None, None],
         "noisy_ids_out": [None, None],
         "noisy_length": [None]
     }
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         metadata={"vocabulary_file": vocab_file},
         shapes=expected_shapes)
     self.assertEqual(features["noisy_length"][0],
                      features["length"][0] - 1)
コード例 #2
0
 def initialize(self, metadata, params=None):
   super(SequenceToSequence, self).initialize(metadata, params=params)
   if params and params.get("contrastive_learning"):
     subword_token = params.get("decoding_subword_token", "■")
     # Use the simplest and most effective CL_one from the paper.
     # https://www.aclweb.org/anthology/P19-1623
     noiser = noise.WordNoiser(
         noises=[noise.WordOmission(1)],
         subword_token=subword_token,
         is_spacer=subword_token == "▁")
     self.labels_inputter.set_noise(noiser, in_place=False)
コード例 #3
0
ファイル: noise_test.py プロジェクト: Byramklc/OneCeviri
 def testWordNoising(self):
     tokens = tf.constant([["a■", "b", "c■", "d", "■e"],
                           ["a", "b", "c", "", ""]])
     lengths = tf.constant([5, 3])
     noiser = noise.WordNoiser()
     noiser.add(noise.WordDropout(0.1))
     noiser.add(noise.WordReplacement(0.1))
     noiser.add(noise.WordPermutation(3))
     noisy_tokens, noisy_lengths = noiser(tokens, sequence_length=lengths)
     tokens, noisy_tokens = self.evaluate([tokens, noisy_tokens])
     self.assertAllEqual(noisy_tokens.shape, tokens.shape)
コード例 #4
0
 def testWordEmbedderWithInPlaceNoise(self, probability):
     vocab_file = self._makeTextFile("vocab.txt", ["the", "world", "hello"])
     data_file = self._makeTextFile("data.txt", ["hello world !"])
     noiser = noise.WordNoiser(noises=[noise.WordOmission(1)])
     embedder = text_inputter.WordEmbedder("vocabulary_file",
                                           embedding_size=10)
     embedder.set_noise(noiser, probability=probability)
     features, transformed = self._makeDataset(
         embedder,
         data_file,
         metadata={"vocabulary_file": vocab_file},
         shapes={
             "tokens": [None, None],
             "ids": [None, None],
             "length": [None]
         })
     self.assertEqual(features["length"][0], 3 if probability == 0 else 2)
コード例 #5
0
def _add_noise(tokens, lengths, params, subword_token):
    if not isinstance(params, list):
        raise ValueError("Expected a list of noise modules")
    noises = []
    for module in params:
        noise_type, args = six.next(six.iteritems(module))
        if not isinstance(args, list):
            args = [args]
        noise_type = noise_type.lower()
        if noise_type == "dropout":
            noise_class = noise.WordDropout
        elif noise_type == "replacement":
            noise_class = noise.WordReplacement
        elif noise_type == "permutation":
            noise_class = noise.WordPermutation
        else:
            raise ValueError("Invalid noise type: %s" % noise_type)
        noises.append(noise_class(*args))
    noiser = noise.WordNoiser(noises=noises,
                              subword_token=subword_token,
                              is_spacer=subword_token == "▁")
    return noiser(tokens, lengths, keep_shape=True)