def test_word_blank_with_eos(self):
        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
            self.assert_word_blanking_correct(
                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
            )
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
    def test_word_blank_without_eos(self):
        """Same result as word blank with eos except no EOS at end"""
        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
            self.assert_word_blanking_correct(
                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
            )
            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
Exemple #3
0
    def test_word_dropout(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
            # Expect only the first word (2 bpe tokens) of the first example
            # was dropped out
            self.assertEqual(x_len[0] - 2, l_noised[0])
            for i in range(l_noised[0]):
                self.assertEqual(x_noised[i][0], x[i+2][0])
Exemple #4
0
    def test_word_dropout_with_eos(self):
        vocab, x, x_len = self._get_test_data(append_eos=True)

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
            self.assert_word_dropout_correct(x=x,
                                             x_noised=x_noised,
                                             x_len=x_len,
                                             l_noised=l_noised)
            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
Exemple #5
0
    def test_word_blank(self):
        vocab, x, x_len = self._get_test_data()

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
            # Expect only the first word (2 bpe tokens) of the first example
            # was blanked out
            self.assertEqual(x_len[0], l_noised[0])
            for i in range(l_noised[0]):
                if i < 2:
                    self.assertEqual(x_noised[i][0], vocab.unk())
                else:
                    self.assertEqual(x_noised[i][0], x[i][0])
Exemple #6
0
    def test_word_dropout_without_eos(self):
        """Same result as word dropout with eos except no EOS at end"""
        vocab, x, x_len = self._get_test_data(append_eos=False)

        with data_utils.numpy_seed(1234):
            noising_gen = noising.WordDropout(vocab)
            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
            self.assert_word_dropout_correct(x=x,
                                             x_noised=x_noised,
                                             x_len=x_len,
                                             l_noised=l_noised)
            self.assert_no_eos_at_end(x=x_noised,
                                      x_len=l_noised,
                                      eos=vocab.eos())