def test_word_blank_with_eos(self): vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) self.assert_word_blanking_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() ) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_blank_without_eos(self): """Same result as word blank with eos except no EOS at end""" vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) self.assert_word_blanking_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() ) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_dropout(self): vocab, x, x_len = self._get_test_data() with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) # Expect only the first word (2 bpe tokens) of the first example # was dropped out self.assertEqual(x_len[0] - 2, l_noised[0]) for i in range(l_noised[0]): self.assertEqual(x_noised[i][0], x[i+2][0])
def test_word_dropout_with_eos(self): vocab, x, x_len = self._get_test_data(append_eos=True) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) self.assert_word_dropout_correct(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_blank(self): vocab, x, x_len = self._get_test_data() with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) # Expect only the first word (2 bpe tokens) of the first example # was blanked out self.assertEqual(x_len[0], l_noised[0]) for i in range(l_noised[0]): if i < 2: self.assertEqual(x_noised[i][0], vocab.unk()) else: self.assertEqual(x_noised[i][0], x[i][0])
def test_word_dropout_without_eos(self): """Same result as word dropout with eos except no EOS at end""" vocab, x, x_len = self._get_test_data(append_eos=False) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) self.assert_word_dropout_correct(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())