def assert_word_shuffle_matches_expected( self, x, x_len, max_shuffle_distance: int, vocab: Dictionary, expected_shufle_maps: List[Dict[int, int]], expect_eos_at_end: bool, bpe_end_marker=None, ): """ This verifies that with a given x, x_len, max_shuffle_distance, and vocab, we get the expected shuffle result. Args: x: Tensor of shape (T x B) = (sequence_length, batch_size) x_len: Tensor of length B = batch_size max_shuffle_distance: arg to pass to noising expected_shuffle_maps: List[mapping] where mapping is a Dict[old_index, new_index], mapping x's elements from their old positions in x to their new positions in x. expect_eos_at_end: if True, check the output to make sure there is an EOS at the end. bpe_end_marker: str denoting the BPE end token. If this is not None, we set the BPE cont token to None in the noising classes. """ bpe_cont_marker = None if bpe_end_marker is None: bpe_cont_marker = "@@" with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle( vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker ) x_noised, l_noised = word_shuffle.noising( x, x_len, max_shuffle_distance=max_shuffle_distance ) # For every example, we have a different expected shuffle map. We check # that each example is shuffled as expected according to each # corresponding shuffle map. for i in range(len(expected_shufle_maps)): shuffle_map = expected_shufle_maps[i] for k, v in shuffle_map.items(): self.assertEqual(x[k][i], x_noised[v][i]) # Shuffling should not affect the length of each example for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised): self.assertEqual(pre_shuffle_length, post_shuffle_length) if expect_eos_at_end: self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def _random_shuffle(target_tokens, p, max_shuffle_distance): word_shuffle = noising.WordShuffle(self.tgt_dict) target_mask = target_tokens.eq(self.tgt_dict.pad()) target_length = target_mask.size(1) - target_mask.long().sum(1) prev_target_tokens, _ = word_shuffle.noising( target_tokens.t().cpu(), target_length.cpu(), max_shuffle_distance) prev_target_tokens = prev_target_tokens.to( target_tokens.device).t() masks = (target_tokens.clone().sum( dim=1, keepdim=True).float().uniform_(0, 1) < p) prev_target_tokens = masks * prev_target_tokens + ( ~masks) * target_tokens return prev_target_tokens
def test_word_shuffle_with_eos_nonbpe(self): vocab, x, x_len = self._get_test_data(append_eos=True, bpe=False) with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle(vocab, bpe_cont_marker=None) x_noised, l_noised = word_shuffle.noising(x, x_len, 0) self.assert_no_shuffle_with_0_distance(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) x_noised, l_noised = word_shuffle.noising(x, x_len, 3) self.assert_nonbpe_shuffle_with_distance_3(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
def test_word_shuffle(self): vocab, x, x_len = self._get_test_data() with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle(vocab) x_noised, l_noised = word_shuffle.noising(x, x_len, 0) for i in range(len(x_len)): for j in range(x_len[i]): self.assertEqual(x[j][i], x_noised[j][i]) self.assertEqual(x_len[0], l_noised[0]) x_noised, l_noised = word_shuffle.noising(x, x_len, 3) # Expect the second example has the last three tokens shuffled # 6, 7, 8, 9 => 6, 8, 9, 7, where (8, 9) is a word for i in range(x_len[0]): self.assertEqual(x[i][0], x_noised[i][0]) shuffle_map = {0: 0, 1: 3, 2: 1, 3: 2} for k, v in shuffle_map.items(): self.assertEqual(x[k][1], x_noised[v][1]) self.assertEqual(x_len[0], l_noised[0]) self.assertEqual(x_len[1], l_noised[1])
def test_word_shuffle_without_eos(self): """ Same result as word shuffle with eos except no EOS at end """ vocab, x, x_len = self._get_test_data(append_eos=False) with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle(vocab) x_noised, l_noised = word_shuffle.noising(x, x_len, 0) self.assert_no_shuffle_with_0_distance(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) x_noised, l_noised = word_shuffle.noising(x, x_len, 3) self.assert_word_shuffle_with_distance_3(x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())