def setUp(self): d = mock_dict() tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1) tokens_ds1 = TokenBlockDataset( tokens_1, sizes=[tokens_1.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_1 = LanguagePairDataset(tokens_ds1, tokens_ds1.sizes, d, shuffle=False) tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1) tokens_ds2 = TokenBlockDataset( tokens_2, sizes=[tokens_2.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_2 = LanguagePairDataset(tokens_ds2, tokens_ds2.sizes, d, shuffle=False)
def lang_pair_dataset(lengths: Sequence[int]) -> LanguagePairDataset: tokens = [[i] * l for i, l in enumerate(lengths)] return LanguagePairDataset(ListDataset(tokens), lengths, mock_dict())