def test_udpos_sequence_tagging(self): from torchtext.experimental.datasets import UDPOS # smoke test to ensure imdb works properly train_dataset, valid_dataset, test_dataset = UDPOS() self.assertEqual(len(train_dataset), 12543) self.assertEqual(len(valid_dataset), 2002) self.assertEqual(len(test_dataset), 2077) self.assertEqual(train_dataset[0][0][:10], torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) self.assertEqual(train_dataset[0][1][:10], torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) self.assertEqual(train_dataset[0][2][:10], torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) self.assertEqual(train_dataset[-1][0][:10], torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) self.assertEqual(train_dataset[-1][1][:10], torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) self.assertEqual(train_dataset[-1][2][:10], torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) self.assertEqual(valid_dataset[0][0][:10], torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) self.assertEqual(valid_dataset[0][1][:10], torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) self.assertEqual(valid_dataset[0][2][:10], torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) self.assertEqual(valid_dataset[-1][0][:10], torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) self.assertEqual(valid_dataset[-1][1][:10], torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) self.assertEqual(valid_dataset[-1][2][:10], torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) self.assertEqual(test_dataset[0][0][:10], torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) self.assertEqual(test_dataset[0][1][:10], torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) self.assertEqual(test_dataset[0][2][:10], torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) self.assertEqual(test_dataset[-1][0][:10], torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) self.assertEqual(test_dataset[-1][1][:10], torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) self.assertEqual(test_dataset[-1][2][:10], torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3) self.assertEqual(len(train_dataset.get_vocabs()[0]), 19674) self.assertEqual(len(train_dataset.get_vocabs()[1]), 19) self.assertEqual(len(train_dataset.get_vocabs()[2]), 52) # Assert token ids word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452])
def test_udpos_sequence_tagging(self): from torchtext.experimental.datasets import UDPOS # smoke test to ensure imdb works properly train_dataset, valid_dataset, test_dataset = UDPOS() self._helper_test_func(len(train_dataset), 12543, (train_dataset[0][0][:10], train_dataset[0][1][:10], train_dataset[0][2][:10], train_dataset[-1][0][:10], train_dataset[-1][1][:10], train_dataset[-1][2][:10]), ([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585], [8, 3, 8, 3, 9, 2, 4, 8, 8, 8], [5, 34, 5, 27, 7, 11, 14, 5, 5, 5], [9, 32, 169, 436, 59, 192, 30, 6, 117, 17], [5, 10, 11, 4, 11, 11, 3, 12, 11, 4], [6, 20, 8, 10, 8, 8, 24, 13, 8, 15])) self._helper_test_func(len(valid_dataset), 2002, (valid_dataset[0][0][:10], valid_dataset[0][1][:10], valid_dataset[0][2][:10], valid_dataset[-1][0][:10], valid_dataset[-1][1][:10], valid_dataset[-1][2][:10]), ([746, 3, 10633, 656, 25, 1334, 45], [6, 7, 8, 4, 7, 2, 3], [3, 4, 5, 16, 4, 2, 27], [354, 4, 31, 17, 141, 421, 148, 6, 7, 78], [11, 3, 5, 4, 9, 2, 2, 12, 7, 11], [8, 12, 6, 15, 7, 2, 2, 13, 4, 8])) self._helper_test_func(len(test_dataset), 2077, (test_dataset[0][0][:10], test_dataset[0][1][:10], test_dataset[0][2][:10], test_dataset[-1][0][:10], test_dataset[-1][1][:10], test_dataset[-1][2][:10]), ([210, 54, 3115, 0, 12229, 0, 33], [5, 15, 8, 4, 6, 8, 3], [30, 3, 5, 14, 3, 5, 9], [116, 0, 6, 11, 412, 10, 0, 4, 0, 6], [5, 4, 12, 10, 9, 15, 4, 3, 4, 12], [6, 16, 13, 16, 7, 3, 19, 12, 19, 13])) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3) self.assertEqual(len(train_dataset.get_vocabs()[0]), 19674) self.assertEqual(len(train_dataset.get_vocabs()[1]), 19) self.assertEqual(len(train_dataset.get_vocabs()[2]), 52) # Assert token ids word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) # Add test for the subset of the standard datasets train_dataset, = UDPOS(data_select=('train')) self._helper_test_func(len(train_dataset), 12543, (train_dataset[0][0][:10], train_dataset[-1][2][:10]), ([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585], [6, 20, 8, 10, 8, 8, 24, 13, 8, 15])) train_iter, valid_iter = torchtext.experimental.datasets.raw.UDPOS(data_select=('train', 'valid')) self._helper_test_func(len(train_iter), 12543, ' '.join(next(iter(train_iter))[0][:5]), ' '.join(['Al', '-', 'Zaman', ':', 'American'])) self._helper_test_func(len(valid_iter), 2002, ' '.join(next(iter(valid_iter))[0][:5]), ' '.join(['From', 'the', 'AP', 'comes', 'this'])) del train_iter, valid_iter