def test_conll_sequence_tagging(self): from torchtext.experimental.datasets import CoNLL2000Chunking # smoke test to ensure imdb works properly train_dataset, test_dataset = CoNLL2000Chunking() self._helper_test_func( len(train_dataset), 8936, (train_dataset[0][0][:10], train_dataset[0][1][:10], train_dataset[0][2][:10], train_dataset[-1][0][:10], train_dataset[-1][1][:10], train_dataset[-1][2][:10]), ([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317], [ 2, 3, 5, 2, 17, 12, 16, 15, 13, 5 ], [3, 6, 3, 2, 5, 7, 7, 7, 7, 3], [ 85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502 ], [18, 17, 12, 19, 10, 6, 3, 3, 4, 4 ], [3, 5, 7, 7, 3, 2, 6, 6, 3, 2])) self._helper_test_func( len(test_dataset), 2012, (test_dataset[0][0][:10], test_dataset[0][1][:10], test_dataset[0][2][:10], test_dataset[-1][0][:10], test_dataset[-1][1][:10], test_dataset[-1][2][:10]), ([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7], [ 4, 4, 4, 23, 4, 2, 11, 18, 11, 5 ], [3, 2, 2, 3, 2, 2, 5, 3, 5, 3 ], [51, 456, 560, 2, 11, 465, 2, 1413, 36, 60], [3, 4, 4, 8, 3, 2, 8, 4, 17, 16], [6, 3, 2, 4, 6, 3, 4, 3, 5, 7])) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3) self.assertEqual(len(train_dataset.get_vocabs()[0]), 19124) self.assertEqual(len(train_dataset.get_vocabs()[1]), 46) self.assertEqual(len(train_dataset.get_vocabs()[2]), 24) # Assert token ids word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [ word_vocab[token] for token in 'Two of them were being run'.split() ] self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) # Add test for the subset of the standard datasets train_dataset, = CoNLL2000Chunking(data_select=('train')) self._helper_test_func( len(train_dataset), 8936, (train_dataset[0][0][:10], train_dataset[0][1][:10], train_dataset[0][2][:10], train_dataset[-1][0][:10], train_dataset[-1][1][:10], train_dataset[-1][2][:10]), ([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317], [ 2, 3, 5, 2, 17, 12, 16, 15, 13, 5 ], [3, 6, 3, 2, 5, 7, 7, 7, 7, 3], [ 85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502 ], [18, 17, 12, 19, 10, 6, 3, 3, 4, 4 ], [3, 5, 7, 7, 3, 2, 6, 6, 3, 2])) train_iter, = torchtext.experimental.datasets.raw.CoNLL2000Chunking( data_select=('train')) self._helper_test_func( len(train_iter), 8936, ' '.join(next(iter(train_iter))[0][:5]), ' '.join(['Confidence', 'in', 'the', 'pound', 'is'])) del train_iter
def test_conll_sequence_tagging(self): from torchtext.experimental.datasets import CoNLL2000Chunking # smoke test to ensure imdb works properly train_dataset, test_dataset = CoNLL2000Chunking() self.assertEqual(len(train_dataset), 8936) self.assertEqual(len(test_dataset), 2012) self.assertEqual(train_dataset[0][0][:10], torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long()) self.assertEqual(train_dataset[0][1][:10], torch.tensor([2, 3, 5, 2, 17, 12, 16, 15, 13, 5]).long()) self.assertEqual(train_dataset[0][2][:10], torch.tensor([3, 6, 3, 2, 5, 7, 7, 7, 7, 3]).long()) self.assertEqual(train_dataset[-1][0][:10], torch.tensor([85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502]).long()) self.assertEqual(train_dataset[-1][1][:10], torch.tensor([18, 17, 12, 19, 10, 6, 3, 3, 4, 4]).long()) self.assertEqual(train_dataset[-1][2][:10], torch.tensor([3, 5, 7, 7, 3, 2, 6, 6, 3, 2]).long()) self.assertEqual(test_dataset[0][0][:10], torch.tensor([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7]).long()) self.assertEqual(test_dataset[0][1][:10], torch.tensor([4, 4, 4, 23, 4, 2, 11, 18, 11, 5]).long()) self.assertEqual(test_dataset[0][2][:10], torch.tensor([3, 2, 2, 3, 2, 2, 5, 3, 5, 3]).long()) self.assertEqual(test_dataset[-1][0][:10], torch.tensor([51, 456, 560, 2, 11, 465, 2, 1413, 36, 60]).long()) self.assertEqual(test_dataset[-1][1][:10], torch.tensor([3, 4, 4, 8, 3, 2, 8, 4, 17, 16]).long()) self.assertEqual(test_dataset[-1][2][:10], torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long()) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3) self.assertEqual(len(train_dataset.get_vocabs()[0]), 19124) self.assertEqual(len(train_dataset.get_vocabs()[1]), 46) self.assertEqual(len(train_dataset.get_vocabs()[2]), 24) # Assert token ids word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690])