Esempio n. 1
0
    def test_udpos_sequence_tagging(self):
        from torchtext.experimental.datasets import UDPOS

        # smoke test to ensure imdb works properly
        train_dataset, valid_dataset, test_dataset = UDPOS()
        self._helper_test_func(len(train_dataset), 12543, (train_dataset[0][0][:10], train_dataset[0][1][:10],
                                                           train_dataset[0][2][:10], train_dataset[-1][0][:10],
                                                           train_dataset[-1][1][:10], train_dataset[-1][2][:10]),
                               ([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585],
                                [8, 3, 8, 3, 9, 2, 4, 8, 8, 8],
                                [5, 34, 5, 27, 7, 11, 14, 5, 5, 5],
                                [9, 32, 169, 436, 59, 192, 30, 6, 117, 17],
                                [5, 10, 11, 4, 11, 11, 3, 12, 11, 4],
                                [6, 20, 8, 10, 8, 8, 24, 13, 8, 15]))
        self._helper_test_func(len(valid_dataset), 2002, (valid_dataset[0][0][:10], valid_dataset[0][1][:10],
                                                          valid_dataset[0][2][:10], valid_dataset[-1][0][:10],
                                                          valid_dataset[-1][1][:10], valid_dataset[-1][2][:10]),
                               ([746, 3, 10633, 656, 25, 1334, 45],
                                [6, 7, 8, 4, 7, 2, 3],
                                [3, 4, 5, 16, 4, 2, 27],
                                [354, 4, 31, 17, 141, 421, 148, 6, 7, 78],
                                [11, 3, 5, 4, 9, 2, 2, 12, 7, 11],
                                [8, 12, 6, 15, 7, 2, 2, 13, 4, 8]))
        self._helper_test_func(len(test_dataset), 2077, (test_dataset[0][0][:10], test_dataset[0][1][:10],
                                                         test_dataset[0][2][:10], test_dataset[-1][0][:10],
                                                         test_dataset[-1][1][:10], test_dataset[-1][2][:10]),
                               ([210, 54, 3115, 0, 12229, 0, 33],
                                [5, 15, 8, 4, 6, 8, 3],
                                [30, 3, 5, 14, 3, 5, 9],
                                [116, 0, 6, 11, 412, 10, 0, 4, 0, 6],
                                [5, 4, 12, 10, 9, 15, 4, 3, 4, 12],
                                [6, 16, 13, 16, 7, 3, 19, 12, 19, 13]))

        # Assert vocabs
        self.assertEqual(len(train_dataset.get_vocabs()), 3)
        self.assertEqual(len(train_dataset.get_vocabs()[0]), 19674)
        self.assertEqual(len(train_dataset.get_vocabs()[1]), 19)
        self.assertEqual(len(train_dataset.get_vocabs()[2]), 52)

        # Assert token ids
        word_vocab = train_dataset.get_vocabs()[0]
        tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()]
        self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452])

        # Add test for the subset of the standard datasets
        train_dataset, = UDPOS(data_select=('train'))
        self._helper_test_func(len(train_dataset), 12543, (train_dataset[0][0][:10], train_dataset[-1][2][:10]),
                               ([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585],
                                [6, 20, 8, 10, 8, 8, 24, 13, 8, 15]))
        train_iter, valid_iter = torchtext.experimental.datasets.raw.UDPOS(data_select=('train', 'valid'))
        self._helper_test_func(len(train_iter), 12543, ' '.join(next(iter(train_iter))[0][:5]),
                               ' '.join(['Al', '-', 'Zaman', ':', 'American']))
        self._helper_test_func(len(valid_iter), 2002, ' '.join(next(iter(valid_iter))[0][:5]),
                               ' '.join(['From', 'the', 'AP', 'comes', 'this']))
        del train_iter, valid_iter
Esempio n. 2
0
    def test_udpos_sequence_tagging(self):
        from torchtext.experimental.datasets import UDPOS

        # smoke test to ensure imdb works properly
        train_dataset, valid_dataset, test_dataset = UDPOS()
        self.assertEqual(len(train_dataset), 12543)
        self.assertEqual(len(valid_dataset), 2002)
        self.assertEqual(len(test_dataset), 2077)
        self.assertEqual(train_dataset[0][0][:10],
                         torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long())
        self.assertEqual(train_dataset[0][1][:10],
                         torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long())
        self.assertEqual(train_dataset[0][2][:10],
                         torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long())
        self.assertEqual(train_dataset[-1][0][:10],
                         torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long())
        self.assertEqual(train_dataset[-1][1][:10],
                         torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long())
        self.assertEqual(train_dataset[-1][2][:10],
                         torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long())

        self.assertEqual(valid_dataset[0][0][:10],
                         torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long())
        self.assertEqual(valid_dataset[0][1][:10],
                         torch.tensor([6, 7, 8, 4, 7, 2, 3]).long())
        self.assertEqual(valid_dataset[0][2][:10],
                         torch.tensor([3, 4, 5, 16, 4, 2, 27]).long())
        self.assertEqual(valid_dataset[-1][0][:10],
                         torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long())
        self.assertEqual(valid_dataset[-1][1][:10],
                         torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long())
        self.assertEqual(valid_dataset[-1][2][:10],
                         torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long())

        self.assertEqual(test_dataset[0][0][:10],
                         torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long())
        self.assertEqual(test_dataset[0][1][:10],
                         torch.tensor([5, 15, 8, 4, 6, 8, 3]).long())
        self.assertEqual(test_dataset[0][2][:10],
                         torch.tensor([30, 3, 5, 14, 3, 5, 9]).long())
        self.assertEqual(test_dataset[-1][0][:10],
                         torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long())
        self.assertEqual(test_dataset[-1][1][:10],
                         torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long())
        self.assertEqual(test_dataset[-1][2][:10],
                         torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long())

        # Assert vocabs
        self.assertEqual(len(train_dataset.get_vocabs()), 3)
        self.assertEqual(len(train_dataset.get_vocabs()[0]), 19674)
        self.assertEqual(len(train_dataset.get_vocabs()[1]), 19)
        self.assertEqual(len(train_dataset.get_vocabs()[2]), 52)

        # Assert token ids
        word_vocab = train_dataset.get_vocabs()[0]
        tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()]
        self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452])