コード例 #1
0
    def test_conll_sequence_tagging(self):
        from torchtext.experimental.datasets import CoNLL2000Chunking

        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = CoNLL2000Chunking()
        self._helper_test_func(
            len(train_dataset), 8936,
            (train_dataset[0][0][:10], train_dataset[0][1][:10],
             train_dataset[0][2][:10], train_dataset[-1][0][:10],
             train_dataset[-1][1][:10], train_dataset[-1][2][:10]),
            ([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317], [
                2, 3, 5, 2, 17, 12, 16, 15, 13, 5
            ], [3, 6, 3, 2, 5, 7, 7, 7, 7, 3], [
                85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502
            ], [18, 17, 12, 19, 10, 6, 3, 3, 4, 4
                ], [3, 5, 7, 7, 3, 2, 6, 6, 3, 2]))
        self._helper_test_func(
            len(test_dataset), 2012,
            (test_dataset[0][0][:10], test_dataset[0][1][:10],
             test_dataset[0][2][:10], test_dataset[-1][0][:10],
             test_dataset[-1][1][:10], test_dataset[-1][2][:10]),
            ([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7], [
                4, 4, 4, 23, 4, 2, 11, 18, 11, 5
            ], [3, 2, 2, 3, 2, 2, 5, 3, 5, 3
                ], [51, 456, 560, 2, 11, 465, 2, 1413, 36, 60],
             [3, 4, 4, 8, 3, 2, 8, 4, 17, 16], [6, 3, 2, 4, 6, 3, 4, 3, 5, 7]))

        # Assert vocabs
        self.assertEqual(len(train_dataset.get_vocabs()), 3)
        self.assertEqual(len(train_dataset.get_vocabs()[0]), 19124)
        self.assertEqual(len(train_dataset.get_vocabs()[1]), 46)
        self.assertEqual(len(train_dataset.get_vocabs()[2]), 24)

        # Assert token ids
        word_vocab = train_dataset.get_vocabs()[0]
        tokens_ids = [
            word_vocab[token]
            for token in 'Two of them were being run'.split()
        ]
        self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690])

        # Add test for the subset of the standard datasets
        train_dataset, = CoNLL2000Chunking(data_select=('train'))
        self._helper_test_func(
            len(train_dataset), 8936,
            (train_dataset[0][0][:10], train_dataset[0][1][:10],
             train_dataset[0][2][:10], train_dataset[-1][0][:10],
             train_dataset[-1][1][:10], train_dataset[-1][2][:10]),
            ([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317], [
                2, 3, 5, 2, 17, 12, 16, 15, 13, 5
            ], [3, 6, 3, 2, 5, 7, 7, 7, 7, 3], [
                85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502
            ], [18, 17, 12, 19, 10, 6, 3, 3, 4, 4
                ], [3, 5, 7, 7, 3, 2, 6, 6, 3, 2]))
        train_iter, = torchtext.experimental.datasets.raw.CoNLL2000Chunking(
            data_select=('train'))
        self._helper_test_func(
            len(train_iter), 8936, ' '.join(next(iter(train_iter))[0][:5]),
            ' '.join(['Confidence', 'in', 'the', 'pound', 'is']))
        del train_iter
コード例 #2
0
    def test_conll_sequence_tagging(self):
        from torchtext.experimental.datasets import CoNLL2000Chunking

        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = CoNLL2000Chunking()
        self.assertEqual(len(train_dataset), 8936)
        self.assertEqual(len(test_dataset), 2012)
        self.assertEqual(train_dataset[0][0][:10],
                         torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long())
        self.assertEqual(train_dataset[0][1][:10],
                         torch.tensor([2, 3, 5, 2, 17, 12, 16, 15, 13, 5]).long())
        self.assertEqual(train_dataset[0][2][:10],
                         torch.tensor([3, 6, 3, 2, 5, 7, 7, 7, 7, 3]).long())
        self.assertEqual(train_dataset[-1][0][:10],
                         torch.tensor([85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502]).long())
        self.assertEqual(train_dataset[-1][1][:10],
                         torch.tensor([18, 17, 12, 19, 10, 6, 3, 3, 4, 4]).long())
        self.assertEqual(train_dataset[-1][2][:10],
                         torch.tensor([3, 5, 7, 7, 3, 2, 6, 6, 3, 2]).long())

        self.assertEqual(test_dataset[0][0][:10],
                         torch.tensor([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7]).long())
        self.assertEqual(test_dataset[0][1][:10],
                         torch.tensor([4, 4, 4, 23, 4, 2, 11, 18, 11, 5]).long())
        self.assertEqual(test_dataset[0][2][:10],
                         torch.tensor([3, 2, 2, 3, 2, 2, 5, 3, 5, 3]).long())
        self.assertEqual(test_dataset[-1][0][:10],
                         torch.tensor([51, 456, 560, 2, 11, 465, 2, 1413, 36, 60]).long())
        self.assertEqual(test_dataset[-1][1][:10],
                         torch.tensor([3, 4, 4, 8, 3, 2, 8, 4, 17, 16]).long())
        self.assertEqual(test_dataset[-1][2][:10],
                         torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long())

        # Assert vocabs
        self.assertEqual(len(train_dataset.get_vocabs()), 3)
        self.assertEqual(len(train_dataset.get_vocabs()[0]), 19124)
        self.assertEqual(len(train_dataset.get_vocabs()[1]), 46)
        self.assertEqual(len(train_dataset.get_vocabs()[2]), 24)

        # Assert token ids
        word_vocab = train_dataset.get_vocabs()[0]
        tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()]
        self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690])