Beispiel #1
0
    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self._helper_test_func(
            len(train_dataset), 25000, train_dataset[0][1][:10],
            [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92])
        self._helper_test_func(len(test_dataset), 25000,
                               test_dataset[0][1][:10],
                               [13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20])

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Add test for the subset of the standard datasets
        train_dataset, = IMDB(data_select=('train'))
        self._helper_test_func(
            len(train_dataset), 25000, train_dataset[0][1][:10],
            [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92])
        train_iter, test_iter = torchtext.experimental.datasets.raw.IMDB()
        self._helper_test_func(len(train_iter), 25000,
                               next(iter(train_iter))[1][:25],
                               'I rented I AM CURIOUS-YEL')
        self._helper_test_func(len(test_iter), 25000,
                               next(iter(test_iter))[1][:25],
                               'I love sci-fi and am will')
        del train_iter, test_iter
Beispiel #2
0
    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)
        assert_allclose(
            train_dataset[0][1][:10],
            torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135,
                          92]).long())
        assert_allclose(
            train_dataset[-1][1][:10],
            torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148,
                          8]).long())
        assert_allclose(
            test_dataset[0][1][:10],
            torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long())
        assert_allclose(
            test_dataset[-1][1][:10],
            torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008,
                          3]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)
Beispiel #3
0
    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "imdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "aclImdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "aclImdb_v1.tar.gz")
        conditional_remove(datafile)
def init(config):
    ''' Loads the GloVe embeddings for the words
        which occur in the IMDB train set vocab 
        and uses that vocab to create train, validation
        and test sets for the IMDB dataset. Extracts the
        pad_id token.
    '''
    import os
    if not os.path.isdir('.data'):
        os.mkdir('.data')

    # Extract the initial vocab from the IMDB dataset
    vocab = IMDB(data_select='train')[0].get_vocab()
    # Create GloVe embeddings based on original vocab
    # word freqs
    glove_vocab = torchtext.vocab.Vocab(
        counter=vocab.freqs,
        max_size=MAX_VOCAB_SIZE,
        min_freq=MIN_FREQ,
        vectors=torchtext.vocab.GloVe(name='6B'))
    # Acquire 'Spacy' tokenizer for the vocab words
    tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
    # Acquire train and test IMDB sets with previously created
    # GloVe vocab and 'Spacy' tokenizer
    train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab)

    # Extract the vocab of the acquired train set
    vocab = train_set.get_vocab()
    # Extract the token used for padding
    pad_id = vocab['<pad>']

    # Split the train set into train and validation sets
    train_set, valid_set = split_train_val(train_set)

    config['train'] = train_set
    config['val'] = valid_set
    config['test'] = test_set
    config['vocab'] = vocab
    config['pad_id'] = pad_id
Beispiel #5
0
    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)
        assert_allclose(
            train_dataset[0][1][:10],
            torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135,
                          92]).long())
        assert_allclose(
            train_dataset[-1][1][:10],
            torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148,
                          8]).long())
        assert_allclose(
            test_dataset[0][1][:10],
            torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long())
        assert_allclose(
            test_dataset[-1][1][:10],
            torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008,
                          3]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "imdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "aclImdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "aclImdb_v1.tar.gz")
        conditional_remove(datafile)