Python IMDBの例

プログラミング言語: Python

名前空間/パッケージ名: torchtext.experimental.datasets

クラス/型: IMDB

hotexamples.comのコード掲載数: 8

Python IMDB - 8件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtorchtext.experimental.datasets.IMDBの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

IMDB(8)

get_vocab(5)

よく使われるメソッド

IMDB (8)

get_vocab (5)

コード例 #1

ファイルを表示

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)
        assert_allclose(
            train_dataset[0][1][:10],
            torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135,
                          92]).long())
        assert_allclose(
            train_dataset[-1][1][:10],
            torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148,
                          8]).long())
        assert_allclose(
            test_dataset[0][1][:10],
            torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long())
        assert_allclose(
            test_dataset[-1][1][:10],
            torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008,
                          3]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

コード例 #2

ファイルを表示

ファイル: test_builtin_datasets.py プロジェクト: zivlir/text

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "imdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "aclImdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "aclImdb_v1.tar.gz")
        conditional_remove(datafile)

コード例 #3

ファイルを表示

ファイル: test_builtin_datasets.py プロジェクト: taylr/text

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self._helper_test_func(
            len(train_dataset), 25000, train_dataset[0][1][:10],
            [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92])
        self._helper_test_func(len(test_dataset), 25000,
                               test_dataset[0][1][:10],
                               [13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20])

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Add test for the subset of the standard datasets
        train_dataset, = IMDB(data_select=('train'))
        self._helper_test_func(
            len(train_dataset), 25000, train_dataset[0][1][:10],
            [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92])
        train_iter, test_iter = torchtext.experimental.datasets.raw.IMDB()
        self._helper_test_func(len(train_iter), 25000,
                               next(iter(train_iter))[1][:25],
                               'I rented I AM CURIOUS-YEL')
        self._helper_test_func(len(test_iter), 25000,
                               next(iter(test_iter))[1][:25],
                               'I love sci-fi and am will')
        del train_iter, test_iter

コード例 #4

ファイルを表示

ファイル: init.py プロジェクト: senadkurtisi/IMDB-Sentiment-Analysis-PyTorch

def init(config):
    ''' Loads the GloVe embeddings for the words
        which occur in the IMDB train set vocab 
        and uses that vocab to create train, validation
        and test sets for the IMDB dataset. Extracts the
        pad_id token.
    '''
    import os
    if not os.path.isdir('.data'):
        os.mkdir('.data')

    # Extract the initial vocab from the IMDB dataset
    vocab = IMDB(data_select='train')[0].get_vocab()
    # Create GloVe embeddings based on original vocab
    # word freqs
    glove_vocab = torchtext.vocab.Vocab(
        counter=vocab.freqs,
        max_size=MAX_VOCAB_SIZE,
        min_freq=MIN_FREQ,
        vectors=torchtext.vocab.GloVe(name='6B'))
    # Acquire 'Spacy' tokenizer for the vocab words
    tokenizer = get_tokenizer('spacy', 'en_core_web_sm')
    # Acquire train and test IMDB sets with previously created
    # GloVe vocab and 'Spacy' tokenizer
    train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab)

    # Extract the vocab of the acquired train set
    vocab = train_set.get_vocab()
    # Extract the token used for padding
    pad_id = vocab['<pad>']

    # Split the train set into train and validation sets
    train_set, valid_set = split_train_val(train_set)

    config['train'] = train_set
    config['val'] = valid_set
    config['test'] = test_set
    config['vocab'] = vocab
    config['pad_id'] = pad_id

コード例 #5

ファイルを表示

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        # smoke test to ensure wikitext2 works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "imdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "aclImdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "aclImdb_v1.tar.gz")
        conditional_remove(datafile)

コード例 #6

ファイルを表示

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        from torchtext.vocab import Vocab
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self.assertEqual(len(train_dataset), 25000)
        self.assertEqual(len(test_dataset), 25000)
        assert_allclose(
            train_dataset[0][1][:10],
            torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135,
                          92]).long())
        assert_allclose(
            train_dataset[-1][1][:10],
            torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148,
                          8]).long())
        assert_allclose(
            test_dataset[0][1][:10],
            torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long())
        assert_allclose(
            test_dataset[-1][1][:10],
            torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008,
                          3]).long())

        # Test API with a vocab input object
        old_vocab = train_dataset.get_vocab()
        new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500)
        new_train_data, new_test_data = IMDB(vocab=new_vocab)

        # Delete the dataset after we're done to save disk space on CI
        datafile = os.path.join(self.project_root, ".data", "imdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data", "aclImdb")
        conditional_remove(datafile)
        datafile = os.path.join(self.project_root, ".data",
                                "aclImdb_v1.tar.gz")
        conditional_remove(datafile)

コード例 #7

ファイルを表示

    def test_imdb(self):
        from torchtext.experimental.datasets import IMDB
        # smoke test to ensure imdb works properly
        train_dataset, test_dataset = IMDB()
        self._helper_test_func(
            len(train_dataset), 25000, train_dataset[0][1][:10],
            [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92])
        self._helper_test_func(len(test_dataset), 25000,
                               test_dataset[0][1][:10],
                               [13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20])

        # Add test for the subset of the standard datasets
        train_iter, test_iter = torchtext.datasets.IMDB()
        self._helper_test_func(len(train_iter), 25000,
                               next(train_iter)[1][:25],
                               'I rented I AM CURIOUS-YEL')
        self._helper_test_func(len(test_iter), 25000,
                               next(test_iter)[1][:25],
                               'I love sci-fi and am will')
        del train_iter, test_iter

コード例 #8

ファイルを表示

ファイル: data.py プロジェクト: jeroenvuurens/dl

def imdb(tokenizer=None):
    if not tokenizer:
        tokenizer=get_tokenizer("spacy")
    train_ds, valid_ds = IMDB(tokenizer=tokenizer) 
    return TextData.from_datasets(train_ds, valid_ds)