def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab # smoke test to ensure imdb works properly train_dataset, test_dataset = IMDB() self.assertEqual(len(train_dataset), 25000) self.assertEqual(len(test_dataset), 25000) assert_allclose( train_dataset[0][1][:10], torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]).long()) assert_allclose( train_dataset[-1][1][:10], torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148, 8]).long()) assert_allclose( test_dataset[0][1][:10], torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long()) assert_allclose( test_dataset[-1][1][:10], torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008, 3]).long()) # Test API with a vocab input object old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab)
def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab # smoke test to ensure imdb works properly train_dataset, test_dataset = IMDB() self.assertEqual(len(train_dataset), 25000) self.assertEqual(len(test_dataset), 25000) # Test API with a vocab input object old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "imdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") conditional_remove(datafile)
def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab # smoke test to ensure imdb works properly train_dataset, test_dataset = IMDB() self._helper_test_func( len(train_dataset), 25000, train_dataset[0][1][:10], [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]) self._helper_test_func(len(test_dataset), 25000, test_dataset[0][1][:10], [13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]) # Test API with a vocab input object old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) # Add test for the subset of the standard datasets train_dataset, = IMDB(data_select=('train')) self._helper_test_func( len(train_dataset), 25000, train_dataset[0][1][:10], [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]) train_iter, test_iter = torchtext.experimental.datasets.raw.IMDB() self._helper_test_func(len(train_iter), 25000, next(iter(train_iter))[1][:25], 'I rented I AM CURIOUS-YEL') self._helper_test_func(len(test_iter), 25000, next(iter(test_iter))[1][:25], 'I love sci-fi and am will') del train_iter, test_iter
def init(config): ''' Loads the GloVe embeddings for the words which occur in the IMDB train set vocab and uses that vocab to create train, validation and test sets for the IMDB dataset. Extracts the pad_id token. ''' import os if not os.path.isdir('.data'): os.mkdir('.data') # Extract the initial vocab from the IMDB dataset vocab = IMDB(data_select='train')[0].get_vocab() # Create GloVe embeddings based on original vocab # word freqs glove_vocab = torchtext.vocab.Vocab( counter=vocab.freqs, max_size=MAX_VOCAB_SIZE, min_freq=MIN_FREQ, vectors=torchtext.vocab.GloVe(name='6B')) # Acquire 'Spacy' tokenizer for the vocab words tokenizer = get_tokenizer('spacy', 'en_core_web_sm') # Acquire train and test IMDB sets with previously created # GloVe vocab and 'Spacy' tokenizer train_set, test_set = IMDB(tokenizer=tokenizer, vocab=glove_vocab) # Extract the vocab of the acquired train set vocab = train_set.get_vocab() # Extract the token used for padding pad_id = vocab['<pad>'] # Split the train set into train and validation sets train_set, valid_set = split_train_val(train_set) config['train'] = train_set config['val'] = valid_set config['test'] = test_set config['vocab'] = vocab config['pad_id'] = pad_id
def test_imdb(self): from torchtext.experimental.datasets import IMDB # smoke test to ensure wikitext2 works properly train_dataset, test_dataset = IMDB() self.assertEqual(len(train_dataset), 25000) self.assertEqual(len(test_dataset), 25000) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "imdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") conditional_remove(datafile)
def test_imdb(self): from torchtext.experimental.datasets import IMDB from torchtext.vocab import Vocab # smoke test to ensure imdb works properly train_dataset, test_dataset = IMDB() self.assertEqual(len(train_dataset), 25000) self.assertEqual(len(test_dataset), 25000) assert_allclose( train_dataset[0][1][:10], torch.tensor([13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]).long()) assert_allclose( train_dataset[-1][1][:10], torch.tensor([2, 71, 4555, 194, 3328, 15144, 42, 227, 148, 8]).long()) assert_allclose( test_dataset[0][1][:10], torch.tensor([13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]).long()) assert_allclose( test_dataset[-1][1][:10], torch.tensor([13, 1035, 14, 21, 28, 2, 1051, 1275, 1008, 3]).long()) # Test API with a vocab input object old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "imdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb") conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", "aclImdb_v1.tar.gz") conditional_remove(datafile)
def test_imdb(self): from torchtext.experimental.datasets import IMDB # smoke test to ensure imdb works properly train_dataset, test_dataset = IMDB() self._helper_test_func( len(train_dataset), 25000, train_dataset[0][1][:10], [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]) self._helper_test_func(len(test_dataset), 25000, test_dataset[0][1][:10], [13, 125, 1051, 5, 246, 1652, 8, 277, 66, 20]) # Add test for the subset of the standard datasets train_iter, test_iter = torchtext.datasets.IMDB() self._helper_test_func(len(train_iter), 25000, next(train_iter)[1][:25], 'I rented I AM CURIOUS-YEL') self._helper_test_func(len(test_iter), 25000, next(test_iter)[1][:25], 'I love sci-fi and am will') del train_iter, test_iter
def imdb(tokenizer=None): if not tokenizer: tokenizer=get_tokenizer("spacy") train_ds, valid_ds = IMDB(tokenizer=tokenizer) return TextData.from_datasets(train_ds, valid_ds)