def test_ngrams_func(self): func = ngrams_func(1) assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \ ['A', 'string', 'particularly', 'one', 'with', 'slightly'] func = ngrams_func(2) assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \ ['A', 'string', 'particularly', 'one', 'with', 'slightly', 'A string', 'string particularly', 'particularly one', 'one with', 'with slightly'] func = ngrams_func(3) assert func(['A', 'string', 'particularly', 'one', 'with', 'slightly']) == \ ['A', 'string', 'particularly', 'one', 'with', 'slightly', 'A string', 'string particularly', 'particularly one', 'one with', 'with slightly', 'A string particularly', 'string particularly one', 'particularly one with', 'one with slightly']
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split_): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) split = check_default_set(split_, ('train', 'test'), dataset_name) raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) # Materialize raw text iterable dataset raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets) } if vocab is None: if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_data["train"], text_transform) logger_.info('Vocab has %d entries', len(vocab)) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) logger_.info('Building datasets for {}'.format(split)) return wrap_datasets( tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in split), split_)
def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) data_select = check_default_set(data_select, ('train', 'test')) raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) # Materialize raw text iterable dataset raw_data = { name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets) } if vocab is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data["train"], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) return tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select)
def _setup_datasets( dataset_name, root=".data", ngrams=1, vocab=None, tokenizer=None, data_select=("train", "test"), ): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(("train", "test"))): raise TypeError( "Given data selection {} is not supported!".format(data_select)) train, test = raw.DATASETS[dataset_name](root=root) # Cache raw text iterable dataset raw_data = { "train": [(label, txt) for (label, txt) in train], "test": [(label, txt) for (label, txt) in test], } if vocab is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data["train"], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) if dataset_name == 'IMDB': label_transform = sequential_transforms( lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) return tuple( TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select)