def test_drop_last_works(self): sampler = BucketBatchSampler( batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. def collate_fn(x, **kwargs): return Batch(x) data_loader = MultiProcessDataLoader( self.get_mock_reader(), "fake_path", batch_sampler=sampler, ) data_loader.collate_fn = collate_fn data_loader.index_with(self.vocab) batches = [batch for batch in iter(data_loader)] stats = self.get_batches_stats(batches) # all batches have length batch_size assert all(batch_len == 2 for batch_len in stats["batch_lengths"]) # we should have lost one instance by skipping the last batch assert stats["total_instances"] == len(self.instances) - 1
def read_and_check_instances(self, filepath: str, num_workers: int = 0): data_loader = MultiProcessDataLoader(self.reader, filepath, num_workers=num_workers, batch_size=1, start_method="spawn") all_instances = [] for instance in data_loader.iter_instances(): all_instances.append(instance) # 100 files * 4 sentences / file assert len(all_instances) == 100 * 4 counts = Counter(fingerprint(instance) for instance in all_instances) # should have the exact same data 100 times assert len(counts) == 4 assert counts[("cats", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("dogs", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("snakes", "are", "animals", ".", "N", "V", "N", "N")] == 100 assert counts[("birds", "are", "animals", ".", "N", "V", "N", "N")] == 100
def test_language_model_data_collator(): """ Ensure `LanguageModelingDataCollator` works """ norm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16) vocab = Vocabulary.from_instances(norm_loader.iter_instances()) norm_loader.index_with(vocab) batch0 = list(norm_loader)[0] model_name = "epwalsh/bert-xsmall-dummy" data_collate = LanguageModelingDataCollator(model_name) mlm_loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, collate_fn=data_collate) vocab = Vocabulary.from_instances(mlm_loader.iter_instances()) mlm_loader.index_with(vocab) batch1 = list(mlm_loader)[0] norm_inputs = batch0["source"]["tokens"]["token_ids"] mlm_inputs = batch1["source"]["tokens"]["token_ids"] mlm_labels = batch1["source"]["tokens"]["labels"] # if we replace the mlm inputs with their labels, should be same as origin inputs assert torch.where(mlm_labels != -100, mlm_labels, mlm_inputs).tolist() == norm_inputs.tolist()
def build_data_loaders(config, dataset_reader: DatasetReader) -> Tuple[MultiProcessDataLoader, MultiProcessDataLoader, MultiProcessDataLoader]: train_loader = MultiProcessDataLoader(dataset_reader, data_path='train', batch_size=config.batch_size_for_train, shuffle=False) dev_loader = MultiProcessDataLoader(dataset_reader, data_path='dev', batch_size=config.batch_size_for_eval, shuffle=False) test_loader = MultiProcessDataLoader(dataset_reader, data_path='test', batch_size=config.batch_size_for_eval, shuffle=False) return train_loader, dev_loader, test_loader
def test_batch_count(self): sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"]) data_loader = MultiProcessDataLoader(self.get_mock_reader(), "fake_path", batch_sampler=sampler) data_loader.index_with(self.vocab) assert len(data_loader) == 3
def main(): reader = StanfordSentimentTreeBankDatasetReader() train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/train.txt' dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/stanfordSentimentTreebank/trees/dev.txt' sampler = BucketBatchSampler(batch_size=32, sorting_keys=["tokens"]) train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler) dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler) # You can optionally specify the minimum count of tokens/labels. # `min_count={'tokens':3}` here means that any tokens that appear less than three times # will be ignored and not included in the vocabulary. vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), dev_data_loader.iter_instances()), min_count={'tokens': 3}) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM) # BasicTextFieldEmbedder takes a dict - we need an embedding just for tokens, # not for labels, which are used as-is as the "answer" of the sentence classification word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) # Seq2VecEncoder is a neural network abstraction that takes a sequence of something # (usually a sequence of embedded word vectors), processes it, and returns a single # vector. Oftentimes this is an RNN-based architecture (e.g., LSTM or GRU), but # AllenNLP also supports CNNs and other simple architectures (for example, # just averaging over the input vectors). encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_data_loader, validation_data_loader=dev_data_loader, patience=10, num_epochs=20, cuda_device=-1) trainer.train() predictor = SentenceClassifierPredictor(model, dataset_reader=reader) logits = predictor.predict('This is the best movie ever!')['logits'] label_id = np.argmax(logits) print(model.vocab.get_token_from_index(label_id, 'labels'))
def test_batches_per_epoch(): loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=4, batches_per_epoch=10) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) assert len(loader) == 10 assert len(list(loader)) == 10
def test_load_to_cuda(options): reader = MockDatasetReader() loader = MultiProcessDataLoader( reader=reader, data_path="this doens't matter", cuda_device=0, **options, ) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) for batch in loader: assert batch["tensor"].device == torch.device("cuda:0")
def build_data_loaders( reader, train_data_path: str, validation_data_path: str, ) -> Tuple[DataLoader, DataLoader]: train_loader = MultiProcessDataLoader(reader, train_data_path, batch_size=8, shuffle=True) dev_loader = MultiProcessDataLoader(reader, validation_data_path, batch_size=8, shuffle=False) return train_loader, dev_loader
def test_batch_count(self): sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"]) data_loader = MultiProcessDataLoader(self.get_mock_reader(), "fake_path", batch_sampler=sampler) assert len(data_loader) == 3
class TrainerTestBase(AllenNlpTestCase): def setup_method(self): super().setup_method() self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") self.reader = SequenceTaggingDatasetReader() self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2) self.data_loader_lazy = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2, max_instances_in_memory=10 ) self.instances = list(self.data_loader.iter_instances()) self.vocab = Vocabulary.from_instances(self.instances) self.data_loader.index_with(self.vocab) self.data_loader_lazy.index_with(self.vocab) self.model_params = Params( { "text_field_embedder": { "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}} }, "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2}, } ) self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params) self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.validation_data_loader = MultiProcessDataLoader( self.reader, self.data_path, batch_size=2 ) self.validation_data_loader.index_with(self.vocab)
def test_batch_count_with_drop_last(self): sampler = BucketBatchSampler( batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) data_loader = MultiProcessDataLoader(self.get_mock_reader(), "fake_path", batch_sampler=sampler) assert len(data_loader) == 2
def test_with_multi_process_loading(self, lazy): readers = {"a": PlainTextReader(), "b": PlainTextReader(), "c": PlainTextReader()} reader = InterleavingDatasetReader(readers) data_dir = self.FIXTURES_ROOT / "data" file_path = { "a": data_dir / "babi.txt", "b": data_dir / "conll2003.txt", "c": data_dir / "conll2003.txt", } vocab = Vocabulary.from_instances(reader.read(file_path)) loader = MultiProcessDataLoader( reader, file_path, num_workers=1, batch_size=1, max_instances_in_memory=2 if lazy else None, ) loader.index_with(vocab) list(loader.iter_instances()) list(loader)
def test_error_raised_when_text_fields_contain_token_indexers( max_instances_in_memory): """ This tests that the MultiProcessDataLoader raises an error when num_workers > 0 but the dataset reader doesn't implement apply_token_indexers(). It also tests that errors raised within a worker process are propogated upwards to the main process, and that when that happens, all workers will be successfully killed. """ with pytest.raises( WorkerError, match="Make sure your dataset reader's text_to_instance()"): loader = MultiProcessDataLoader( MockOldDatasetReader(), "this-path-doesn't-matter", num_workers=2, max_instances_in_memory=max_instances_in_memory, batch_size=1, ) list(loader.iter_instances())
def test_multiprocess_data_loader(options): reader = MockDatasetReader() data_path = "this doesn't matter" loader = MultiProcessDataLoader(reader=reader, data_path=data_path, **options) if not options.get("max_instances_in_memory"): # Instances should be loaded immediately if max_instances_in_memory is None. assert loader._instances instances: Iterable[Instance] = loader.iter_instances() # This should be a generator. assert not isinstance(instances, (list, tuple)) instances = list(instances) assert len(instances) == MockDatasetReader.NUM_INSTANCES # Now build vocab. vocab = Vocabulary.from_instances(instances) # Before indexing the loader, trying to iterate through batches will raise an error. with pytest.raises(ValueError, match="Did you forget to call DataLoader.index_with"): list(loader) loader.index_with(vocab) # Run through a couple epochs to make sure we collect all of the instances. for epoch in range(2): indices: List[int] = [] for batch in loader: for index in batch["index"]: indices.append(index) # type: ignore # Ensure no duplicates. assert len(indices) == len(set(indices)), indices # Ensure all collected. assert len(indices) == MockDatasetReader.NUM_INSTANCES, epoch
def build_data_loader(data_reader: DatasetReader, data_path: Path, batch_size: int, shuffle: bool = True) -> DataLoader: """ Build an AllenNLP DataLoader. :param train_data: The training dataset, torch object. :param dev_data: The dev dataset, torch object. :return train_loader, dev_loader: The train and dev data loaders as a tuple. """ # Note that DataLoader is imported from allennlp above, *not* torch. # We need to get the allennlp-specific collate function, which is # what actually does indexing and batching. # log.debug("Building DataLoader.") loader = MultiProcessDataLoader(reader=data_reader, data_path=data_path, batch_size=batch_size, shuffle=shuffle) # log.debug("DataLoader built.") return loader
def test_drop_last(): """ Ensures that the `drop_last` option is respected. """ loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, drop_last=True) vocab = Vocabulary.from_instances(loader.iter_instances()) loader.index_with(vocab) # Should still load all instances. `drop_last` only affects batches. assert len(list( loader.iter_instances())) == MockDatasetReader.NUM_INSTANCES # Just here because the assertions below depend on the exact value of NUM_INSTANCES. assert MockDatasetReader.NUM_INSTANCES == 100 batches = list(loader) for batch in batches: assert len(batch["index"]) == 16 assert len(batches) == 6
def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, ): """ Expected results for ``test.json`` from the Open Entity dataset: {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}. Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") tokenizer_kwargs = {"additional_special_tokens": [ENT]} reader = EntityTypingReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=True, tokenizer_kwargs=tokenizer_kwargs), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name, tokenizer_kwargs=tokenizer_kwargs) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntityClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)
def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, prediction_save_path: str, ): """ Expected results for CoNLL-2003 NER English test set. {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929} Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") reader = ConllSpanReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=False, tokenizer_kwargs={"add_prefix_space": True}), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntitySpanClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] labels = ["O" if l == "NIL" else l for l in labels] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) if prediction_save_path is not None: params["prediction_save_path"] = prediction_save_path model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)