def test_caching_with_lazy_reader_in_multi_process_loader(self): data_file = ( AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "imdb_corpus.jsonl" ) reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory) deque( PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2), maxlen=0, ) # We shouldn't write to the cache when the data is being loaded from multiple # processes. cache_file = reader._get_cache_location_for_file_path(str(data_file)) assert not os.path.exists(cache_file) # But try again from the main process and we should see the cache file. instances = list(reader.read(data_file)) assert instances assert os.path.exists(cache_file) # Reading again from a multi-process loader should read from the cache. new_instances = list( PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2) ) assert len(instances) == len(new_instances)
def compute_influence_values(self, training_loader: PyTorchDataLoader, validation_loader: PyTorchDataLoader): training_loader = PyTorchDataLoader(training_loader.dataset, batch_size=1, shuffle=False) validation_loader = PyTorchDataLoader(validation_loader.dataset, batch_size=1, shuffle=False) influence_values = [] validation_idx = [] for batch in tqdm(iter(validation_loader)): assert len(batch["metadata"]) == 1, breakpoint() influence_values.append([]) ihvp = self.ihvp(batch, training_loader ) # (tuple of params) # = H^-1 . Grad(L(z_test)) validation_idx.append(batch["metadata"][0]["idx"]) training_idx = [] for train_ex in tqdm(iter(training_loader)): assert len(train_ex["metadata"]) == 1, breakpoint() train_grad = self.get_grad(train_ex) if_value = sum( (x * y).sum().item() for x, y in zip(ihvp, train_grad)) / len(training_loader) influence_values[-1].append(if_value) training_idx.append(train_ex["metadata"][0]["idx"]) return np.array(influence_values), training_idx, validation_idx
def ihvp(self, test_example, training_loader): self._predictor._model.zero_grad() v = self.get_grad(test_example) if not self._use_hessian: return tuple(x.detach() for x in v) ihv_estimate = v training_loader = PyTorchDataLoader(training_loader.dataset, batch_size=5, shuffle=True) training_iter = iter(training_loader) for _ in tqdm(range(len(training_loader))): train_batch = next(training_iter) self._predictor._model.zero_grad() loss = self.get_outputs_for_batch(train_batch) hv = vhp_s(loss, self._valid_parameters, ihv_estimate) with torch.no_grad(): ihv_estimate = tuple( _v + (1 - self._damping) * _ihv - _hv / self._scale for _v, _ihv, _hv in zip(v, ihv_estimate, hv)) return tuple(x.detach() for x in ihv_estimate)
def test_multi_processing_with_lazy_dataset_warns(): def fake_instance_generator(file_name: str) -> Iterable[Instance]: yield from [] with pytest.warns(UserWarning, match=r".*deadlocks.*"): PyTorchDataLoader(AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1)
def test_batch_of_entirely_empty_lists_works(self): dataset = AllennlpDataset([self.empty_instance, self.empty_instance], self.vocab) model = DummyModel(self.vocab) model.eval() loader = PyTorchDataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch)
def test_max_instances_with_multi_process_loader(self, num_workers): data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "imdb_corpus.jsonl") reader = TextClassificationJsonReader(max_instances=2, lazy=True) instances = list( PyTorchDataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=num_workers)) assert len(instances) == 2
def main(): args = get_args() # Read config. file_dict = json.loads(evaluate_file(args.training_config)) model_dict = file_dict["model"] if args.use_bert: bert_name = model_dict["embedder"]["token_embedders"]["bert"][ "model_name"] else: bert_name = None # Hack to replace components that we're setting in the script. for name in ["type", "embedder", "initializer", "module_initializer"]: del model_dict[name] # Create indexer. if args.use_bert: tok_indexers = { "bert": token_indexers.PretrainedTransformerMismatchedIndexer( bert_name, max_length=512) } else: tok_indexers = {"tokens": token_indexers.SingleIdTokenIndexer()} # Read input data. reader = DyGIEReader(max_span_width=8, token_indexers=tok_indexers, max_instances=args.max_instances) data = reader.read(file_dict["train_data_path"]) vocab = vocabulary.Vocabulary.from_instances(data) data.index_with(vocab) # Create embedder. if args.use_bert: token_embedder = token_embedders.PretrainedTransformerMismatchedEmbedder( bert_name, max_length=512) embedder = text_field_embedders.BasicTextFieldEmbedder( {"bert": token_embedder}) else: token_embedder = token_embedders.Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=100) embedder = text_field_embedders.BasicTextFieldEmbedder( {"tokens": token_embedder}) # Create iterator and model. iterator = PyTorchDataLoader(batch_size=1, dataset=data) if args.model_archive is None: model = dygie.DyGIE(vocab=vocab, embedder=embedder, **model_dict) else: model = dygie.DyGIE.from_archive(args.model_archive) # Run forward pass over a single entry. for batch in iterator: output_dict = model(**batch)
def run(args): predictor: Predictor = get_predictor(args) training_file = args.training_file validation_file = args.validation_file training_data = read_data(predictor._dataset_reader, training_file) validation_data = read_data(predictor._dataset_reader, validation_file) print("Indexing with Vocabulary") training_data.index_with(predictor._model.vocab) validation_data.index_with(predictor._model.vocab) training_loader = PyTorchDataLoader(training_data, batch_size=args.training_batch_size, shuffle=False) validation_loader = PyTorchDataLoader( validation_data, batch_size=args.validation_batch_size, shuffle=False) print("Computing Influence Values") if args.run_all: influencers = get_influencer_iterable(predictor, args) else: influencers = [get_influencer(predictor, args)] for influencer in influencers: influence_values, training_idx, validation_idx = influencer.compute_influence_values( training_loader, validation_loader) output_folder = args.output_folder output_subfolder = influencer.get_output_subfolder().strip() if len(output_subfolder) > 0: output_folder = os.path.join(output_folder, output_subfolder) print(f"Dumping stuff to {output_folder}") dump_results(influence_values, training_idx, validation_idx, output_folder) print("Job done. Rejoice !")
def test_batch_count(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = PyTorchDataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) assert len(dataloader) == 3
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. loader = PyTorchDataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] assert lengths.tolist() == expected_lengths # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): assert numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, )
def test_trainer_respects_epoch_size_smaller_tnan_total(self): batches_per_epoch = 1 num_epochs = 2 data_loader_smaller_epoch = PyTorchDataLoader( self.instances, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_smaller_epoch, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self): batches_per_epoch = 3 num_epochs = 3 data_loader_custom_epoch_lazy = PyTorchDataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_custom_epoch_lazy, validation_data_loader=self.validation_data_loader, num_epochs=num_epochs, serialization_dir=self.TEST_DIR, ) assert trainer._batch_num_total == 0 metrics = trainer.train() epoch = metrics["epoch"] assert epoch == num_epochs - 1 assert trainer._batch_num_total == num_epochs * batches_per_epoch
def test_total_loss_is_average_of_batch_loss(self): batches_per_epoch = 3 data_loader_custom_epoch_lazy = PyTorchDataLoader( self.instances_lazy, batch_size=2, collate_fn=allennlp_collate, batches_per_epoch=batches_per_epoch, ) class FakeBatchCallback(BatchCallback): def __call__( self, trainer: "GradientDescentTrainer", batch_inputs: List[List[TensorDict]], batch_outputs: List[Dict[str, Any]], batch_metrics: Dict[str, Any], epoch: int, batch_number: int, is_training: bool, is_master: bool, ) -> None: if not hasattr(trainer, "batch_losses"): trainer.batch_losses = [] # type: ignore trainer.batch_losses.append( batch_outputs[0]["loss"].item()) # type: ignore trainer = GradientDescentTrainer( self.model, self.optimizer, data_loader_custom_epoch_lazy, num_epochs=1, batch_callbacks=[FakeBatchCallback()], ) metrics = trainer.train() assert metrics["training_loss"] == float( sum(trainer.batch_losses) / batches_per_epoch)
def test_drop_last_works(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = PyTorchDataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) batches = [batch for batch in iter(dataloader)] stats = self.get_batches_stats(batches) # all batches have length batch_size assert all(batch_len == 2 for batch_len in stats["batch_lengths"]) # we should have lost one instance by skipping the last batch assert stats["total_instances"] == len(self.instances) - 1
def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy): NUM_INSTANCES = 20 BATCH_SIZE = 2 BATCHES_PER_EPOCH = 3 EPOCHS = 4 class FakeDatasetReader(DatasetReader): def _read(self, filename: str) -> Iterable[Instance]: for i in range(NUM_INSTANCES): yield Instance({"index": LabelField(i, skip_indexing=True)}) reader = FakeDatasetReader(lazy=lazy) dataset = reader.read("blah") loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH) epoch_batches = [] for epoch in range(EPOCHS): batches = [] for batch in loader: instances = [] for index in batch["index"]: instances.append(index) batches.append(instances) epoch_batches.append(batches) assert epoch_batches == [ # Epoch 0. [[0, 1], [2, 3], [4, 5]], # Epoch 1. [[6, 7], [8, 9], [10, 11]], # Epoch 2. [[12, 13], [14, 15], [16, 17]], # Epoch 3. [[18, 19], [0, 1], [2, 3]], ]
def main(): args = get_args() # Read config. conf_dict = json.loads(evaluate_file(args.training_config)) model_dict = conf_dict["model"] if args.use_bert: bert_name = model_dict["embedder"]["token_embedders"]["bert"][ "model_name"] bert_max_length = model_dict["embedder"]["token_embedders"]["bert"][ "max_length"] else: bert_name = None # Hack to replace components that we're setting in the script. for name in ["type", "embedder", "initializer", "module_initializer"]: del model_dict[name] # Create indexer. if args.use_bert: tok_indexers = { "bert": token_indexers.PretrainedTransformerMismatchedIndexer( bert_name, max_length=bert_max_length) } else: tok_indexers = {"tokens": token_indexers.SingleIdTokenIndexer()} # Read input data. reader_dict = conf_dict["dataset_reader"] reader = DyGIEReader( reader_dict["max_span_width"], max_trigger_span_width=reader_dict["max_trigger_span_width"], token_indexers=tok_indexers, max_instances=500) # token_indexers=tok_indexers, max_instances=args.max_instances) data = reader.read(conf_dict["train_data_path"]) vocab = vocabulary.Vocabulary.from_instances(data) data.index_with(vocab) # Create embedder. if args.use_bert: token_embedder = token_embedders.PretrainedTransformerMismatchedEmbedder( bert_name, max_length=bert_max_length) embedder = text_field_embedders.BasicTextFieldEmbedder( {"bert": token_embedder}) else: token_embedder = token_embedders.Embedding( num_embeddings=vocab.get_vocab_size("tokens"), embedding_dim=100) embedder = text_field_embedders.BasicTextFieldEmbedder( {"tokens": token_embedder}) # Create context layer: if not passthrough always use lstm when testing if model_dict["context_layer"]["type"] != "pass_through": del model_dict["context_layer"]["type"] model_dict["context_layer"]["input_size"] = embedder.get_output_dim() context_layer = seq2seq_encoders.LstmSeq2SeqEncoder( **model_dict["context_layer"]) else: context_layer = seq2seq_encoders.PassThroughEncoder( embedder.get_output_dim()) del model_dict["context_layer"] # Create iterator and model. iterator = PyTorchDataLoader(batch_size=1, dataset=data) if args.model_archive is None: model = dygie.DyGIE(vocab=vocab, embedder=embedder, context_layer=context_layer, **model_dict) else: model = dygie.DyGIE.from_archive(args.model_archive) # Run forward pass over a single entry. for batch in iterator: output_dict = model(**batch) print(output_dict)
model = TK(word_embedder, n_kernels=11, n_layers = 2, n_tf_dim = 300, n_tf_heads = 10) # todo optimizer, loss print('Model',config["model"],'total parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad)) print('Network:', model) # # train # _triple_reader = IrTripleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30) _triple_reader = _triple_reader.read(config["train_data"]) _triple_reader.index_with(vocab) loader = PyTorchDataLoader(_triple_reader, batch_size=32) for epoch in range(2): for batch in Tqdm.tqdm(loader): # todo train loop pass # # eval (duplicate for validation inside train loop - but rename "loader", since # otherwise it will overwrite the original train iterator, which is instantiated outside the loop) # _tuple_reader = IrLabeledTupleDatasetReader(lazy=True, max_doc_length=180, max_query_length=30) _tuple_reader = _tuple_reader.read(config["test_data"])
from allennlp.data.dataloader import PyTorchDataLoader from allennlp.modules.attention import DotProductAttention from allennlp.modules import Embedding from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.seq2seq_encoders import RnnSeq2SeqEncoder, PytorchSeq2SeqWrapper from allennlp.training.trainer import GradientDescentTrainer, Trainer from allennlp.training.optimizers import AdamOptimizer import torch import torch.nn as nn from torch.autograd import Variable reader = CopyNetDatasetReader(target_namespace="trg") train_dataset = reader.read('data/train.tsv') train_loader = PyTorchDataLoader(train_dataset, batch_size=8, shuffle=True) vocab = Vocabulary.from_instances(train_dataset) EMBEDDING_DIM = 128 HIDDEN_DIM = 256 TARGET_EMBEDDING_DIM = 512 token_embedding = Embedding(embedding_dim=EMBEDDING_DIM, num_embeddings=vocab.get_vocab_size(namespace="tokens")) word_embedding = BasicTextFieldEmbedder({"token": token_embedding}) bi_rnn_encoder = RnnSeq2SeqEncoder(EMBEDDING_DIM, HIDDEN_DIM, 2, bidirectional=True) dot_attn = DotProductAttention() model = CopyNetSeq2Seq(vocab, word_embedding, bi_rnn_encoder, dot_attn, target_namespace="trg", target_embedding_dim=TARGET_EMBEDDING_DIM) with tempfile.TemporaryDirectory() as serialization_dir: parameters = [