def test_restoring_works_with_older_checkpointing(self): trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, ) trainer.train() for index in range(3): path = str(self.TEST_DIR / "training_state_epoch_{}.th".format(index)) state = torch.load(path) state.pop("metric_tracker") state.pop("batch_num_total") state["val_metric_per_epoch"] = [0.4, 0.1, 0.8] torch.save(state, path) next_epoch = trainer._restore_checkpoint() best_epoch = trainer._metric_tracker.best_epoch # Loss decreases in 3 epochs, but because we hard fed the val metrics as above: assert next_epoch == 3 assert best_epoch == 1 assert trainer._metric_tracker._best_so_far == 0.1 assert trainer._metric_tracker._epochs_with_no_improvement == 1
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an iterator that sleeps for 2.5 second per epoch, so the total training # time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class WaitingIterator(BasicIterator): def _create_batches(self, *args, **kwargs): time.sleep(2.5) return super()._create_batches(*args, **kwargs) iterator = WaitingIterator(batch_size=2) iterator.index_with(self.vocab) trainer = Trainer( self.model, self.optimizer, iterator, self.instances, num_epochs=6, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=2, keep_serialized_model_every_num_seconds=5, ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def test_trainer_can_resume_training(self): trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=1, serialization_dir=self.TEST_DIR, ) trainer.train() new_trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 1 tracker = trainer._metric_tracker assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def test_trainer_can_resume_training_for_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters()) trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=1, serialization_dir=self.TEST_DIR, moving_average=moving_average) trainer.train() new_moving_average = ExponentialMovingAverage( self.model.named_parameters()) new_trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, moving_average=new_moving_average) epoch = new_trainer._restore_checkpoint() # pylint: disable=protected-access assert epoch == 1 tracker = trainer._metric_tracker # pylint: disable=protected-access assert tracker.is_best_so_far() assert tracker._best_so_far is not None # pylint: disable=protected-access new_trainer.train()
def test_trainer_can_resume_training(self): trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=1, serialization_dir=self.TEST_DIR) trainer.train() new_trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR) epoch = new_trainer._restore_checkpoint() # pylint: disable=protected-access assert epoch == 1 tracker = trainer._metric_tracker # pylint: disable=protected-access assert tracker.is_best_so_far() assert tracker._best_so_far is not None # pylint: disable=protected-access new_trainer.train()
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train() new_lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) new_trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=new_lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR) epoch = new_trainer._restore_checkpoint() assert epoch == 2 assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an fake data loader that sleeps for 2.5 second per epoch, so the total # training time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class SlowDataLoader: data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate) def __iter__(self): time.sleep(2.5) return iter(self.data_loader) def __len__(self): return len(self.data_loader) trainer = Trainer( self.model, self.optimizer, SlowDataLoader(), num_epochs=6, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=2, keep_serialized_model_every_num_seconds=5, ) trainer.train() # Now check the serialized files for prefix in ["model_state_epoch_*", "training_state_epoch_*"]: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def test_trainer_can_run_and_resume_with_momentum_scheduler(self): scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({"type": "inverted_triangular", "cool_down": 2, "warm_up": 2}), ) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, momentum_scheduler=scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=4, serialization_dir=self.TEST_DIR, ) trainer.train() new_scheduler = MomentumScheduler.from_params( optimizer=self.optimizer, params=Params({"type": "inverted_triangular", "cool_down": 2, "warm_up": 2}), ) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, momentum_scheduler=new_scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=6, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 4 assert new_trainer._momentum_scheduler.last_epoch == 3 new_trainer.train()
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = CosineWithRestarts(self.optimizer, t_initial=5) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, learning_rate_scheduler=lr_scheduler, validation_data_loader=self.validation_data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train() new_lr_scheduler = CosineWithRestarts(self.optimizer, t_initial=5) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, learning_rate_scheduler=new_lr_scheduler, validation_data_loader=self.validation_data_loader, num_epochs=4, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 2 assert new_trainer._learning_rate_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_can_resume_training_for_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters()) trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=1, serialization_dir=self.TEST_DIR, moving_average=moving_average, ) trainer.train() new_moving_average = ExponentialMovingAverage( self.model.named_parameters()) new_trainer = Trainer( self.model, self.optimizer, self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=3, serialization_dir=self.TEST_DIR, moving_average=new_moving_average, ) epoch = new_trainer._restore_checkpoint() assert epoch == 1 tracker = trainer._metric_tracker assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train() new_lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=new_lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 2 assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_can_run_cuda(self): trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, cuda_device=0) trainer.train()
def test_trainer_raises_on_model_with_no_loss_key(self): class FakeModel(Model): def forward(self, **kwargs): # pylint: disable=arguments-differ,unused-argument return {} with pytest.raises(RuntimeError): trainer = Trainer(FakeModel(None), self.optimizer, self.iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train()
def test_trainer_can_log_histograms(self): # enable activation logging for module in self.model.modules(): module.should_log_activations = True trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, histogram_interval=2) trainer.train()
def test_trainer_saves_models_at_specified_interval(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = Trainer( self.model, self.optimizer, iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001, ) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = "model_state_epoch_*" file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [ re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names ] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == "1" assert "." in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove( os.path.join(self.TEST_DIR, "model_state_epoch_{}.th".format(k))) os.remove( os.path.join(self.TEST_DIR, "training_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "best.th")) restore_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001, ) epoch = restore_trainer._restore_checkpoint() assert epoch == 2 # One batch per epoch. assert restore_trainer._batch_num_total == 2
def test_trainer_can_run_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters(), decay=0.9999) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, moving_average=moving_average) trainer.train()
def test_trainer_can_log_learning_rates_tensorboard(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = Trainer(self.model, self.optimizer, iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, should_log_learning_rate=True, summary_interval=2) trainer.train()
def test_trainer_can_run_exponential_moving_average(self): moving_average = ExponentialMovingAverage(self.model.named_parameters(), decay=0.9999) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=2, moving_average=moving_average, ) trainer.train()
def test_trainer_can_run_with_lr_scheduler(self): lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=2, ) trainer.train()
def test_trainer_can_run_with_lr_scheduler(self): lr_params = Params({"type": "reduce_on_plateau"}) lr_scheduler = LearningRateScheduler.from_params(self.optimizer, lr_params) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2) trainer.train()
def test_trainer_respects_num_serialized_models_to_keep(self): trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=5, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=3) trainer.train() # Now check the serialized files for prefix in ['model_state_epoch_*', 'training_state_epoch_*']: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] assert sorted(epochs) == [2, 3, 4]
def test_trainer_can_log_learning_rates_tensorboard(self): data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate) trainer = Trainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, should_log_learning_rate=True, summary_interval=2, ) trainer.train()
def test_trainer_can_run_with_lr_scheduler(self): lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, ) trainer.train()
def test_production_rule_field_with_multiple_gpus(self): wikitables_dir = 'allennlp/tests/fixtures/data/wikitables/' wikitables_reader = WikiTablesDatasetReader(tables_directory=wikitables_dir, dpd_output_directory=wikitables_dir + 'dpd_output/') instances = wikitables_reader.read(wikitables_dir + 'sample_data.examples') archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz' model = load_archive(archive_path).model model.cuda() multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(model.vocab) trainer = Trainer(model, self.optimizer, multigpu_iterator, instances, num_epochs=2, cuda_device=[0, 1]) trainer.train()
def test_trainer_raises_on_model_with_no_loss_key(self): class FakeModel(Model): def forward(self, **kwargs): return {} with pytest.raises(RuntimeError): trainer = Trainer( FakeModel(None), self.optimizer, self.data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train()
def train(): model.cuda(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=validation_iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=num_epochs, serialization_dir=serialization_dir, grad_clipping=grad_clipping, cuda_device=device) trainer.train()
def test_multigpu_qanet(self): params = Params.from_file(self.param_file) vocab = Vocabulary.from_instances(self.instances) model = Model.from_params(vocab=vocab, params=params["model"]).cuda() optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.instances.index_with(model.vocab) loader = DataLoader(self.instances, batch_size=4) trainer = Trainer(model, optimizer, loader, num_epochs=2, cuda_device=[0, 1]) trainer.train()
def test_multigpu_qanet(self): params = Params.from_file(self.param_file) vocab = Vocabulary.from_instances(self.instances) model = Model.from_params(vocab=vocab, params=params['model']).cuda() optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(model.vocab) trainer = Trainer(model, optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1]) trainer.train()
def test_trainer_can_run_multiple_gpu(self): class MetaDataCheckWrapper(Model): """ Checks that the metadata field has been correctly split across the batch dimension when running on multiple gpus. """ def __init__(self, model): super().__init__(model.vocab) self.model = model def forward(self, **kwargs) -> Dict[str, torch.Tensor]: # type: ignore # pylint: disable=arguments-differ assert 'metadata' in kwargs and 'tags' in kwargs, \ f'tokens and metadata must be provided. Got {kwargs.keys()} instead.' batch_size = kwargs['tokens']['tokens'].size()[0] assert len(kwargs['metadata']) == batch_size, \ f'metadata must be split appropriately. Expected {batch_size} elements, ' \ f"got {len(kwargs['metadata'])} elements." return self.model.forward(**kwargs) multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(self.vocab) trainer = Trainer(MetaDataCheckWrapper(self.model), self.optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1]) metrics = trainer.train() assert 'peak_cpu_memory_MB' in metrics assert isinstance(metrics['peak_cpu_memory_MB'], float) assert metrics['peak_cpu_memory_MB'] > 0 assert 'peak_gpu_0_memory_MB' in metrics assert isinstance(metrics['peak_gpu_0_memory_MB'], int) assert 'peak_gpu_1_memory_MB' in metrics assert isinstance(metrics['peak_gpu_1_memory_MB'], int)
def test_trainer_saves_metrics_every_epoch(self): trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=5, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=3) trainer.train() for epoch in range(5): epoch_file = self.TEST_DIR / f'metrics_epoch_{epoch}.json' assert epoch_file.exists() metrics = json.load(open(epoch_file)) assert "validation_loss" in metrics assert "best_validation_loss" in metrics assert metrics.get("epoch") == epoch