def test_trainer_can_run_exponential_moving_average(self): moving_average = ExponentialMovingAverage(self.model.named_parameters(), decay=0.9999) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, validation_data_loader=self.validation_data_loader, num_epochs=2, moving_average=moving_average, ) trainer.train()
def test_trainer_can_run_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters(), decay=0.9999) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, moving_average=moving_average) trainer.train()
def test_trainer_can_run_amp(self): self.model.cuda() trainer = Trainer( self.model, self.optimizer, self.data_loader, num_epochs=2, cuda_device=0, opt_level="O1", ) _ = trainer.train()
def test_passing_trainer_multiple_gpus_raises_error(self): self.model.cuda() with pytest.raises(ConfigurationError): Trainer( self.model, self.optimizer, self.data_loader, num_epochs=2, cuda_device=[0, 1], )
def test_trainer_can_run_cuda(self): self.model.cuda() trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, cuda_device=0 ) metrics = trainer.train() assert "peak_cpu_memory_MB" in metrics assert isinstance(metrics["peak_cpu_memory_MB"], float) assert metrics["peak_cpu_memory_MB"] > 0 assert "peak_gpu_0_memory_MB" in metrics assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
def test_trainer_can_log_learning_rates_tensorboard(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = Trainer(self.model, self.optimizer, iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, should_log_learning_rate=True, summary_interval=2) trainer.train()
def test_trainer_saves_and_loads_best_validation_metrics_correctly_2(self): # Use -loss and run 1 epoch of original-training, and one of restored-training # Run 1 epoch of original training. trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=1, serialization_dir=self.TEST_DIR, ) trainer.train() _ = trainer._restore_checkpoint() best_epoch_1 = trainer._metric_tracker.best_epoch best_validation_metrics_epoch_1 = trainer._metric_tracker.best_epoch_metrics # best_validation_metrics_epoch_1: {'accuracy': 0.75, 'accuracy3': 1.0, 'loss': 0.6243013441562653} assert isinstance(best_validation_metrics_epoch_1, dict) assert "loss" in best_validation_metrics_epoch_1 # Run 1 more epoch of restored training. restore_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=2, serialization_dir=self.TEST_DIR, ) restore_trainer.train() _ = restore_trainer._restore_checkpoint() best_epoch_2 = restore_trainer._metric_tracker.best_epoch best_validation_metrics_epoch_2 = restore_trainer._metric_tracker.best_epoch_metrics # Because of using +loss, 2nd epoch won't be better than 1st. So best val metrics should be same. assert best_epoch_1 == best_epoch_2 == 0 assert best_validation_metrics_epoch_2 == best_validation_metrics_epoch_1
def test_trainer_can_run(self): trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, ) metrics = trainer.train() assert "best_validation_loss" in metrics assert isinstance(metrics["best_validation_loss"], float) assert "best_validation_accuracy" in metrics assert isinstance(metrics["best_validation_accuracy"], float) assert "best_validation_accuracy3" in metrics assert isinstance(metrics["best_validation_accuracy3"], float) assert "best_epoch" in metrics assert isinstance(metrics["best_epoch"], int) # Making sure that both increasing and decreasing validation metrics work. trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=2, ) metrics = trainer.train() assert "best_validation_loss" in metrics assert isinstance(metrics["best_validation_loss"], float) assert "best_validation_accuracy" in metrics assert isinstance(metrics["best_validation_accuracy"], float) assert "best_validation_accuracy3" in metrics assert isinstance(metrics["best_validation_accuracy3"], float) assert "best_epoch" in metrics assert isinstance(metrics["best_epoch"], int) assert "peak_cpu_memory_MB" in metrics assert isinstance(metrics["peak_cpu_memory_MB"], float) assert metrics["peak_cpu_memory_MB"] > 0
def test_trainer_can_run_with_lr_scheduler(self): lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) trainer = Trainer( model=self.model, optimizer=self.optimizer, data_loader=self.data_loader, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", validation_data_loader=self.validation_data_loader, num_epochs=2, ) trainer.train()
def test_trainer_can_run_with_lr_scheduler(self): lr_params = Params({"type": "reduce_on_plateau"}) lr_scheduler = LearningRateScheduler.from_params(self.optimizer, lr_params) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2) trainer.train()
def test_restored_training_returns_best_epoch_metrics_even_if_no_better_epoch_is_found_after_restoring( self, ): # Instead of -loss, use +loss to assure 2nd epoch is considered worse. # Run 1 epoch of original training. original_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=1, serialization_dir=self.TEST_DIR, ) training_metrics = original_trainer.train() # Run 1 epoch of restored training. restored_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, validation_metric="+loss", num_epochs=2, serialization_dir=self.TEST_DIR, ) restored_metrics = restored_trainer.train() assert "best_validation_loss" in restored_metrics assert "best_validation_accuracy" in restored_metrics assert "best_validation_accuracy3" in restored_metrics assert "best_epoch" in restored_metrics # Epoch 2 validation loss should be lesser than that of Epoch 1 assert training_metrics["best_validation_loss"] == restored_metrics[ "best_validation_loss"] assert training_metrics["best_epoch"] == 0 assert training_metrics["validation_loss"] > restored_metrics[ "validation_loss"]
def test_trainer_can_log_histograms(self): # enable activation logging for module in self.model.modules(): module.should_log_activations = True trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, histogram_interval=2) trainer.train()
def test_should_stop_early_with_flat_lining_metric(self): # pylint: disable=protected-access flatline = [.2] * 6 tracker = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="+test")._metric_tracker tracker.add_metrics(flatline) assert tracker.should_stop_early tracker = Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, patience=5, validation_metric="-test")._metric_tracker tracker.add_metrics(flatline) assert tracker.should_stop_early
def test_trainer_raises_on_model_with_no_loss_key(self): class FakeModel(Model): def forward(self, **kwargs): # pylint: disable=arguments-differ,unused-argument return {} with pytest.raises(RuntimeError): trainer = Trainer(FakeModel(None), self.optimizer, self.iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train()
def test_trainer_can_run_with_lr_scheduler(self): lr_scheduler = ExponentialLearningRateScheduler(self.optimizer, gamma=0.5) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, ) trainer.train()
def test_trainer_can_log_learning_rates_tensorboard(self): data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate) trainer = Trainer( self.model, self.optimizer, data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, should_log_learning_rate=True, summary_interval=2, ) trainer.train()
def test_trainer_can_run_and_resume_with_momentum_scheduler(self): scheduler = MomentumScheduler.from_params( self.optimizer, Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 })) trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, momentum_scheduler=scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR) trainer.train() new_scheduler = MomentumScheduler.from_params( self.optimizer, Params({ "type": "inverted_triangular", "cool_down": 2, "warm_up": 2 })) new_trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, momentum_scheduler=new_scheduler, validation_metric="-loss", train_dataset=self.instances, validation_dataset=self.instances, num_epochs=6, serialization_dir=self.TEST_DIR) epoch = new_trainer._restore_checkpoint() assert epoch == 4 assert new_trainer._momentum_scheduler.last_epoch == 3 new_trainer.train()
def test_trainer_respects_num_serialized_models_to_keep(self): trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=5, serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=3) trainer.train() # Now check the serialized files for prefix in ['model_state_epoch_*', 'training_state_epoch_*']: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] assert sorted(epochs) == [2, 3, 4]
def test_production_rule_field_with_multiple_gpus(self): wikitables_dir = 'allennlp/tests/fixtures/data/wikitables/' wikitables_reader = WikiTablesDatasetReader(tables_directory=wikitables_dir, dpd_output_directory=wikitables_dir + 'dpd_output/') instances = wikitables_reader.read(wikitables_dir + 'sample_data.examples') archive_path = self.FIXTURES_ROOT / 'semantic_parsing' / 'wikitables' / 'serialization' / 'model.tar.gz' model = load_archive(archive_path).model model.cuda() multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(model.vocab) trainer = Trainer(model, self.optimizer, multigpu_iterator, instances, num_epochs=2, cuda_device=[0, 1]) trainer.train()
def test_trainer_saves_models_at_specified_interval(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = Trainer(self.model, self.optimizer, iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = 'model_state_epoch_*' file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == '1' assert '.' in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove(os.path.join(self.TEST_DIR, 'model_state_epoch_{}.th'.format(k))) os.remove(os.path.join(self.TEST_DIR, 'training_state_epoch_{}.th'.format(k))) os.remove(os.path.join(self.TEST_DIR, 'best.th')) restore_trainer = Trainer(self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, model_save_interval=0.0001) epoch = restore_trainer._restore_checkpoint() # pylint: disable=protected-access assert epoch == 2 # One batch per epoch. assert restore_trainer._batch_num_total == 2 # pylint: disable=protected-access
def test_trainer_raises_on_model_with_no_loss_key(self): class FakeModel(Model): def forward(self, **kwargs): return {} with pytest.raises(RuntimeError): trainer = Trainer( FakeModel(None), self.optimizer, self.data_loader, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train()
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, 'None']: with pytest.raises(ConfigurationError, match='.* is an invalid value for "patience": ' 'it must be a positive integer or None ' '\\(if you want to disable early stopping\\)'): Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=100, patience=patience, validation_metric="+test")
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, 'None']: with pytest.raises( ConfigurationError, message='No ConfigurationError for patience={}'.format( patience)): Trainer(self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=100, patience=patience, validation_metric="+test")
def test_passing_trainer_multiple_gpus_raises_error(self): self.model.cuda() multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(self.vocab) with pytest.raises(ConfigurationError): Trainer( self.model, self.optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1], )
def test_trainer_can_run(self): trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2) metrics = trainer.train() assert 'best_validation_loss' in metrics assert isinstance(metrics['best_validation_loss'], float) assert 'best_validation_accuracy' in metrics assert isinstance(metrics['best_validation_accuracy'], float) assert 'best_validation_accuracy3' in metrics assert isinstance(metrics['best_validation_accuracy3'], float) assert 'best_epoch' in metrics assert isinstance(metrics['best_epoch'], int) # Making sure that both increasing and decreasing validation metrics work. trainer = Trainer(model=self.model, optimizer=self.optimizer, iterator=self.iterator, train_dataset=self.instances, validation_dataset=self.instances, validation_metric='+loss', num_epochs=2) metrics = trainer.train() assert 'best_validation_loss' in metrics assert isinstance(metrics['best_validation_loss'], float) assert 'best_validation_accuracy' in metrics assert isinstance(metrics['best_validation_accuracy'], float) assert 'best_validation_accuracy3' in metrics assert isinstance(metrics['best_validation_accuracy3'], float) assert 'best_epoch' in metrics assert isinstance(metrics['best_epoch'], int) assert 'peak_cpu_memory_MB' in metrics assert isinstance(metrics['peak_cpu_memory_MB'], float) assert metrics['peak_cpu_memory_MB'] > 0
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=2, serialization_dir=self.TEST_DIR, ) trainer.train() new_lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) new_trainer = Trainer( model=self.model, optimizer=self.optimizer, iterator=self.iterator, learning_rate_scheduler=new_lr_scheduler, train_dataset=self.instances, validation_dataset=self.instances, num_epochs=4, serialization_dir=self.TEST_DIR, ) epoch = new_trainer._restore_checkpoint() assert epoch == 2 assert new_trainer._learning_rate_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_can_resume_training_for_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters()) trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=1, serialization_dir=self.TEST_DIR, moving_average=moving_average, ) trainer.train() new_moving_average = ExponentialMovingAverage( self.model.named_parameters()) new_trainer = Trainer( self.model, self.optimizer, self.iterator, self.instances, validation_dataset=self.instances, num_epochs=3, serialization_dir=self.TEST_DIR, moving_average=new_moving_average, ) epoch = new_trainer._restore_checkpoint() assert epoch == 1 tracker = trainer._metric_tracker assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def train(): model.cuda(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) trainer = Trainer(model=model, optimizer=optimizer, iterator=train_iterator, validation_iterator=validation_iterator, train_dataset=train_dataset, validation_dataset=validation_dataset, num_epochs=num_epochs, serialization_dir=serialization_dir, grad_clipping=grad_clipping, cuda_device=device) trainer.train()
def test_multigpu_qanet(self): params = Params.from_file(self.param_file) vocab = Vocabulary.from_instances(self.instances) model = Model.from_params(vocab=vocab, params=params["model"]).cuda() optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) self.instances.index_with(model.vocab) loader = DataLoader(self.instances, batch_size=4) trainer = Trainer(model, optimizer, loader, num_epochs=2, cuda_device=[0, 1]) trainer.train()
def test_multigpu_qanet(self): params = Params.from_file(self.param_file) vocab = Vocabulary.from_instances(self.instances) model = Model.from_params(vocab=vocab, params=params['model']).cuda() optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9) multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(model.vocab) trainer = Trainer(model, optimizer, multigpu_iterator, self.instances, num_epochs=2, cuda_device=[0, 1]) trainer.train()