def test_trainer_saves_and_loads_best_validation_metrics_correctly_1(self): # Use -loss and run 1 epoch of original-training, and one of restored-training # Run 1 epoch of original training. trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(), num_epochs=1, serialization_dir=self.TEST_DIR) trainer.train() _ = trainer.handler.fire_event(Events.TRAINING_START) best_epoch_1 = trainer.metric_tracker.best_epoch best_validation_metrics_epoch_1 = trainer.metric_tracker.best_epoch_metrics # best_validation_metrics_epoch_1: {'accuracy': 0.75, 'accuracy3': 1.0, 'loss': 0.6243013441562653} assert isinstance(best_validation_metrics_epoch_1, dict) assert "loss" in best_validation_metrics_epoch_1 # Run 1 epoch of restored training. restore_trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(), num_epochs=2, serialization_dir=self.TEST_DIR) restore_trainer.train() _ = restore_trainer.handler.fire_event(Events.TRAINING_START) best_epoch_2 = restore_trainer.metric_tracker.best_epoch best_validation_metrics_epoch_2 = restore_trainer.metric_tracker.best_epoch_metrics # Because of using -loss, 2nd epoch would be better than 1st. So best val metrics should not be same. assert best_epoch_1 == 0 and best_epoch_2 == 1 assert best_validation_metrics_epoch_2 != best_validation_metrics_epoch_1
def test_trainer_can_run_with_lr_scheduler(self): lr_params = Params({"type": "reduce_on_plateau"}) lr_scheduler = LearningRateScheduler.from_params(self.optimizer, lr_params) callbacks = self.default_callbacks() + [UpdateLearningRate(lr_scheduler)] trainer = CallbackTrainer( model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=callbacks, num_epochs=2, ) trainer.train()
def test_validation_metrics_consistent_with_and_without_tracking(self): default_callbacks = self.default_callbacks(serialization_dir=None) default_callbacks_without_tracking = [ callback for callback in default_callbacks if not isinstance(callback, TrackMetrics) ] trainer1 = CallbackTrainer( copy.deepcopy(self.model), training_data=self.instances, iterator=self.iterator, optimizer=copy.deepcopy(self.optimizer), callbacks=default_callbacks_without_tracking, num_epochs=1, serialization_dir=None) trainer1.train() trainer2 = CallbackTrainer(copy.deepcopy(self.model), training_data=self.instances, iterator=self.iterator, optimizer=copy.deepcopy(self.optimizer), callbacks=default_callbacks, num_epochs=1, serialization_dir=None) trainer2.train() metrics1 = trainer1.val_metrics metrics2 = trainer2.val_metrics assert metrics1.keys() == metrics2.keys() for key in ['accuracy', 'accuracy3', 'loss']: np.testing.assert_almost_equal(metrics1[key], metrics2[key])
def test_trainer_can_run_and_resume_with_momentum_scheduler(self): scheduler = MomentumScheduler.from_params( self.optimizer, Params({"type": "inverted_triangular", "cool_down": 2, "warm_up": 2})) callbacks = self.default_callbacks() + [UpdateMomentum(scheduler)] trainer = CallbackTrainer(model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=4, callbacks=callbacks, serialization_dir=self.TEST_DIR) trainer.train() new_scheduler = MomentumScheduler.from_params( self.optimizer, Params({"type": "inverted_triangular", "cool_down": 2, "warm_up": 2})) new_callbacks = self.default_callbacks() + [UpdateMomentum(new_scheduler)] new_trainer = CallbackTrainer(model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=6, callbacks=new_callbacks, serialization_dir=self.TEST_DIR) new_trainer.handler.fire_event(Events.TRAINING_START) assert new_trainer.epoch_number == 4 assert new_scheduler.last_epoch == 3 new_trainer.train()
def test_trainer_raises_on_model_with_no_loss_key(self): class FakeModel(Model): def forward(self, **kwargs): # pylint: disable=arguments-differ,unused-argument return {} with pytest.raises(RuntimeError): trainer = CallbackTrainer(FakeModel(None), training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(), num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train()
def test_trainer_can_resume_training_for_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters()) callbacks = self.default_callbacks() + [ MovingAverageCallback(moving_average) ] trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=1, serialization_dir=self.TEST_DIR, callbacks=callbacks) trainer.train() new_moving_average = ExponentialMovingAverage( self.model.named_parameters()) new_callbacks = self.default_callbacks() + [ MovingAverageCallback(new_moving_average) ] new_trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=new_callbacks) new_trainer.handler.fire_event(Events.RESTORE_CHECKPOINT) # pylint: disable=protected-access assert new_trainer.epoch_number == 1 tracker = trainer.metric_tracker # pylint: disable=protected-access assert tracker.is_best_so_far() assert tracker._best_so_far is not None # pylint: disable=protected-access new_trainer.train()
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) callbacks = self.default_callbacks() + [LrsCallback(lr_scheduler)] trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, callbacks=callbacks, num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train() new_lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({ "type": "exponential", "gamma": 0.5 })) callbacks = self.default_callbacks() + [LrsCallback(new_lr_scheduler)] new_trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, callbacks=callbacks, num_epochs=4, serialization_dir=self.TEST_DIR) new_trainer.handler.fire_event(Events.RESTORE_CHECKPOINT) assert new_trainer.epoch_number == 2 assert new_lr_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_posts_to_url(self): url = 'http://slack.com?webhook=ewifjweoiwjef' responses.add(responses.POST, url) post_to_url = PostToUrl(url, message="only a test") callbacks = self.default_callbacks() + [post_to_url] trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, num_epochs=2, callbacks=callbacks) trainer.train() assert len(responses.calls) == 1 assert responses.calls[ 0].response.request.body == b'{"text": "only a test"}'
def test_trainer_can_resume_training(self): trainer = CallbackTrainer(self.model, self.optimizer, callbacks=self.default_callbacks(), num_epochs=1, serialization_dir=self.TEST_DIR) trainer.train() new_trainer = CallbackTrainer(self.model, self.optimizer, callbacks=self.default_callbacks(), num_epochs=3, serialization_dir=self.TEST_DIR) new_trainer.handler.fire_event(Events.RESTORE_CHECKPOINT) assert new_trainer.epoch_number == 1 tracker = new_trainer.metric_tracker assert tracker is not None assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def test_trainer_can_run_ema_from_params(self): uma_params = Params({"moving_average": {"decay": 0.9999}}) callbacks = self.default_callbacks() + [ UpdateMovingAverage.from_params(uma_params, self.model) ] trainer = CallbackTrainer( model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=2, callbacks=callbacks, ) trainer.train()
def test_trainer_can_resume_with_lr_scheduler(self): lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({"type": "exponential", "gamma": 0.5})) callbacks = self.default_callbacks() + [UpdateLearningRate(lr_scheduler)] trainer = CallbackTrainer(model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=callbacks, num_epochs=2, serialization_dir=self.TEST_DIR) trainer.train() new_lr_scheduler = LearningRateScheduler.from_params( self.optimizer, Params({"type": "exponential", "gamma": 0.5})) callbacks = self.default_callbacks() + [UpdateLearningRate(new_lr_scheduler)] new_trainer = CallbackTrainer(model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=callbacks, num_epochs=4, serialization_dir=self.TEST_DIR) new_trainer.handler.fire_event(Events.TRAINING_START) assert new_trainer.epoch_number == 2 assert new_lr_scheduler.lr_scheduler.last_epoch == 1 new_trainer.train()
def test_trainer_can_resume_training(self): trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(), num_epochs=1, serialization_dir=self.TEST_DIR) trainer.train() new_trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(), num_epochs=3, serialization_dir=self.TEST_DIR) new_trainer.handler.fire_event(Events.TRAINING_START) assert new_trainer.epoch_number == 1 tracker = new_trainer.metric_tracker assert tracker is not None assert tracker.is_best_so_far() assert tracker._best_so_far is not None new_trainer.train()
def test_trainer_can_log_learning_rates_tensorboard(self): callbacks = [cb for cb in self.default_callbacks() if not isinstance(cb, LogToTensorboard)] # The lambda: None is unfortunate, but it will get replaced by the callback. tensorboard = TensorboardWriter(lambda: None, should_log_learning_rate=True, summary_interval=2) callbacks.append(LogToTensorboard(tensorboard)) trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=callbacks) trainer.train()
def test_trainer_can_log_histograms(self): # enable activation logging for module in self.model.modules(): module.should_log_activations = True callbacks = [cb for cb in self.default_callbacks() if not isinstance(cb, LogToTensorboard)] # The lambda: None is unfortunate, but it will get replaced by the callback. tensorboard = TensorboardWriter(lambda: None, histogram_interval=2) callbacks.append(LogToTensorboard(tensorboard)) trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=callbacks) trainer.train()
def test_trainer_saves_metrics_every_epoch(self): trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, num_epochs=5, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(max_checkpoints=3)) trainer.train() for epoch in range(5): epoch_file = self.TEST_DIR / f'metrics_epoch_{epoch}.json' assert epoch_file.exists() metrics = json.load(open(epoch_file)) assert "validation_loss" in metrics assert "best_validation_loss" in metrics assert metrics.get("epoch") == epoch
def test_trainer_can_run_exponential_moving_average(self): moving_average = ExponentialMovingAverage( self.model.named_parameters(), decay=0.9999) callbacks = self.default_callbacks() + [ UpdateMovingAverage(moving_average) ] trainer = CallbackTrainer( model=self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=2, callbacks=callbacks, ) trainer.train()
def test_handle_errors(self): class ErrorTest(Callback): """ A callback with three triggers * at BATCH_START, it raises a RuntimeError * at TRAINING_END, it sets a finished flag to True * at ERROR, it captures `trainer.exception` """ def __init__(self) -> None: self.exc: Optional[Exception] = None self.finished_training = None @handle_event(Events.BATCH_START) def raise_exception(self, trainer): raise RuntimeError("problem starting batch") @handle_event(Events.TRAINING_END) def finish_training(self, trainer): self.finished_training = True @handle_event(Events.ERROR) def capture_error(self, trainer): self.exc = trainer.exception error_test = ErrorTest() callbacks = self.default_callbacks() + [error_test] original_trainer = CallbackTrainer( self.model, self.instances, self.iterator, self.optimizer, callbacks=callbacks, num_epochs=1, serialization_dir=self.TEST_DIR, ) with pytest.raises(RuntimeError): original_trainer.train() # The callback should have captured the exception. assert error_test.exc is not None assert error_test.exc.args == ("problem starting batch",) # The "finished" flag should never have been set to True. assert not error_test.finished_training
def test_metric_only_considered_best_so_far_when_strictly_better_than_those_before_it_decreasing_metric( self): new_trainer = CallbackTrainer( self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=3, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(patience=5), ) tracker = new_trainer.metric_tracker # when it is the only metric it should be considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metric(1) assert new_tracker.is_best_so_far() # when it is the same as one before it it is not considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3]) assert not new_tracker.is_best_so_far() # when it is the best it is considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013]) assert new_tracker.is_best_so_far() # when it is not the the best it is not considered the best new_tracker = copy.deepcopy(tracker) new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13])
def test_trainer_respects_num_serialized_models_to_keep(self): trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=5, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(max_checkpoints=3)) trainer.train() # Now check the serialized files for prefix in ['model_state_epoch_*', 'training_state_epoch_*']: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names] assert sorted(epochs) == [2, 3, 4]
def test_should_stop_early_with_early_stopping_disabled(self): # Increasing metric trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=100, callbacks=self.default_callbacks(validation_metric="+test")) tracker = trainer.metric_tracker tracker.add_metrics([float(i) for i in reversed(range(20))]) assert not tracker.should_stop_early() # Decreasing metric trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=100, callbacks=self.default_callbacks(validation_metric="-test")) tracker = trainer.metric_tracker tracker.add_metrics([float(i) for i in range(20)]) assert not tracker.should_stop_early()
def test_trainer_saves_models_at_specified_interval(self): iterator = BasicIterator(batch_size=4) iterator.index_with(self.vocab) trainer = CallbackTrainer( self.model, training_data=self.instances, iterator=iterator, optimizer=self.optimizer, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(model_save_interval=0.0001), ) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = "model_state_epoch_*" file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == "1" assert "." in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove(os.path.join(self.TEST_DIR, "model_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "training_state_epoch_{}.th".format(k))) os.remove(os.path.join(self.TEST_DIR, "best.th")) restore_trainer = CallbackTrainer( self.model, training_data=self.instances, iterator=iterator, optimizer=self.optimizer, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(model_save_interval=0.0001), ) restore_trainer.handler.fire_event(Events.TRAINING_START) assert restore_trainer.epoch_number == 2 # One batch per epoch. assert restore_trainer.batch_num_total == 2
def test_model_training(self): training_dataset = self.sample_instances if self.sample_only else self.train_instances #training_dataset = training_dataset[:500] validation_dataset = self.sample_instances if self.sample_only else self.test_instances serialization_dir = self.TEST_DATA_ROOT / "serialized_sample" if self.sample_only else "serialized" tensorboard_dir = self.TEST_DATA_ROOT / "tensorboard" batch_size = 64 train_iterator = BucketIterator(sorting_keys=[("question", "num_tokens")], padding_noise=0.0, batch_size=batch_size) val_iterator = BucketIterator(sorting_keys=[("question", "num_tokens") ], padding_noise=0.0, batch_size=batch_size) train_iterator.index_with(vocab=self.vocab) val_iterator.index_with(vocab=self.vocab) tensorboard = TensorboardWriter(get_batch_num_total=lambda: np.ceil( len(training_dataset) / batch_size), serialization_dir=tensorboard_dir, summary_interval=5, histogram_interval=5, should_log_parameter_statistics=True) trainer = CallbackTrainer( model=self.model, serialization_dir=serialization_dir, iterator=train_iterator, training_data=training_dataset, num_epochs=20, cuda_device=0, optimizer=torch.optim.Adagrad(self.model.parameters()), callbacks=[ LogToTensorboard(tensorboard), Validate(validation_data=validation_dataset, validation_iterator=val_iterator), TrackMetrics(), ResetMetricsCallback() ]) trainer.train() self.val_outputs_fp.close()
def test_trainer_can_run_cuda(self): self.model.cuda() trainer = CallbackTrainer( self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=2, callbacks=self.default_callbacks(), cuda_device=0, ) metrics = trainer.train() assert "peak_cpu_memory_MB" in metrics assert isinstance(metrics["peak_cpu_memory_MB"], float) assert metrics["peak_cpu_memory_MB"] > 0 assert "peak_gpu_0_memory_MB" in metrics assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
def test_trainer_saves_models_at_specified_interval(self): trainer = CallbackTrainer( self.model, self.optimizer, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(batch_size=4), model_save_interval=0.0001) trainer.train() # Now check the serialized files for models saved during the epoch. prefix = 'model_state_epoch_*' file_names = sorted(glob.glob(os.path.join(self.TEST_DIR, prefix))) epochs = [ re.search(r"_([0-9\.\-]+)\.th", fname).group(1) for fname in file_names ] # We should have checkpoints at the end of each epoch and during each, e.g. # [0.timestamp, 0, 1.timestamp, 1] assert len(epochs) == 4 assert epochs[3] == '1' assert '.' in epochs[0] # Now make certain we can restore from timestamped checkpoint. # To do so, remove the checkpoint from the end of epoch 1&2, so # that we are forced to restore from the timestamped checkpoints. for k in range(2): os.remove( os.path.join(self.TEST_DIR, 'model_state_epoch_{}.th'.format(k))) os.remove( os.path.join(self.TEST_DIR, 'training_state_epoch_{}.th'.format(k))) os.remove(os.path.join(self.TEST_DIR, 'best.th')) restore_trainer = CallbackTrainer(self.model, self.optimizer, num_epochs=2, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks(), model_save_interval=0.0001) restore_trainer.handler.fire_event(Events.RESTORE_CHECKPOINT) assert restore_trainer.epoch_number == 2 # One batch per epoch. assert restore_trainer.batch_num_total == 2
def test_should_stop_early_with_invalid_patience(self): for patience in [0, -1, -2, 1.5, 'None']: with pytest.raises(ConfigurationError): CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, num_epochs=100, callbacks=self.default_callbacks(patience=patience, validation_metric="+test"))
def test_trainer_can_run_multiple_gpu(self): self.model.cuda() class MetaDataCheckWrapper(Model): """ Checks that the metadata field has been correctly split across the batch dimension when running on multiple gpus. """ def __init__(self, model): super().__init__(model.vocab) self.model = model def forward(self, **kwargs) -> Dict[str, torch.Tensor]: # type: ignore assert ( "metadata" in kwargs and "tags" in kwargs ), f"tokens and metadata must be provided. Got {kwargs.keys()} instead." batch_size = kwargs["tokens"]["tokens"].size()[0] assert len(kwargs["metadata"]) == batch_size, ( f"metadata must be split appropriately. Expected {batch_size} elements, " f"got {len(kwargs['metadata'])} elements.") return self.model.forward(**kwargs) multigpu_iterator = BasicIterator(batch_size=4) multigpu_iterator.index_with(self.vocab) trainer = CallbackTrainer( MetaDataCheckWrapper(self.model), training_data=self.instances, iterator=multigpu_iterator, optimizer=self.optimizer, num_epochs=2, callbacks=self.default_callbacks(), cuda_device=[0, 1], ) metrics = trainer.train() assert "peak_cpu_memory_MB" in metrics assert isinstance(metrics["peak_cpu_memory_MB"], float) assert metrics["peak_cpu_memory_MB"] > 0 assert "peak_gpu_0_memory_MB" in metrics assert isinstance(metrics["peak_gpu_0_memory_MB"], int) assert "peak_gpu_1_memory_MB" in metrics assert isinstance(metrics["peak_gpu_1_memory_MB"], int)
def test_trainer_respects_keep_serialized_model_every_num_seconds(self): # To test: # Create an iterator that sleeps for 2.5 second per epoch, so the total training # time for one epoch is slightly greater then 2.5 seconds. # Run for 6 epochs, keeping the last 2 models, models also kept every 5 seconds. # Check the resulting checkpoints. Should then have models at epochs # 2, 4, plus the last two at 5 and 6. class WaitingIterator(BasicIterator): # pylint: disable=arguments-differ def _create_batches(self, *args, **kwargs): time.sleep(2.5) return super(WaitingIterator, self)._create_batches(*args, **kwargs) waiting_iterator = WaitingIterator(batch_size=2) waiting_iterator.index_with(self.vocab) # Don't want validation iterator to wait. viterator = BasicIterator(batch_size=2) viterator.index_with(self.vocab) trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=waiting_iterator, optimizer=self.optimizer, num_epochs=6, serialization_dir=self.TEST_DIR, callbacks=self.default_callbacks( max_checkpoints=2, checkpoint_every=5, validation_iterator=viterator)) trainer.train() # Now check the serialized files for prefix in ['model_state_epoch_*', 'training_state_epoch_*']: file_names = glob.glob(os.path.join(self.TEST_DIR, prefix)) epochs = [ int(re.search(r"_([0-9])\.th", fname).group(1)) for fname in file_names ] # epoch N has N-1 in file name assert sorted(epochs) == [1, 3, 4, 5]
def test_model_training(self): serialization_dir = self.TEST_DATA_ROOT / "serialized_sample" tensorboard_dir = self.TEST_DATA_ROOT / "tensorboard.seq2seq" batch_size = 64 train_iterator = BucketIterator(sorting_keys=[("source_tokens", "num_tokens")], padding_noise=0.0, batch_size=batch_size) train_iterator.index_with(vocab=self.vocab) tensorboard = TensorboardWriter( get_batch_num_total=lambda: np.ceil(len(self.train_instances) / batch_size), serialization_dir=tensorboard_dir, summary_interval=5, histogram_interval=5, should_log_parameter_statistics=True) trainer = CallbackTrainer(model=self.model, serialization_dir=serialization_dir, iterator=train_iterator, training_data=self.train_instances, num_epochs=1, cuda_device=0, optimizer=torch.optim.Adam(self.model.parameters(), lr=1e-3), callbacks=[LogToTensorboard(tensorboard), Validate(validation_data=self.dev_instances, validation_iterator=train_iterator), TrackMetrics(), ResetMetricsCallback()] ) for i in range(50): print('Epoch: {}'.format(i)) trainer.train() import itertools predictor = Seq2SeqPredictor(self.model, self.reader) for instance in itertools.islice(self.dev_instances, 10): print('SOURCE:', instance.fields['source_tokens'].tokens) print('GOLD:', instance.fields['target_tokens'].tokens) print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])
def test_trainer_can_run(self): trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, callbacks=self.default_callbacks(serialization_dir=None), num_epochs=2) metrics = trainer.train() assert 'best_validation_loss' in metrics assert isinstance(metrics['best_validation_loss'], float) assert 'best_validation_accuracy' in metrics assert isinstance(metrics['best_validation_accuracy'], float) assert 'best_validation_accuracy3' in metrics assert isinstance(metrics['best_validation_accuracy3'], float) assert 'best_epoch' in metrics assert isinstance(metrics['best_epoch'], int) assert 'peak_cpu_memory_MB' in metrics # Making sure that both increasing and decreasing validation metrics work. trainer = CallbackTrainer(model=self.model, optimizer=self.optimizer, callbacks=self.default_callbacks(validation_metric="+loss", serialization_dir=None), num_epochs=2) metrics = trainer.train() assert 'best_validation_loss' in metrics assert isinstance(metrics['best_validation_loss'], float) assert 'best_validation_accuracy' in metrics assert isinstance(metrics['best_validation_accuracy'], float) assert 'best_validation_accuracy3' in metrics assert isinstance(metrics['best_validation_accuracy3'], float) assert 'best_epoch' in metrics assert isinstance(metrics['best_epoch'], int) assert 'peak_cpu_memory_MB' in metrics assert isinstance(metrics['peak_cpu_memory_MB'], float) assert metrics['peak_cpu_memory_MB'] > 0
def test_restored_training_returns_best_epoch_metrics_even_if_no_better_epoch_is_found_after_restoring(self): # Instead of -loss, use +loss to assure 2nd epoch is considered worse. # Run 1 epoch of original training. original_trainer = CallbackTrainer(self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(validation_metric="+loss"), num_epochs=1, serialization_dir=self.TEST_DIR) training_metrics = original_trainer.train() # Run 1 epoch of restored training. restored_trainer = CallbackTrainer( self.model, training_data=self.instances, iterator=self.iterator, optimizer=self.optimizer, callbacks=self.default_callbacks(validation_metric="+loss"), num_epochs=2, serialization_dir=self.TEST_DIR) restored_metrics = restored_trainer.train() assert "best_validation_loss" in restored_metrics assert "best_validation_accuracy" in restored_metrics assert "best_validation_accuracy3" in restored_metrics assert "best_epoch" in restored_metrics # Epoch 2 validation loss should be lesser than that of Epoch 1 assert training_metrics["best_validation_loss"] == restored_metrics["best_validation_loss"] assert training_metrics["best_epoch"] == 0 assert training_metrics["validation_loss"] > restored_metrics["validation_loss"]