def test_tensorboard_hparams_reload(tmpdir): model = EvalModelTemplate() trainer = Trainer(max_epochs=1, default_root_dir=tmpdir) assert trainer.log_dir == trainer.logger.log_dir trainer.fit(model) assert trainer.log_dir == trainer.logger.log_dir folder_path = trainer.log_dir # make sure yaml is there with open(os.path.join(folder_path, "hparams.yaml")) as file: # The FullLoader parameter handles the conversion from YAML # scalar values to Python the dictionary format yaml_params = yaml.safe_load(file) assert yaml_params["b1"] == 0.5 assert len(yaml_params.keys()) == 10 # verify artifacts assert len(os.listdir(os.path.join(folder_path, "checkpoints"))) == 1 # verify tb logs event_acc = EventAccumulator(folder_path) event_acc.Reload() data_pt_1_5 = b'\x12\x93\x01"\x0b\n\tdrop_prob"\x0c\n\nbatch_size"\r\n\x0bin_features"\x0f\n\rlearning_rate"' \ b'\x10\n\x0eoptimizer_name"\x0b\n\tdata_root"\x0e\n\x0cout_features"\x0c\n\nhidden_dim"' \ b'\x04\n\x02b1"\x04\n\x02b2*\r\n\x0b\x12\thp_metric' data_pt_1_6 = b'\x12\xa7\x01"\r\n\tdrop_prob \x03"\x0e\n\nbatch_size \x03"\x0f\n\x0bin_features \x03"' \ b'\x11\n\rlearning_rate \x03"\x12\n\x0eoptimizer_name \x01"\r\n\tdata_root \x01"' \ b'\x10\n\x0cout_features \x03"\x0e\n\nhidden_dim \x03"\x06\n\x02b1 \x03"' \ b'\x06\n\x02b2 \x03*\r\n\x0b\x12\thp_metric' hparams_data = data_pt_1_6 if LooseVersion( torch.__version__) >= LooseVersion("1.6.0") else data_pt_1_5 assert event_acc.summary_metadata[ '_hparams_/experiment'].plugin_data.plugin_name == 'hparams' assert event_acc.summary_metadata[ '_hparams_/experiment'].plugin_data.content == hparams_data
def _new_model(): # Create a model that tracks epochs and batches seen model = EvalModelTemplate(**hparams) model.num_epochs_seen = 0 model.num_batches_seen = 0 model.num_on_load_checkpoint_called = 0 def increment_epoch(self): self.num_epochs_seen += 1 def increment_batch(self, batch, batch_idx, dataloader_idx): self.num_batches_seen += 1 def increment_on_load_checkpoint(self, _): self.num_on_load_checkpoint_called += 1 # Bind methods to keep track of epoch numbers, batch numbers it has seen # as well as number of times it has called on_load_checkpoint() model.on_epoch_end = types.MethodType(increment_epoch, model) model.on_train_batch_start = types.MethodType(increment_batch, model) model.on_load_checkpoint = types.MethodType( increment_on_load_checkpoint, model) return model
def test_model_saving_loading(tmpdir): """Tests use case where trainer saves the model, and user loads it from tags independently.""" model = EvalModelTemplate() # logger file to get meta logger = tutils.get_default_logger(tmpdir) # fit model trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(dirpath=tmpdir), default_root_dir=tmpdir, ) result = trainer.fit(model) # traning complete assert result == 1, 'amp + ddp model failed to complete' # make a prediction dataloaders = model.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: for batch in dataloader: break x, y = batch x = x.view(x.size(0), -1) # generate preds before saving model model.eval() pred_before_saving = model(x) # save model new_weights_path = os.path.join(tmpdir, 'save_test.ckpt') trainer.save_checkpoint(new_weights_path) # load new model hparams_path = tutils.get_data_path(logger, path_dir=tmpdir) hparams_path = os.path.join(hparams_path, 'hparams.yaml') model_2 = EvalModelTemplate.load_from_checkpoint(checkpoint_path=new_weights_path, hparams_file=hparams_path,) model_2.eval() # make prediction # assert that both predictions are the same new_pred = model_2(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1
def test_full_loop(tmpdir): dm = TrialMNISTDataModule(tmpdir) dm.prepare_data() dm.setup() model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, ) trainer.fit(model, dm) # fit model result = trainer.fit(model) assert result == 1 # test result = trainer.test(datamodule=dm) result = result[0] assert result['test_acc'] > 0.8
def test_horovod_multi_optimizer(tmpdir): model = TestGAN(**EvalModelTemplate.get_default_hparams()) trainer_options = dict( default_root_dir=str(tmpdir), progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, deterministic=True, distributed_backend='horovod', ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1, 'model failed to complete' assert len(trainer.optimizers) == 2 for i, optimizer in enumerate(trainer.optimizers): assert hasattr( optimizer, 'synchronize' ), 'optimizer has not been wrapped into DistributedOptimizer' def get_model_params(model): return set([p for p in model.parameters()]) def get_optimizer_params(optimizer): return set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) assert get_model_params(model.generator) != get_model_params( model.discriminator) assert get_model_params(model.generator) == get_optimizer_params( trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params( trainer.optimizers[1])
def test_auto_scale_batch_size_set_model_attribute(tmpdir, use_hparams): """ Test that new batch size gets written to the correct hyperparameter attribute. """ tutils.reset_seed() hparams = EvalModelTemplate.get_default_hparams() before_batch_size = hparams.get('batch_size') class HparamsEvalModelTemplate(EvalModelTemplate): def dataloader(self, *args, **kwargs): # artificially set batch_size so we can get a dataloader # remove it immediately after, because we want only self.hparams.batch_size setattr(self, "batch_size", before_batch_size) dataloader = super().dataloader(*args, **kwargs) del self.batch_size return dataloader datamodule_model = MNISTDataModule( data_dir=tmpdir, batch_size=111) # this datamodule should get ignored! datamodule_fit = MNISTDataModule(data_dir=tmpdir, batch_size=before_batch_size) model_class = HparamsEvalModelTemplate if use_hparams else EvalModelTemplate model = model_class(**hparams) model.datamodule = datamodule_model # unused when another module gets passed to .tune() / .fit() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, auto_scale_batch_size=True, gpus=1, ) trainer.tune(model, datamodule_fit) after_batch_size = model.hparams.batch_size if use_hparams else model.batch_size assert trainer.datamodule == datamodule_fit assert before_batch_size != after_batch_size assert after_batch_size <= len(trainer.train_dataloader.dataset) assert datamodule_fit.batch_size == after_batch_size # should be left unchanged, since it was not passed to .tune() assert datamodule_model.batch_size == 111
def test_model_properties_resume_from_checkpoint(tmpdir): """ Test that properties like `current_epoch` and `global_step` in model and trainer are always the same. """ model = EvalModelTemplate() checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) trainer_args = dict( default_root_dir=tmpdir, max_epochs=1, logger=False, callbacks=[checkpoint_callback, ModelTrainerPropertyParity() ] # this performs the assertions ) trainer = Trainer(**trainer_args) trainer.fit(model) trainer_args.update(max_epochs=2) trainer = Trainer(**trainer_args, resume_from_checkpoint=str(tmpdir / "last.ckpt")) trainer.fit(model)
def test_full_loop(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, deterministic=True, ) # fit model result = trainer.fit(model, dm) assert result == 1 # test result = trainer.test(datamodule=dm) result = result[0] assert result['test_acc'] > 0.8
def test_amp_gpu_ddp_slurm_managed(tmpdir): """Make sure DDP + AMP work.""" # simulate setting slurm flags tutils.set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) model = EvalModelTemplate() # exp file to get meta logger = tutils.get_default_logger(tmpdir) # exp file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gpus=[0], distributed_backend='ddp_spawn', precision=16, checkpoint_callback=checkpoint, logger=logger, ) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test root model address assert trainer.accelerator_connector.resolve_root_node_address( 'abc') == 'abc' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23]') == 'abc23' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23-24]') == 'abc23' assert trainer.accelerator_connector.resolve_root_node_address( 'abc[23-24, 45-40, 40]') == 'abc23'
def test_model_checkpoint_none_monitor(tmpdir): model = EvalModelTemplate() epochs = 2 checkpoint_callback = ModelCheckpoint(monitor='val_loss', filepath=tmpdir, save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=False, checkpoint_callback=checkpoint_callback, max_epochs=epochs, ) trainer.fit(model) # these should not be set if monitor is None assert checkpoint_callback.best_model_path == '' assert checkpoint_callback.best_model_score == 0 assert checkpoint_callback.best_k_models == {} assert checkpoint_callback.kth_best_model_path == '' # check that the correct ckpts were created expected = ['lightning_logs'] expected.extend(f'epoch={e}.ckpt' for e in range(epochs)) assert set(os.listdir(tmpdir)) == set(expected)
def test_ckpt_metric_names(tmpdir): model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, gradient_clip_val=1.0, overfit_batches=0.20, progress_bar_refresh_rate=0, limit_train_batches=0.01, limit_val_batches=0.01, checkpoint_callback=ModelCheckpoint(monitor='val_loss', filepath=tmpdir + "/{val_loss:.2f}"), ) trainer.fit(model) # make sure the checkpoint we saved has the metric in the name ckpts = os.listdir(tmpdir) ckpts = [x for x in ckpts if "val_loss" in x] assert len(ckpts) == 1 val = re.sub("[^0-9.]", "", ckpts[0]) assert len(val) > 3
def test_model_checkpoint_period(tmpdir, period): model = EvalModelTemplate() epochs = 5 checkpoint_callback = ModelCheckpoint(filepath=tmpdir, save_top_k=-1, period=period) trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=False, checkpoint_callback=checkpoint_callback, max_epochs=epochs, limit_train_batches=0.1, limit_val_batches=0.1, logger=False, ) trainer.fit(model) # check that the correct ckpts were created expected = [ f'epoch={e}.ckpt' for e in range(epochs) if not (e + 1) % period ] if period > 0 else [] assert set(os.listdir(tmpdir)) == set(expected)
def test_grad_tracking_interval(tmpdir, log_every_n_steps): """ Test that gradient norms get tracked in the right interval and that everytime the same keys get logged. """ trainer = Trainer( default_root_dir=tmpdir, track_grad_norm=2, log_every_n_steps=log_every_n_steps, max_steps=10, ) with patch.object(trainer.logger, "log_metrics") as mocked: model = EvalModelTemplate() trainer.fit(model) expected = trainer.global_step // log_every_n_steps grad_norm_dicts = [] for _, kwargs in mocked.call_args_list: metrics = kwargs.get("metrics", {}) grad_norm_dict = {k: v for k, v in metrics.items() if k.startswith("grad_")} if grad_norm_dict: grad_norm_dicts.append(grad_norm_dict) assert len(grad_norm_dicts) == expected assert all(grad_norm_dicts[0].keys() == g.keys() for g in grad_norm_dicts)
def test_dataloaders_load_every_epoch(tmpdir): os.environ['PL_DEV_DEBUG'] = '1' model = EvalModelTemplate() train_loader = model.train_dataloader() model.train_dataloader = None val_loader = model.val_dataloader() model.val_dataloader = None test_loader = model.test_dataloader() model.test_dataloader = None # logger file to get meta trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=0.3, limit_val_batches=0.3, reload_dataloaders_every_epoch=True, max_epochs=3, ) result = trainer.fit(model, train_loader, val_loader) trainer.test(test_dataloaders=test_loader) assert len(trainer.dev_debugger.val_dataloader_calls) == 4 assert len(trainer.dev_debugger.train_dataloader_calls) == 3 assert len(trainer.dev_debugger.test_dataloader_calls) == 1 # verify the sequence calls = trainer.dev_debugger.dataloader_sequence_calls expected_sequence = [ 'val_dataloader', 'train_dataloader', 'val_dataloader', 'train_dataloader', 'val_dataloader', 'train_dataloader', 'val_dataloader', 'test_dataloader' ] for call, expected in zip(calls, expected_sequence): assert call['name'] == expected
def test_train_loop_only(tmpdir): dm = TrialMNISTDataModule(tmpdir) dm.prepare_data() dm.setup() model = EvalModelTemplate() model.validation_step = None model.validation_step_end = None model.validation_epoch_end = None model.test_step = None model.test_step_end = None model.test_epoch_end = None trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, ) trainer.fit(model, dm) # fit model result = trainer.fit(model) assert result == 1 assert trainer.callback_metrics['loss'] < 0.50
def test_dataloaders_with_limit_percent_batches(tmpdir, limit_train_batches, limit_val_batches, limit_test_batches): """Verify num_batches for val & test dataloaders passed with batch limit in percent""" model = EvalModelTemplate() model.val_dataloader = model.val_dataloader__multiple_mixed_length model.test_dataloader = model.test_dataloader__multiple_mixed_length model.validation_step = model.validation_step__multiple_dataloaders model.validation_epoch_end = model.validation_epoch_end__multiple_dataloaders model.test_step = model.test_step__multiple_dataloaders model.test_epoch_end = model.test_epoch_end__multiple_dataloaders # train, multiple val and multiple test passed with percent_check trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=limit_train_batches, limit_val_batches=limit_val_batches, limit_test_batches=limit_test_batches, ) trainer.fit(model) expected_train_batches = int( len(trainer.train_dataloader) * limit_train_batches) expected_val_batches = [ int(len(dataloader) * limit_val_batches) for dataloader in trainer.val_dataloaders ] assert trainer.num_training_batches == expected_train_batches assert trainer.num_val_batches == expected_val_batches trainer.test(ckpt_path=None) expected_test_batches = [ int(len(dataloader) * limit_test_batches) for dataloader in trainer.test_dataloaders ] assert trainer.num_test_batches == expected_test_batches
def test_callbacks_state_resume_from_checkpoint(enable_pl_optimizer, tmpdir): """ Test that resuming from a checkpoint restores callbacks that persist state. """ model = EvalModelTemplate() callback_capture = CaptureCallbacksBeforeTraining() def get_trainer_args(): checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) trainer_args = dict(default_root_dir=tmpdir, max_steps=1, logger=False, enable_pl_optimizer=enable_pl_optimizer, callbacks=[ checkpoint, callback_capture, ]) assert checkpoint.best_model_path == "" assert checkpoint.best_model_score is None return trainer_args # initial training trainer = Trainer(**get_trainer_args()) trainer.fit(model) callbacks_before_resume = deepcopy(trainer.callbacks) # resumed training trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt")) trainer.fit(model) assert len(callbacks_before_resume) == len(callback_capture.callbacks) for before, after in zip(callbacks_before_resume, callback_capture.callbacks): if isinstance(before, ModelCheckpoint): assert before.best_model_path == after.best_model_path assert before.best_model_score == after.best_model_score
def test_resume_early_stopping_from_checkpoint(tmpdir): """ Prevent regressions to bugs: https://github.com/PyTorchLightning/pytorch-lightning/issues/1464 https://github.com/PyTorchLightning/pytorch-lightning/issues/1463 """ seed_everything(42) model = EvalModelTemplate() checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=1) early_stop_callback = EarlyStoppingTestRestore() trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, callbacks=[early_stop_callback], num_sanity_val_steps=0, max_epochs=4, ) trainer.fit(model) checkpoint_filepath = checkpoint_callback.kth_best_model_path # ensure state is persisted properly checkpoint = torch.load(checkpoint_filepath) # the checkpoint saves "epoch + 1" early_stop_callback_state = early_stop_callback.saved_states[checkpoint["epoch"] - 1] assert 4 == len(early_stop_callback.saved_states) assert checkpoint["callbacks"][type(early_stop_callback)] == early_stop_callback_state # ensure state is reloaded properly (assertion in the callback) early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state) new_trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, resume_from_checkpoint=checkpoint_filepath, callbacks=[early_stop_callback], ) with pytest.raises(MisconfigurationException, match=r'.*you restored a checkpoint with current_epoch*'): new_trainer.fit(model)
def test_resume_early_stopping_from_checkpoint(tmpdir): """ Prevent regressions to bugs: https://github.com/PyTorchLightning/pytorch-lightning/issues/1464 https://github.com/PyTorchLightning/pytorch-lightning/issues/1463 """ model = EvalModelTemplate() checkpoint_callback = ModelCheckpoint(save_top_k=1) early_stop_callback = EarlyStoppingTestRestore() trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, num_sanity_val_steps=0, max_epochs=4, ) trainer.fit(model) checkpoint_filepath = checkpoint_callback.kth_best_model # ensure state is persisted properly checkpoint = torch.load(checkpoint_filepath) # the checkpoint saves "epoch + 1" early_stop_callback_state = early_stop_callback.saved_states[ checkpoint["epoch"] - 1] assert 4 == len(early_stop_callback.saved_states) assert checkpoint["callbacks"][type( early_stop_callback)] == early_stop_callback_state # ensure state is reloaded properly (assertion in the callback) early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state) new_trainer = Trainer( default_root_dir=tmpdir, max_epochs=2, resume_from_checkpoint=checkpoint_filepath, early_stop_callback=early_stop_callback, ) new_trainer.fit(model)
def test_trainer_interrupted_flag(tmpdir): """Test the flag denoting that a user interrupted training.""" model = EvalModelTemplate() class InterruptCallback(Callback): def __init__(self): super().__init__() def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): raise KeyboardInterrupt class HandleInterruptCallback(Callback): def __init__(self): super().__init__() self.exc_info = None def on_keyboard_interrupt(self, trainer, pl_module): self.exc_info = sys.exc_info() interrupt_callback = InterruptCallback() handle_interrupt_callback = HandleInterruptCallback() trainer = Trainer( callbacks=[interrupt_callback, handle_interrupt_callback], max_epochs=1, limit_val_batches=0.1, limit_train_batches=0.2, progress_bar_refresh_rate=0, logger=False, default_root_dir=tmpdir, ) assert not trainer.interrupted assert handle_interrupt_callback.exc_info is None trainer.fit(model) assert trainer.interrupted assert isinstance(handle_interrupt_callback.exc_info[1], KeyboardInterrupt)
def test_mlflow_logger_dirs_creation(tmpdir): """ Test that the logger creates the folders and files in the right place. """ assert not os.listdir(tmpdir) logger = MLFlowLogger('test', save_dir=tmpdir) assert logger.save_dir == tmpdir assert set(os.listdir(tmpdir)) == {'.trash'} run_id = logger.run_id exp_id = logger.experiment_id # multiple experiment calls should not lead to new experiment folders for i in range(2): _ = logger.experiment assert set(os.listdir(tmpdir)) == {'.trash', exp_id} assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys() assert trainer.ckpt_path == trainer.weights_save_path == (tmpdir / exp_id / run_id / 'checkpoints') assert set(os.listdir(trainer.ckpt_path)) == {'epoch=0.ckpt'}
def test_model_checkpoint_save_last(tmpdir): """Tests that save_last produces only one last checkpoint.""" model = EvalModelTemplate() epochs = 3 ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last-{epoch}' model_checkpoint = ModelCheckpoint(filepath=tmpdir, save_top_k=-1, save_last=True) trainer = Trainer( default_root_dir=tmpdir, early_stop_callback=False, checkpoint_callback=model_checkpoint, max_epochs=epochs, ) trainer.fit(model) last_filename = model_checkpoint._format_checkpoint_name( ModelCheckpoint.CHECKPOINT_NAME_LAST, epochs - 1, {}) last_filename = last_filename + '.ckpt' assert str(tmpdir / last_filename) == model_checkpoint.last_model_path assert set( os.listdir(tmpdir)) == set([f'epoch={i}.ckpt' for i in range(epochs)] + [last_filename, 'lightning_logs']) ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last'
def test_trainer_reset_correctly(tmpdir): """Check that all trainer parameters are reset correctly after scaling batch size.""" tutils.reset_seed() model = EvalModelTemplate() # logger file to get meta trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) changed_attributes = [ "callbacks", "checkpoint_callback", "current_epoch", "limit_train_batches", "logger", "max_steps", "weights_summary", ] expected = {ca: getattr(trainer, ca) for ca in changed_attributes} trainer.tuner.scale_batch_size(model, max_trials=5) actual = {ca: getattr(trainer, ca) for ca in changed_attributes} assert actual == expected
def test_trainer_reset_correctly(tmpdir): """Check that all trainer parameters are reset correctly after lr_find()""" model = EvalModelTemplate() # logger file to get meta trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) changed_attributes = [ "accumulate_grad_batches", "auto_lr_find", "callbacks", "checkpoint_callback", "current_epoch", "logger", "max_steps", ] expected = {ca: getattr(trainer, ca) for ca in changed_attributes} trainer.tuner.lr_find(model, num_training=5) actual = {ca: getattr(trainer, ca) for ca in changed_attributes} assert actual == expected assert model.trainer == trainer
def test_trainer_attached_to_dm(tmpdir): reset_seed() dm = TrialMNISTDataModule(tmpdir) model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, max_epochs=3, weights_summary=None, deterministic=True, ) # fit model result = trainer.fit(model, dm) assert result == 1 assert dm.trainer is not None # test result = trainer.test(datamodule=dm) result = result[0] assert dm.trainer is not None
def test_wrong_train_setting(tmpdir): """ * Test that an error is thrown when no `training_dataloader()` is defined * Test that an error is thrown when no `training_step()` is defined """ tutils.reset_seed() hparams = tutils.get_default_hparams() trainer = Trainer(default_root_dir=tmpdir, max_epochs=1) with pytest.raises(MisconfigurationException): model = EvalModelTemplate(hparams) model.train_dataloader = None trainer.fit(model) with pytest.raises(MisconfigurationException): model = EvalModelTemplate(hparams) model.training_step = None trainer.fit(model)
def test_trainer_subclassing(): model = EvalModelTemplate() # First way of pulling out args from signature is to list them class TrainerSubclass(Trainer): def __init__(self, custom_arg, *args, custom_kwarg='test', **kwargs): super().__init__(*args, **kwargs) self.custom_arg = custom_arg self.custom_kwarg = custom_kwarg trainer = TrainerSubclass(123, custom_kwarg='custom', fast_dev_run=True) result = trainer.fit(model) assert result == 1 assert trainer.custom_arg == 123 assert trainer.custom_kwarg == 'custom' assert trainer.fast_dev_run # Second way is to pop from the dict # It's a special case because Trainer does not have any positional args class TrainerSubclass(Trainer): def __init__(self, **kwargs): self.custom_arg = kwargs.pop('custom_arg', 0) self.custom_kwarg = kwargs.pop('custom_kwarg', 'test') super().__init__(**kwargs) trainer = TrainerSubclass(custom_kwarg='custom', fast_dev_run=True) result = trainer.fit(model) assert result == 1 assert trainer.custom_kwarg == 'custom' assert trainer.fast_dev_run # when we pass in an unknown arg, the base class should complain with pytest.raises( TypeError, match=r"__init__\(\) got an unexpected keyword argument 'abcdefg'" ): TrainerSubclass(abcdefg='unknown_arg')
def test_gpu_stats_monitor(tmpdir): """ Test GPU stats are logged using a logger. """ model = EvalModelTemplate() gpu_stats = GPUStatsMonitor(intra_step_time=True) logger = CSVLogger(tmpdir) log_every_n_steps = 2 trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, limit_train_batches=7, log_every_n_steps=log_every_n_steps, gpus=1, callbacks=[gpu_stats], logger=logger) results = trainer.fit(model) assert results path_csv = os.path.join(logger.log_dir, ExperimentWriter.NAME_METRICS_FILE) met_data = np.genfromtxt(path_csv, delimiter=',', names=True, deletechars='', replace_space=' ') batch_time_data = met_data['batch_time/intra_step (ms)'] batch_time_data = batch_time_data[~np.isnan(batch_time_data)] assert batch_time_data.shape[0] == trainer.global_step // log_every_n_steps fields = [ 'utilization.gpu', 'memory.used', 'memory.free', 'utilization.memory' ] for f in fields: assert any([f in h for h in met_data.dtype.names])
def test_multiple_test_dataloader(tmpdir, ckpt_path): """Verify multiple test_dataloader.""" model_template = EvalModelTemplate() class MultipleTestDataloaderModel(EvalModelTemplate): def test_dataloader(self): return [self.dataloader(train=False), self.dataloader(train=False)] def test_step(self, batch, batch_idx, *args, **kwargs): return model_template.test_step__multiple_dataloaders( batch, batch_idx, *args, **kwargs) model = MultipleTestDataloaderModel() # fit model trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, limit_val_batches=10, limit_train_batches=100, ) trainer.fit(model) if ckpt_path == 'specific': ckpt_path = trainer.checkpoint_callback.best_model_path trainer.test(ckpt_path=ckpt_path) # verify there are 2 test loaders assert len(trainer.test_dataloaders ) == 2, 'Multiple test_dataloaders not initiated properly' # make sure predictions are good for each test set for dataloader in trainer.test_dataloaders: tpipes.run_prediction_eval_model_template(trainer.model, dataloader) # run the test method trainer.test(ckpt_path=ckpt_path)
def test_default_logger_callbacks_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, gradient_clip_val=1.0, overfit_batches=0.20, progress_bar_refresh_rate=0, limit_train_batches=0.01, limit_val_batches=0.01, ) model = EvalModelTemplate() tpipes.run_model_test_without_loggers(trainer_options, model) # test freeze on cpu model.freeze() model.unfreeze()