def test_result_obj_on_tpu(tmpdir): seed_everything(1234) batches = 5 epochs = 2 model = EvalModelTemplate() model.training_step = model.training_step_result_obj model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_obj model.validation_step_end = None model.validation_epoch_end = None model.test_step = model.test_step_result_obj model.test_step_end = None model.test_epoch_end = None trainer_options = dict(default_root_dir=tmpdir, max_epochs=epochs, callbacks=[EarlyStopping()], log_every_n_steps=2, limit_train_batches=batches, weights_summary=None, tpu_cores=8) tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_result_obj_on_tpu(tmpdir): seed_everything(1234) os.environ['PL_DEV_DEBUG'] = '1' batches = 5 epochs = 2 model = EvalModelTemplate() model.training_step = model.training_step_result_obj model.training_step_end = None model.training_epoch_end = None model.validation_step = model.validation_step_result_obj model.validation_step_end = None model.validation_epoch_end = None model.test_step = model.test_step_result_obj model.test_step_end = None model.test_epoch_end = None trainer_options = dict(default_root_dir=tmpdir, max_epochs=epochs, early_stop_callback=True, row_log_interval=2, limit_train_batches=batches, weights_summary=None, tpu_cores=8) tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_single_gpu_model(tmpdir, gpus): """Make sure single GPU works (DP mode).""" trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, gpus=gpus) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_cpu_model(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.1, limit_val_batches=0.1, gpus='-1') model = EvalModelTemplate() with pytest.warns(UserWarning): tpipes.run_model_test(trainer_options, model)
def test_cpu_model_with_amp(tmpdir): """Make sure model trains on CPU.""" trainer_options = dict(default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4, precision=16) model = EvalModelTemplate() with pytest.raises((MisconfigurationException, ModuleNotFoundError)): tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_multi_gpu_none_backend(tmpdir): """Make sure when using multiple GPUs the user can't use `distributed_backend = None`.""" tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, distributed_backend=None, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.2, limit_val_batches=0.2, gpus=2) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=[tpu_core], limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) assert torch_xla._XLAC._xla_get_default_device() == f'xla:{tpu_core}'
def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4, gradient_clip_val=0.1, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_all_features_cpu_model(tmpdir): """Test each of the trainer options.""" trainer_options = dict(default_root_dir=tmpdir, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, progress_bar_refresh_rate=0, accumulate_grad_batches=2, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.4) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_model_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, distributed_backend='tpu', tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
def test_multi_gpu_early_stop_ddp_spawn(tmpdir): """Make sure DDP works. with early stopping""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, callbacks=[EarlyStopping()], max_epochs=50, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model)
def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', progress_bar_refresh_rate=0) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) # test memory helper functions memory.get_memory_profile('min_max')
def test_model_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() # 8 cores needs a big dataset model.train_dataloader = _serial_train_loader model.val_dataloader = _serial_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() ckpt_path = trainer_options['default_root_dir'] trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) model = EvalModelTemplate() run_model_test(trainer_options, model, on_gpu=args.on_gpu, version=0, with_hpc=False) # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if args.on_gpu: trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def test_multi_cpu_model_ddp(tmpdir): """Make sure DDP works.""" tutils.set_random_master_port() trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, limit_train_batches=0.4, limit_val_batches=0.2, gpus=None, num_processes=2, distributed_backend='ddp_cpu', ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False)
def test_model_16bit_tpu_index_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, max_epochs=1, distributed_backend='tpu', tpu_cores=[1], limit_train_batches=0.4, limit_val_batches=0.4, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False) assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' assert os.environ.get('XLA_USE_BF16') == str( 1), "XLA_USE_BF16 was not set in environment variables"
def test_model_saves_on_multi_gpu(tmpdir): """Test that ONNX model saves on a distributed backend""" tutils.set_random_master_port() trainer_options = dict(default_root_dir=tmpdir, max_epochs=1, limit_train_batches=10, limit_val_batches=10, gpus=[0, 1], distributed_backend='ddp_spawn', progress_bar_refresh_rate=0) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model) file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True
def test_early_stopping_cpu_model(tmpdir): """Test each of the trainer options.""" stopping = EarlyStopping(monitor='early_stop_on', min_delta=0.1) trainer_options = dict( default_root_dir=tmpdir, early_stop_callback=stopping, max_epochs=2, gradient_clip_val=1.0, overfit_batches=0.20, track_grad_norm=2, limit_train_batches=0.1, limit_val_batches=0.1, ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False) # test freeze on cpu model.freeze() model.unfreeze()
def test_base_tpu_model_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4 ) model = EvalModelTemplate() # 8 cores needs a big dataset def long_train_loader(): dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32) return dataset model.train_dataloader = long_train_loader model.val_dataloader = long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
def test_base_tpu_16bit_model_8_cores(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, precision=16, progress_bar_refresh_rate=0, max_epochs=1, tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4 ) model = EvalModelTemplate() # 8 cores needs a big dataset def long_train_loader(): dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32) return dataset model.train_dataloader = long_train_loader model.val_dataloader = long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False) assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"