def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, with_hpc: bool = True): reset_seed() save_dir = trainer_options['default_root_dir'] # logger file to get meta logger = get_default_logger(save_dir, version=version) trainer_options.update(logger=logger) if 'checkpoint_callback' not in trainer_options: # logger file to get weights checkpoint = init_checkpoint_callback(logger) trainer_options.update(checkpoint_callback=checkpoint) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test model loading pretrained_model = load_model_from_checkpoint( logger, trainer.checkpoint_callback.dirpath) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] [ run_prediction(dataloader, pretrained_model) for dataloader in test_loaders ] if with_hpc: if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ trainer.init_optimizers(pretrained_model) # test HPC loading / saving trainer.hpc_save(save_dir, logger) trainer.hpc_load(save_dir, on_gpu=on_gpu)
def run_test_from_config(trainer_options): """Trains the default model with the given config.""" set_random_master_port() reset_seed() ckpt_path = trainer_options['weights_save_path'] trainer_options.update(checkpoint_callback=ModelCheckpoint(ckpt_path)) model = EvalModelTemplate() trainer = Trainer(**trainer_options) result = trainer.fit(model) assert result == 1 # Horovod should be initialized following training. If not, this will raise an exception. assert hvd.size() == 2 if trainer.global_rank > 0: # on higher ranks the checkpoint location is unknown # we want to test checkpointing on rank 0 only assert not hasattr(trainer, 'ckpt_path') assert not trainer.checkpoint_callback.best_model_path return # test model loading pretrained_model = EvalModelTemplate.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) # test HPC loading / saving trainer.hpc_save(ckpt_path, trainer.logger) trainer.hpc_load(ckpt_path, on_gpu=args.on_gpu) if args.on_gpu: trainer = Trainer(gpus=1, distributed_backend='horovod', max_epochs=1) # Test the root_gpu property assert trainer.root_gpu == hvd.local_rank()
def run_model_test(trainer_options, model, on_gpu=True): save_dir = trainer_options['default_save_path'] # logger file to get meta logger = get_test_tube_logger(save_dir, False) # logger file to get weights checkpoint = init_checkpoint_callback(logger) # add these to the trainer options trainer_options['checkpoint_callback'] = checkpoint trainer_options['logger'] = logger # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test model loading pretrained_model = load_model(logger, trainer.checkpoint_callback.filepath) # test new model accuracy test_loaders = model.test_dataloader() if not isinstance(test_loaders, list): test_loaders = [test_loaders] [ run_prediction(dataloader, pretrained_model) for dataloader in test_loaders ] if trainer.use_ddp or trainer.use_ddp2: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers( ) # test HPC loading / saving trainer.hpc_save(save_dir, logger) trainer.hpc_load(save_dir, on_gpu=on_gpu)
def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True): save_dir = init_save_dir() # logger file to get meta logger = get_test_tube_logger(False) logger.log_hyperparams(hparams) logger.save() # logger file to get weights checkpoint = ModelCheckpoint(save_dir) # add these to the trainer options trainer_options['checkpoint_callback'] = checkpoint trainer_options['logger'] = logger # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test model loading pretrained_model = load_model(logger.experiment, save_dir) # test new model accuracy [ run_prediction(dataloader, pretrained_model) for dataloader in model.test_dataloader() ] if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers( ) # test HPC loading / saving trainer.hpc_save(save_dir, logger) trainer.hpc_load(save_dir, on_gpu=on_gpu) clear_save_dir()
def run_gpu_model_test(trainer_options, model, hparams, on_gpu=True): save_dir = init_save_dir() # exp file to get meta exp = get_exp(False) exp.argparse(hparams) exp.save() # exp file to get weights checkpoint = ModelCheckpoint(save_dir) # add these to the trainer options trainer_options['checkpoint_callback'] = checkpoint trainer_options['experiment'] = exp # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test model loading pretrained_model = load_model(exp, save_dir, on_gpu) # test model preds run_prediction(model.test_dataloader, pretrained_model) if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers( ) # test HPC loading / saving trainer.hpc_save(save_dir, exp) trainer.hpc_load(save_dir, on_gpu=on_gpu) clear_save_dir()
def test_dp_resume(tmpdir): """Make sure DP continues training correctly.""" hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(**hparams) trainer_options = dict( max_epochs=1, gpus=2, distributed_backend='dp', ) # get logger logger = tutils.get_default_logger(tmpdir) # exp file to get weights # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # add these to the trainer options trainer_options['logger'] = logger trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # track epoch before saving. Increment since we finished the current epoch, don't want to rerun real_global_epoch = trainer.current_epoch + 1 # correct result and ok accuracy assert result == 1, 'amp + dp model failed to complete' # --------------------------- # HPC LOAD/SAVE # --------------------------- # save trainer.hpc_save(tmpdir, logger) # init new trainer new_logger = tutils.get_default_logger(tmpdir, version=logger.version) trainer_options['logger'] = new_logger trainer_options['checkpoint_callback'] = ModelCheckpoint(tmpdir) trainer_options['limit_train_batches'] = 0.5 trainer_options['limit_val_batches'] = 0.2 trainer_options['max_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model dp_model = new_trainer.model dp_model.eval() dataloader = trainer.train_dataloader tutils.run_prediction(dataloader, dp_model, dp=True) # new model model = EvalModelTemplate(**hparams) model.on_train_start = assert_good_acc # fit new model which should load hpc weights new_trainer.fit(model) # test freeze on gpu model.freeze() model.unfreeze()
def test_amp_gpu_ddp_slurm_managed(): """ Make sure DDP + AMP work :return: """ if not can_run_gpu_test(): return # simulate setting slurm flags os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) os.environ['SLURM_LOCALID'] = str(0) hparams = get_hparams() model = LightningTestModel(hparams) trainer_options = dict(show_progress_bar=True, max_nb_epochs=1, gpus=[0], distributed_backend='ddp', use_amp=True) save_dir = init_save_dir() # exp file to get meta exp = get_exp(False) exp.argparse(hparams) exp.save() # exp file to get weights checkpoint = ModelCheckpoint(save_dir) # add these to the trainer options trainer_options['checkpoint_callback'] = checkpoint trainer_options['experiment'] = exp # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test root model address assert trainer.resolve_root_node_address('abc') == 'abc' assert trainer.resolve_root_node_address('abc[23]') == 'abc23' assert trainer.resolve_root_node_address('abc[23-24]') == 'abc23' assert trainer.resolve_root_node_address( 'abc[23-24, 45-40, 40]') == 'abc23' # test model loading with a map_location map_location = 'cuda:1' pretrained_model = load_model(exp, save_dir, True, map_location) # test model preds run_prediction(model.test_dataloader, pretrained_model) if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers( ) # test HPC loading / saving trainer.hpc_save(save_dir, exp) trainer.hpc_load(save_dir, on_gpu=True) # test freeze on gpu model.freeze() model.unfreeze() clear_save_dir()
def test_cpu_slurm_save_load(): """ Verify model save/load/checkpoint on CPU :return: """ hparams = get_hparams() model = LightningTestModel(hparams) save_dir = init_save_dir() # exp file to get meta exp = get_exp(False) exp.argparse(hparams) exp.save() version = exp.version trainer_options = dict(max_nb_epochs=1, experiment=exp, checkpoint_callback=ModelCheckpoint(save_dir)) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) real_global_step = trainer.global_step # traning complete assert result == 1, 'amp + ddp model failed to complete' # predict with trained model before saving # make a prediction for batch in model.test_dataloader: break x, y = batch x = x.view(x.size(0), -1) model.eval() pred_before_saving = model(x) # test HPC saving # simulate snapshot on slurm saved_filepath = trainer.hpc_save(save_dir, exp) assert os.path.exists(saved_filepath) # new exp file to get meta exp = get_exp(False, version=version) exp.argparse(hparams) exp.save() trainer_options = dict( max_nb_epochs=1, experiment=exp, checkpoint_callback=ModelCheckpoint(save_dir), ) trainer = Trainer(**trainer_options) model = LightningTestModel(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_pred_same(): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same trainer.model.eval() new_pred = trainer.model(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 model.on_epoch_start = assert_pred_same # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) clear_save_dir()
def test_cpu_slurm_save_load(): """ Verify model save/load/checkpoint on CPU :return: """ hparams = get_hparams() model = LightningTestModel(hparams) save_dir = init_save_dir() # exp file to get meta exp = get_exp(False) exp.argparse(hparams) exp.save() cluster_a = SlurmCluster() trainer_options = dict( max_nb_epochs=1, cluster=cluster_a, experiment=exp, checkpoint_callback=ModelCheckpoint(save_dir) ) # fit model trainer = Trainer(**trainer_options) result = trainer.fit(model) real_global_step = trainer.global_step # traning complete assert result == 1, 'amp + ddp model failed to complete' # predict with trained model before saving # make a prediction for batch in model.test_dataloader: break x, y = batch x = x.view(x.size(0), -1) model.eval() pred_before_saving = model(x) # test registering a save function trainer.enable_auto_hpc_walltime_manager() # test HPC saving # simulate snapshot on slurm saved_filepath = trainer.hpc_save(save_dir, exp) assert os.path.exists(saved_filepath) # wipe-out trainer and model # retrain with not much data... this simulates picking training back up after slurm # we want to see if the weights come back correctly continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number) trainer_options = dict( max_nb_epochs=1, cluster=SlurmCluster(continue_tng_hparams), experiment=exp, checkpoint_callback=ModelCheckpoint(save_dir), ) trainer = Trainer(**trainer_options) model = LightningTestModel(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_pred_same(): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same trainer.model.eval() new_pred = trainer.model(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 model.on_epoch_start = assert_pred_same # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model) clear_save_dir()
def test_cpu_slurm_save_load(tmpdir): """Verify model save/load/checkpoint on CPU.""" hparams = EvalModelTemplate.get_default_hparams() model = EvalModelTemplate(hparams) # logger file to get meta logger = tutils.get_default_logger(tmpdir) version = logger.version # fit model trainer = Trainer(max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir)) result = trainer.fit(model) real_global_step = trainer.global_step # traning complete assert result == 1, 'cpu model failed to complete' # predict with trained model before saving # make a prediction dataloaders = model.test_dataloader() if not isinstance(dataloaders, list): dataloaders = [dataloaders] for dataloader in dataloaders: for batch in dataloader: break x, y = batch x = x.view(x.size(0), -1) model.eval() pred_before_saving = model(x) # test HPC saving # simulate snapshot on slurm saved_filepath = trainer.hpc_save(tmpdir, logger) assert os.path.exists(saved_filepath) # new logger file to get meta logger = tutils.get_default_logger(tmpdir, version=version) trainer = Trainer( max_epochs=1, logger=logger, checkpoint_callback=ModelCheckpoint(tmpdir), ) model = EvalModelTemplate(hparams) # set the epoch start hook so we can predict before the model does the full training def assert_pred_same(): assert trainer.global_step == real_global_step and trainer.global_step > 0 # predict with loaded model to make sure answers are the same trainer.model.eval() new_pred = trainer.model(x) assert torch.all(torch.eq(pred_before_saving, new_pred)).item() == 1 model.on_epoch_start = assert_pred_same # by calling fit again, we trigger training, loading weights from the cluster # and our hook to predict using current model before any more weight updates trainer.fit(model)
def test_dp_resume(): """ Make sure DP continues training correctly :return: """ if not tutils.can_run_gpu_test(): return tutils.reset_seed() hparams = tutils.get_hparams() model = LightningTestModel(hparams) trainer_options = dict( show_progress_bar=True, max_nb_epochs=2, gpus=2, distributed_backend='dp', ) save_dir = tutils.init_save_dir() # get logger logger = tutils.get_test_tube_logger(debug=False) # exp file to get weights # logger file to get weights checkpoint = tutils.init_checkpoint_callback(logger) # add these to the trainer options trainer_options['logger'] = logger trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # track epoch before saving real_global_epoch = trainer.current_epoch # correct result and ok accuracy assert result == 1, 'amp + dp model failed to complete' # --------------------------- # HPC LOAD/SAVE # --------------------------- # save trainer.hpc_save(save_dir, logger) # init new trainer new_logger = tutils.get_test_tube_logger(version=logger.version) trainer_options['logger'] = new_logger trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 trainer_options['max_nb_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): assert new_trainer.current_epoch == real_global_epoch and new_trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model dp_model = new_trainer.model dp_model.eval() dataloader = trainer.get_train_dataloader() tutils.run_prediction(dataloader, dp_model, dp=True) # new model model = LightningTestModel(hparams) model.on_sanity_check_start = assert_good_acc # fit new model which should load hpc weights new_trainer.fit(model) # test freeze on gpu model.freeze() model.unfreeze() tutils.clear_save_dir()
def main(): """ Make sure DDP + AMP continue training correctly :return: """ hparams = get_hparams() model = LightningTestModel(hparams) trainer_options = dict( show_progress_bar=True, max_nb_epochs=4, gpus=2, distributed_backend='dp', ) save_dir = init_save_dir() # exp file to get meta exp = get_exp(False) exp.argparse(hparams) exp.save() # exp file to get weights checkpoint = ModelCheckpoint(save_dir) # add these to the trainer options trainer_options['experiment'] = exp trainer_options['checkpoint_callback'] = checkpoint # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # track epoch before saving real_global_epoch = trainer.current_epoch # correct result and ok accuracy assert result == 1, 'amp + dp model failed to complete' # --------------------------- # HPC LOAD/SAVE # --------------------------- # save trainer.hpc_save(save_dir, exp) # init new trainer new_exp = get_exp(False, version=exp.version) trainer_options['experiment'] = new_exp trainer_options['checkpoint_callback'] = ModelCheckpoint(save_dir) trainer_options['train_percent_check'] = 0.2 trainer_options['val_percent_check'] = 0.2 trainer_options['max_nb_epochs'] = 1 new_trainer = Trainer(**trainer_options) # set the epoch start hook so we can predict before the model does the full training def assert_good_acc(): assert trainer.current_epoch == real_global_epoch and trainer.current_epoch > 0 # if model and state loaded correctly, predictions will be good even though we # haven't trained with the new loaded model dp_model = new_trainer.model dp_model.eval() _ = [run_prediction(dataloader, dp_model, dp=True) for dataloader in trainer.val_dataloader] # new model model = LightningTestModel(hparams) model.on_sanity_check_start = assert_good_acc # fit new model which should load hpc weights new_trainer.fit(model) # test freeze on gpu model.freeze() model.unfreeze() clear_save_dir()
def test_amp_gpu_ddp_slurm_managed(): """ Make sure DDP + AMP work :return: """ if not can_run_gpu_test(): return reset_seed() # simulate setting slurm flags set_random_master_port() os.environ['SLURM_LOCALID'] = str(0) hparams = get_hparams() model = LightningTestModel(hparams) trainer_options = dict(show_progress_bar=True, max_nb_epochs=1, gpus=[0], distributed_backend='ddp', use_amp=True) save_dir = init_save_dir() # exp file to get meta logger = get_test_tube_logger(False) # exp file to get weights checkpoint = init_checkpoint_callback(logger) # add these to the trainer options trainer_options['checkpoint_callback'] = checkpoint trainer_options['logger'] = logger # fit model trainer = Trainer(**trainer_options) trainer.is_slurm_managing_tasks = True result = trainer.fit(model) # correct result and ok accuracy assert result == 1, 'amp + ddp model failed to complete' # test root model address assert trainer.resolve_root_node_address('abc') == 'abc' assert trainer.resolve_root_node_address('abc[23]') == 'abc23' assert trainer.resolve_root_node_address('abc[23-24]') == 'abc23' assert trainer.resolve_root_node_address( 'abc[23-24, 45-40, 40]') == 'abc23' # test model loading with a map_location pretrained_model = load_model(logger.experiment, trainer.checkpoint_callback.filepath) # test model preds [ run_prediction(dataloader, pretrained_model) for dataloader in trainer.get_test_dataloaders() ] if trainer.use_ddp: # on hpc this would work fine... but need to hack it for the purpose of the test trainer.model = pretrained_model trainer.optimizers, trainer.lr_schedulers = pretrained_model.configure_optimizers( ) # test HPC loading / saving trainer.hpc_save(save_dir, logger) trainer.hpc_load(save_dir, on_gpu=True) # test freeze on gpu model.freeze() model.unfreeze() clear_save_dir()