def test_ddp_configure_ddp(): """Tests with ddp strategy.""" model = BoringModel() ddp_strategy = DDPStrategy() trainer = Trainer( max_epochs=1, strategy=ddp_strategy, ) # test wrap the model if fitting trainer.state.fn = TrainerFn.FITTING trainer.strategy.connect(model) trainer.lightning_module.trainer = trainer trainer.strategy.setup_environment() assert isinstance(trainer.model, LightningModule) trainer.strategy.setup(trainer) # in DDPStrategy configure_ddp(), model wrapped by DistributedDataParallel assert isinstance(trainer.model, DistributedDataParallel) ddp_strategy = DDPStrategy() trainer = Trainer( max_epochs=1, strategy=ddp_strategy, ) # test do not wrap the model if TrainerFn is not fitting trainer.state.fn = TrainerFn.VALIDATING trainer.strategy.connect(model) trainer.lightning_module.trainer = trainer trainer.strategy.setup_environment() trainer.strategy.setup(trainer) # in DDPStrategy configure_ddp(), model are still LightningModule assert isinstance(trainer.model, LightningModule)
def test_ddp_post_local_sgd_comm_hook(tmpdir): """Test for DDP post-localSGD hook.""" model = BoringModel() strategy = DDPStrategy( ddp_comm_state=post_localSGD.PostLocalSGDState( process_group=None, subgroup=None, start_localSGD_iter=8, ), ddp_comm_hook=post_localSGD.post_localSGD_hook, model_averaging_period=4, ) trainer = Trainer( fast_dev_run=True, gpus=2, strategy=strategy, default_root_dir=tmpdir, sync_batchnorm=True, ) trainer.fit(model) trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook expected_comm_hook = post_localSGD.post_localSGD_hook.__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_pluggable_accelerator(): class TestAccelerator(Accelerator): @staticmethod def parse_devices(devices): return devices @staticmethod def get_parallel_devices(devices): return ["foo"] * devices @staticmethod def auto_device_count(): return 3 @staticmethod def is_available(): return True @staticmethod def name(): return "custom_acc_name" trainer = Trainer(accelerator=TestAccelerator(), devices=2, strategy="ddp") assert isinstance(trainer.accelerator, TestAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert trainer.strategy.parallel_devices == ["foo"] * 2 trainer = Trainer(strategy=DDPStrategy(TestAccelerator()), devices="auto") assert isinstance(trainer.accelerator, TestAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert trainer.strategy.parallel_devices == ["foo"] * 3
def test_post_local_sgd_model_averaging_value_error(average_parameters_mock, tmpdir): """Test that when using DDP with post-localSGD a ValueError is thrown when the optmizer is ZeroRedundancyOptimizer.""" from torch.distributed.optim import ZeroRedundancyOptimizer class OptimizerModel(BoringModel): def configure_optimizers(self): return ZeroRedundancyOptimizer(params=self.parameters(), optimizer_class=torch.optim.Adam, lr=0.01) model = OptimizerModel() strategy = DDPStrategy( ddp_comm_state=post_localSGD.PostLocalSGDState( process_group=None, subgroup=None, start_localSGD_iter=8, ), ddp_comm_hook=post_localSGD.post_localSGD_hook, model_averaging_period=4, ) trainer = Trainer( fast_dev_run=True, gpus=2, strategy=strategy, default_root_dir=tmpdir, sync_batchnorm=True, ) with pytest.raises(ValueError, match="Currently model averaging cannot work with a distributed optimizer"): trainer.fit(model) average_parameters_mock.assert_not_called()
def test_tpu_invalid_raises(): strategy = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=PrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin"): Trainer(strategy=strategy, devices=8) strategy = DDPStrategy(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) with pytest.raises(ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`"): Trainer(strategy=strategy, devices=8)
def test_configure_launcher_create_processes_externally(): class MyClusterEnvironment(ClusterEnvironment): @property def creates_processes_externally(self): return True @property def main_address(self): return "" @property def main_port(self): return 8080 @staticmethod def detect(): return True def world_size(self): return 1 def set_world_size(self): pass def global_rank(self): return 0 def set_global_rank(self): pass def local_rank(self): return 0 def node_rank(self): return 0 ddp_strategy = DDPStrategy(cluster_environment=MyClusterEnvironment()) assert ddp_strategy.launcher is None ddp_strategy._configure_launcher() assert ddp_strategy.launcher is None
def test_ddp_dont_configure_sync_batchnorm(trainer_fn): model = BoringModelGPU() model.layer = torch.nn.BatchNorm1d(10) ddp_strategy = DDPStrategy() trainer = Trainer(accelerator="gpu", devices=1, strategy=ddp_strategy, sync_batchnorm=True) trainer.state.fn = trainer_fn trainer.strategy.connect(model) trainer.lightning_module.trainer = trainer trainer.strategy.setup_environment() assert isinstance(trainer.model, LightningModule) trainer.strategy.setup(trainer) # because TrainerFn is not FITTING, model is not configured with sync batchnorm assert not isinstance(trainer.strategy.model.layer, torch.nn.modules.batchnorm.SyncBatchNorm)
def main(v_cfg: DictConfig): print(OmegaConf.to_yaml(v_cfg)) seed_everything(0) torch.autograd.set_detect_anomaly(True) early_stop_callback = EarlyStopping( patience=100, monitor="Validation Loss" ) model_check_point = ModelCheckpoint( monitor='Valid mean spearman boost', save_top_k=1, save_last=True, mode="max", auto_insert_metric_name=True, # train_time_interval=timedelta(seconds=60 * 60) ) trainer = Trainer(gpus=v_cfg["trainer"].gpu, enable_model_summary=False, strategy=DDPStrategy( process_group_backend="gloo" if platform.system() == "Windows" else "nccl", find_unused_parameters=False ) if not v_cfg["trainer"]["evaluate"] else None, # early_stop_callback=early_stop_callback, callbacks=[model_check_point], auto_lr_find="learning_rate" if v_cfg["trainer"].auto_lr_find else False, max_epochs=3000, gradient_clip_val=0.1, check_val_every_n_epoch=1, replace_sampler_ddp=False ) model = Regress_hyper_parameters(v_cfg) if v_cfg["trainer"].resume_from_checkpoint is not None: state_dict = torch.load(v_cfg["trainer"].resume_from_checkpoint)["state_dict"] # for item in list(state_dict.keys()): # if "point_feature_extractor" in item: # state_dict.pop(item) model.load_state_dict(state_dict, strict=False) if v_cfg["trainer"].auto_lr_find: trainer.tune(model) print(model.learning_rate) # model.save('temp/model.pt') if v_cfg["trainer"].evaluate: trainer.test(model) else: trainer.fit(model)
def test_tpu_invalid_raises(): training_type_plugin = TPUSpawnStrategy(accelerator=TPUAccelerator(), precision_plugin=Mock()) with pytest.raises( ValueError, match="TPUAccelerator` can only be used with a `TPUPrecisionPlugin" ): Trainer(strategy=training_type_plugin) training_type_plugin = DDPStrategy(accelerator=TPUAccelerator(), precision_plugin=TPUPrecisionPlugin()) with pytest.raises( ValueError, match="TPUAccelerator` can only be used with a `SingleTPUStrategy`" ): Trainer(strategy=training_type_plugin)
def test_ddp_fp16_compress_comm_hook(tmpdir): """Test for DDP FP16 compress hook.""" model = BoringModel() strategy = DDPStrategy(ddp_comm_hook=default.fp16_compress_hook) trainer = Trainer( max_epochs=1, gpus=2, strategy=strategy, default_root_dir=tmpdir, sync_batchnorm=True, fast_dev_run=True, ) trainer.fit(model) trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook expected_comm_hook = default.fp16_compress_hook.__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_tpu_invalid_raises_set_precision_with_strategy(): accelerator = TPUAccelerator() training_type_plugin = TPUSpawnStrategy(accelerator=accelerator, precision_plugin=object()) with pytest.raises( ValueError, match= "`TPUAccelerator` can only be used with a `TPUPrecisionPlugin`"): Trainer(strategy=training_type_plugin) accelerator = TPUAccelerator() training_type_plugin = DDPStrategy(accelerator=accelerator, precision_plugin=TPUPrecisionPlugin()) with pytest.raises( ValueError, match= "The `TPUAccelerator` can only be used with a `SingleTPUStrategy` or `TPUSpawnStrategy" ): Trainer(strategy=training_type_plugin)
def test_post_local_sgd_model_averaging(average_parameters_mock, tmpdir): """Test that when using DDP with post-localSGD, model averaging is called.""" model = BoringModel() # test regular ddp does not call model averaging trainer = Trainer( fast_dev_run=True, accelerator="gpu", devices=2, strategy="ddp", default_root_dir=tmpdir, sync_batchnorm=True, enable_progress_bar=False, enable_model_summary=False, ) trainer.fit(model) average_parameters_mock.assert_not_called() # test ddp with post-localSGD does call model averaging ddp_strategy = DDPStrategy( ddp_comm_state=post_localSGD.PostLocalSGDState( process_group=None, subgroup=None, start_localSGD_iter=8, ), ddp_comm_hook=post_localSGD.post_localSGD_hook, model_averaging_period=4, ) trainer = Trainer( fast_dev_run=True, accelerator="gpu", devices=2, strategy=ddp_strategy, default_root_dir=tmpdir, sync_batchnorm=True, ) trainer.fit(model) average_parameters_mock.assert_called()
def test_ddp_fp16_compress_wrap_sgd_comm_hook(tmpdir): """Test for DDP FP16 compress wrapper for SGD hook.""" model = BoringModel() strategy = DDPStrategy( ddp_comm_state=powerSGD.PowerSGDState(process_group=None), ddp_comm_hook=powerSGD.powerSGD_hook, ddp_comm_wrapper=default.fp16_compress_wrapper, ) trainer = Trainer( max_epochs=1, gpus=2, strategy=strategy, default_root_dir=tmpdir, sync_batchnorm=True, fast_dev_run=True, ) trainer.fit(model) trainer_comm_hook = trainer.strategy.model.get_ddp_logging_data().comm_hook expected_comm_hook = default.fp16_compress_wrapper(powerSGD.powerSGD_hook).__qualname__ assert trainer_comm_hook == expected_comm_hook assert trainer.state.finished, f"Training failed with {trainer.state}"
def test_ddp_strategy_set_timeout(mock_init_process_group): """Tests with ddp strategy.""" test_timedelta = timedelta(seconds=30) model = BoringModel() ddp_strategy = DDPStrategy(timeout=test_timedelta) trainer = Trainer( max_epochs=1, strategy=ddp_strategy, ) # test wrap the model if fitting trainer.state.fn = TrainerFn.FITTING trainer.strategy.connect(model) trainer.lightning_module.trainer = trainer trainer.strategy.setup_environment() process_group_backend = trainer.strategy._get_process_group_backend() global_rank = trainer.strategy.cluster_environment.global_rank() world_size = trainer.strategy.cluster_environment.world_size() mock_init_process_group.assert_called_with(process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta)
def process_args(args=None, return_io=False): """ Process arguments for running training """ if not isinstance(args, argparse.Namespace): args = parse_args(args) args.loader_kwargs = dict() targs = dict(max_epochs=args.epochs, ) targs['accumulate_grad_batches'] = args.accumulate env = None if args.ipu: targs['accelerator'] = 'ipu' targs['devices'] = process_gpus(args.gpus) else: targs['gpus'] = process_gpus(args.gpus) targs['num_nodes'] = args.num_nodes if args.lsf: ########################################################################################## # Currently coding against pytorch-lightning 1.4.3 ########################################################################################## if args.num_workers > 4: print0( "num_workers (-k) > 4 can lead to hanging on Summit -- setting to 4", file=sys.stderr) args.num_workers = 4 args.loader_kwargs[ 'num_workers'] = 1 # Set as a default. This will get overridden elsewhere args.loader_kwargs['multiprocessing_context'] = 'spawn' env = LSFEnvironment() elif args.slurm: env = SLURMEnvironment() if env is not None: global RANK global SIZE try: RANK = env.global_rank() SIZE = env.world_size() except: print( ">>> Could not get global rank -- setting RANK to 0 and SIZE to 1", file=sys.stderr) RANK = 0 SIZE = 1 if targs['gpus'] is not None: targs['accelerator'] = 'gpu' if targs['gpus'] == 1: targs['devices'] = 1 else: if env is None: raise ValueError( 'Please specify environment (--lsf or --slurm) if using more than one GPU' ) # parallel_devices = [torch.device(i) for i in range(torch.cuda.device_count()) if i < targs['gpus']] # precision_plugin = NativeMixedPrecisionPlugin(16, 'cuda') torch.cuda.set_device(env.local_rank()) targs['devices'] = targs['gpus'] targs['strategy'] = DDPStrategy( find_unused_parameters=False, cluster_environment=env, #accelerator=GPUAccelerator(), #parallel_devices=parallel_devices, #precision_plugin=precision_plugin, ) print( "---- Rank %s - Using GPUAccelerator with DDPStrategy" % env.global_rank(), file=sys.stderr) else: targs['accelerator'] = 'cpu' del args.gpus if args.sanity: if isinstance(args.sanity, str): args.sanity = int(args.sanity) else: args.sanity = 4000 targs['limit_train_batches'] = args.sanity targs['limit_val_batches'] = args.sanity // 4 if args.lr_find: targs['auto_lr_find'] = True del args.lr_find if args.checkpoint is not None: if os.path.exists(args.checkpoint): targs['resume_from_checkpoint'] = args.checkpoint else: warnings.warn( "Ignoring -c/--checkpoint argument because {args.checkpoint} does not exist." ) args.checkpoint = None if args.cuda_profile: targs['profiler'] = PyTorchProfiler( filename=f'pytorch_prof.{RANK:0{len(str(SIZE))}}', emit_nvtx=True) targs['replace_sampler_ddp'] = False args.loader_kwargs = dict() # make sure we are classifying if we are using adding classifier layers # to a resnet features model if args.features_checkpoint is not None: if args.manifold: raise ValueError( 'Cannot use manifold loss (i.e. -M) if adding classifier (i.e. -F)' ) args.classify = True data_mod = DeepIndexDataModule(args, keep_open=True, seed=args.seed + RANK, rank=RANK, size=SIZE) # if classification problem, use the number of taxa as the number of outputs if args.classify: args.n_outputs = data_mod.dataset.n_outputs args.input_nc = 136 if args.tnf else len(data_mod.dataset.vocab) model = process_model(args, taxa_table=data_mod.dataset.difile.taxa_table) if args.num_workers > 0: data_mod.dataset.close() ret = [model, args, targs] if return_io: ret.append(io) ret.append(data_mod) return tuple(ret)
def test_parallel_devices_in_strategy_confilict_with_accelerator( parallel_devices, accelerator): with pytest.raises(MisconfigurationException, match=r"parallel_devices set through"): Trainer(strategy=DDPStrategy(parallel_devices=parallel_devices), accelerator=accelerator)
@RunIf(min_gpus=2) @mock.patch.dict( os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "SLURM_PROCID": "1", "SLURM_LOCALID": "1", }, ) @mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True) @pytest.mark.parametrize("strategy", ["ddp", DDPStrategy()]) def test_strategy_choice_ddp_slurm(setup_distributed_mock, strategy): trainer = Trainer(fast_dev_run=True, strategy=strategy, gpus=2) assert trainer._accelerator_connector._is_slurm_managing_tasks() assert isinstance(trainer.accelerator, GPUAccelerator) assert isinstance(trainer.strategy, DDPStrategy) assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) assert trainer.strategy.cluster_environment.local_rank() == 1 assert trainer.strategy.local_rank == 1 @mock.patch.dict( os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "SLURM_NTASKS": "2",