def test_reset_seed_no_op(): """Test that the reset_seed function is a no-op when seed_everything() was not used.""" assert "PL_GLOBAL_SEED" not in os.environ seed_before = torch.initial_seed() seed_utils.reset_seed() assert torch.initial_seed() == seed_before assert "PL_GLOBAL_SEED" not in os.environ
def setup_distributed(self): reset_seed() # determine which process we are and world size self.set_world_ranks() # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table self.init_ddp_connection() # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info( f"All DDP processes registered. Starting ddp with {self.world_size} processes" ) log.info("-" * 100) # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device
def setup_distributed(self) -> None: reset_seed() # determine which process we are and world size self.set_world_ranks() self._init_bagua_distributed()
def _worker_setup(self, process_idx: int): reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank init_dist_connection(self.cluster_environment, self.torch_distributed_backend, self.global_rank, self.world_size)
def new_process(self, process_idx: int, trainer, mp_queue) -> None: self.mp_queue = mp_queue reset_seed() self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() # set warning rank rank_zero_only.rank = self.global_rank if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: trainer.progress_bar_callback.disable() self.model_to_device() trainer.accelerator.setup_optimizers(trainer) trainer.precision_plugin.connect(self._model, None, None) self.barrier("pre-run-stage") results = trainer.run_stage() self.transfer_distrib_spawn_state_on_fit_end(results) # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 self.barrier("end-process") # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 if self.local_rank == 0: time.sleep(2)
def _worker_setup(self, process_idx: int): reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() init_dist_connection(self.cluster_environment, self._process_group_backend, self.global_rank, self.world_size)
def test_reset_seed_everything(): """ Test that we can reset the seed to the initial value set by seed_everything() """ assert "PL_GLOBAL_SEED" not in os.environ seed_utils.seed_everything(123) assert os.environ["PL_GLOBAL_SEED"] == "123" before = torch.rand(1) seed_utils.reset_seed() after = torch.rand(1) assert torch.allclose(before, after)
def setup_environment(self) -> None: reset_seed() # set warning rank rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() assert self.cluster_environment is not None init_dist_connection(self.cluster_environment, self._process_group_backend) super().setup_environment()
def setup_distributed(self): reset_seed() # determine which process we are and world size self.set_world_ranks() self._init_deepspeed_distributed() if not self._config_initialized: self._format_config() self._config_initialized = True
def setup_distributed(self): log.detail(f"{self.__class__.__name__}: setting up distributed...") reset_seed() # determine which process we are and world size self.set_world_ranks() # set warning rank rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() init_dist_connection(self.cluster_environment, self._process_group_backend)
def setup_distributed(self): reset_seed() # determine which process we are and world size self.set_world_ranks() # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table init_dist_connection(self.cluster_environment, self.torch_distributed_backend)
def new_process(self, process_idx, trainer, mp_queue): self.mp_queue = mp_queue reset_seed() self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.global_rank # type: ignore # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table self.init_ddp_connection(self.global_rank, self.world_size) # TODO: we moved it to the trainer.fit after calling pre_dispatch # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info(f"All DDP processes registered. Starting ddp with" "{self.world_size} processes") log.info("-" * 100) # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device if self.sync_batchnorm: self.model = self.configure_sync_batchnorm(self.model) self.configure_ddp() # Move this line here so that we can temporarily use cpu while configuring ddp # and use ipex.DEVICE later on # move the model to the correct device # # The reason for this movement is relate to unstorage tensor for ipex. # So maybe another way is replacing torch.save like ipexaccelerator does. self.model_to_device() self.barrier() results = trainer.run_stage() # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(results)
def test_reset_seed_everything(workers): """Test that we can reset the seed to the initial value set by seed_everything()""" assert "PL_GLOBAL_SEED" not in os.environ assert "PL_SEED_WORKERS" not in os.environ seed_utils.seed_everything(123, workers) before = torch.rand(1) assert os.environ["PL_GLOBAL_SEED"] == "123" assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) seed_utils.reset_seed() after = torch.rand(1) assert os.environ["PL_GLOBAL_SEED"] == "123" assert os.environ["PL_SEED_WORKERS"] == str(int(workers)) assert torch.allclose(before, after)
def setup_distributed(self): reset_seed() # determine which process we are and world size self.set_world_ranks() # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table self.init_ddp_connection() # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device
def new_process(self, process_idx: int, trainer: "pl.Trainer", mp_queue: SimpleQueue) -> None: self.mp_queue = mp_queue reset_seed() self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table init_ddp_connection(self.cluster_environment, self.torch_distributed_backend, self.global_rank, self.world_size) # TODO: we moved it to the trainer.fit after calling pre_dispatch # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device # move the model to the correct device self.model_to_device() if self.sync_batchnorm: self.model = self.configure_sync_batchnorm(self.model) # skip wrapping the model if we are not fitting as no gradients need to be exchanged trainer_fn = self.lightning_module.trainer.state.fn if trainer_fn == TrainerFn.FITTING: self.configure_ddp() self.barrier() results = trainer.run_stage() # persist info in ddp_spawn self.__transfer_distrib_spawn_state_on_fit_end(trainer, results) # ensure that spawned processes go through teardown before joining trainer._call_teardown_hook()
def new_process(self, process_idx, trainer, mp_queue): self.mp_queue = mp_queue reset_seed() self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table self.init_ddp_connection(self.global_rank, self.world_size) # TODO: we moved it to the trainer.fit after calling pre_dispatch # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # on world_size=0 let everyone know training is starting if self.is_global_zero and not torch.distributed.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info( f"All DDP processes registered. Starting ddp with {self.world_size} processes" ) log.info("-" * 100) # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device # move the model to the correct device self.model_to_device() if self.sync_batchnorm: self.model = self.configure_sync_batchnorm(self.model) self.configure_ddp() self.barrier() results = trainer.run_stage() # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(results)
def new_process(self, process_idx, trainer, mp_queue): self.mp_queue = mp_queue reset_seed() self.set_world_ranks(process_idx) # set warning rank rank_zero_only.rank = self.global_rank # set up server using proc 0's ip address # try to init for 20 times at max in case ports are taken # where to store ip_table self.init_ddp_connection(self.global_rank, self.world_size) # TODO: we moved it to the trainer.fit after calling pre_dispatch # ... need to double check that it is the correct place # self.trainer.call_setup_hook(self.model) # set the ranks and devices self.dist.rank = self.global_rank self.dist.device = self.root_device # move the model to the correct device self.model_to_device() if self.sync_batchnorm: self.model = self.configure_sync_batchnorm(self.model) self.configure_ddp() self.barrier() results = trainer.run_stage() # persist info in ddp_spawn self.transfer_distrib_spawn_state_on_fit_end(results) # ensure that spawned processes go through teardown before joining trainer._call_teardown_hook()
def run_sanity_check(self, ref_model): using_val_step = ref_model.val_dataloader is not None and is_overridden( 'validation_step', ref_model) should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0 # run tiny validation (if validation defined) # to make sure program won't crash during val if should_sanity_check: stage = self._running_stage self.sanity_checking = True # hook and callback self.on_sanity_check_start() # run eval step self.run_evaluation() self.on_sanity_check_end() self._running_stage = stage # reset the seed to what it was before sanity check # prevents sanity check to affect random sampling in training reset_seed()
def _worker_setup(self, process_idx: int): reset_seed() self.tpu_local_core_rank = xm.get_local_ordinal() self.tpu_global_core_rank = xm.get_ordinal() rank_zero_only.rank = self.global_rank
def _worker_setup(self, process_idx: int): reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank