def setup(self, trainer: "pl.Trainer") -> None: self._rank_0_will_call_children_scripts = self.broadcast( self._rank_0_will_call_children_scripts) if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() self.accelerator.setup(trainer) # move the model to the correct device self.model_to_device() trainer_fn = trainer.state.fn if trainer_fn == TrainerFn.FITTING: if self._layer_sync and self.model: self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() if trainer_fn == TrainerFn.FITTING: # set up optimizers after the module has been moved to the device # but before the module has been wrapped self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) # skip wrapping the model if we are not fitting as no gradients need to be exchanged self._configure_bagua_model(trainer)
def teardown(self) -> None: """This method is called to teardown the training process. It is the right place to release memory and free other resources. """ optimizers_to_device(self.optimizers, torch.device("cpu")) self.precision_plugin.teardown()
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) self.init_deepspeed() self.barrier()
def setup(self, trainer: "pl.Trainer") -> None: # share ddp pids to all processes self._rank_0_will_call_children_scripts = self.broadcast( self._rank_0_will_call_children_scripts) if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() self.accelerator.setup(trainer) # move the model to the correct device self.model_to_device() # skip wrapping the model if we are not fitting as no gradients need to be exchanged trainer_fn = trainer.state.fn if trainer_fn == TrainerFn.FITTING: if self._layer_sync: self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() if trainer_fn == TrainerFn.FITTING: self.configure_ddp() # set up optimizers after the wrapped module has been moved to the device self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) if _TORCH_GREATER_EQUAL_1_10 and trainer_fn == TrainerFn.FITTING: import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState): self._enable_model_averaging()
def configure_ddp(self) -> None: # set up optimizers after the wrapped module has been moved to the device self.setup_optimizers(self.lightning_module.trainer) self.model, self.optimizers = self._setup_model_and_optimizers( model=LightningShardedDataParallel(self.model), optimizers=self.optimizers) optimizers_to_device(self.optimizers, self.root_device)
def configure_ddp(self) -> None: self._set_ddp_kwargs() self.setup_optimizers(self.model.trainer) self.model, self.optimizers = self._setup_model_and_optimizers( model=LightningShardedDataParallel(self.model), optimizers=self.optimizers, ) optimizers_to_device(self.optimizers, self.root_device)
def configure_ddp(self) -> None: self.pre_configure_ddp() self.model = self._setup_model(LightningDistributedModule(self.model)) self._register_ddp_hooks() # set up optimizers after the wrapped module has been moved to the device self.setup_optimizers(self.lightning_module.trainer) optimizers_to_device(self.optimizers, self.root_device)
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) # we set the device so that optimizers can be created with distributed comms. self.lightning_module._device = self.root_device self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) self.init_deepspeed() self.barrier()
def setup(self, trainer: "pl.Trainer") -> None: """Setup plugins for the trainer fit and creates optimizers. Args: trainer: the trainer instance """ self.accelerator.setup(trainer) self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device)
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) if trainer.state.fn == TrainerFn.FITTING and self._layer_sync: self.model = self._layer_sync.apply(self.model) self.configure_ddp() self.barrier() self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) self.setup_precision_plugin()
def teardown(self) -> None: """This method is called to teardown the training process. It is the right place to release memory and free other resources. """ optimizers_to_device(self.optimizers, torch.device("cpu")) if self.lightning_module is not None: log.detail(f"{self.__class__.__name__}: moving model to CPU") self.lightning_module.cpu() self.precision_plugin.teardown() self.accelerator.teardown()
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) if trainer.state.fn == TrainerFn.FITTING and self._layer_sync: assert self.model is not None self.model = self._layer_sync.apply(self.model) if not self.cpu_offload: self.model_to_device() self.barrier() self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) self.setup_precision_plugin()
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) if self.debug: os.environ["PT_XLA_DEBUG"] = str(1) shared_params = find_shared_parameters(self.model) self.model_to_device() set_shared_parameters(self.model.module, shared_params) self.setup_precision_plugin() if trainer.state.fn == TrainerFn.FITTING: self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device)
def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) # share ddp pids to all processes self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) if trainer.state.fn == TrainerFn.FITTING and self._layer_sync: assert self.model is not None self.model = self._layer_sync.apply(self.model) # we set the device so that optimizers can be created with distributed comms. assert self.lightning_module is not None self.lightning_module._device = self.root_device self.barrier() self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) self.setup_precision_plugin()
def setup(self, trainer: "pl.Trainer") -> None: self.start_method = "fork" self.accelerator.setup(trainer) self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) if self.debug: os.environ["PT_XLA_DEBUG"] = str(1) shared_params = find_shared_parameters(self.model) self.model_to_device() if is_overridden("on_post_move_to_device", self.lightning_module): self.model.module.on_post_move_to_device() else: set_shared_parameters(self.model.module, shared_params) self.setup_optimizers(trainer) self.precision_plugin.connect(self.model, None, None)