def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True async_grad_allreduce = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." if hasattr(self._cfg.optim, 'sched'): sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl ) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." ) # if using tensor parallel only, we can use async grad all-reduce if self.cfg.get('pipeline_model_parallel_size', 1) == 1: async_grad_allreduce = True else: async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, grad_allreduce_chunk_size_mb=self.cfg.get( 'grad_allreduce_chunk_size_mb', 125), ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
def configure_optimizers(self): self.setup_optimization() # Wrap the baseline optimizer with the optimizer class with master parameters if self.megatron_amp_o2 and self._optimizer is not None: if self.cfg.precision == 'bf16': fp32_grad_accum = True contiguous_grad_bucket = True elif self.cfg.precision == 16: fp32_grad_accum = False # TODO: contiguous grad bucket for fp16 is also planned to be supported contiguous_grad_bucket = False raise ValueError( "fp16 training is not yet supported with O2. Please set megatron_amp_O2 to False in the model config." ) # TODO: this should be true when not using pipeline parallelism # we will support that for bf16 when we have async handler from apex # and we will support it for fp16 when we have it implemented in the O2 recipe async_grad_allreduce = False self._optimizer = MainParamsOptimizerWrapper( self._optimizer, fp32_grad_accum=fp32_grad_accum, contiguous_grad_bucket=contiguous_grad_bucket, async_grad_allreduce=async_grad_allreduce, ) assert self._trainer.max_steps is not None, "'max_steps' is missing in trainer config." sched_config = self._cfg.optim.sched sched_config['max_steps'] = self._trainer.max_steps self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=sched_config, train_dataloader=self._train_dl) if self._scheduler is None: return self._optimizer else: return [self._optimizer], [self._scheduler]
def setup_optimization(self, optim_config: Optional[Union[DictConfig, Dict]] = None): """ Prepares an optimizer from a string name and its optional config parameters. Args: optim_config: A dictionary containing the following keys: * "lr": mandatory key for learning rate. Will raise ValueError if not provided. * "optimizer": string name pointing to one of the available optimizers in the registry. \ If not provided, defaults to "adam". * "opt_args": Optional list of strings, in the format "arg_name=arg_value". \ The list of "arg_value" will be parsed and a dictionary of optimizer kwargs \ will be built and supplied to instantiate the optimizer. """ # If config was not explicitly passed to us if optim_config is None: # See if internal config has `optim` namespace if self._cfg is not None and hasattr(self._cfg, 'optim'): optim_config = self._cfg.optim # If config is still None, or internal config has no Optim, return without instantiation if optim_config is None: logging.info( 'No optimizer config provided, therefore no optimizer was created' ) return else: # Preserve the configuration if not isinstance(optim_config, DictConfig): optim_config = OmegaConf.create(optim_config) # See if internal config has `optim` namespace before preservation if self._cfg is not None and hasattr(self._cfg, 'optim'): self._cfg.optim = optim_config # Setup optimizer and scheduler if optim_config is not None and isinstance(optim_config, DictConfig): optim_config = OmegaConf.to_container(optim_config) if 'sched' in optim_config and self._trainer is not None: if not isinstance(self._trainer.accumulate_grad_batches, int): raise ValueError( "We do not currently support gradient acculumation that is not an integer." ) if self._trainer.max_steps is None: # Store information needed to calculate max_steps optim_config['sched'][ 't_max_epochs'] = self._trainer.max_epochs optim_config['sched'][ 't_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches if self._trainer.distributed_backend is None: optim_config['sched'][ 't_num_workers'] = self._trainer.num_gpus or 1 elif self._trainer.distributed_backend is "ddp_cpu": optim_config['sched'][ 't_num_workers'] = self._trainer.num_processes * self._trainer.num_nodes elif self._trainer.distributed_backend is "ddp": optim_config['sched'][ 't_num_workers'] = self._trainer.num_gpus * self._trainer.num_nodes else: logging.warning( f"The lightning trainer received accelerator: {self._trainer.distributed_backend }. We " "recommend to use 'ddp' instead.") optim_config['sched'][ 't_num_workers'] = self._trainer.num_gpus * self._trainer.num_nodes else: optim_config['sched']['max_steps'] = self._trainer.max_steps # Force into DictConfig from nested structure optim_config = OmegaConf.create(optim_config) # Get back nested dict so we its mutable optim_config = OmegaConf.to_container(optim_config, resolve=True) # Extract scheduler config if inside optimizer config if 'sched' in optim_config: scheduler_config = optim_config.pop('sched') else: scheduler_config = None # Check if caller provided optimizer name, default to Adam otherwise optimizer_cls = optim_config.get('cls', None) if optimizer_cls is None: # Try to get optimizer name for dynamic resolution, defaulting to Adam optimizer_name = optim_config.get('name', 'adam') else: if inspect.isclass(optimizer_cls): optimizer_name = optimizer_cls.__name__.lower() else: # resolve the class name (lowercase) from the class path if not provided optimizer_name = optimizer_cls.split(".")[-1].lower() # We are guarenteed to have lr since it is required by the argparser # But maybe user forgot to pass it to this function lr = optim_config.get('lr', None) if lr is None: raise ValueError( '`lr` must be passed to `optimizer_config` when setting up the optimization !' ) # Check if caller has optimizer kwargs, default to empty dictionary if 'args' in optim_config: optimizer_args = optim_config.pop('args') optimizer_args = optim.parse_optimizer_args( optimizer_name, optimizer_args) else: optimizer_args = copy.deepcopy(optim_config) # Remove extra parameters from optimizer_args nest # Assume all other parameters are to be passed into optimizer constructor optimizer_args.pop('name', None) optimizer_args.pop('cls', None) optimizer_args.pop('lr', None) # Actually instantiate the optimizer if optimizer_cls is not None: if inspect.isclass(optimizer_cls): optimizer = optimizer_cls(self.parameters(), lr=lr, **optimizer_args) logging.info("Optimizer config = %s", str(optimizer)) self._optimizer = optimizer else: # Attempt class path resolution try: optimizer_cls = OmegaConf.create({'cls': optimizer_cls}) optimizer_config = {'lr': lr} optimizer_config.update(optimizer_args) optimizer_instance = hydra.utils.instantiate( optimizer_cls, self.parameters(), **optimizer_config) # type: DictConfig logging.info("Optimizer config = %s", str(optimizer_instance)) self._optimizer = optimizer_instance except Exception as e: logging.error( "Could not instantiate class path - {} with kwargs {}". format(optimizer_cls, str(optimizer_config))) raise e else: optimizer = optim.get_optimizer(optimizer_name) optimizer = optimizer(self.parameters(), lr=lr, **optimizer_args) logging.info("Optimizer config = %s", str(optimizer)) self._optimizer = optimizer # Try to instantiate scheduler for optimizer self._scheduler = prepare_lr_scheduler( optimizer=self._optimizer, scheduler_config=scheduler_config, train_dataloader=self._train_dl) # Return the optimizer with/without scheduler # This return allows multiple optimizers or schedulers to be created return self._optimizer, self._scheduler