def wrap_optimizer(self, optimizer: Any) -> Any: """ This should be used to wrap optimizer objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their optimizer. For example, if users create their optimizer within ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)`` prior to passing the optimizer into their Estimator. """ if not self.env.training: return optimizer self.optimizer_initialized = True if not self.hvd_config.use: return optimizer check.check_false( isinstance(optimizer, str), "Please specify an optimizer object instead of using a string name.", ) hvd.require_horovod_type( "tensorflow", "EstimatorContext.wrap_optimizer was called.") use_compression = self.hvd_config.fp16_compression optimizer = hvd.DistributedOptimizer( optimizer, compression=hvd.compression.Compression.fp16 if use_compression else hvd.compression.Compression.none, aggregation_frequency=self.hvd_config.aggregation_frequency, average_aggregated_gradients=self.hvd_config. average_aggregated_gradients, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) return optimizer
def _get_multi_gpu_model_and_optimizer( pre_compiled_model: tf.keras.Model, optimizer: tf.keras.optimizers.Optimizer, env: det.EnvContext, hvd_config: horovod.HorovodContext, profile_frequency: Optional[int], profile_filename: str, ) -> Tuple[tf.keras.Model, tf.keras.optimizers.Optimizer]: num_gpus = len(env.container_gpus) new_model = pre_compiled_model new_optimizer = optimizer if num_gpus > 1 and not hvd_config.use: new_model = tf.keras.utils.multi_gpu_model(pre_compiled_model, num_gpus) # If using horovod, wrap the optimizer and check for an aggregation_frequency. elif hvd_config.use: # Horovod doesn't know how to handle string-based optimizers. if isinstance(optimizer, str): raise det.errors.InvalidExperimentException( "string optimizers are not supported") new_optimizer = hvd.DistributedOptimizer( optimizer, **get_horovod_config( exp_config=env.experiment_config, hvd_config=hvd_config, profile_frequency=profile_frequency, profile_filename=profile_filename, ), ) return new_model, new_optimizer
def _init_model_and_optimizer(self) -> None: self.context.model = self.trial.build_model() # TODO: Check that optimizer is not an amp optimizer. self.context.optimizer = self.trial.optimizer(self.context.model) self._init_device() self.context.model = self.context.model.to(self.device) if self.hvd_config.use: use_compression = self.hvd_config.fp16_compression self.context.optimizer = hvd.DistributedOptimizer( self.context.optimizer, named_parameters=self.context.model.named_parameters(), backward_passes_per_step=self.hvd_config.aggregation_frequency, compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none, ) logging.debug("Initialized optimizer for distributed and optimized parallel training.") elif self.n_gpus > 1: check.eq( self.hvd_config.aggregation_frequency, 1, "Please enable `optimized_parallel` to use aggregation " "frequency greater than 1 for single machine multi-GPU " "training.", ) self.context.model = nn.DataParallel(self.context.model) logging.debug("Initialized mode for native parallel training.")
def _Optimizer( self, optimizer: torch.optim.Optimizer ) -> torch.optim.Optimizer: # type: ignore """Wraps an optimizer. It returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`Model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. """ check.false(self._use_amp, "Must call Optimizer() before _configure_apex_amp.") if self.hvd_config.use: use_compression = self.hvd_config.fp16_compression optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=self.hvd_config.aggregation_frequency, compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def wrap_optimizer( self, optimizer: torch.optim.Optimizer, backward_passes_per_step: int = 1, ) -> torch.optim.Optimizer: """Returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`wrap_model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. `backward_passes_per_step` can be used to specify how many gradient aggregation steps will be performed in a single `train_batch` call per optimizer step. In most cases, this will just be the default value 1. However, this advanced functionality can be used to support training loops like the one shown below: .. code-block:: python def train_batch( self, batch: TorchData, epoch_idx: int, batch_idx: int ) -> Dict[str, torch.Tensor]: data, labels = batch output = self.model(data) loss1 = output['loss1'] loss2 = output['loss2'] self.context.backward(loss1) self.context.backward(loss2) self.context.step_optimizer(self.optimizer, backward_passes_per_step=2) return {"loss1": loss1, "loss2": loss2} """ if self.env.managed_training: check.false( self._use_apex, "Must call wrap_optimizer() before configure_apex_amp.") check.gt_eq( backward_passes_per_step, 1, "backward_passes_per_step for local gradient aggregation must be >= 1", ) if self.distributed.size > 1 and self._distributed_backend.use_horovod( ): optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=backward_passes_per_step * self._aggregation_frequency, compression=hvd.Compression.fp16 if self._fp16_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def _get_horovod_optimizer_if_using_horovod( self, optimizer: tf.keras.optimizers.Optimizer ) -> tf.keras.optimizers.Optimizer: if not self.hvd_config.use: return optimizer # Horovod doesn't know how to handle string-based optimizers. if isinstance(optimizer, str): raise det.errors.InvalidExperimentException("string optimizers are not supported") return hvd.DistributedOptimizer( optimizer, aggregation_frequency=self.hvd_config.aggregation_frequency, average_aggregated_gradients=self.hvd_config.average_aggregated_gradients, )
def wrap_optimizer(self, optimizer: Any) -> Any: """ This should be used to wrap optimizer objects immediately after they have been created. Users should use the output of this wrapper as the new instance of their optimizer. For example, if users create their optimizer within ``build_estimator()``, they should call ``optimizer = wrap_optimizer(optimzer)`` prior to passing the optimizer into their Estimator. """ if not self.env.managed_training: return optimizer self.optimizer_initialized = True if not self.hvd_config.use: return optimizer check.check_false( isinstance(optimizer, str), "Please specify an optimizer object instead of using a string name.", ) hvd.require_horovod_type( "tensorflow", "EstimatorContext.wrap_optimizer was called.") use_compression = self.hvd_config.fp16_compression # The signature of our horovod optimizer changed after we rebased onto 0.21. hvd_sig = inspect.signature(hvd.DistributedOptimizer) horovod_kwargs = { "compression": hvd.compression.Compression.fp16 if use_compression else hvd.compression.Compression.none, "average_aggregated_gradients": self.hvd_config.average_aggregated_gradients, } if "aggregation_frequency" in hvd_sig.parameters: horovod_kwargs[ "aggregation_frequency"] = self.hvd_config.aggregation_frequency else: horovod_kwargs[ "backward_passes_per_step"] = self.hvd_config.aggregation_frequency optimizer = hvd.DistributedOptimizer(optimizer, **horovod_kwargs) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) return optimizer
def _init_model(self) -> None: self.optimizer = self.trial.optimizer(self.model) # TODO: Check that optimizer is not an amp optimizer. self._init_device() self.model = self.model.to(self.device) if self.hvd_config.use: use_compression = self.hvd_config.fp16_compression self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters(), backward_passes_per_step=self.hvd_config.aggregation_frequency, compression=hvd.Compression.fp16 if use_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) elif self.n_gpus > 1: check.eq( self.hvd_config.aggregation_frequency, 1, "Please enable `optimized_parallel` to use aggregation " "frequency greater than 1 for single machine multi-GPU " "training.", ) self.model = nn.DataParallel(self.model) logging.debug("Initialized mode for native parallel training.") self.lr_helper = _LRHelper( self.trial.create_lr_scheduler(self.optimizer)) # If a load path is provided load weights and restore the data location. self._load() self._configure_amp() if self.hvd_config.use: hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) # Initialize training and validation iterators. self.training_iterator = iter(self.training_loader)
def _get_horovod_optimizer_if_using_horovod( self, optimizer: tf.keras.optimizers.Optimizer ) -> tf.keras.optimizers.Optimizer: if not self.hvd_config.use: return optimizer # Horovod doesn't know how to handle string-based optimizers. if isinstance(optimizer, str): raise det.errors.InvalidExperimentException("string optimizers are not supported") # The signature of our horovod optimizer changed after we rebased onto 0.21. hvd_sig = inspect.signature(hvd.DistributedOptimizer) horovod_kwargs = { "average_aggregated_gradients": self.hvd_config.average_aggregated_gradients, } # type: Dict[str, Any] if "aggregation_frequency" in hvd_sig.parameters: horovod_kwargs["aggregation_frequency"] = self.hvd_config.aggregation_frequency else: horovod_kwargs["backward_passes_per_step"] = self.hvd_config.aggregation_frequency return hvd.DistributedOptimizer(optimizer, **horovod_kwargs)