Example #1
0
    def _step_optimizer(
        self,
        optimizer: torch.optim.Optimizer,  # type: ignore
        clip_grads: Optional[Callable[[Iterator], None]] = None,
    ) -> None:
        """
        Perform a single optimization step.

        This function must be called once for each optimizer. However, the order of
        different optimizers' optimization steps can be specified by calling
        this function in different orders. Also, gradient accumulation across iterations
        is performed by the Determined training loop by setting the experiment configuration
        optimization.aggregation_frequency.

        Arguments:
            optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped.
            clip_grads(a function, optional): This function should have one argument for
                parameters in order to clip the gradients
        """

        if self._should_communicate_and_update():
            # Communication needs to be synchronized so that is completed
            # before we apply gradient clipping and `step()`.
            if self.hvd_config.use and not self._use_amp:
                optimizer.synchronize()

            parameters = ([
                p for group in optimizer.param_groups
                for p in group.get("params", [])
            ] if not self._use_amp else apex.amp.master_params(optimizer))

            if self.hvd_config.average_aggregated_gradients:
                self._average_gradients(
                    parameters=parameters,
                    divisor=self.hvd_config.aggregation_frequency)

            if clip_grads is not None:
                clip_grads(parameters)

            if self.hvd_config.use:
                with optimizer.skip_synchronize():
                    optimizer.step()
            else:
                optimizer.step()
            optimizer.zero_grad()
Example #2
0
    def step_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        clip_grads: Optional[Callable[[Iterator], None]] = None,
        auto_zero_grads: bool = True,
        scaler: Optional[Any] = None,
        # Should be torch.cuda.amp.GradScaler, but:
        #   * other implementations might be possible
        #   * requiring this type forces upgrades to PyTorch 1.6+
    ) -> None:
        """
        Perform a single optimization step.

        This function must be called once for each optimizer. However, the order of
        different optimizers' steps can be specified by calling this function in different
        orders. Also, gradient accumulation across iterations is performed by the Determined
        training loop by setting the experiment configuration field
        :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`.

        Here is a code example:

        .. code-block:: python

            def clip_grads(params):
                torch.nn.utils.clip_grad_norm_(params, 0.0001),

            self.context.step_optimizer(self.opt1, clip_grads)

        Arguments:
            optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped.
            clip_grads(a function, optional): This function should have one argument for
                parameters in order to clip the gradients.
            auto_zero_grads(bool, optional): Automatically zero out gradients automatically after
                stepping the optimizer. If false, you need to call ``optimizer.zero_grad()``
                manually. Note that if :ref:`optimizations.aggregation_frequency
                <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true.
            scaler(``torch.cuda.amp.GradScaler``, optional): The scaler to use for stepping the
                optimizer. This should be unset if not using AMP, and is necessary if
                ``wrap_scaler()`` was called directly.
        """

        check.true(
            auto_zero_grads or self._aggregation_frequency == 1,
            "if optimizations.aggregation_frequency is larger than 1, "
            "you can only set auto_zero_grads to be true. ",
        )

        if not self._should_communicate_and_update():
            return

        # Communication needs to be synchronized so that is completed
        # before we apply gradient clipping and `step()`.
        # In the case of APEX this is called in backward() instead, so that it's inside the context
        # manager and before unscaling.
        # In the case of PyTorch DDP, losses are synchronized during the backwards() pass.
        if (self.distributed.size > 1
                and self._distributed_backend.use_horovod()
                and not self._use_apex):
            with self._record_timing("train_batch.sync_optimizers",
                                     accumulate=True):
                optimizer.synchronize()  # type: ignore

        parameters = ([
            p for group in optimizer.param_groups
            for p in group.get("params", [])
        ] if not self._use_apex else apex.amp.master_params(optimizer))

        if self._average_aggregated_gradients:
            self._average_gradients(parameters=parameters,
                                    divisor=self._aggregation_frequency)

        if clip_grads is not None:
            if self._scaler and self.experimental._auto_amp:
                self._scaler.unscale_(optimizer)
            clip_grads(parameters)

        # For stepping the optimizer we will operate on the scaler passed
        # in, or fall back to the wrapped scaler (if any).
        if scaler is None and self.experimental._auto_amp:
            scaler = self._scaler
        if scaler:

            def step_fn() -> None:
                scaler.step(optimizer)  # type: ignore

        else:
            step_fn = optimizer.step  # type: ignore

        # In the case of PyTorch DDP, losses are synchronized automatically on the backwards() pass
        if self.distributed.size > 1 and self._distributed_backend.use_horovod(
        ):
            with optimizer.skip_synchronize():  # type: ignore
                step_fn()
        else:
            step_fn()

        if auto_zero_grads:
            optimizer.zero_grad()
Example #3
0
    def step_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        clip_grads: Optional[Callable[[Iterator], None]] = None,
        auto_zero_grads: bool = True,
    ) -> None:
        """
        Perform a single optimization step.

        This function must be called once for each optimizer. However, the order of
        different optimizers' steps can be specified by calling this function in different
        orders. Also, gradient accumulation across iterations is performed by the Determined
        training loop by setting the experiment configuration field
        :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`.

        Here is a code example:

        .. code-block:: python

            def clip_grads(params):
                torch.nn.utils.clip_grad_norm_(params, 0.0001),

            self.context.step_optimizer(self.opt1, clip_grads)

        Arguments:
            optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped.
            clip_grads(a function, optional): This function should have one argument for
                parameters in order to clip the gradients.
            auto_zero_grads(bool, optional): Automatically zero out gradients automatically after
                stepping the optimizer. If false, you need to call ``optimizer.zero_grad()``
                manually. Note that if :ref:`optimizations.aggregation_frequency
                <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true.
        """

        check.true(
            auto_zero_grads or self.hvd_config.aggregation_frequency > 1,
            "if optimizations.aggregation_frequency is larger than 1, "
            "you can only set auto_zero_grads to be true. ",
        )
        if self._should_communicate_and_update():
            # Communication needs to be synchronized so that is completed
            # before we apply gradient clipping and `step()`.
            if self.hvd_config.use and not self._use_amp:
                optimizer.synchronize()  # type: ignore

            parameters = (
                [p for group in optimizer.param_groups for p in group.get("params", [])]
                if not self._use_amp
                else apex.amp.master_params(optimizer)
            )

            if self.hvd_config.average_aggregated_gradients:
                self._average_gradients(
                    parameters=parameters, divisor=self.hvd_config.aggregation_frequency
                )

            if clip_grads is not None:
                clip_grads(parameters)

            if self.hvd_config.use:
                with optimizer.skip_synchronize():  # type: ignore
                    optimizer.step()
            else:
                optimizer.step()

            if auto_zero_grads:
                optimizer.zero_grad()