Esempio n. 1
0
    def on_batch_end(self, state):
        loss = state.get_key(key="loss", inner_key=self.loss_key)
        if isinstance(loss, list):
            loss = torch.mean(torch.stack(loss))

        if self.prefix is not None:
            state.metrics.add_batch_value(metrics_dict={
                self.prefix: loss.item(),
            })

        if not state.need_backward:
            return

        self._accumulation_counter += 1
        if not self.fp16:
            model = state.model
            optimizer = state.get_key(key="optimizer",
                                      inner_key=self.optimizer_key)
            loss.backward()

            if (self._accumulation_counter + 1) % self.accumulation_steps == 0:
                self.grad_step(optimizer=optimizer,
                               optimizer_wd=self._optimizer_wd,
                               grad_clip_fn=self.grad_clip_fn)
                model.zero_grad()
                self._accumulation_counter = 0
        else:
            model = state.model
            model.zero_grad()
            optimizer = state.get_key(key="optimizer",
                                      inner_key=self.optimizer_key)
            loss = state.get_key(key="loss", inner_key=self.optimizer_key)
            scaled_loss = self.fp16_grad_scale * loss.float()
            scaled_loss.backward()

            master_params = list(optimizer.param_groups[0]["params"])
            model_params = list(
                filter(lambda p: p.requires_grad, model.parameters()))
            copy_grads(source=model_params, target=master_params)
            for param in master_params:
                param.grad.data.mul_(1. / self.fp16_grad_scale)
            self.grad_step(optimizer=optimizer,
                           optimizer_wd=self._optimizer_wd,
                           grad_clip_fn=self.grad_clip_fn)
            copy_params(source=master_params, target=model_params)
            torch.cuda.synchronize()
Esempio n. 2
0
    def on_batch_end(self, state):
        if not state.is_train:
            return

        self.accumulation_counter += 1
        if not self.fp16:
            model = state.model
            optimizer = state.get_key(key="optimizer",
                                      inner_key=self.optimizer_key)
            loss = state.get_key(key="loss", inner_key=self.loss_key)
            loss.backward()

            if (self.accumulation_counter + 1) % self.accumulation_steps == 0:
                self.grad_step(optimizer=optimizer,
                               optimizer_wd=self.optimizer_wd,
                               grad_clip_fn=self.grad_clip_fn)
                model.zero_grad()
                self.accumulation_counter = 0
        else:
            model = state.model
            model.zero_grad()
            optimizer = state.get_key(key="optimizer",
                                      inner_key=self.optimizer_key)
            loss = state.get_key(key="loss", inner_key=self.optimizer_key)
            scaled_loss = self.fp16_grad_scale * loss.float()
            scaled_loss.backward()

            master_params = list(optimizer.param_groups[0]["params"])
            model_params = list(
                filter(lambda p: p.requires_grad, model.parameters()))
            copy_grads(source=model_params, target=master_params)
            for param in master_params:
                param.grad.data.mul_(1. / self.fp16_grad_scale)
            self.grad_step(optimizer=optimizer,
                           optimizer_wd=self.optimizer_wd,
                           grad_clip_fn=self.grad_clip_fn)
            copy_params(source=master_params, target=model_params)
            torch.cuda.synchronize()