def _rescale_gradients(self) -> Optional[float]: """ Performs gradient rescaling. Is a no-op if gradient rescaling is not enabled. """ if self._grad_norm: parameters_to_clip = [ p for p in self._model.parameters() if p.grad is not None ] return sparse_clip_norm(parameters_to_clip, self._grad_norm) return None
def test_sparse_clip_grad(self): # create a sparse embedding layer, then take gradient embedding = torch.nn.Embedding(100, 16, sparse=True) embedding.zero_grad() ids = torch.autograd.Variable((torch.rand(17) * 100).long()) # Set some of the ids to the same value so that the sparse gradient # has repeated indices. This tests some additional logic. ids[:5] = 5 loss = embedding(ids).sum() loss.backward() assert is_sparse(embedding.weight.grad) # Now try to clip the gradients. _ = sparse_clip_norm([embedding.weight], 1.5) # Final norm should be 1.5 grad = embedding.weight.grad.data.coalesce() self.assertAlmostEqual(grad._values().norm(2.0), 1.5, places=5) # pylint: disable=protected-access
def test_sparse_clip_grad(self): # create a sparse embedding layer, then take gradient embedding = torch.nn.Embedding(100, 16, sparse=True) embedding.zero_grad() ids = (torch.rand(17) * 100).long() # Set some of the ids to the same value so that the sparse gradient # has repeated indices. This tests some additional logic. ids[:5] = 5 loss = embedding(ids).sum() loss.backward() assert is_sparse(embedding.weight.grad) # Now try to clip the gradients. _ = sparse_clip_norm([embedding.weight], 1.5) # Final norm should be 1.5 grad = embedding.weight.grad.coalesce() # pylint: disable=no-member self.assertAlmostEqual(grad._values().norm(2.0).item(), 1.5, places=5) # pylint: disable=protected-access