def scale_by_grad_accum_steps_wrapper_hook(
         hook_state, bucket: dist.GradBucket
 ) -> torch.futures.Future[torch.Tensor]:
     bucket.set_buffer(bucket.buffer().div_(
         args.gradient_accumulation_steps))
     fut = hook(hook_state, bucket)
     return fut
Esempio n. 2
0
    def bf16_compress_wrapper_hook(
            hook_state,
            bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
        # Cast bucket tensor to BF16.
        bucket.set_buffer(bucket.buffer().to(torch.bfloat16))

        fut = hook(hook_state, bucket)

        def decompress(fut):
            decompressed_tensor = bucket.buffer()
            # Decompress in place to reduce the peak memory.
            # See: https://github.com/pytorch/pytorch/issues/45968
            decompressed_tensor.copy_(fut.value())
            return decompressed_tensor

        # Decompress after hook has run.
        return fut.then(decompress)