Esempio n. 1
0
    def _all_reduce_and_rescale_grads(self,
                                      grad_denom=1,
                                      buffer_size=1048576000):
        # def _all_reduce_and_rescale_grads(self, grad_denom=1, buffer_size=2**28):
        """All-reduce and rescale the fp32 gradients in chunks of the specified size."""
        # grads = [p.grad.data for p in amp.master_params(self.optim.optimizer) if p.requires_grad and p.grad is not None]
        grads = [
            p.grad.data for p in self.model.parameters()
            if p.requires_grad and p.grad is not None
        ]
        # sys.stdout.flush()
        if len(grads) == 0:
            return

        buffer_t = grads[0].new(
            math.ceil(buffer_size / grads[0].element_size())).zero_()
        buffer = []

        def all_reduce_buffer():
            # copy grads into buffer_t
            offset = 0
            for g in buffer:
                numel = g.numel()
                buffer_t[offset:offset + numel].copy_(g.view(-1))
                offset += numel
            # all-reduce and rescale
            nccl.all_reduce(buffer_t[:offset])

            if grad_denom > 1:
                buffer_t.div_(grad_denom)
            # copy all-reduced buffer back into grads
            offset = 0
            for g in buffer:
                numel = g.numel()
                g.view(-1).copy_(buffer_t[offset:offset + numel])
                offset += numel

        filled = 0
        for g in grads:
            sz = g.numel() * g.element_size()
            if sz > buffer_size:
                # grad is bigger than buffer, all-reduce and rescale directly
                nccl.all_reduce(g)
                g.div_(grad_denom)
            elif filled + sz > buffer_size:
                # buffer is full, all-reduce and replace buffer with grad
                all_reduce_buffer()
                buffer = [g]
                filled = sz
            else:
                # add grad to buffer
                buffer.append(g)
                filled += sz
        if len(buffer) > 0:
            all_reduce_buffer()
        def all_reduce_buffer():
            # copy grads into buffer_t
            offset = 0
            for g in buffer:
                numel = g.numel()
                buffer_t[offset:offset + numel].copy_(g.view(-1))
                offset += numel
            # all-reduce and rescale
            nccl.all_reduce(buffer_t[:offset])

            if grad_denom > 1:
                buffer_t.div_(grad_denom)
            # copy all-reduced buffer back into grads
            offset = 0
            for g in buffer:
                numel = g.numel()
                g.view(-1).copy_(buffer_t[offset:offset + numel])
                offset += numel