def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)
        _memory_utility.pack_params(params, itemsize, 'grad',
                                    self.gpu_buffer_a)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(),
                                    self.gpu_buffer_b.ptr(), n_elems_total,
                                    nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0,
                                    stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            _communication_utility.inter_allreduce_gpu(
                self.inter_mpi_comm, self.size, self.gpu_buffer_a,
                self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node,
                n_bytes_per_node, stream)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total,
                                   nccl.NCCL_FLOAT, 0, stream.ptr)

        _memory_utility.unpack_params(params, itemsize, 'grad',
                                      self.gpu_buffer_b)
    def _allreduce_grad_async(self, model, zero_fill, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model, zero_fill)

        # NOTE: we need to explicitly check `is None` , becuase
        # numpy's dtype object is evaluated to False in numpy <= 1.12.1
        if self.allreduce_grad_dtype is not None:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        else:
            allreduce_grad_dtype = chainer.get_dtype()

        assert allreduce_grad_dtype is not None

        n_elems = _memory_utility.count_grad_elements(params, zero_fill)
        needs_sync = self._prepare_allreduce_pack_buffer(
            allreduce_grad_dtype, n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        # pack grads from params -> buffer A
        self._pack_params_to_buffer(params, allreduce_grad_dtype, zero_fill,
                                    stream)

        # Allreduce from buffer A -> buffer B
        # div by comm_size from buffer B -> buffer A
        self.multi_node_mean_nccl(self.gpu_buffer_a, self.gpu_buffer_b,
                                  n_elems, allreduce_grad_dtype, stream)

        # unpack params from buffer A -> params
        self._unpack_params_from_buffer(params, allreduce_grad_dtype,
                                        zero_fill, stream)
Beispiel #3
0
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Same as PureNcclCommunicator's multi_node_mean but leave as it is
        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM,
            stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(arr)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
    def _allreduce_grad_async(self, model, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model)
        grad_dtype = _get_param_grad_dtype(params[0])
        if self.allreduce_grad_dtype is None:
            allreduce_grad_dtype = grad_dtype
        else:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        n_elems = sum(param.grad.size for param in params)
        needs_sync = self._assign_for_allreduce_grad(grad_dtype,
                                                     allreduce_grad_dtype,
                                                     n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype,
                                    n_elems, stream)
        self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(),
                                 self.gpu_buffer_b.ptr(), n_elems,
                                 _get_nccl_type_id(allreduce_grad_dtype),
                                 nccl.NCCL_SUM, stream.ptr)
        if self.div_by_size is None:
            self.div_by_size = chainer.cuda.cupy.ElementwiseKernel(
                '{} x'.format(allreduce_grad_dtype.name),
                '{} y'.format(allreduce_grad_dtype.name),
                'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        self.div_by_size(self.gpu_buffer_b.array(n_elems,
                                                 dtype=allreduce_grad_dtype),
                         self.gpu_buffer_a.array(n_elems,
                                                 dtype=allreduce_grad_dtype),
                         stream=stream)
        self._unpack_params_from_buffer(params, grad_dtype,
                                        allreduce_grad_dtype, n_elems, stream)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(params,
                                    itemsize,
                                    'grad',
                                    self.gpu_buffer_a,
                                    transfer_dtype=dtype)

        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM,
            stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        _memory_utility.unpack_params(params,
                                      itemsize,
                                      'grad',
                                      self.gpu_buffer_b,
                                      transfer_dtype=dtype)
Beispiel #6
0
 def multi_node_mean_grad(self, model, zero_fill=False):
     params = _memory_utility.extract_params_set_grad(model, zero_fill)
     for param in params:
         if zero_fill and param.grad is None:
             if param.data is None:
                 continue
             param.grad = param.xp.zeros_like(param.data)
         self._multi_node_mean(None, param.grad)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_elems_buffer = n_elems_per_node * self.inter_size
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(
            params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            self.cpu_buffer_a.assign(n_bytes_buffer)
            self.cpu_buffer_b.assign(n_bytes_buffer)

            arr_b = self.gpu_buffer_b.array(n_elems_buffer)
            arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer)

            self.inter_mpi_comm.Alltoall(
                [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT],
                [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT])

            # Reduction in GPU
            arr_a = self.gpu_buffer_a.array(n_elems_buffer)
            arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer)
            arr_a = arr_a.reshape(self.inter_size, n_elems_per_node)
            arr_a = arr_a.sum(axis=0)
            arr_a *= 1.0 / self.size
            arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node)

            self.inter_mpi_comm.Allgather(
                [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT],
                [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT])

            arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(
            self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0,
            stream.ptr)

        _memory_utility.unpack_params(
            params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
Beispiel #8
0
 def allreduce_grad(self, model):
     for param in _memory_utility.extract_params_set_grad(model):
         grad = param.grad
         is_float16 = param.grad.dtype == np.float16
         if is_float16:
             grad = grad.astype(np.float32)
         buf = _memory_utility.array_to_buffer_object(grad)
         self.mpi_comm.Allreduce(mpi4py.MPI.IN_PLACE, buf)
         if is_float16:
             param.grad = grad.astype(np.float16)
         param.grad /= self.size
Beispiel #9
0
    def multi_node_mean_grad(self, model, zero_fill=False):
        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    zero_fill)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a,
                                      zero_fill)
    def allreduce_grad(self, model):
        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)

        _memory_utility.pack_params(params, itemsize, 'grad',
                                    self.gpu_buffer_a)

        _memory_utility.unpack_params(params, itemsize, 'grad',
                                      self.gpu_buffer_a)
    def multi_node_mean_grad(self, model, zero_fill=False):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype, zero_fill)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self._check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(),
                                           self.gpu_buffer_b.ptr(),
                                           n_elems_per_node_1d,
                                           nccl.NCCL_FLOAT, nccl.NCCL_SUM,
                                           stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size, self.gpu_buffer_a,
            self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(),
                                       self.gpu_buffer_a.ptr(),
                                       n_elems_per_node_1d, nccl.NCCL_FLOAT,
                                       stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a,
                                      allreduce_grad_dtype, zero_fill)
Beispiel #12
0
    def allreduce_grad(self, model, zero_fill=False):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype, zero_fill, stream)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(),
                                    self.gpu_buffer_b.ptr(), n_elems_total,
                                    nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0,
                                    stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            _communication_utility.inter_allreduce_gpu(
                self.inter_mpi_comm, self.size, self.gpu_buffer_a,
                self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node,
                n_bytes_per_node, stream)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total,
                                   nccl.NCCL_FLOAT, 0, stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b,
                                      allreduce_grad_dtype, zero_fill, stream)
Beispiel #13
0
    def allreduce_grad(self, model):
        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype)

        self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total),
                             self.gpu_buffer_b.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b,
                                      allreduce_grad_dtype)
Beispiel #14
0
    def multi_node_mean_grad(self, model, zero_fill=False):
        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)

        self._pack_params_to_buffer(params,
                                    'grad',
                                    buffer=self.gpu_buffer_a,
                                    allreduce_grad_dtype=np.float32,
                                    zero_fill=zero_fill)

        self._unpack_params_from_buffer(params,
                                        'grad',
                                        buffer=self.gpu_buffer_a,
                                        allreduce_grad_dtype=np.float32,
                                        zero_fill=zero_fill)
Beispiel #15
0
    def allreduce_grad(self, model, zero_fill=False):
        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype, zero_fill)

        self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total),
                             self.gpu_buffer_b.array(n_elems_total))

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b,
                                      allreduce_grad_dtype, zero_fill)
Beispiel #16
0
    def _allreduce_grad_async(self, model, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model)

        # NOTE: we need to explicitly check `is None` , becuase
        # numpy's dtype object is evaluated to False in numpy <= 1.12.1
        if self.allreduce_grad_dtype is not None:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        else:
            allreduce_grad_dtype = chainer.get_dtype()

        assert allreduce_grad_dtype is not None

        n_elems = sum(param.grad.size for param in params)
        needs_sync = self._prepare_allreduce_pack_buffer(allreduce_grad_dtype,
                                                         n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        # pack grads from params -> buffer A
        self._pack_params_to_buffer(params, allreduce_grad_dtype, stream)
        # Allreduce from buffer A -> buffer B
        self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(),
                                 self.gpu_buffer_b.ptr(), n_elems,
                                 _communication_utility._get_nccl_type_id(
                                     allreduce_grad_dtype),
                                 nccl.NCCL_SUM,
                                 stream.ptr)
        # div by comm_size from buffer B -> buffer A
        if self.div_by_size is None:
            self.div_by_size = chainer.cuda.cupy.ElementwiseKernel(
                '{} x'.format(allreduce_grad_dtype.name),
                '{} y'.format(allreduce_grad_dtype.name),
                'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        self.div_by_size(
            self.gpu_buffer_b.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            self.gpu_buffer_a.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            stream=stream)

        # unpack params from buffer A -> params
        self._unpack_params_from_buffer(params, allreduce_grad_dtype, stream)
    def allreduce_grad(self, model):
        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(
            params, itemsize, 'grad', self.gpu_buffer_a)

        self.mpi_comm.Allreduce(
            [self.gpu_buffer_a.buffer(n_bytes_total), mpi4py.MPI.FLOAT],
            [self.gpu_buffer_b.buffer(n_bytes_total), mpi4py.MPI.FLOAT])
        arr = self.gpu_buffer_b.array(n_elems_total)
        arr *= (1.0 / self.size)

        _memory_utility.unpack_params(
            params, itemsize, 'grad', self.gpu_buffer_b)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size))
        n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size
        n_bytes_per_node_1d = n_elems_per_node_1d * itemsize
        n_bytes_per_node_2d = n_elems_per_node_2d * itemsize
        n_bytes_buffer = n_bytes_per_node_2d * self.size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a,
                                    allreduce_grad_dtype)

        # Intra-node reduce-scatter (1st dimension)
        self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(),
                                           self.gpu_buffer_b.ptr(),
                                           n_elems_per_node_1d,
                                           nccl.NCCL_FLOAT, nccl.NCCL_SUM,
                                           stream.ptr)

        # Inter-node allreduce (2nd dimension)
        _communication_utility.inter_allreduce_gpu(
            self.inter_mpi_comm, self.size, self.gpu_buffer_a,
            self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d,
            n_bytes_per_node_2d, stream)

        # Intra-node allgather (1st dimension)
        self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(),
                                       self.gpu_buffer_a.ptr(),
                                       n_elems_per_node_1d, nccl.NCCL_FLOAT,
                                       stream.ptr)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a,
                                      allreduce_grad_dtype)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model)
        itemsize = 4
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(
            params, itemsize, 'grad', self.gpu_buffer_a)

        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total)
        arr *= (1.0 / self.size)

        _memory_utility.unpack_params(
            params, itemsize, 'grad', self.gpu_buffer_b)
    def multi_node_mean_grad(self, model, zero_fill=False):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_grad(model, zero_fill)
        itemsize = 4
        n_elems_total = _memory_utility.count_grad_elements(params, zero_fill)
        n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size))
        n_elems_buffer = n_elems_per_node * self.inter_size
        n_bytes_per_node = n_elems_per_node * itemsize
        n_bytes_buffer = n_bytes_per_node * self.inter_size

        self.gpu_buffer_a.assign(n_bytes_buffer)
        self.gpu_buffer_b.assign(n_bytes_buffer)

        allreduce_grad_dtype = np.float32

        self._pack_params_to_buffer(params,
                                    'grad',
                                    buffer=self.gpu_buffer_a,
                                    allreduce_grad_dtype=allreduce_grad_dtype,
                                    zero_fill=zero_fill)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self._check_ready_to_allreduce(array_a, array_b)

        # Intra-node reduce
        self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(),
                                    self.gpu_buffer_b.ptr(), n_elems_total,
                                    nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0,
                                    stream.ptr)

        # Inter-node allreduce
        if self.intra_rank == 0:
            self.cpu_buffer_a.assign(n_bytes_buffer)
            self.cpu_buffer_b.assign(n_bytes_buffer)

            arr_b = self.gpu_buffer_b.array(n_elems_buffer)
            arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer)

            self.inter_mpi_comm.Alltoall(
                [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT],
                [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT])

            # Reduction in GPU
            arr_a = self.gpu_buffer_a.array(n_elems_buffer)
            arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer)
            arr_a = arr_a.reshape(self.inter_size, n_elems_per_node)
            arr_a = arr_a.sum(axis=0)
            arr_a *= 1.0 / self.size
            arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node)

            self.inter_mpi_comm.Allgather(
                [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT],
                [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT])

            arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer)

        # Intra-node bcast
        self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total,
                                   nccl.NCCL_FLOAT, 0, stream.ptr)

        if chainer.is_debug():
            stream.synchronize()
            self._ensure_all_finite(self.gpu_buffer_b.array(n_elems_total))

        self._unpack_params_from_buffer(params, 'grad', self.gpu_buffer_b,
                                        allreduce_grad_dtype, zero_fill)
 def allreduce_grad(self, model):
     for param in _memory_utility.extract_params_set_grad(model):
         buf = _memory_utility.array_to_buffer_object(param.grad)
         self.mpi_comm.Allreduce(mpi4py.MPI.IN_PLACE, buf)
         param.grad /= self.size