def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b)
def _allreduce_grad_async(self, model, zero_fill, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model, zero_fill) # NOTE: we need to explicitly check `is None` , becuase # numpy's dtype object is evaluated to False in numpy <= 1.12.1 if self.allreduce_grad_dtype is not None: allreduce_grad_dtype = self.allreduce_grad_dtype else: allreduce_grad_dtype = chainer.get_dtype() assert allreduce_grad_dtype is not None n_elems = _memory_utility.count_grad_elements(params, zero_fill) needs_sync = self._prepare_allreduce_pack_buffer( allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() # pack grads from params -> buffer A self._pack_params_to_buffer(params, allreduce_grad_dtype, zero_fill, stream) # Allreduce from buffer A -> buffer B # div by comm_size from buffer B -> buffer A self.multi_node_mean_nccl(self.gpu_buffer_a, self.gpu_buffer_b, n_elems, allreduce_grad_dtype, stream) # unpack params from buffer A -> params self._unpack_params_from_buffer(params, allreduce_grad_dtype, zero_fill, stream)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model) grad_dtype = _get_param_grad_dtype(params[0]) if self.allreduce_grad_dtype is None: allreduce_grad_dtype = grad_dtype else: allreduce_grad_dtype = self.allreduce_grad_dtype n_elems = sum(param.grad.size for param in params) needs_sync = self._assign_for_allreduce_grad(grad_dtype, allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream) self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, _get_nccl_type_id(allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size(self.gpu_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) self._unpack_params_from_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) for param in params: if zero_fill and param.grad is None: if param.data is None: continue param.grad = param.xp.zeros_like(param.data) self._multi_node_mean(None, param.grad)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params( params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) # Intra-node reduce self.intra_nccl_comm.reduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast( self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) _memory_utility.unpack_params( params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
def allreduce_grad(self, model): for param in _memory_utility.extract_params_set_grad(model): grad = param.grad is_float16 = param.grad.dtype == np.float16 if is_float16: grad = grad.astype(np.float32) buf = _memory_utility.array_to_buffer_object(grad) self.mpi_comm.Allreduce(mpi4py.MPI.IN_PLACE, buf) if is_float16: param.grad = grad.astype(np.float16) param.grad /= self.size
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, zero_fill) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, zero_fill)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_a)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill)
def allreduce_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill, stream) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill, stream)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype)
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self._pack_params_to_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=np.float32, zero_fill=zero_fill) self._unpack_params_from_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=np.float32, zero_fill=zero_fill)
def allreduce_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model) # NOTE: we need to explicitly check `is None` , becuase # numpy's dtype object is evaluated to False in numpy <= 1.12.1 if self.allreduce_grad_dtype is not None: allreduce_grad_dtype = self.allreduce_grad_dtype else: allreduce_grad_dtype = chainer.get_dtype() assert allreduce_grad_dtype is not None n_elems = sum(param.grad.size for param in params) needs_sync = self._prepare_allreduce_pack_buffer(allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() # pack grads from params -> buffer A self._pack_params_to_buffer(params, allreduce_grad_dtype, stream) # Allreduce from buffer A -> buffer B self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, _communication_utility._get_nccl_type_id( allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) # div by comm_size from buffer B -> buffer A if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size( self.gpu_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) # unpack params from buffer A -> params self._unpack_params_from_buffer(params, allreduce_grad_dtype, stream)
def allreduce_grad(self, model): params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) self.mpi_comm.Allreduce( [self.gpu_buffer_a.buffer(n_bytes_total), mpi4py.MPI.FLOAT], [self.gpu_buffer_b.buffer(n_bytes_total), mpi4py.MPI.FLOAT]) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) itemsize = 4 n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 self._pack_params_to_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=allreduce_grad_dtype, zero_fill=zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) self._unpack_params_from_buffer(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)
def allreduce_grad(self, model): for param in _memory_utility.extract_params_set_grad(model): buf = _memory_utility.array_to_buffer_object(param.grad) self.mpi_comm.Allreduce(mpi4py.MPI.IN_PLACE, buf) param.grad /= self.size