def _allreduce_grad_async(self, model, zero_fill, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model, zero_fill) # NOTE: we need to explicitly check `is None` , becuase # numpy's dtype object is evaluated to False in numpy <= 1.12.1 if self.allreduce_grad_dtype is not None: allreduce_grad_dtype = self.allreduce_grad_dtype else: allreduce_grad_dtype = chainer.get_dtype() assert allreduce_grad_dtype is not None n_elems = _memory_utility.count_grad_elements(params, zero_fill) needs_sync = self._prepare_allreduce_pack_buffer( allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() # pack grads from params -> buffer A self._pack_params_to_buffer(params, allreduce_grad_dtype, zero_fill, stream) # Allreduce from buffer A -> buffer B # div by comm_size from buffer B -> buffer A self.multi_node_mean_nccl(self.gpu_buffer_a, self.gpu_buffer_b, n_elems, allreduce_grad_dtype, stream) # unpack params from buffer A -> params self._unpack_params_from_buffer(params, allreduce_grad_dtype, zero_fill, stream)
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, zero_fill) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, zero_fill)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node_2d = int(math.ceil(n_elems_total / self.size)) n_elems_per_node_1d = n_elems_per_node_2d * self.inter_size n_bytes_per_node_1d = n_elems_per_node_1d * itemsize n_bytes_per_node_2d = n_elems_per_node_2d * itemsize n_bytes_buffer = n_bytes_per_node_2d * self.size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce-scatter (1st dimension) self.intra_nccl_comm.reduceScatter(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, nccl.NCCL_SUM, stream.ptr) # Inter-node allreduce (2nd dimension) _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_per_node_1d, n_elems_per_node_2d, n_bytes_per_node_2d, stream) # Intra-node allgather (1st dimension) self.intra_nccl_comm.allGather(self.gpu_buffer_b.ptr(), self.gpu_buffer_a.ptr(), n_elems_per_node_1d, nccl.NCCL_FLOAT, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_a.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill)
def allreduce_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill, stream) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: _communication_utility.inter_allreduce_gpu( self.inter_mpi_comm, self.size, self.gpu_buffer_a, self.gpu_buffer_b, n_bytes_buffer, n_elems_per_node, n_bytes_per_node, stream) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill, stream)
def multi_node_mean_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self._pack_params_to_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=np.float32, zero_fill=zero_fill) self._unpack_params_from_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=np.float32, zero_fill=zero_fill)
def allreduce_grad(self, model, zero_fill=False): params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) allreduce_grad_dtype = np.float32 _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, allreduce_grad_dtype, zero_fill) self.multi_node_mean(self.gpu_buffer_a.array(n_elems_total), self.gpu_buffer_b.array(n_elems_total)) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)
def allreduce_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype, zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype, zero_fill)
def multi_node_mean_grad(self, model, zero_fill=False): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model, zero_fill) itemsize = 4 n_elems_total = _memory_utility.count_grad_elements(params, zero_fill) n_elems_per_node = int(math.ceil(n_elems_total / self.inter_size)) n_elems_buffer = n_elems_per_node * self.inter_size n_bytes_per_node = n_elems_per_node * itemsize n_bytes_buffer = n_bytes_per_node * self.inter_size self.gpu_buffer_a.assign(n_bytes_buffer) self.gpu_buffer_b.assign(n_bytes_buffer) allreduce_grad_dtype = np.float32 self._pack_params_to_buffer(params, 'grad', buffer=self.gpu_buffer_a, allreduce_grad_dtype=allreduce_grad_dtype, zero_fill=zero_fill) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self._check_ready_to_allreduce(array_a, array_b) # Intra-node reduce self.intra_nccl_comm.reduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, nccl.NCCL_SUM, 0, stream.ptr) # Inter-node allreduce if self.intra_rank == 0: self.cpu_buffer_a.assign(n_bytes_buffer) self.cpu_buffer_b.assign(n_bytes_buffer) arr_b = self.gpu_buffer_b.array(n_elems_buffer) arr_b.data.copy_to_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) self.inter_mpi_comm.Alltoall( [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT], [self.cpu_buffer_a.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) # Reduction in GPU arr_a = self.gpu_buffer_a.array(n_elems_buffer) arr_a.data.copy_from_host(self.cpu_buffer_a.ptr(), n_bytes_buffer) arr_a = arr_a.reshape(self.inter_size, n_elems_per_node) arr_a = arr_a.sum(axis=0) arr_a *= 1.0 / self.size arr_a.data.copy_to_host(self.cpu_buffer_a.ptr(), n_bytes_per_node) self.inter_mpi_comm.Allgather( [self.cpu_buffer_a.buffer(n_bytes_per_node), mpi4py.MPI.FLOAT], [self.cpu_buffer_b.buffer(n_bytes_buffer), mpi4py.MPI.FLOAT]) arr_b.data.copy_from_host(self.cpu_buffer_b.ptr(), n_bytes_buffer) # Intra-node bcast self.intra_nccl_comm.bcast(self.gpu_buffer_b.ptr(), n_elems_total, nccl.NCCL_FLOAT, 0, stream.ptr) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(self.gpu_buffer_b.array(n_elems_total)) self._unpack_params_from_buffer(params, 'grad', self.gpu_buffer_b, allreduce_grad_dtype, zero_fill)