def multi_node_mean_nccl(self, gpu_buffer_a, gpu_buffer_b, n_elems, dtype, stream=None): # Performs allreduce and division by size, i.e. mean. # gpu_buffer_a = Sigma(gpu_buffer_a, all-procs) / self.size # b is just used as buffer if chainer.is_debug(): stream.synchronize() array_a = gpu_buffer_a.array(n_elems, dtype=dtype) array_b = gpu_buffer_b.array(n_elems, dtype=dtype) self.check_ready_to_allreduce(array_a, array_b) if stream is None: stream = chainer.cuda.Stream.null self._init_comms() type_id = _communication_utility._get_nccl_type_id(dtype) self.nccl_comm.allReduce(gpu_buffer_a.ptr(), gpu_buffer_b.ptr(), n_elems, type_id, nccl.NCCL_SUM, stream.ptr) div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(dtype.name), '{} y'.format(dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') div_by_size(gpu_buffer_b.array(n_elems, dtype=dtype), gpu_buffer_a.array(n_elems, dtype=dtype), stream=stream) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(gpu_buffer_a.array(n_elems, dtype=dtype))
def __init__(self, params, attr_name, zero_fill): n_params = len(params) params_dptr = np.empty(n_params, dtype=np.int64) params_dtype = np.empty(n_params, dtype=np.int32) params_size_csum = np.empty(n_params + 1, dtype=np.int32) params_size_csum[0] = 0 for i, param in enumerate(params): v = getattr(param, attr_name) if attr_name == 'grad' and v is None and zero_fill: v = param.xp.zeros_like(param.data) setattr(param, attr_name, v) xp = chainer.backend.get_array_module(v) if xp == cp: v_data = v.data elif xp == chx: v_data = _get_memory_pointer_from_chainerx(v) else: raise ValueError( '{} is from an unsupported array module'.format(type(v))) params_dptr[i] = v_data.ptr if v.dtype not in [np.float16, np.float32, np.float64]: raise ValueError('dtype must be float16, float32 or float64.') params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype) params_size_csum[i + 1] = params_size_csum[i] + v.size self.n_params = n_params self.n_elems = params_size_csum[n_params] self.size_csum = chainer.cuda.cupy.asarray(params_size_csum) self.dtype = chainer.cuda.cupy.asarray(params_dtype) self.dptr = chainer.cuda.cupy.asarray(params_dptr)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype) if chainer.is_debug(): stream.synchronize() array_a = self.gpu_buffer_a.array(n_elems_total) array_b = self.gpu_buffer_b.array(n_elems_total) self.check_ready_to_allreduce(array_a, array_b) # Same as PureNcclCommunicator's multi_node_mean but leave as it is self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) if chainer.is_debug(): stream.synchronize() self.ensure_all_finite(arr) _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) _memory_utility.unpack_params(params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
def bcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_data(model) dtype = params[0].data.dtype itemsize = dtype.itemsize n_elems_total = sum(param.data.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, itemsize, 'data', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.bcast( self.gpu_buffer_a.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), 0, stream.ptr) _memory_utility.unpack_params(params, itemsize, 'data', self.gpu_buffer_a, transfer_dtype=dtype)
def allreduce_grad(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_grad(model) dtype = params[0].grad.dtype itemsize = dtype.itemsize n_elems_total = sum(param.grad.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) self.gpu_buffer_b.assign(n_bytes_total) _memory_utility.pack_params( params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype) self.intra_nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM, stream.ptr) arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype) arr *= (1.0 / self.size) _memory_utility.unpack_params( params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model) grad_dtype = _get_param_grad_dtype(params[0]) if self.allreduce_grad_dtype is None: allreduce_grad_dtype = grad_dtype else: allreduce_grad_dtype = self.allreduce_grad_dtype n_elems = sum(param.grad.size for param in params) needs_sync = self._assign_for_allreduce_grad(grad_dtype, allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream) self.nccl_comm.allReduce( self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, _communication_utility._get_nccl_type_id(allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size(self.gpu_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) self._unpack_params_from_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream)
def bcast_data(self, model): self._init_comms() params = _memory_utility.extract_params_set_data(model) data_dtype = _get_param_data_dtype(params[0]) n_elems = sum(param.data.size for param in params) data_grad_n_bytes = data_dtype.itemsize * n_elems if self.gpu_tmp_buffer.size != data_grad_n_bytes: self.gpu_tmp_buffer.assign(data_grad_n_bytes) stream = chainer.cuda.Stream.null _memory_utility.pack_params(params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype) self.nccl_comm.bcast( self.gpu_tmp_buffer.ptr(), n_elems, _communication_utility._get_nccl_type_id(data_dtype), 0, stream.ptr) _memory_utility.unpack_params(params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype)
def _batched_unpack_params(params_data, buffer, dtype): n_params = params_data.n_params n_elems = params_data.n_elems params_dptr = params_data.dptr params_dtype = params_data.dtype params_size_csum = params_data.size_csum buf_dtype = _communication_utility._get_nccl_type_id(dtype) n_threads = 128 n_blocks = (n_elems + n_threads - 1) // n_threads _cupy_batched_unpack_params()( (n_blocks, ), (n_threads, ), (buffer.memory.ptr, buf_dtype, n_elems, params_dptr, params_dtype, params_size_csum, n_params))
def _batched_unpack_params(params_data, buffer, dtype): n_params = params_data.n_params n_elems = params_data.n_elems params_dptr = params_data.dptr params_dtype = params_data.dtype params_size_csum = params_data.size_csum buf_dtype = _communication_utility._get_nccl_type_id(dtype) n_threads = 128 n_blocks = (n_elems + n_threads - 1) // n_threads _cupy_batched_unpack_params()( (n_blocks, ), (n_threads, ), (buffer.memory.ptr, buf_dtype, n_elems, params_dptr, params_dtype, params_size_csum, n_params))
def _batched_unpack_params(params_data, buffer, dtype, stream=None): n_params = params_data.n_params n_elems = params_data.n_elems params_dptr = params_data.dptr params_dtype = params_data.dtype params_size_csum = params_data.size_csum buf_dtype = _communication_utility._get_nccl_type_id(dtype) n_threads = 128 n_blocks = (n_elems + n_threads - 1) // n_threads if stream is None: stream = cp.cuda.get_current_stream() with stream: _cupy_batched_unpack_params()( (n_blocks, ), (n_threads, ), (buffer.memory.ptr, buf_dtype, n_elems, params_dptr, params_dtype, params_size_csum, n_params))
def __init__(self, params, attr_name): n_params = len(params) params_dptr = np.empty(n_params, dtype=np.int64) params_dtype = np.empty(n_params, dtype=np.int32) params_size_csum = np.empty(n_params+1, dtype=np.int32) params_size_csum[0] = 0 for i, param in enumerate(params): v = getattr(param, attr_name) params_dptr[i] = v.data.ptr if v.dtype not in [np.float16, np.float32]: raise ValueError('dtype must be float16 or float32.') params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype) params_size_csum[i+1] = params_size_csum[i] + v.size self.n_params = n_params self.n_elems = params_size_csum[n_params] self.size_csum = chainer.cuda.cupy.asarray(params_size_csum) self.dtype = chainer.cuda.cupy.asarray(params_dtype) self.dptr = chainer.cuda.cupy.asarray(params_dptr)
def __init__(self, params, attr_name): n_params = len(params) params_dptr = np.empty(n_params, dtype=np.int64) params_dtype = np.empty(n_params, dtype=np.int32) params_size_csum = np.empty(n_params+1, dtype=np.int32) params_size_csum[0] = 0 for i, param in enumerate(params): v = getattr(param, attr_name) params_dptr[i] = v.data.ptr if v.dtype not in [np.float16, np.float32]: raise ValueError('dtype must be float16 or float32.') params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype) params_size_csum[i+1] = params_size_csum[i] + v.size self.n_params = n_params self.n_elems = params_size_csum[n_params] self.size_csum = chainer.cuda.cupy.asarray(params_size_csum) self.dtype = chainer.cuda.cupy.asarray(params_dtype) self.dptr = chainer.cuda.cupy.asarray(params_dptr)
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model) # NOTE: we need to explicitly check `is None` , becuase # numpy's dtype object is evaluated to False in numpy <= 1.12.1 if self.allreduce_grad_dtype is not None: allreduce_grad_dtype = self.allreduce_grad_dtype else: allreduce_grad_dtype = chainer.get_dtype() assert allreduce_grad_dtype is not None n_elems = sum(param.grad.size for param in params) needs_sync = self._prepare_allreduce_pack_buffer(allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() # pack grads from params -> buffer A self._pack_params_to_buffer(params, allreduce_grad_dtype, stream) # Allreduce from buffer A -> buffer B self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, _communication_utility._get_nccl_type_id( allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) # div by comm_size from buffer B -> buffer A if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size( self.gpu_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) # unpack params from buffer A -> params self._unpack_params_from_buffer(params, allreduce_grad_dtype, stream)
def bcast_data(self, model): self._init_comms() stream = chainer.cuda.Stream.null params = _memory_utility.extract_params_set_data(model) dtype = params[0].data.dtype itemsize = dtype.itemsize n_elems_total = sum(param.data.size for param in params) n_bytes_total = n_elems_total * itemsize self.gpu_buffer_a.assign(n_bytes_total) _memory_utility.pack_params(params, 'data', self.gpu_buffer_a, dtype) self.intra_nccl_comm.bcast( self.gpu_buffer_a.ptr(), n_elems_total, _communication_utility._get_nccl_type_id(dtype), 0, stream.ptr) _memory_utility.unpack_params(params, 'data', self.gpu_buffer_a, dtype)
def bcast_data(self, model): self._init_comms() params = _memory_utility.extract_params_set_data(model) data_dtype = _get_param_data_dtype(params[0]) n_elems = sum(param.data.size for param in params) data_grad_n_bytes = data_dtype.itemsize * n_elems if self.gpu_tmp_buffer.size != data_grad_n_bytes: self.gpu_tmp_buffer.assign(data_grad_n_bytes) stream = chainer.cuda.Stream.null _memory_utility.pack_params( params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype) self.nccl_comm.bcast(self.gpu_tmp_buffer.ptr(), n_elems, _communication_utility._get_nccl_type_id( data_dtype), 0, stream.ptr) _memory_utility.unpack_params( params, data_dtype.itemsize, 'data', self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype)
def __init__(self, params, attr_name, zero_fill): n_params = len(params) params_dptr = np.empty(n_params, dtype=np.int64) params_dtype = np.empty(n_params, dtype=np.int32) params_size_csum = np.empty(n_params + 1, dtype=np.int32) params_size_csum[0] = 0 for i, param in enumerate(params): v = getattr(param, attr_name) if attr_name == 'grad' and v is None and zero_fill: v = param.xp.zeros_like(param.data) setattr(param, attr_name, v) params_dptr[i] = v.data.ptr if v.dtype not in [np.float16, np.float32, np.float64]: raise ValueError('dtype must be float16, float32 or float64.') params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype) params_size_csum[i + 1] = params_size_csum[i] + v.size self.n_params = n_params self.n_elems = params_size_csum[n_params] self.size_csum = chainer.cuda.cupy.asarray(params_size_csum) self.dtype = chainer.cuda.cupy.asarray(params_dtype) self.dptr = chainer.cuda.cupy.asarray(params_dptr)
def _multi_node_mean_nccl(self, sendbuf, recvbuf, n_elems, dtype, stream=None): """Compute mean of each element on each processes with NCCL. The function compute mean of each element in ``sendbuf`` on each processes. The result is stored in ``recvbuf``. NCCL is used for communication. Args: sendbuf (numpy/cupy array): Input arrays. recvbuf (numpy/cupy array): Output arrays. n_elems (int): the number of elements in `sendbuf`. dtype: Data type of elements used in All-Reduce. stream: CUDA stream used for All-Reduce. """ if chainer.is_debug(): stream.synchronize() array_a = sendbuf.array(n_elems, dtype=dtype) array_b = recvbuf.array(n_elems, dtype=dtype) self._check_ready_to_allreduce(array_a, array_b) if stream is None: stream = chainer.cuda.Stream.null self._init_comms() type_id = _communication_utility._get_nccl_type_id(dtype) self.nccl_comm.allReduce(sendbuf.ptr(), recvbuf.ptr(), n_elems, type_id, nccl.NCCL_SUM, stream.ptr) div_by_size = chainer.cuda.elementwise( '', '{} x'.format(dtype.name), 'x *= (1.0/{})'.format(self.size), 'div_by_size') div_by_size( recvbuf.array(n_elems, dtype=dtype), stream=stream) if chainer.is_debug(): stream.synchronize() self._ensure_all_finite(recvbuf.array(n_elems, dtype=dtype))
def _allreduce_grad_async(self, model, stream): self._init_comms() params = _memory_utility.extract_params_set_grad(model) grad_dtype = _get_param_grad_dtype(params[0]) if self.allreduce_grad_dtype is None: allreduce_grad_dtype = grad_dtype else: allreduce_grad_dtype = self.allreduce_grad_dtype n_elems = sum(param.grad.size for param in params) needs_sync = self._assign_for_allreduce_grad(grad_dtype, allreduce_grad_dtype, n_elems) if stream != chainer.cuda.Stream.null and needs_sync: chainer.cuda.Stream.null.synchronize() self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream) self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems, _communication_utility._get_nccl_type_id( allreduce_grad_dtype), nccl.NCCL_SUM, stream.ptr) if self.div_by_size is None: self.div_by_size = chainer.cuda.cupy.ElementwiseKernel( '{} x'.format(allreduce_grad_dtype.name), '{} y'.format(allreduce_grad_dtype.name), 'y = x*(1.0/{})'.format(self.size), 'div_by_size') self.div_by_size( self.gpu_buffer_b.array(n_elems, dtype=allreduce_grad_dtype), self.gpu_buffer_a.array(n_elems, dtype=allreduce_grad_dtype), stream=stream) self._unpack_params_from_buffer(params, grad_dtype, allreduce_grad_dtype, n_elems, stream)
def check_allreduce_grad_mixed_dtype(param, model, use_gpu): # Checks the actual allreduce communication is performed # in the correct data type (FP16 or FP32) comm_class = param.communicator_class if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if comm_class is PureNcclCommunicator: communicator = comm_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = comm_class(mpi_comm) mpi_comm.barrier() # answer type: see the document of `create_communicator` global_dtype = param.global_dtype allreduce_dtype = param.allreduce_grad_dtype # assert test configuration. assert chainer.get_dtype() == global_dtype answer_dtype = None if allreduce_dtype == np.float16: answer_dtype = np.float16 elif allreduce_dtype == np.float32: answer_dtype = np.float32 else: if global_dtype == np.float32: answer_dtype = np.float32 else: answer_dtype = np.float16 if use_gpu: model.to_gpu() model.a.W.grad[:] = communicator.rank model.b.W.grad[:] = communicator.rank + 1 model.c.b.grad[:] = communicator.rank + 2 if isinstance(communicator, PureNcclCommunicator): communicator._init_comms() with mock.patch.object(communicator, 'nccl_comm', wraps=communicator.nccl_comm) as mc: answer_dtype = _communication_utility._get_nccl_type_id( answer_dtype) communicator.allreduce_grad(model) # dtype that was used in the actual communication, # which is nccl_comm.allReduce call_args = mc.allReduce.call_args[0] actual_dtype = call_args[3] assert answer_dtype == actual_dtype else: # For other MPI-based communicators, # all communication should happen in FP32 as of now, so # here we just check the results are correct for # 16-32 mixed models. communicator.allreduce_grad(model) base = (communicator.size - 1.0) / 2 chainer.testing.assert_allclose(model.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(model.b.W.grad, (base + 1) * np.ones((4, 3))) mpi_comm.barrier() destroy_communicator(communicator)
def check_allreduce_grad_mixed_dtype(param, model, use_gpu): # Checks the actual allreduce communication is performed # in the correct data type (FP16 or FP32) comm_class = param.communicator_class if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if comm_class is PureNcclCommunicator: communicator = comm_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = comm_class(mpi_comm) mpi_comm.barrier() # answer type: see the document of `create_communicator` global_dtype = param.global_dtype allreduce_dtype = param.allreduce_grad_dtype # assert test configuration. assert chainer.get_dtype() == global_dtype answer_dtype = None if allreduce_dtype == np.float16: answer_dtype = np.float16 elif allreduce_dtype == np.float32: answer_dtype = np.float32 else: if global_dtype == np.float32: answer_dtype = np.float32 else: answer_dtype = np.float16 if use_gpu: model.to_gpu() model.a.W.grad[:] = communicator.rank model.b.W.grad[:] = communicator.rank + 1 model.c.b.grad[:] = communicator.rank + 2 if isinstance(communicator, PureNcclCommunicator): communicator._init_comms() with mock.patch.object(communicator, 'nccl_comm', wraps=communicator.nccl_comm) as mc: answer_dtype = _communication_utility._get_nccl_type_id( answer_dtype) communicator.allreduce_grad(model) # dtype that was used in the actual communication, # which is nccl_comm.allReduce call_args = mc.allReduce.call_args[0] actual_dtype = call_args[3] assert answer_dtype == actual_dtype else: # For other MPI-based communicators, # all communication should happen in FP32 as of now, so # here we just check the results are correct for # 16-32 mixed models. communicator.allreduce_grad(model) base = (communicator.size - 1.0) / 2 chainer.testing.assert_allclose(model.a.W.grad, (base + 0) * np.ones((3, 2))) chainer.testing.assert_allclose(model.b.W.grad, (base + 1) * np.ones((4, 3))) mpi_comm.barrier() destroy_communicator(communicator)