def multi_node_mean_nccl(self,
                             gpu_buffer_a,
                             gpu_buffer_b,
                             n_elems,
                             dtype,
                             stream=None):
        # Performs allreduce and division by size, i.e. mean.
        # gpu_buffer_a = Sigma(gpu_buffer_a, all-procs) / self.size
        # b is just used as buffer
        if chainer.is_debug():
            stream.synchronize()
            array_a = gpu_buffer_a.array(n_elems, dtype=dtype)
            array_b = gpu_buffer_b.array(n_elems, dtype=dtype)
            self.check_ready_to_allreduce(array_a, array_b)

        if stream is None:
            stream = chainer.cuda.Stream.null
        self._init_comms()
        type_id = _communication_utility._get_nccl_type_id(dtype)
        self.nccl_comm.allReduce(gpu_buffer_a.ptr(), gpu_buffer_b.ptr(),
                                 n_elems, type_id, nccl.NCCL_SUM, stream.ptr)
        div_by_size = chainer.cuda.cupy.ElementwiseKernel(
            '{} x'.format(dtype.name), '{} y'.format(dtype.name),
            'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        div_by_size(gpu_buffer_b.array(n_elems, dtype=dtype),
                    gpu_buffer_a.array(n_elems, dtype=dtype),
                    stream=stream)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(gpu_buffer_a.array(n_elems, dtype=dtype))
Exemple #2
0
    def __init__(self, params, attr_name, zero_fill):
        n_params = len(params)
        params_dptr = np.empty(n_params, dtype=np.int64)
        params_dtype = np.empty(n_params, dtype=np.int32)
        params_size_csum = np.empty(n_params + 1, dtype=np.int32)
        params_size_csum[0] = 0
        for i, param in enumerate(params):
            v = getattr(param, attr_name)
            if attr_name == 'grad' and v is None and zero_fill:
                v = param.xp.zeros_like(param.data)
                setattr(param, attr_name, v)
            xp = chainer.backend.get_array_module(v)

            if xp == cp:
                v_data = v.data
            elif xp == chx:
                v_data = _get_memory_pointer_from_chainerx(v)
            else:
                raise ValueError(
                    '{} is from an unsupported array module'.format(type(v)))

            params_dptr[i] = v_data.ptr
            if v.dtype not in [np.float16, np.float32, np.float64]:
                raise ValueError('dtype must be float16, float32 or float64.')
            params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype)
            params_size_csum[i + 1] = params_size_csum[i] + v.size
        self.n_params = n_params
        self.n_elems = params_size_csum[n_params]
        self.size_csum = chainer.cuda.cupy.asarray(params_size_csum)
        self.dtype = chainer.cuda.cupy.asarray(params_dtype)
        self.dptr = chainer.cuda.cupy.asarray(params_dptr)
Exemple #3
0
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Same as PureNcclCommunicator's multi_node_mean but leave as it is
        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM,
            stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(arr)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(params, 'grad', self.gpu_buffer_a, dtype)

        if chainer.is_debug():
            stream.synchronize()
            array_a = self.gpu_buffer_a.array(n_elems_total)
            array_b = self.gpu_buffer_b.array(n_elems_total)
            self.check_ready_to_allreduce(array_a, array_b)

        # Same as PureNcclCommunicator's multi_node_mean but leave as it is
        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype),
            nccl.NCCL_SUM, stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        if chainer.is_debug():
            stream.synchronize()
            self.ensure_all_finite(arr)

        _memory_utility.unpack_params(params, 'grad', self.gpu_buffer_b, dtype)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(params,
                                    itemsize,
                                    'grad',
                                    self.gpu_buffer_a,
                                    transfer_dtype=dtype)

        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype), nccl.NCCL_SUM,
            stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        _memory_utility.unpack_params(params,
                                      itemsize,
                                      'grad',
                                      self.gpu_buffer_b,
                                      transfer_dtype=dtype)
    def bcast_data(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_data(model)

        dtype = params[0].data.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.data.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)

        _memory_utility.pack_params(params,
                                    itemsize,
                                    'data',
                                    self.gpu_buffer_a,
                                    transfer_dtype=dtype)

        self.intra_nccl_comm.bcast(
            self.gpu_buffer_a.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype), 0, stream.ptr)

        _memory_utility.unpack_params(params,
                                      itemsize,
                                      'data',
                                      self.gpu_buffer_a,
                                      transfer_dtype=dtype)
    def allreduce_grad(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null
        params = _memory_utility.extract_params_set_grad(model)

        dtype = params[0].grad.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.grad.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)
        self.gpu_buffer_b.assign(n_bytes_total)

        _memory_utility.pack_params(
            params, itemsize, 'grad', self.gpu_buffer_a, transfer_dtype=dtype)

        self.intra_nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype),
            nccl.NCCL_SUM, stream.ptr)

        arr = self.gpu_buffer_b.array(n_elems_total, dtype=dtype)
        arr *= (1.0 / self.size)

        _memory_utility.unpack_params(
            params, itemsize, 'grad', self.gpu_buffer_b, transfer_dtype=dtype)
    def _allreduce_grad_async(self, model, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model)
        grad_dtype = _get_param_grad_dtype(params[0])
        if self.allreduce_grad_dtype is None:
            allreduce_grad_dtype = grad_dtype
        else:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        n_elems = sum(param.grad.size for param in params)
        needs_sync = self._assign_for_allreduce_grad(grad_dtype,
                                                     allreduce_grad_dtype,
                                                     n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype,
                                    n_elems, stream)
        self.nccl_comm.allReduce(
            self.gpu_buffer_a.ptr(), self.gpu_buffer_b.ptr(), n_elems,
            _communication_utility._get_nccl_type_id(allreduce_grad_dtype),
            nccl.NCCL_SUM, stream.ptr)
        if self.div_by_size is None:
            self.div_by_size = chainer.cuda.cupy.ElementwiseKernel(
                '{} x'.format(allreduce_grad_dtype.name),
                '{} y'.format(allreduce_grad_dtype.name),
                'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        self.div_by_size(self.gpu_buffer_b.array(n_elems,
                                                 dtype=allreduce_grad_dtype),
                         self.gpu_buffer_a.array(n_elems,
                                                 dtype=allreduce_grad_dtype),
                         stream=stream)
        self._unpack_params_from_buffer(params, grad_dtype,
                                        allreduce_grad_dtype, n_elems, stream)
    def bcast_data(self, model):
        self._init_comms()
        params = _memory_utility.extract_params_set_data(model)
        data_dtype = _get_param_data_dtype(params[0])
        n_elems = sum(param.data.size for param in params)
        data_grad_n_bytes = data_dtype.itemsize * n_elems
        if self.gpu_tmp_buffer.size != data_grad_n_bytes:
            self.gpu_tmp_buffer.assign(data_grad_n_bytes)
        stream = chainer.cuda.Stream.null

        _memory_utility.pack_params(params,
                                    data_dtype.itemsize,
                                    'data',
                                    self.gpu_tmp_buffer,
                                    stream,
                                    transfer_dtype=data_dtype)
        self.nccl_comm.bcast(
            self.gpu_tmp_buffer.ptr(), n_elems,
            _communication_utility._get_nccl_type_id(data_dtype), 0,
            stream.ptr)
        _memory_utility.unpack_params(params,
                                      data_dtype.itemsize,
                                      'data',
                                      self.gpu_tmp_buffer,
                                      stream,
                                      transfer_dtype=data_dtype)
Exemple #10
0
def _batched_unpack_params(params_data, buffer, dtype):
    n_params = params_data.n_params
    n_elems = params_data.n_elems
    params_dptr = params_data.dptr
    params_dtype = params_data.dtype
    params_size_csum = params_data.size_csum
    buf_dtype = _communication_utility._get_nccl_type_id(dtype)
    n_threads = 128
    n_blocks = (n_elems + n_threads - 1) // n_threads
    _cupy_batched_unpack_params()(
        (n_blocks, ), (n_threads, ),
        (buffer.memory.ptr, buf_dtype, n_elems,
         params_dptr, params_dtype, params_size_csum, n_params))
Exemple #11
0
def _batched_unpack_params(params_data, buffer, dtype):
    n_params = params_data.n_params
    n_elems = params_data.n_elems
    params_dptr = params_data.dptr
    params_dtype = params_data.dtype
    params_size_csum = params_data.size_csum
    buf_dtype = _communication_utility._get_nccl_type_id(dtype)
    n_threads = 128
    n_blocks = (n_elems + n_threads - 1) // n_threads
    _cupy_batched_unpack_params()(
        (n_blocks, ), (n_threads, ),
        (buffer.memory.ptr, buf_dtype, n_elems,
         params_dptr, params_dtype, params_size_csum, n_params))
Exemple #12
0
def _batched_unpack_params(params_data, buffer, dtype, stream=None):
    n_params = params_data.n_params
    n_elems = params_data.n_elems
    params_dptr = params_data.dptr
    params_dtype = params_data.dtype
    params_size_csum = params_data.size_csum
    buf_dtype = _communication_utility._get_nccl_type_id(dtype)
    n_threads = 128
    n_blocks = (n_elems + n_threads - 1) // n_threads
    if stream is None:
        stream = cp.cuda.get_current_stream()
    with stream:
        _cupy_batched_unpack_params()(
            (n_blocks, ), (n_threads, ),
            (buffer.memory.ptr, buf_dtype, n_elems, params_dptr, params_dtype,
             params_size_csum, n_params))
Exemple #13
0
 def __init__(self, params, attr_name):
     n_params = len(params)
     params_dptr = np.empty(n_params, dtype=np.int64)
     params_dtype = np.empty(n_params, dtype=np.int32)
     params_size_csum = np.empty(n_params+1, dtype=np.int32)
     params_size_csum[0] = 0
     for i, param in enumerate(params):
         v = getattr(param, attr_name)
         params_dptr[i] = v.data.ptr
         if v.dtype not in [np.float16, np.float32]:
             raise ValueError('dtype must be float16 or float32.')
         params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype)
         params_size_csum[i+1] = params_size_csum[i] + v.size
     self.n_params = n_params
     self.n_elems = params_size_csum[n_params]
     self.size_csum = chainer.cuda.cupy.asarray(params_size_csum)
     self.dtype = chainer.cuda.cupy.asarray(params_dtype)
     self.dptr = chainer.cuda.cupy.asarray(params_dptr)
Exemple #14
0
 def __init__(self, params, attr_name):
     n_params = len(params)
     params_dptr = np.empty(n_params, dtype=np.int64)
     params_dtype = np.empty(n_params, dtype=np.int32)
     params_size_csum = np.empty(n_params+1, dtype=np.int32)
     params_size_csum[0] = 0
     for i, param in enumerate(params):
         v = getattr(param, attr_name)
         params_dptr[i] = v.data.ptr
         if v.dtype not in [np.float16, np.float32]:
             raise ValueError('dtype must be float16 or float32.')
         params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype)
         params_size_csum[i+1] = params_size_csum[i] + v.size
     self.n_params = n_params
     self.n_elems = params_size_csum[n_params]
     self.size_csum = chainer.cuda.cupy.asarray(params_size_csum)
     self.dtype = chainer.cuda.cupy.asarray(params_dtype)
     self.dptr = chainer.cuda.cupy.asarray(params_dptr)
Exemple #15
0
    def _allreduce_grad_async(self, model, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model)

        # NOTE: we need to explicitly check `is None` , becuase
        # numpy's dtype object is evaluated to False in numpy <= 1.12.1
        if self.allreduce_grad_dtype is not None:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        else:
            allreduce_grad_dtype = chainer.get_dtype()

        assert allreduce_grad_dtype is not None

        n_elems = sum(param.grad.size for param in params)
        needs_sync = self._prepare_allreduce_pack_buffer(allreduce_grad_dtype,
                                                         n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        # pack grads from params -> buffer A
        self._pack_params_to_buffer(params, allreduce_grad_dtype, stream)
        # Allreduce from buffer A -> buffer B
        self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(),
                                 self.gpu_buffer_b.ptr(), n_elems,
                                 _communication_utility._get_nccl_type_id(
                                     allreduce_grad_dtype),
                                 nccl.NCCL_SUM,
                                 stream.ptr)
        # div by comm_size from buffer B -> buffer A
        if self.div_by_size is None:
            self.div_by_size = chainer.cuda.cupy.ElementwiseKernel(
                '{} x'.format(allreduce_grad_dtype.name),
                '{} y'.format(allreduce_grad_dtype.name),
                'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        self.div_by_size(
            self.gpu_buffer_b.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            self.gpu_buffer_a.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            stream=stream)

        # unpack params from buffer A -> params
        self._unpack_params_from_buffer(params, allreduce_grad_dtype, stream)
    def bcast_data(self, model):
        self._init_comms()
        stream = chainer.cuda.Stream.null

        params = _memory_utility.extract_params_set_data(model)

        dtype = params[0].data.dtype
        itemsize = dtype.itemsize
        n_elems_total = sum(param.data.size for param in params)
        n_bytes_total = n_elems_total * itemsize
        self.gpu_buffer_a.assign(n_bytes_total)

        _memory_utility.pack_params(params, 'data', self.gpu_buffer_a, dtype)

        self.intra_nccl_comm.bcast(
            self.gpu_buffer_a.ptr(), n_elems_total,
            _communication_utility._get_nccl_type_id(dtype),
            0, stream.ptr)

        _memory_utility.unpack_params(params, 'data', self.gpu_buffer_a, dtype)
Exemple #17
0
    def bcast_data(self, model):
        self._init_comms()
        params = _memory_utility.extract_params_set_data(model)
        data_dtype = _get_param_data_dtype(params[0])
        n_elems = sum(param.data.size for param in params)
        data_grad_n_bytes = data_dtype.itemsize * n_elems
        if self.gpu_tmp_buffer.size != data_grad_n_bytes:
            self.gpu_tmp_buffer.assign(data_grad_n_bytes)
        stream = chainer.cuda.Stream.null

        _memory_utility.pack_params(
            params, data_dtype.itemsize, 'data',
            self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype)
        self.nccl_comm.bcast(self.gpu_tmp_buffer.ptr(), n_elems,
                             _communication_utility._get_nccl_type_id(
                                 data_dtype),
                             0, stream.ptr)
        _memory_utility.unpack_params(
            params, data_dtype.itemsize, 'data',
            self.gpu_tmp_buffer, stream, transfer_dtype=data_dtype)
Exemple #18
0
 def __init__(self, params, attr_name, zero_fill):
     n_params = len(params)
     params_dptr = np.empty(n_params, dtype=np.int64)
     params_dtype = np.empty(n_params, dtype=np.int32)
     params_size_csum = np.empty(n_params + 1, dtype=np.int32)
     params_size_csum[0] = 0
     for i, param in enumerate(params):
         v = getattr(param, attr_name)
         if attr_name == 'grad' and v is None and zero_fill:
             v = param.xp.zeros_like(param.data)
             setattr(param, attr_name, v)
         params_dptr[i] = v.data.ptr
         if v.dtype not in [np.float16, np.float32, np.float64]:
             raise ValueError('dtype must be float16, float32 or float64.')
         params_dtype[i] = _communication_utility._get_nccl_type_id(v.dtype)
         params_size_csum[i + 1] = params_size_csum[i] + v.size
     self.n_params = n_params
     self.n_elems = params_size_csum[n_params]
     self.size_csum = chainer.cuda.cupy.asarray(params_size_csum)
     self.dtype = chainer.cuda.cupy.asarray(params_dtype)
     self.dptr = chainer.cuda.cupy.asarray(params_dptr)
    def _multi_node_mean_nccl(self, sendbuf, recvbuf,
                              n_elems, dtype, stream=None):
        """Compute mean of each element on each processes with NCCL.

        The function compute mean of each element in ``sendbuf`` on each
        processes. The result is stored in ``recvbuf``. NCCL is used for
        communication.

        Args:
            sendbuf (numpy/cupy array): Input arrays.
            recvbuf (numpy/cupy array): Output arrays.
            n_elems (int): the number of elements in `sendbuf`.
            dtype: Data type of elements used in All-Reduce.
            stream: CUDA stream used for All-Reduce.

        """
        if chainer.is_debug():
            stream.synchronize()
            array_a = sendbuf.array(n_elems, dtype=dtype)
            array_b = recvbuf.array(n_elems, dtype=dtype)
            self._check_ready_to_allreduce(array_a, array_b)

        if stream is None:
            stream = chainer.cuda.Stream.null
        self._init_comms()
        type_id = _communication_utility._get_nccl_type_id(dtype)
        self.nccl_comm.allReduce(sendbuf.ptr(),
                                 recvbuf.ptr(), n_elems,
                                 type_id, nccl.NCCL_SUM, stream.ptr)
        div_by_size = chainer.cuda.elementwise(
            '',
            '{} x'.format(dtype.name),
            'x *= (1.0/{})'.format(self.size), 'div_by_size')
        div_by_size(
            recvbuf.array(n_elems, dtype=dtype),
            stream=stream)

        if chainer.is_debug():
            stream.synchronize()
            self._ensure_all_finite(recvbuf.array(n_elems, dtype=dtype))
Exemple #20
0
    def _allreduce_grad_async(self, model, stream):
        self._init_comms()
        params = _memory_utility.extract_params_set_grad(model)
        grad_dtype = _get_param_grad_dtype(params[0])
        if self.allreduce_grad_dtype is None:
            allreduce_grad_dtype = grad_dtype
        else:
            allreduce_grad_dtype = self.allreduce_grad_dtype
        n_elems = sum(param.grad.size for param in params)
        needs_sync = self._assign_for_allreduce_grad(grad_dtype,
                                                     allreduce_grad_dtype,
                                                     n_elems)
        if stream != chainer.cuda.Stream.null and needs_sync:
            chainer.cuda.Stream.null.synchronize()

        self._pack_params_to_buffer(params, grad_dtype, allreduce_grad_dtype,
                                    n_elems, stream)
        self.nccl_comm.allReduce(self.gpu_buffer_a.ptr(),
                                 self.gpu_buffer_b.ptr(), n_elems,
                                 _communication_utility._get_nccl_type_id(
                                     allreduce_grad_dtype),
                                 nccl.NCCL_SUM,
                                 stream.ptr)
        if self.div_by_size is None:
            self.div_by_size = chainer.cuda.cupy.ElementwiseKernel(
                '{} x'.format(allreduce_grad_dtype.name),
                '{} y'.format(allreduce_grad_dtype.name),
                'y = x*(1.0/{})'.format(self.size), 'div_by_size')
        self.div_by_size(
            self.gpu_buffer_b.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            self.gpu_buffer_a.array(n_elems,
                                    dtype=allreduce_grad_dtype),
            stream=stream)
        self._unpack_params_from_buffer(params, grad_dtype,
                                        allreduce_grad_dtype, n_elems, stream)
Exemple #21
0
def check_allreduce_grad_mixed_dtype(param, model, use_gpu):
    # Checks the actual allreduce communication is performed
    # in the correct data type (FP16 or FP32)
    comm_class = param.communicator_class

    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if comm_class is PureNcclCommunicator:
        communicator = comm_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = comm_class(mpi_comm)

    mpi_comm.barrier()

    # answer type: see the document of `create_communicator`
    global_dtype = param.global_dtype
    allreduce_dtype = param.allreduce_grad_dtype

    # assert test configuration.
    assert chainer.get_dtype() == global_dtype

    answer_dtype = None
    if allreduce_dtype == np.float16:
        answer_dtype = np.float16
    elif allreduce_dtype == np.float32:
        answer_dtype = np.float32
    else:
        if global_dtype == np.float32:
            answer_dtype = np.float32
        else:
            answer_dtype = np.float16

    if use_gpu:
        model.to_gpu()

    model.a.W.grad[:] = communicator.rank
    model.b.W.grad[:] = communicator.rank + 1
    model.c.b.grad[:] = communicator.rank + 2

    if isinstance(communicator, PureNcclCommunicator):
        communicator._init_comms()
        with mock.patch.object(communicator, 'nccl_comm',
                               wraps=communicator.nccl_comm) as mc:
            answer_dtype = _communication_utility._get_nccl_type_id(
                answer_dtype)

            communicator.allreduce_grad(model)

            # dtype that was used in the actual communication,
            # which is nccl_comm.allReduce
            call_args = mc.allReduce.call_args[0]
            actual_dtype = call_args[3]
            assert answer_dtype == actual_dtype
    else:
        # For other MPI-based communicators,
        # all communication should happen in FP32 as of now, so
        # here we just check the results are correct for
        # 16-32 mixed models.
        communicator.allreduce_grad(model)

    base = (communicator.size - 1.0) / 2
    chainer.testing.assert_allclose(model.a.W.grad,
                                    (base + 0) * np.ones((3, 2)))
    chainer.testing.assert_allclose(model.b.W.grad,
                                    (base + 1) * np.ones((4, 3)))

    mpi_comm.barrier()
    destroy_communicator(communicator)
Exemple #22
0
def check_allreduce_grad_mixed_dtype(param, model, use_gpu):
    # Checks the actual allreduce communication is performed
    # in the correct data type (FP16 or FP32)
    comm_class = param.communicator_class

    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if comm_class is PureNcclCommunicator:
        communicator = comm_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = comm_class(mpi_comm)

    mpi_comm.barrier()

    # answer type: see the document of `create_communicator`
    global_dtype = param.global_dtype
    allreduce_dtype = param.allreduce_grad_dtype

    # assert test configuration.
    assert chainer.get_dtype() == global_dtype

    answer_dtype = None
    if allreduce_dtype == np.float16:
        answer_dtype = np.float16
    elif allreduce_dtype == np.float32:
        answer_dtype = np.float32
    else:
        if global_dtype == np.float32:
            answer_dtype = np.float32
        else:
            answer_dtype = np.float16

    if use_gpu:
        model.to_gpu()

    model.a.W.grad[:] = communicator.rank
    model.b.W.grad[:] = communicator.rank + 1
    model.c.b.grad[:] = communicator.rank + 2

    if isinstance(communicator, PureNcclCommunicator):
        communicator._init_comms()
        with mock.patch.object(communicator, 'nccl_comm',
                               wraps=communicator.nccl_comm) as mc:
            answer_dtype = _communication_utility._get_nccl_type_id(
                answer_dtype)

            communicator.allreduce_grad(model)

            # dtype that was used in the actual communication,
            # which is nccl_comm.allReduce
            call_args = mc.allReduce.call_args[0]
            actual_dtype = call_args[3]
            assert answer_dtype == actual_dtype
    else:
        # For other MPI-based communicators,
        # all communication should happen in FP32 as of now, so
        # here we just check the results are correct for
        # 16-32 mixed models.
        communicator.allreduce_grad(model)

    base = (communicator.size - 1.0) / 2
    chainer.testing.assert_allclose(model.a.W.grad,
                                    (base + 0) * np.ones((3, 2)))
    chainer.testing.assert_allclose(model.b.W.grad,
                                    (base + 1) * np.ones((4, 3)))

    mpi_comm.barrier()
    destroy_communicator(communicator)