Ejemplo n.º 1
0
    def __init__(self, mpi_comm,
                 batched_copy=False):

        super(NonCudaAwareCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError(
                'NCCL is not available. '
                'Please confirm that NCCL is enabled in CuPy.'
            )
        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.inter_mpi_comm = None
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
        self.cpu_buffer_a = _memory_utility.HostPinnedMemory()
        self.cpu_buffer_b = _memory_utility.HostPinnedMemory()

        self.batched_copy = batched_copy
Ejemplo n.º 2
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available or nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
Ejemplo n.º 3
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        with self.config_scope():
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
Ejemplo n.º 4
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None):
        super(PureNcclCommunicator, self).__init__(mpi_comm, True)
        if nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')
        self._init_ranks()

        self.inter_mpi_comm = None
        self.intra_mpi_comm = None
        self.intra_nccl_comm = None
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_allreduce_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_allreduce_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
Ejemplo n.º 5
0
    def __init__(self, mpi_comm,
                 batched_copy=False):
        super(FlatCommunicator, self).__init__(mpi_comm)

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        self.batched_copy = batched_copy
Ejemplo n.º 6
0
    def __init__(self, mpi_comm):
        super(SingleNodeCommunicator, self).__init__(mpi_comm, use_nccl=True)

        if self.inter_size != 1:
            raise ValueError('SingleNodeCommunicator cannot be used under '
                             'multi-node settings')

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Ejemplo n.º 7
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm, True)
        if nccl.get_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')
        self._init_ranks()

        self.inter_mpi_comm = None
        self.intra_mpi_comm = None
        self.intra_nccl_comm = None
        self.nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Ejemplo n.º 8
0
    def __init__(self, mpi_comm):
        super(HierarchicalCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('NCCL is not available. '
                               'Please confirm that NCCL is enabled in CuPy.')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.inter_mpi_comm = None
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Ejemplo n.º 9
0
def main():
    comm = MPI.COMM_WORLD
    intra_rank = utils.get_intra_rank(comm)
    chainer.cuda.get_device(intra_rank).use()
    mpi_print = common.create_mpi_print(comm)

    nelems_list = [2, 4, 8, 16, 32, 64, 128, 256]
    nelems_max = nelems_list[-1] * pow(2, 20)

    sendarr = cupy.random.rand(nelems_max, dtype=cupy.float32)
    recvarr = cupy.zeros((nelems_max, ), dtype=cupy.float32)
    if comm.rank == 0:
        print('array initialized...')

    sendbuf_gpu = _memory_utility.DeviceMemory()
    sendbuf_gpu.assign(nelems_max * 4)
    recvbuf_gpu = _memory_utility.DeviceMemory()
    recvbuf_gpu.assign(nelems_max * 4)
    if comm.rank == 0:
        print('GPU buffer initialized...')

    utils.pack([sendarr], sendbuf_gpu)
    if comm.rank == 0:
        print('packed...')

    for nelems in nelems_list:
        nelems *= pow(2, 20)

        sendbuf = [sendbuf_gpu.buffer(nelems * 4), MPI.FLOAT]
        recvbuf = [recvbuf_gpu.buffer(nelems * 4), MPI.FLOAT
                   ] if comm.rank == 0 else None

        if comm.rank == 0:
            s_time = time.time()
        # WE MUST SYNC BEFORE COMMUNICATION !!!
        chainer.cuda.Stream.null.synchronize()
        comm.Reduce(sendbuf, recvbuf, root=0)

        if comm.rank == 0:
            print('COUNT {} MiBytes, TIME {} sec'.format(
                (nelems * 4) / pow(2, 20),
                time.time() - s_time))
Ejemplo n.º 10
0
    def __init__(self, mpi_comm):
        super(SingleNodeCommunicator, self).__init__(mpi_comm)

        if self.inter_size != 1:
            raise ValueError('SingleNodeCommunicator cannot be used under '
                             'multi-node settings')
        if not nccl._available:
            raise RuntimeError('NCCL is not available. '
                               'Please confirm that NCCL is enabled in CuPy.')
        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.intra_nccl_comm = None

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()
Ejemplo n.º 11
0
    def __init__(self,
                 mpi_comm,
                 allreduce_grad_dtype=None,
                 batched_copy=False):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.batched_copy = batched_copy
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
Ejemplo n.º 12
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        # GPU buffers
        self.gpu_buf_a = _memory_utility.DeviceMemory()
        self.gpu_buf_b = _memory_utility.DeviceMemory()

        # Data type used in communications
        self._rsv_comm_dtype = np.dtype(np.float32)
        self._agv_comm_dtype = np.dtype(np.float32)

        # GPU kernels. We don't generate here due to the same reason above
        self._rsv_memset_kernel = None
        self._agv_memset_kernel = None

        # array info
        self._ainfo = None
    def __init__(self,
                 mpi_comm,
                 rsv_comm_dtype=np.float32,
                 agv_comm_dtype=np.float32,
                 use_hiercoll=False,
                 dims=None
                 ):
        super(PureNCCLCommunicator, self).__init__(mpi_comm)

        if use_hiercoll:
            if not _hiercoll_available:
                raise ValueError('use_hiercoll is True,'
                                 'but hiercoll.hiernccl is not available.')

            if dims is None:
                dims = []

        if dims is not None and not use_hiercoll:
            raise ValueError('dim is not None,'
                             'but use_hiercoll is False.')

        if use_hiercoll and mpi_comm.size != MPI.COMM_WORLD.size:
            raise ValueError(
                'HierColl with non-WORLD MPI Comm is not supported.')

        # None -> Non-hierarchical / pure NCCL
        # []   -> auto hierarchical selection (envv or optimizer)
        # [int]-> manual hierarchical selection
        self.dims = dims

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        # GPU buffers
        self.gpu_buf_a = _memory_utility.DeviceMemory()
        self.gpu_buf_b = _memory_utility.DeviceMemory()
        self.gpu_buf_c = _memory_utility.DeviceMemory()

        # Assume FP32 for data type
        self._arrs_dtype = np.dtype(np.float32)

        # Data type used in communications
        self._rsv_comm_dtype = np.dtype(rsv_comm_dtype)
        if self._rsv_comm_dtype.kind != 'f':
            raise ValueError('rsv_comm_dtype must be numpy.float16,'
                             'numpy.float32 or numpy.float64.')

        self._agv_comm_dtype = np.dtype(agv_comm_dtype)
        if self._agv_comm_dtype.kind != 'f':
            raise ValueError('agv_comm_dtype must be numpy.float16,'
                             'numpy.float32 or numpy.float64.')

        # GPU kernels. We don't generate here due to the same reason above
        self._cast_rsv_kernels = None
        self._cast_agv_kernels = None
        self._mean_kernel = None
        self._max_kernel = None
        self._memset_kernel = None

        # Packer to pack/unpack arrays
        self._packer = _utility.Packer()

        # For scaling in FP16
        self._scaled = False
        self._scaling_factors = None
        self._streams = None
Ejemplo n.º 14
0
 def __init__(self, mpi_comm):
     super(NonCudaAwareCommunicator, self).__init__(mpi_comm, use_nccl=True)
     self.gpu_buffer_a = _memory_utility.DeviceMemory()
     self.gpu_buffer_b = _memory_utility.DeviceMemory()
     self.cpu_buffer_a = _memory_utility.HostPinnedMemory()
     self.cpu_buffer_b = _memory_utility.HostPinnedMemory()
Ejemplo n.º 15
0
    def __init__(self, mpi_comm):
        super(DummyCommunicator, self).__init__(mpi_comm, use_nccl=True)

        self.gpu_buffer_a = _memory_utility.DeviceMemory()
 def __init__(self, mpi_comm=mpi4py.MPI.COMM_WORLD):
     super(TwoDimensionalCommunicator, self).__init__(mpi_comm,
                                                      use_nccl=True)
     self.gpu_buffer_a = _memory_utility.DeviceMemory()
     self.gpu_buffer_b = _memory_utility.DeviceMemory()