def __init__(self, mpi_comm, batched_copy=False): super(NonCudaAwareCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError( 'NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.' ) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.inter_mpi_comm = None self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() self.cpu_buffer_a = _memory_utility.HostPinnedMemory() self.cpu_buffer_b = _memory_utility.HostPinnedMemory() self.batched_copy = batched_copy
def __init__(self, mpi_comm, allreduce_grad_dtype=None): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available or nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None
def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() with self.config_scope(): self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def __init__(self, mpi_comm, allreduce_grad_dtype=None): super(PureNcclCommunicator, self).__init__(mpi_comm, True) if nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') self._init_ranks() self.inter_mpi_comm = None self.intra_mpi_comm = None self.intra_nccl_comm = None self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_allreduce_buffer_a = _memory_utility.DeviceMemory() self.gpu_allreduce_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None
def __init__(self, mpi_comm, batched_copy=False): super(FlatCommunicator, self).__init__(mpi_comm) self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() self.batched_copy = batched_copy
def __init__(self, mpi_comm): super(SingleNodeCommunicator, self).__init__(mpi_comm, use_nccl=True) if self.inter_size != 1: raise ValueError('SingleNodeCommunicator cannot be used under ' 'multi-node settings') self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm, True) if nccl.get_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') self._init_ranks() self.inter_mpi_comm = None self.intra_mpi_comm = None self.intra_nccl_comm = None self.nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def __init__(self, mpi_comm): super(HierarchicalCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.') # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.inter_mpi_comm = None self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def main(): comm = MPI.COMM_WORLD intra_rank = utils.get_intra_rank(comm) chainer.cuda.get_device(intra_rank).use() mpi_print = common.create_mpi_print(comm) nelems_list = [2, 4, 8, 16, 32, 64, 128, 256] nelems_max = nelems_list[-1] * pow(2, 20) sendarr = cupy.random.rand(nelems_max, dtype=cupy.float32) recvarr = cupy.zeros((nelems_max, ), dtype=cupy.float32) if comm.rank == 0: print('array initialized...') sendbuf_gpu = _memory_utility.DeviceMemory() sendbuf_gpu.assign(nelems_max * 4) recvbuf_gpu = _memory_utility.DeviceMemory() recvbuf_gpu.assign(nelems_max * 4) if comm.rank == 0: print('GPU buffer initialized...') utils.pack([sendarr], sendbuf_gpu) if comm.rank == 0: print('packed...') for nelems in nelems_list: nelems *= pow(2, 20) sendbuf = [sendbuf_gpu.buffer(nelems * 4), MPI.FLOAT] recvbuf = [recvbuf_gpu.buffer(nelems * 4), MPI.FLOAT ] if comm.rank == 0 else None if comm.rank == 0: s_time = time.time() # WE MUST SYNC BEFORE COMMUNICATION !!! chainer.cuda.Stream.null.synchronize() comm.Reduce(sendbuf, recvbuf, root=0) if comm.rank == 0: print('COUNT {} MiBytes, TIME {} sec'.format( (nelems * 4) / pow(2, 20), time.time() - s_time))
def __init__(self, mpi_comm): super(SingleNodeCommunicator, self).__init__(mpi_comm) if self.inter_size != 1: raise ValueError('SingleNodeCommunicator cannot be used under ' 'multi-node settings') if not nccl._available: raise RuntimeError('NCCL is not available. ' 'Please confirm that NCCL is enabled in CuPy.') if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.intra_nccl_comm = None self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()
def __init__(self, mpi_comm, allreduce_grad_dtype=None, batched_copy=False): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.batched_copy = batched_copy self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None # GPU buffers self.gpu_buf_a = _memory_utility.DeviceMemory() self.gpu_buf_b = _memory_utility.DeviceMemory() # Data type used in communications self._rsv_comm_dtype = np.dtype(np.float32) self._agv_comm_dtype = np.dtype(np.float32) # GPU kernels. We don't generate here due to the same reason above self._rsv_memset_kernel = None self._agv_memset_kernel = None # array info self._ainfo = None
def __init__(self, mpi_comm, rsv_comm_dtype=np.float32, agv_comm_dtype=np.float32, use_hiercoll=False, dims=None ): super(PureNCCLCommunicator, self).__init__(mpi_comm) if use_hiercoll: if not _hiercoll_available: raise ValueError('use_hiercoll is True,' 'but hiercoll.hiernccl is not available.') if dims is None: dims = [] if dims is not None and not use_hiercoll: raise ValueError('dim is not None,' 'but use_hiercoll is False.') if use_hiercoll and mpi_comm.size != MPI.COMM_WORLD.size: raise ValueError( 'HierColl with non-WORLD MPI Comm is not supported.') # None -> Non-hierarchical / pure NCCL # [] -> auto hierarchical selection (envv or optimizer) # [int]-> manual hierarchical selection self.dims = dims # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None # GPU buffers self.gpu_buf_a = _memory_utility.DeviceMemory() self.gpu_buf_b = _memory_utility.DeviceMemory() self.gpu_buf_c = _memory_utility.DeviceMemory() # Assume FP32 for data type self._arrs_dtype = np.dtype(np.float32) # Data type used in communications self._rsv_comm_dtype = np.dtype(rsv_comm_dtype) if self._rsv_comm_dtype.kind != 'f': raise ValueError('rsv_comm_dtype must be numpy.float16,' 'numpy.float32 or numpy.float64.') self._agv_comm_dtype = np.dtype(agv_comm_dtype) if self._agv_comm_dtype.kind != 'f': raise ValueError('agv_comm_dtype must be numpy.float16,' 'numpy.float32 or numpy.float64.') # GPU kernels. We don't generate here due to the same reason above self._cast_rsv_kernels = None self._cast_agv_kernels = None self._mean_kernel = None self._max_kernel = None self._memset_kernel = None # Packer to pack/unpack arrays self._packer = _utility.Packer() # For scaling in FP16 self._scaled = False self._scaling_factors = None self._streams = None
def __init__(self, mpi_comm): super(NonCudaAwareCommunicator, self).__init__(mpi_comm, use_nccl=True) self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() self.cpu_buffer_a = _memory_utility.HostPinnedMemory() self.cpu_buffer_b = _memory_utility.HostPinnedMemory()
def __init__(self, mpi_comm): super(DummyCommunicator, self).__init__(mpi_comm, use_nccl=True) self.gpu_buffer_a = _memory_utility.DeviceMemory()
def __init__(self, mpi_comm=mpi4py.MPI.COMM_WORLD): super(TwoDimensionalCommunicator, self).__init__(mpi_comm, use_nccl=True) self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory()