def __init__(self, mpi_comm): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() with self.config_scope(): self.allreduce_grad_dtype = None self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = param.communicator_class(mpi_comm) communicator.set_config('batched_copy', param.batched_copy) value = communicator.get_config('batched_copy') assert param.batched_copy == value with pytest.raises(ValueError): communicator.set_config('blah blah blah') if param.communicator_class is PureNcclCommunicator: communicator.set_config('allreduce_grad_dtype', param.allreduce_grad_dtype) value = communicator.get_config('allreduce_grad_dtype') assert param.allreduce_grad_dtype == value if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def __init__(self, mpi_comm, allreduce_grad_dtype=None, batched_copy=False): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available or nccl.get_build_version() < 2000: raise RuntimeError( 'PureNcclCommunicator is only supported on NCCL 2.0+') # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError( 'allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.batched_copy = batched_copy self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.div_by_size = None self.params_data = None
def create_communicator(communicator_class, mpi_comm, use_gpu): if PureNcclCommunicator == communicator_class: use_nccl = True else: use_nccl = False if use_gpu and not use_nccl and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') communicator = communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def __init__(self, mpi_comm, allreduce_grad_dtype=None, batched_copy=False): super(PureNcclCommunicator, self).__init__(mpi_comm) if not nccl._available: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but NCCL is not available.') if nccl.get_build_version() < 2000: raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, ' 'but found {}.'.format( nccl.get_build_version())) if nccl.get_version() < 2302: warnings.warn('NCCL 2.2 and older versions are deprecated.', DeprecationWarning) # We have to delay the initialization of communicators. This is because # NCCL's communicators use the current CUDA devices at the time of # initialization. Therefore, we have to initialize NCCL communicators # after users set the devices to use. self.nccl_comm = None self.gpu_tmp_buffer = _memory_utility.DeviceMemory() self.gpu_buffer_a = _memory_utility.DeviceMemory() self.gpu_buffer_b = _memory_utility.DeviceMemory() if allreduce_grad_dtype is not None: self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype) if self.allreduce_grad_dtype.kind != 'f': raise ValueError('allreduce_grad_dtype must be' 'numpy.float16, numpy.float32,' 'numpy.float64, or None.') else: self.allreduce_grad_dtype = None self.batched_copy = batched_copy self.grad_dtype_to_allreduce_dtype_kernel = None self.allreduce_dtype_to_grad_dtype_kernel = None self.params_data = None
def setup_gpu(self, device=None): if nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') self.comm = chainermn.create_communicator('pure_nccl') device = self.comm.intra_rank chainer.cuda.get_device_from_id(device).use() self.target = DynamicExampleModel() self.target.to_gpu() self.target.a.W.data[:] = self.comm.rank self.target.b.W.data[:] = self.comm.rank + 1 self.target.a.W.grad[:] = 0 self.target.b.W.grad[:] = 0 self.actual_optimizer = chainer.GradientMethod() self.actual_optimizer.create_update_rule = mock.MagicMock
def create_communicator(param, use_gpu): if not param.multi_node: ranks = _communication_utility.init_ranks(mpi_comm) inter_size = ranks[4] if inter_size > 1: pytest.skip('This test is for single node only') if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') if param.communicator_class is PureNcclCommunicator: communicator = param.communicator_class( mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype, batched_copy=param.batched_copy) else: communicator = param.communicator_class(mpi_comm) if use_gpu: chainer.cuda.get_device_from_id(communicator.intra_rank).use() return communicator
def setUp(self): if nccl.get_build_version() < 2000: pytest.skip('This test requires NCCL version >= 2.0') self.mpi_comm = mpi4py.MPI.COMM_WORLD