Exemple #1
0
    def __init__(self, mpi_comm):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        with self.config_scope():
            self.allreduce_grad_dtype = None
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
Exemple #2
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    communicator = param.communicator_class(mpi_comm)
    communicator.set_config('batched_copy', param.batched_copy)
    value = communicator.get_config('batched_copy')
    assert param.batched_copy == value

    with pytest.raises(ValueError):
        communicator.set_config('blah blah blah')

    if param.communicator_class is PureNcclCommunicator:
        communicator.set_config('allreduce_grad_dtype',
                                param.allreduce_grad_dtype)
        value = communicator.get_config('allreduce_grad_dtype')
        assert param.allreduce_grad_dtype == value

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemple #3
0
    def __init__(self, mpi_comm, allreduce_grad_dtype=None,
                 batched_copy=False):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available or nccl.get_build_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError(
                    'allreduce_grad_dtype must be'
                    'numpy.float16, numpy.float32,'
                    'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.batched_copy = batched_copy
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
        self.params_data = None
    def __init__(self, mpi_comm, allreduce_grad_dtype=None,
                 batched_copy=False):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available or nccl.get_build_version() < 2000:
            raise RuntimeError(
                'PureNcclCommunicator is only supported on NCCL 2.0+')

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError(
                    'allreduce_grad_dtype must be'
                    'numpy.float16, numpy.float32,'
                    'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.batched_copy = batched_copy
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.div_by_size = None
        self.params_data = None
Exemple #5
0
def create_communicator(communicator_class, mpi_comm, use_gpu):
    if PureNcclCommunicator == communicator_class:
        use_nccl = True
    else:
        use_nccl = False

    if use_gpu and not use_nccl and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')
    communicator = communicator_class(mpi_comm)
    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
def create_communicator(communicator_class, mpi_comm, use_gpu):
    if PureNcclCommunicator == communicator_class:
        use_nccl = True
    else:
        use_nccl = False

    if use_gpu and not use_nccl and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')
    communicator = communicator_class(mpi_comm)
    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemple #7
0
    def __init__(self,
                 mpi_comm,
                 allreduce_grad_dtype=None,
                 batched_copy=False):
        super(PureNcclCommunicator, self).__init__(mpi_comm)
        if not nccl._available:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but NCCL is not available.')
        if nccl.get_build_version() < 2000:
            raise RuntimeError('PureNcclCommunicator requires NCCL 2.0+, '
                               'but found {}.'.format(
                                   nccl.get_build_version()))

        if nccl.get_version() < 2302:
            warnings.warn('NCCL 2.2 and older versions are deprecated.',
                          DeprecationWarning)

        # We have to delay the initialization of communicators. This is because
        # NCCL's communicators use the current CUDA devices at the time of
        # initialization. Therefore, we have to initialize NCCL communicators
        # after users set the devices to use.
        self.nccl_comm = None

        self.gpu_tmp_buffer = _memory_utility.DeviceMemory()
        self.gpu_buffer_a = _memory_utility.DeviceMemory()
        self.gpu_buffer_b = _memory_utility.DeviceMemory()

        if allreduce_grad_dtype is not None:
            self.allreduce_grad_dtype = np.dtype(allreduce_grad_dtype)
            if self.allreduce_grad_dtype.kind != 'f':
                raise ValueError('allreduce_grad_dtype must be'
                                 'numpy.float16, numpy.float32,'
                                 'numpy.float64, or None.')
        else:
            self.allreduce_grad_dtype = None
        self.batched_copy = batched_copy
        self.grad_dtype_to_allreduce_dtype_kernel = None
        self.allreduce_dtype_to_grad_dtype_kernel = None
        self.params_data = None
 def setup_gpu(self, device=None):
     if nccl.get_build_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.comm = chainermn.create_communicator('pure_nccl')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
Exemple #9
0
 def setup_gpu(self, device=None):
     if nccl.get_build_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.comm = chainermn.create_communicator('pure_nccl')
     device = self.comm.intra_rank
     chainer.cuda.get_device_from_id(device).use()
     self.target = DynamicExampleModel()
     self.target.to_gpu()
     self.target.a.W.data[:] = self.comm.rank
     self.target.b.W.data[:] = self.comm.rank + 1
     self.target.a.W.grad[:] = 0
     self.target.b.W.grad[:] = 0
     self.actual_optimizer = chainer.GradientMethod()
     self.actual_optimizer.create_update_rule = mock.MagicMock
Exemple #10
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.communicator_class is PureNcclCommunicator:
        communicator = param.communicator_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemple #11
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.communicator_class is PureNcclCommunicator:
        communicator = param.communicator_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Exemple #12
0
 def setUp(self):
     if nccl.get_build_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.mpi_comm = mpi4py.MPI.COMM_WORLD
Exemple #13
0
 def setUp(self):
     if nccl.get_build_version() < 2000:
         pytest.skip('This test requires NCCL version >= 2.0')
     self.mpi_comm = mpi4py.MPI.COMM_WORLD