Ejemplo n.º 1
0
 def _init_ranks(self):
     my_ranks = _communication_utility.init_ranks(self.mpi_comm)
     assert my_ranks[0] == self.mpi_comm.rank
     self._intra_rank = my_ranks[1]
     self._intra_size = my_ranks[2]
     self._inter_rank = my_ranks[3]
     self._inter_size = my_ranks[4]
Ejemplo n.º 2
0
 def _init_ranks(self):
     my_ranks = _communication_utility.init_ranks(self.mpi_comm)
     assert my_ranks[0] == self.mpi_comm.rank
     self._intra_rank = my_ranks[1]
     self._intra_size = my_ranks[2]
     self._inter_rank = my_ranks[3]
     self._inter_size = my_ranks[4]
Ejemplo n.º 3
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    communicator = param.communicator_class(mpi_comm)
    communicator.set_config('batched_copy', param.batched_copy)
    value = communicator.get_config('batched_copy')
    assert param.batched_copy == value

    with pytest.raises(ValueError):
        communicator.set_config('blah blah blah')

    if param.communicator_class is PureNcclCommunicator:
        communicator.set_config('allreduce_grad_dtype',
                                param.allreduce_grad_dtype)
        value = communicator.get_config('allreduce_grad_dtype')
        assert param.allreduce_grad_dtype == value

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Ejemplo n.º 4
0
def test_deprecation_single():
    ranks = _communication_utility.init_ranks(mpi_comm)
    inter_size = ranks[4]
    if inter_size > 1:
        pytest.skip('This test is for single node only')

    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('single_node')
Ejemplo n.º 5
0
def test_deprecation_single():
    ranks = _communication_utility.init_ranks(mpi_comm)
    inter_size = ranks[4]
    if inter_size > 1:
        pytest.skip('This test is for single node only')

    with chainer.testing.assert_warns(DeprecationWarning):
        chainermn.create_communicator('single_node')
Ejemplo n.º 6
0
    def setUp(self):
        self.mpi_comm = mpi4py.MPI.COMM_WORLD

        if not self.multi_node:
            ranks = _communication_utility.init_ranks(self.mpi_comm)
            inter_size = ranks[4]
            if inter_size > 1:
                raise nose.plugins.skip.SkipTest()

        self.communicator = self.communicator_class(self.mpi_comm)

        if hasattr(self.communicator, 'intra_rank'):
            chainer.cuda.get_device(self.communicator.intra_rank).use()
Ejemplo n.º 7
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device(communicator.intra_rank).use()

    return communicator
Ejemplo n.º 8
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.communicator_class is PureNcclCommunicator:
        communicator = param.communicator_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Ejemplo n.º 9
0
def create_communicator(param, use_gpu):
    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if use_gpu and not param.nccl1 and nccl.get_build_version() < 2000:
        pytest.skip('This test requires NCCL version >= 2.0')

    if param.communicator_class is PureNcclCommunicator:
        communicator = param.communicator_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = param.communicator_class(mpi_comm)

    if use_gpu:
        chainer.cuda.get_device_from_id(communicator.intra_rank).use()

    return communicator
Ejemplo n.º 10
0
def check_allreduce_grad_mixed_dtype(param, model, use_gpu):
    # Checks the actual allreduce communication is performed
    # in the correct data type (FP16 or FP32)
    comm_class = param.communicator_class

    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if comm_class is PureNcclCommunicator:
        communicator = comm_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = comm_class(mpi_comm)

    mpi_comm.barrier()

    # answer type: see the document of `create_communicator`
    global_dtype = param.global_dtype
    allreduce_dtype = param.allreduce_grad_dtype

    # assert test configuration.
    assert chainer.get_dtype() == global_dtype

    answer_dtype = None
    if allreduce_dtype == np.float16:
        answer_dtype = np.float16
    elif allreduce_dtype == np.float32:
        answer_dtype = np.float32
    else:
        if global_dtype == np.float32:
            answer_dtype = np.float32
        else:
            answer_dtype = np.float16

    if use_gpu:
        model.to_gpu()

    model.a.W.grad[:] = communicator.rank
    model.b.W.grad[:] = communicator.rank + 1
    model.c.b.grad[:] = communicator.rank + 2

    if isinstance(communicator, PureNcclCommunicator):
        communicator._init_comms()
        with mock.patch.object(communicator, 'nccl_comm',
                               wraps=communicator.nccl_comm) as mc:
            answer_dtype = _communication_utility._get_nccl_type_id(
                answer_dtype)

            communicator.allreduce_grad(model)

            # dtype that was used in the actual communication,
            # which is nccl_comm.allReduce
            call_args = mc.allReduce.call_args[0]
            actual_dtype = call_args[3]
            assert answer_dtype == actual_dtype
    else:
        # For other MPI-based communicators,
        # all communication should happen in FP32 as of now, so
        # here we just check the results are correct for
        # 16-32 mixed models.
        communicator.allreduce_grad(model)

    base = (communicator.size - 1.0) / 2
    chainer.testing.assert_allclose(model.a.W.grad,
                                    (base + 0) * np.ones((3, 2)))
    chainer.testing.assert_allclose(model.b.W.grad,
                                    (base + 1) * np.ones((4, 3)))

    mpi_comm.barrier()
    destroy_communicator(communicator)
Ejemplo n.º 11
0
def check_allreduce_grad_mixed_dtype(param, model, use_gpu):
    # Checks the actual allreduce communication is performed
    # in the correct data type (FP16 or FP32)
    comm_class = param.communicator_class

    if not param.multi_node:
        ranks = _communication_utility.init_ranks(mpi_comm)
        inter_size = ranks[4]
        if inter_size > 1:
            pytest.skip('This test is for single node only')

    if comm_class is PureNcclCommunicator:
        communicator = comm_class(
            mpi_comm, allreduce_grad_dtype=param.allreduce_grad_dtype,
            batched_copy=param.batched_copy)
    else:
        communicator = comm_class(mpi_comm)

    mpi_comm.barrier()

    # answer type: see the document of `create_communicator`
    global_dtype = param.global_dtype
    allreduce_dtype = param.allreduce_grad_dtype

    # assert test configuration.
    assert chainer.get_dtype() == global_dtype

    answer_dtype = None
    if allreduce_dtype == np.float16:
        answer_dtype = np.float16
    elif allreduce_dtype == np.float32:
        answer_dtype = np.float32
    else:
        if global_dtype == np.float32:
            answer_dtype = np.float32
        else:
            answer_dtype = np.float16

    if use_gpu:
        model.to_gpu()

    model.a.W.grad[:] = communicator.rank
    model.b.W.grad[:] = communicator.rank + 1
    model.c.b.grad[:] = communicator.rank + 2

    if isinstance(communicator, PureNcclCommunicator):
        communicator._init_comms()
        with mock.patch.object(communicator, 'nccl_comm',
                               wraps=communicator.nccl_comm) as mc:
            answer_dtype = _communication_utility._get_nccl_type_id(
                answer_dtype)

            communicator.allreduce_grad(model)

            # dtype that was used in the actual communication,
            # which is nccl_comm.allReduce
            call_args = mc.allReduce.call_args[0]
            actual_dtype = call_args[3]
            assert answer_dtype == actual_dtype
    else:
        # For other MPI-based communicators,
        # all communication should happen in FP32 as of now, so
        # here we just check the results are correct for
        # 16-32 mixed models.
        communicator.allreduce_grad(model)

    base = (communicator.size - 1.0) / 2
    chainer.testing.assert_allclose(model.a.W.grad,
                                    (base + 0) * np.ones((3, 2)))
    chainer.testing.assert_allclose(model.b.W.grad,
                                    (base + 1) * np.ones((4, 3)))

    mpi_comm.barrier()
    destroy_communicator(communicator)