def setup_workers(self):
        # work only once
        if self._initialized:
            return
        self._initialized = True

        self.model.cleargrads()
        for i in six.moves.range(1, len(self.gpus)):
            pipe, worker_end = multiprocessing.Pipe()
            worker = _Worker(
                i, worker_end, self.model, self.gpus, self.da,
                int(
                    float(self.batch) / len(self.gpus) /
                    self.train_batch_divide), self)
            worker.start()
            self._workers.append(worker)
            self._pipes.append(pipe)

        with cuda.Device(self.gpus[0]):
            self.model.to_gpu(self.gpus[0])
            if len(self.gpus) > 1:
                communication_id = nccl.get_unique_id()
                self._send_message(("set comm_id", communication_id))
                self.communication = nccl.NcclCommunicator(
                    len(self.gpus), communication_id, 0)
Esempio n. 2
0
 def _init_with_mpi(self, n_devices, rank):
     # MPI is used only for management purposes
     # so the rank may be different than the one specified
     self._mpi_comm = MPI.COMM_WORLD
     self._mpi_rank = self._mpi_comm.Get_rank()
     self._mpi_comm.Barrier()
     nccl_id = None
     if self._mpi_rank == 0:
         nccl_id = nccl.get_unique_id()
     nccl_id = self._mpi_comm.bcast(nccl_id, root=0)
     # Initialize devices
     self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
Esempio n. 3
0
 def _init_with_tcp_store(self, n_devices, rank, host, port):
     nccl_id = None
     if rank == 0:
         self._store.run(host, port)
         nccl_id = nccl.get_unique_id()
         # get_unique_id return negative values due to cython issues
         # with bytes && c strings. We shift them by 128 to
         # make them positive and send them as bytes to the proxy store
         shifted_nccl_id = bytes([b + 128 for b in nccl_id])
         self._store_proxy['nccl_id'] = shifted_nccl_id
         self._store_proxy.barrier()
     else:
         self._store_proxy.barrier()
         nccl_id = self._store_proxy['nccl_id']
         nccl_id = tuple([int(b) - 128 for b in nccl_id])
     self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
Esempio n. 4
0
    def _get_nccl_comm(self, device, devices):
        if str(devices) in self.nccl_comms:
            return self.nccl_comms[str(devices)]

        if self.rank == 0:
            nccl_comm_id = nccl.get_unique_id()
        else:
            nccl_comm_id = None

        nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id)

        with device:
            nccl_comm = nccl.NcclCommunicator(self.size, nccl_comm_id,
                                              self.rank)
            self.nccl_comms[str(devices)] = nccl_comm

        return nccl_comm
    def setup_workers(self):
        if self._initialized:
            return
        self._initialized = True

        self._master.cleargrads()
        for i in six.moves.range(1, len(self._devices)):
            pipe, worker_end = multiprocessing.Pipe()
            worker = _Worker(i, worker_end, self)
            worker.start()
            self._workers.append(worker)
            self._pipes.append(pipe)

        with cuda.Device(self._devices[0]):
            self._master.to_gpu(self._devices[0])
            if len(self._devices) > 1:
                comm_id = nccl.get_unique_id()
                self._send_message(("set comm_id", comm_id))
                self.comm = nccl.NcclCommunicator(len(self._devices),
                                                  comm_id, 0)
    def setup_workers(self):
        if self._initialized:
            return
        self._initialized = True

        self._master.cleargrads()
        for i in six.moves.range(1, len(self._devices)):
            pipe, worker_end = multiprocessing.Pipe()
            worker = _Worker(i, worker_end, self)
            worker.start()
            self._workers.append(worker)
            self._pipes.append(pipe)

        with cuda.Device(self._devices[0]):
            self._master.to_gpu(self._devices[0])
            if len(self._devices) > 1:
                comm_id = nccl.get_unique_id()
                self._send_message(("set comm_id", comm_id))
                self.comm = nccl.NcclCommunicator(len(self._devices), comm_id,
                                                  0)
Esempio n. 7
0
    def __init__(self):
        if config.mpi4py_enabled:
            self.mpi_comm = MPI.COMM_WORLD
            self.size = self.mpi_comm.Get_size()
            self.rank = self.mpi_comm.Get_rank()
        else:
            self.size = 1
            self.rank = 0
            
        self.device = Device(self.rank % cp.cuda.runtime.getDeviceCount())

        if config.nccl_enabled:
            if self.rank == 0:
                nccl_comm_id = nccl.get_unique_id()
            else:
                nccl_comm_id = None

            nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id)

            with self.device:
                self.nccl_comm = nccl.NcclCommunicator(
                    self.size, nccl_comm_id, self.rank)
Esempio n. 8
0
def get_nccl_unique_id():
    return nccl.get_unique_id()
Esempio n. 9
0
 def __init__(self, rank, world_size):
     global nccl
     from cupy.cuda import nccl
     self.rank = rank
     self.world_size = world_size
     self.unique_id = nccl.get_unique_id()
Esempio n. 10
0
 def make_sesame(self, intra_rank):
     print('Using GPU No.', intra_rank, " @to get unique id")
     chainer.cuda.get_device_from_id(intra_rank).use()
     uid = nccl.get_unique_id()
     return json.dumps(uid)
Esempio n. 11
0
 def uid_gen(intra_rank):
     chainer.cuda.get_device_from_id(intra_rank).use()
     return json.dumps(nccl.get_unique_id())
Esempio n. 12
0
 def test_comm_size(self):
     id = nccl.get_unique_id()
     comm = nccl.NcclCommunicator(1, id, 0)
     assert 1 == comm.size()
Esempio n. 13
0
 def test_check_async_error(self):
     id = nccl.get_unique_id()
     comm = nccl.NcclCommunicator(1, id, 0)
     comm.check_async_error()
     comm.destroy()
Esempio n. 14
0
 def test_abort(self):
     id = nccl.get_unique_id()
     comm = nccl.NcclCommunicator(1, id, 0)
     comm.abort()
Esempio n. 15
0
 def test_single_proc_ring(self):
     id = nccl.get_unique_id()
     comm = nccl.NcclCommunicator(1, id, 0)
     assert 0 == comm.rank_id()
     comm.destroy()
Esempio n. 16
0
 def test_nccl(self):
     uid = libnccl.get_unique_id()
     comm = libnccl.NcclCommunicator(1, uid, 0)  # NOQA