def setup_workers(self): # work only once if self._initialized: return self._initialized = True self.model.cleargrads() for i in six.moves.range(1, len(self.gpus)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker( i, worker_end, self.model, self.gpus, self.da, int( float(self.batch) / len(self.gpus) / self.train_batch_divide), self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self.gpus[0]): self.model.to_gpu(self.gpus[0]) if len(self.gpus) > 1: communication_id = nccl.get_unique_id() self._send_message(("set comm_id", communication_id)) self.communication = nccl.NcclCommunicator( len(self.gpus), communication_id, 0)
def _init_with_mpi(self, n_devices, rank): # MPI is used only for management purposes # so the rank may be different than the one specified self._mpi_comm = MPI.COMM_WORLD self._mpi_rank = self._mpi_comm.Get_rank() self._mpi_comm.Barrier() nccl_id = None if self._mpi_rank == 0: nccl_id = nccl.get_unique_id() nccl_id = self._mpi_comm.bcast(nccl_id, root=0) # Initialize devices self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
def _init_with_tcp_store(self, n_devices, rank, host, port): nccl_id = None if rank == 0: self._store.run(host, port) nccl_id = nccl.get_unique_id() # get_unique_id return negative values due to cython issues # with bytes && c strings. We shift them by 128 to # make them positive and send them as bytes to the proxy store shifted_nccl_id = bytes([b + 128 for b in nccl_id]) self._store_proxy['nccl_id'] = shifted_nccl_id self._store_proxy.barrier() else: self._store_proxy.barrier() nccl_id = self._store_proxy['nccl_id'] nccl_id = tuple([int(b) - 128 for b in nccl_id]) self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
def _get_nccl_comm(self, device, devices): if str(devices) in self.nccl_comms: return self.nccl_comms[str(devices)] if self.rank == 0: nccl_comm_id = nccl.get_unique_id() else: nccl_comm_id = None nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id) with device: nccl_comm = nccl.NcclCommunicator(self.size, nccl_comm_id, self.rank) self.nccl_comms[str(devices)] = nccl_comm return nccl_comm
def setup_workers(self): if self._initialized: return self._initialized = True self._master.cleargrads() for i in six.moves.range(1, len(self._devices)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker(i, worker_end, self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self._devices[0]): self._master.to_gpu(self._devices[0]) if len(self._devices) > 1: comm_id = nccl.get_unique_id() self._send_message(("set comm_id", comm_id)) self.comm = nccl.NcclCommunicator(len(self._devices), comm_id, 0)
def __init__(self): if config.mpi4py_enabled: self.mpi_comm = MPI.COMM_WORLD self.size = self.mpi_comm.Get_size() self.rank = self.mpi_comm.Get_rank() else: self.size = 1 self.rank = 0 self.device = Device(self.rank % cp.cuda.runtime.getDeviceCount()) if config.nccl_enabled: if self.rank == 0: nccl_comm_id = nccl.get_unique_id() else: nccl_comm_id = None nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id) with self.device: self.nccl_comm = nccl.NcclCommunicator( self.size, nccl_comm_id, self.rank)
def get_nccl_unique_id(): return nccl.get_unique_id()
def __init__(self, rank, world_size): global nccl from cupy.cuda import nccl self.rank = rank self.world_size = world_size self.unique_id = nccl.get_unique_id()
def make_sesame(self, intra_rank): print('Using GPU No.', intra_rank, " @to get unique id") chainer.cuda.get_device_from_id(intra_rank).use() uid = nccl.get_unique_id() return json.dumps(uid)
def uid_gen(intra_rank): chainer.cuda.get_device_from_id(intra_rank).use() return json.dumps(nccl.get_unique_id())
def test_comm_size(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) assert 1 == comm.size()
def test_check_async_error(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) comm.check_async_error() comm.destroy()
def test_abort(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) comm.abort()
def test_single_proc_ring(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) assert 0 == comm.rank_id() comm.destroy()
def test_nccl(self): uid = libnccl.get_unique_id() comm = libnccl.NcclCommunicator(1, uid, 0) # NOQA