def setup_workers(self): # work only once if self._initialized: return self._initialized = True self.model.cleargrads() for i in six.moves.range(1, len(self.gpus)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker( i, worker_end, self.model, self.gpus, self.da, int( float(self.batch) / len(self.gpus) / self.train_batch_divide), self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self.gpus[0]): self.model.to_gpu(self.gpus[0]) if len(self.gpus) > 1: communication_id = nccl.get_unique_id() self._send_message(("set comm_id", communication_id)) self.communication = nccl.NcclCommunicator( len(self.gpus), communication_id, 0)
def setup(self): _, comm_id = self.pipe.recv() self.comm = nccl.NcclCommunicator(self.n_devices, comm_id, self.proc_id) self.model.to_gpu(self.device) self.reporter = reporter.Reporter() self.reporter.add_observer('main', self.model)
def setup(self): _, comm_id = self.pipe.recv() self.comm = nccl.NcclCommunicator(self.n_devices, comm_id, self.proc_id) self.model.to_device(self.device) self.reporter = reporter.Reporter() self.reporter.add_observer('main', self.model) self.reporter.add_observers('main', self.model.namedlinks(skipself=True))
def _init_with_mpi(self, n_devices, rank): # MPI is used only for management purposes # so the rank may be different than the one specified self._mpi_comm = MPI.COMM_WORLD self._mpi_rank = self._mpi_comm.Get_rank() self._mpi_comm.Barrier() nccl_id = None if self._mpi_rank == 0: nccl_id = nccl.get_unique_id() nccl_id = self._mpi_comm.bcast(nccl_id, root=0) # Initialize devices self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
def _init_with_tcp_store(self, n_devices, rank, host, port): nccl_id = None if rank == 0: self._store.run(host, port) nccl_id = nccl.get_unique_id() # get_unique_id return negative values due to cython issues # with bytes && c strings. We shift them by 128 to # make them positive and send them as bytes to the proxy store shifted_nccl_id = bytes([b + 128 for b in nccl_id]) self._store_proxy['nccl_id'] = shifted_nccl_id self._store_proxy.barrier() else: self._store_proxy.barrier() nccl_id = self._store_proxy['nccl_id'] nccl_id = tuple([int(b) - 128 for b in nccl_id]) self._comm = nccl.NcclCommunicator(n_devices, nccl_id, rank)
def _get_nccl_comm(self, device, devices): if str(devices) in self.nccl_comms: return self.nccl_comms[str(devices)] if self.rank == 0: nccl_comm_id = nccl.get_unique_id() else: nccl_comm_id = None nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id) with device: nccl_comm = nccl.NcclCommunicator(self.size, nccl_comm_id, self.rank) self.nccl_comms[str(devices)] = nccl_comm return nccl_comm
def setup_workers(self): if self._initialized: return self._initialized = True self._master.cleargrads() for i in six.moves.range(1, len(self._devices)): pipe, worker_end = multiprocessing.Pipe() worker = _Worker(i, worker_end, self) worker.start() self._workers.append(worker) self._pipes.append(pipe) with cuda.Device(self._devices[0]): self._master.to_gpu(self._devices[0]) if len(self._devices) > 1: comm_id = nccl.get_unique_id() self._send_message(("set comm_id", comm_id)) self.comm = nccl.NcclCommunicator(len(self._devices), comm_id, 0)
def __init__(self): if config.mpi4py_enabled: self.mpi_comm = MPI.COMM_WORLD self.size = self.mpi_comm.Get_size() self.rank = self.mpi_comm.Get_rank() else: self.size = 1 self.rank = 0 self.device = Device(self.rank % cp.cuda.runtime.getDeviceCount()) if config.nccl_enabled: if self.rank == 0: nccl_comm_id = nccl.get_unique_id() else: nccl_comm_id = None nccl_comm_id = self.mpi_comm.bcast(nccl_comm_id) with self.device: self.nccl_comm = nccl.NcclCommunicator( self.size, nccl_comm_id, self.rank)
def initialize(self, head_id): self.communicator = nccl.NcclCommunicator(self.world_size, head_id, self.rank)
def construct(self, size, sesame, rank): from cupy.cuda import nccl comm_id = tuple(json.loads(sesame)) self.nccl_comm = nccl.NcclCommunicator(size, comm_id, rank) print('NCCL initialized:', size, rank) assert self.nccl_comm is not None
def test_comm_size(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) assert 1 == comm.size()
def test_check_async_error(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) comm.check_async_error() comm.destroy()
def test_abort(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) comm.abort()
def test_single_proc_ring(self): id = nccl.get_unique_id() comm = nccl.NcclCommunicator(1, id, 0) assert 0 == comm.rank_id() comm.destroy()
def setup(self): _, communication_id = self.pipe.recv() self.communication = nccl.NcclCommunicator(self.number_of_devices, communication_id, self.process_id) self.model.to_gpu(self.device)
def test_nccl(self): uid = libnccl.get_unique_id() comm = libnccl.NcclCommunicator(1, uid, 0) # NOQA