Esempio n. 1
0
def init_nccl_env(mpi_comm):
    from pygpu import collectives as gpucoll
    from theano import gpuarray as theanoga

    gpu_name = None
    gpu_ctx = theanoga.get_context(gpu_name)
    commid = gpucoll.GpuCommCliqueId(gpu_ctx)
    mpi_rank = mpi_comm.Get_rank()
    mpi_size = mpi_comm.Get_size()

    data = commid.comm_id if mpi_rank == 0 else None
    data = mpi_comm.bcast(data, root=0)
    commid.comm_id = data

    comm = gpucoll.GpuComm(commid, mpi_size, mpi_rank)
    print('Init pygpu OK, rank %d' % mpi_rank)
    sys.stdout.flush()
    return comm
Esempio n. 2
0
    def _register_to_platoon(self):
        """
        Asks Controller for configuration information and creates a NCCL
        communicator that participate in the local node's workers world.

        For this it is needed that Theano is imported. Through Theano, this
        methods gets access to the single GPU context of this worker process.
        This context is to be used in all computations done by a worker's
        process.

        .. note::
           It is necessary that this initialization method is called
           successfully before :meth:`all_reduce` in order to be available
           and functional.

        .. versionadded:: 0.6.0

        """
        if pygpu:
            self.ctx_name = None
            self.gpuctx = theanoga.get_context(self.ctx_name)
            self.device = theanoconf.device
            self._local_id = gpucoll.GpuCommCliqueId(context=self.gpuctx)
            # Ask controller for local's info to participate in
            response = self.send_req("platoon-get_platoon_info",
                                     info={
                                         'device':
                                         self.device,
                                         'local_id':
                                         self._local_id.comm_id.decode('utf-8')
                                     })
            self._local_id.comm_id = bytearray(
                response['local_id'].encode('utf-8'))
            self._local_size = response['local_size']
            self._local_rank = response['local_rank']
            self._local_comm = gpucoll.GpuComm(self._local_id,
                                               self._local_size,
                                               self._local_rank)
            self._multinode = response['multinode']
            self._global_size = response['global_size']
            self._global_rank = response['global_rank']
        else:
            raise AttributeError("pygpu or theano is not imported")
Esempio n. 3
0
    def _register_to_platoon(self):
        """
        Asks Controller for configuration information and creates a NCCL
        communicator that participate in the local node's workers world.

        For this it is needed that Theano is imported. Through Theano, this
        methods gets access to the single GPU context of this worker process.
        This context is to be used in all computations done by a worker's
        process.

        .. note::
           It is necessary that this initialization method is called
           successfully before :meth:`all_reduce` in order to be available
           and functional.

        .. versionadded:: 0.6.0

        """
        if pygpu:
            self.ctx_name = None
            self.gpuctx = theanoga.get_context(self.ctx_name)
            self.device = theanoconf.device
            self._local_id = gpucoll.GpuCommCliqueId(context=self.gpuctx)
            # Ask controller for local's info to participate in
            response = self.send_req("platoon-get_platoon_info",
                                     info={'device': self.device,
                                           'local_id': self._local_id.comm_id.decode('utf-8')})
            self._local_id.comm_id = bytearray(response['local_id'].encode('utf-8'))
            self._local_size = response['local_size']
            self._local_rank = response['local_rank']
            self._local_comm = gpucoll.GpuComm(self._local_id,
                                               self._local_size,
                                               self._local_rank)
            self._multinode = response['multinode']
            self._global_size = response['global_size']
            self._global_rank = response['global_rank']
        else:
            raise AttributeError("pygpu or theano is not imported")
Esempio n. 4
0
 def __init__(self, rank, world_size, port_pub_sub, port_push_pull, job_id):
     self.rank = rank
     self.world_size = world_size
     self.port_pub_sub = port_pub_sub
     self.port_push_pull = port_push_pull
     self.job_id = job_id
     self._lock = posix_ipc.Semaphore("{}_lock".format(self.job_id))
     self.gpu_ctx = gpuarray.get_context(None)
     self.local_id = collectives.GpuCommCliqueId(context=self.gpu_ctx)
     self.lock()
     comm_id_file = 'comm_id.pkl'
     if not os.path.isfile(comm_id_file):
         comm_id = self.local_id.comm_id
         utils.dump_pkl(comm_id, comm_id_file)
     else:
         comm_id = utils.load_pkl(comm_id_file)
         self.local_id.comm_id = comm_id
     self.unlock()    
     print 'local_id ', self.local_id.comm_id
     # the following call is blocked till all workers finish calling it
     #print self.local_id.comm_id, self.job_id
     self.local_comm = collectives.GpuComm(self.local_id, self.world_size, self.rank)
     self.init_socket()
     print 'finish init worker with rank %d'%rank