Ejemplo n.º 1
0
    def get_intranode_pair_comm(self, pair):
        '''a gpucomm between the two processes in the pair'''
        # pair is the a size-two tuple of the MPI ranks of the server (rank=0) and a worker

        from pygpu import collectives

        _local_id = collectives.GpuCommCliqueId(context=self.ctx)

        string = _local_id.comm_id.decode('utf-8')

        comm = self.comm
        rank = comm.rank
        size = comm.size

        # if rank==0:
        #     _string=string
        #     comm.send(_string, dest=1)
        # else:
        #
        #     _string = comm.recv(source=0)

        if rank == pair[0]:

            _string = comm.recv(source=pair[1], tag=220)

        else:

            _string = string
            comm.send(_string, dest=pair[0], tag=220)

        #print _string,  string,  _string==string

        # len_pid =len(str(pid))
        #
        # # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created
        #
        # pair_index=0
        #
        # replacement = ''.join(('%d' % pair_index) for i in range(len_pid))
        # _string = string.replace(str(pid), replacement)

        _local_id.comm_id = bytearray(_string.encode('utf-8'))
        _local_size = len(
            pair)  # how many intra-node processes, pair usually means 2

        if self.rank == pair[0]:
            _local_rank = 0
        else:
            _local_rank = 1

        gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank)

        #print 'on rank %d, pair %s generated' % (self.rank, pair)

        return gpucomm
Ejemplo n.º 2
0
 def __init__(self, n_gpu, rank, master_rank):
     gpu_ctx = theano.gpuarray.get_context(None)
     clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx)
     if rank == master_rank:
         sync.dict["gpu_comm_id"] = clique_id.comm_id
         sync.barrier.wait()
     else:
         sync.barrier.wait()
         clique_id.comm_id = sync.dict["gpu_comm_id"]
     self.comm = gpu_coll.GpuComm(clique_id, n_gpu, rank)
     self.n_gpu = n_gpu
     self.avg_fac = 1. / n_gpu
     self.master_rank = master_rank
Ejemplo n.º 3
0
 def init_comm(self, n_itr, log_interval_itrs):
     import theano.gpuarray
     from pygpu import collectives as gpu_coll
     gpu_ctx = theano.gpuarray.get_context(None)
     clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx)
     self.par_objs.dict["gpu_comm_id"] = clique_id.comm_id
     self.par_objs.dict["n_itr"] = n_itr
     self.par_objs.dict["log_interval_itrs"] = log_interval_itrs
     self.par_objs.dict[
         "initial_param_values"] = self.policy.get_param_values()
     self.par_objs.barrier.wait()
     gpu_comm = gpu_coll.GpuComm(clique_id, self.n_runners, self.rank)
     self.algo.optimizer.init_comm(gpu_comm, self.rank, self.n_runners)
Ejemplo n.º 4
0
def init_gpu(rank, n_gpu, sync, is_master=True):
    """
    Happens after atexit.register(_close) in master and when g.forked=False,
    but before atexit.register(error_close) in workers, so should be careful.

    TODO: probably can simplify or otherwise improve the error catching.
    """
    dev_str = "cuda" + str(rank)
    try:
        import theano.gpuarray
        theano.gpuarray.use(dev_str)
        from pygpu import collectives as gpu_coll
        gpu_ctx = theano.gpuarray.get_context(None)
        clique_id = gpu_coll.GpuCommCliqueId(gpu_ctx)
    except ImportError as e:
        if is_master:
            raise e  # (only master raises ImportError, will join subprocesses)
        else:
            return  # (workers exit quietly)
    except Exception as e:
        sync.exct.workers_OK.value = False  # (let others know it failed)
        raise e
    finally:
        sync.init.barriers.gpu_inits[0].wait()
    if not sync.exct.workers_OK.value:
        return False  # (someone else failed)

    if is_master:
        sync.init.dict["comm_id"] = clique_id.comm_id
        sync.init.barriers.gpu_inits[1].wait()
    else:
        sync.init.barriers.gpu_inits[1].wait()
        clique_id.comm_id = sync.init.dict["comm_id"]

    try:
        gpu_comm = gpu_coll.GpuComm(clique_id, n_gpu, rank)
    except Exception as e:
        sync.exct.workers_OK.value = False
        raise e
    finally:
        sync.init.barriers.gpu_inits[2].wait()

    if not sync.exct.workers_OK.value:
        return False  # (someone else failed)
    else:
        return gpu_comm  # (success)
Ejemplo n.º 5
0
def init_nccl_env(mpi_comm):
    from pygpu import collectives as gpucoll
    from theano import gpuarray as theanoga

    gpu_name = None
    gpu_ctx = theanoga.get_context(gpu_name)
    commid = gpucoll.GpuCommCliqueId(gpu_ctx)
    mpi_rank = mpi_comm.Get_rank()
    mpi_size = mpi_comm.Get_size()

    data = commid.comm_id if mpi_rank == 0 else None
    data = mpi_comm.bcast(data, root=0)
    commid.comm_id = data

    comm = gpucoll.GpuComm(commid, mpi_size, mpi_rank)
    print('Init pygpu OK, rank %d' % mpi_rank)
    sys.stdout.flush()
    return comm
Ejemplo n.º 6
0
    def _register_to_platoon(self):
        """
        Asks Controller for configuration information and creates a NCCL
        communicator that participate in the local node's workers world.

        For this it is needed that Theano is imported. Through Theano, this
        methods gets access to the single GPU context of this worker process.
        This context is to be used in all computations done by a worker's
        process.

        .. note::
           It is necessary that this initialization method is called
           successfully before :meth:`all_reduce` in order to be available
           and functional.

        .. versionadded:: 0.6.0

        """
        if pygpu:
            self.ctx_name = None
            self.gpuctx = theanoga.get_context(self.ctx_name)
            self.device = theanoconf.device
            self._local_id = gpucoll.GpuCommCliqueId(context=self.gpuctx)
            # Ask controller for local's info to participate in
            response = self.send_req("platoon-get_platoon_info",
                                     info={
                                         'device':
                                         self.device,
                                         'local_id':
                                         self._local_id.comm_id.decode('utf-8')
                                     })
            self._local_id.comm_id = bytearray(
                response['local_id'].encode('utf-8'))
            self._local_size = response['local_size']
            self._local_rank = response['local_rank']
            self._local_comm = gpucoll.GpuComm(self._local_id,
                                               self._local_size,
                                               self._local_rank)
            self._multinode = response['multinode']
            self._global_size = response['global_size']
            self._global_rank = response['global_rank']
        else:
            raise AttributeError("pygpu or theano is not imported")
Ejemplo n.º 7
0
    def get_intranode_comm_pair(self, pre_random_array):

        _local_id = collectives.GpuCommCliqueId(context=self.ctx)

        string = _local_id.comm_id.decode('utf-8')

        import os
        pid = str(os.getpid())
        len_pid = len(pid)

        # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created

        pair = []
        for index, tmp_pair in enumerate(pre_random_array):
            if (tmp_pair[0] == self.interrank) or (tmp_pair[1]
                                                   == self.interrank):
                # print "Found it !" ,tmp_pair
                pair = tmp_pair
                pair_index = index
                break

        assert pair_index <= 9
        replacement = ''.join(('%d' % pair_index) for i in range(len_pid))
        _string = string.replace(pid, replacement)

        _local_id.comm_id = bytearray(_string.encode('utf-8'))
        _local_size = len(
            pair
        )  # how many intra-node workers, in the case of copper maximum 8 workers per node, assuming running within a node here

        if self.interrank == pair[0]:
            _local_rank = 0
        else:
            _local_rank = 1

        _local_rank = _local_rank  # assuming running within a node here

        gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank)

        if self.test == True:
            print 'on rank %d, pair %s generated' % (self.interrank, pair)

        return gpucomm, pair
Ejemplo n.º 8
0
    def get_intranode_comm(self):
        '''a gpucomm between all synchronous workers'''

        from pygpu import collectives

        _local_id = collectives.GpuCommCliqueId(context=self.ctx)

        string = _local_id.comm_id.decode('utf-8')

        comm = self.comm
        rank = comm.rank
        size = comm.size

        if rank == 0:
            _string = string
        else:
            _string = None

        _string = comm.bcast(_string, root=0)
        _local_id.comm_id = bytearray(_string.encode('utf-8'))

        # make intranode gpucomms, assuming running on multiple nodes
        # 1. get a list of all host-rank strings
        import os
        #print os.uname()[1],os.environ['CPULIST_train']
        hosts = [os.uname()[1] + ",%d" % self.rank]
        import numpy as np
        hosts = np.array(comm.allgather(hosts)).flatten().tolist()
        # 2. get a list of local host-rank strings
        localhost = [host for host in hosts if host.startswith(os.uname()[1])]
        # 3. count how many local ranks by counting the local host-rank strings (_local_size)
        _local_size = len(localhost)
        _local_rank = 0
        # 4. give self a rank among those ranks (_local_rank)
        for index, host in enumerate(localhost):
            if host == os.uname()[1] + ",%d" % self.rank:
                _local_rank = index
                break

        self.gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank)
Ejemplo n.º 9
0
def get_intranode_comm(rank, size, ctx):

    from pygpu import collectives

    _local_id = collectives.GpuCommCliqueId(context=ctx)

    string = _local_id.comm_id.decode('utf-8')

    import os
    pid = str(os.getpid())
    len_pid = len(pid)

    # replace the process-unique id to be the universal id "0......" so that a intranode gpucomm can be created
    replacement = ''.join('0' for i in range(len_pid))
    _string = string.replace(pid, replacement)

    _local_id.comm_id = bytearray(_string.encode('utf-8'))
    _local_size = size  # how many intra-node workers, in the case of copper maximum 8 workers per node, assuming running within a node here
    _local_rank = rank  # assuming running within a node here

    gpucomm = collectives.GpuComm(_local_id, _local_size, _local_rank)

    return gpucomm
Ejemplo n.º 10
0
 def __init__(self, rank, world_size, port_pub_sub, port_push_pull, job_id):
     self.rank = rank
     self.world_size = world_size
     self.port_pub_sub = port_pub_sub
     self.port_push_pull = port_push_pull
     self.job_id = job_id
     self._lock = posix_ipc.Semaphore("{}_lock".format(self.job_id))
     self.gpu_ctx = gpuarray.get_context(None)
     self.local_id = collectives.GpuCommCliqueId(context=self.gpu_ctx)
     self.lock()
     comm_id_file = 'comm_id.pkl'
     if not os.path.isfile(comm_id_file):
         comm_id = self.local_id.comm_id
         utils.dump_pkl(comm_id, comm_id_file)
     else:
         comm_id = utils.load_pkl(comm_id_file)
         self.local_id.comm_id = comm_id
     self.unlock()    
     print 'local_id ', self.local_id.comm_id
     # the following call is blocked till all workers finish calling it
     #print self.local_id.comm_id, self.job_id
     self.local_comm = collectives.GpuComm(self.local_id, self.world_size, self.rank)
     self.init_socket()
     print 'finish init worker with rank %d'%rank