Beispiel #1
0
    def update(self, src_rank):
        """Receive gradients and update"""
        keys = list(self.params.keys())
        grads = dict()
        recv_list = []
        for key in keys:
            to_recv = self.params[key]
            recv_list.append(torch.zeros(to_recv.size()).cuda())

        groupStart()
        for i in range(len(keys)):
            collective.recv(recv_list[i], src_rank, "default")
        groupEnd()

        for i in range(len(keys)):
            grads[keys[i]] = recv_list[i]

        self._inc_gradients(grads)
        if self.grad_counts == len(self.workers):
            #self.optimizer.zero_grad()
            #self._set_gradients(grads)
            self.optimizer.step()
            self.optimizer.zero_grad()

        return True
Beispiel #2
0
    def test_send_recv(self):
        devs = [0, 1]
        comms = nccl.NcclCommunicator.initAll(devs)
        nccl.groupStart()
        for comm in comms:
            dev_id = comm.device_id()
            rank = comm.rank_id()
            assert rank == dev_id

            if rank == 0:
                with cuda.Device(dev_id):
                    sendbuf = cupy.arange(10, dtype=cupy.int64)
                    comm.send(sendbuf.data.ptr, 10, nccl.NCCL_INT64, 1,
                              cuda.Stream.null.ptr)
            elif rank == 1:
                with cuda.Device(dev_id):
                    recvbuf = cupy.zeros(10, dtype=cupy.int64)
                    comm.recv(recvbuf.data.ptr, 10, nccl.NCCL_INT64, 0,
                              cuda.Stream.null.ptr)
        nccl.groupEnd()

        # check result
        with cuda.Device(1):
            expected = cupy.arange(10, dtype=cupy.int64)
            assert (recvbuf == expected).all()
Beispiel #3
0
 def send_recv(cls, comm, in_array, out_array, peer, stream=None):
     comm._check_contiguous(in_array)
     comm._check_contiguous(out_array)
     stream = comm._get_stream(stream)
     idtype, icount = comm._get_nccl_dtype_and_count(in_array)
     odtype, ocount = comm._get_nccl_dtype_and_count(out_array)
     nccl.groupStart()
     cls._send(comm, in_array, peer, idtype, icount, stream)
     cls._recv(comm, out_array, peer, odtype, ocount, stream)
     nccl.groupEnd()
Beispiel #4
0
 def send(cls, comm, array, peer, stream=None):
     arrays = cls._get_internal_arrays(array)
     shape_and_sizes = cls._get_shape_and_sizes(arrays, array.shape)
     cls._exchange_shape_and_sizes(comm, peer, shape_and_sizes, 'send',
                                   stream)
     # Naive approach, we send each of the subarrays one by one
     nccl.groupStart()
     for a in arrays:
         cls._send(comm, a, peer, a.dtype, a.size, stream)
     nccl.groupEnd()
Beispiel #5
0
 def test_single_proc_single_dev(self):
     comms = nccl.NcclCommunicator.initAll(1)
     nccl.groupStart()
     for comm in comms:
         cuda.Device(comm.device_id()).use()
         sendbuf = cupy.arange(10)
         recvbuf = cupy.zeros_like(sendbuf)
         comm.allReduce(sendbuf.data.ptr, recvbuf.data.ptr, 10,
                        nccl.NCCL_INT64, nccl.NCCL_SUM,
                        cuda.Stream.null.ptr)
     nccl.groupEnd()
     assert cupy.allclose(sendbuf, recvbuf)
Beispiel #6
0
 def scatter(cls, comm, in_array, out_array, root=0, stream=None):
     # in_array is a list of sparse matrices
     if comm.rank == root:
         nccl.groupStart()
         for peer, s_a in enumerate(in_array):
             if peer != root:
                 cls.send(comm, s_a, peer, stream)
         nccl.groupEnd()
         cls._assign_arrays(out_array,
                            cls._get_internal_arrays(in_array[root]),
                            in_array[root].shape)
     else:
         cls.recv(comm, out_array, root, stream)
Beispiel #7
0
    def compute(self):
        """Returns the loss, and send gradients to servers"""
        # First receive params from servers
        param_shards = []
        weights = self.get_weights(cpu=False)
        params = dict()
        # create the receive lists to group collective calls
        recv_list = []
        for i in range(self.num_ps):
            recv_list.append([])
            param_shard_keys = self.name_list[i]
            for key in param_shard_keys:
                to_recv = weights[key]
                recv_list[-1].append((torch.ones(to_recv.size()) * 2).cuda())

        logging.warning(
            f"worker {self.rank} {recv_list[0][0][0][0]}, {recv_list[0][0].size()}, {recv_list[0][1]}, {recv_list[0][1].size()}, {recv_list[0][2]}, {recv_list[0][2].size()}"
        )
        groupStart()
        for i in range(self.num_ps):
            for j in range(len(self.name_list[i])):
                logging.warning(f"recv {i}{j} {self.name_list[i][j]}")
                collective.recv(recv_list[i][j], self.num_workers + i,
                                "default")
                if j == 2:
                    break
            break
        groupEnd()
        logging.warning(
            f"worker {self.rank} {recv_list[0][0][0][0]}, {recv_list[0][0].size()}, {recv_list[0][1]}, {recv_list[0][1].size()}, {recv_list[0][2]}, {recv_list[0][2].size()}"
        )
        time.sleep(100)
        for i in range(self.num_ps):
            param_shard_keys = self.name_list[i]
            for j in range(len(param_shard_keys)):
                params[param_shard_keys[j]] = recv_list[i][j]

        grad, loss = self.compute_gradients(params)
        split_grad = self.split_gradients(grad, self.assignments)
        groupStart()
        for i in range(self.num_ps):
            this_shard = self.index_shard(split_grad, i)
            for _, v in this_shard.items():
                collective.send(v, self.num_workers + i, "default")
        groupEnd()
        return loss
Beispiel #8
0
 def scatter(cls, comm, in_array, out_array, root=0, stream=None):
     if in_array.shape[0] != comm._n_devices:
         raise RuntimeError(
             f'scatter requires in_array to have {comm._n_devices}'
             f'elements in its first dimension, found {in_array.shape}')
     comm._check_contiguous(in_array)
     comm._check_contiguous(out_array)
     stream = comm._get_stream(stream)
     nccl.groupStart()
     if root == comm.rank:
         for i in range(comm._n_devices):
             array = in_array[i]
             idtype, icount = comm._get_nccl_dtype_and_count(array)
             cls._send(comm, array, i, idtype, icount, stream)
     dtype, count = comm._get_nccl_dtype_and_count(out_array)
     cls._recv(comm, out_array, root, dtype, count, stream)
     nccl.groupEnd()
Beispiel #9
0
 def recv(cls, comm, out_array, peer, stream=None):
     shape_and_sizes = cls._exchange_shape_and_sizes(
         comm, peer, (), 'recv', stream)
     # Change the array sizes in out_array to match the sent ones
     # Receive the three arrays
     # TODO(ecastill) dtype is not correct, it must match the internal
     # sparse matrix arrays dtype
     arrays = cls._get_internal_arrays(out_array)
     shape = tuple(shape_and_sizes[0:2])
     sizes = shape_and_sizes[2:]
     # TODO(use the out_array datatypes)
     arrs = [cupy.empty(s, dtype=a.dtype) for s, a in zip(sizes, arrays)]
     nccl.groupStart()
     for a in arrs:
         cls._recv(comm, a, peer, a.dtype, a.size, stream)
     nccl.groupEnd()
     # Create a sparse matrix from the received arrays
     cls._assign_arrays(out_array, arrs, shape)
Beispiel #10
0
 def gather(cls, comm, in_array, out_array, root=0, stream=None):
     # TODO(ecastill) out_array needs to have comm size in shape[0]
     if out_array.shape[0] != comm._n_devices:
         raise RuntimeError(
             f'gather requires out_array to have {comm._n_devices}'
             f'elements in its first dimension, found {out_array.shape}')
     comm._check_contiguous(in_array)
     comm._check_contiguous(out_array)
     stream = comm._get_stream(stream)
     nccl.groupStart()
     if root == comm.rank:
         for i in range(comm._n_devices):
             array = out_array[i]
             odtype, ocount = comm._get_nccl_dtype_and_count(array)
             cls._recv(comm, array, i, odtype, ocount, stream)
     dtype, count = comm._get_nccl_dtype_and_count(in_array)
     cls._send(comm, in_array, root, dtype, count, stream)
     nccl.groupEnd()
Beispiel #11
0
 def reduce(cls, comm, in_array, out_array, root=0, op='sum', stream=None):
     arrays = cls._get_internal_arrays(in_array)
     # All the matrices must share the same size
     shape_and_sizes = cls._get_shape_and_sizes(arrays, in_array.shape)
     shape_and_sizes = cls._exchange_shape_and_sizes(
         comm, root, shape_and_sizes, 'gather', stream)
     if comm.rank == root:
         if _get_sparse_type(in_array) != _get_sparse_type(out_array):
             raise ValueError(
                 'in_array and out_array must be the same format')
         result = in_array
         partial = _make_sparse_empty(in_array.dtype,
                                      _get_sparse_type(in_array))
         # each device will send and array with a different size
         for peer, ss in enumerate(shape_and_sizes):
             shape = tuple(ss[0:2])
             sizes = ss[2:]
             arrays = [
                 cupy.empty(s, dtype=a.dtype)
                 for s, a in zip(sizes, arrays)
             ]
             if peer != root:
                 nccl.groupStart()
                 for a in arrays:
                     cls._recv(comm, a, peer, a.dtype, a.size, stream)
                 nccl.groupEnd()
                 cls._assign_arrays(partial, arrays, shape)
                 if op == 'sum':
                     result = result + partial
                 elif op == 'prod':
                     result = result * partial
                 else:
                     raise ValueError(
                         'Sparse matrix only supports sum/prod reduction')
         # TODO, check output types
         # If out_array is coo we need to convert result to coo before
         # reasiging
         cls._assign_arrays(out_array, cls._get_internal_arrays(result),
                            result.shape)
     else:
         nccl.groupStart()
         for a in arrays:
             cls._send(comm, a, root, a.dtype, a.size, stream)
         nccl.groupEnd()
Beispiel #12
0
 def send_params(self, dst_rank):
     """ Send this param shard to the destination worker """
     count = 0
     groupStart()
     for name, v in self.params.items():
         collective.send(v, dst_rank, "default")
         if count < 1:
             count += 1
             logging.warning(f"{name} {v[0][0]}, {v.size()}")
         elif count < 2:
             count += 1
             logging.warning(f"{name} {v}, {v.size()}")
         elif count < 3:
             count += 1
             logging.warning(f"{name} {v}, {v.size()}")
         else:
             break
     groupEnd()
     time.sleep(5000)
Beispiel #13
0
 def all_to_all(cls, comm, in_array, out_array, stream=None):
     # TODO(ecastill) out_array needs to have comm size in shape[0]
     if out_array.shape[0] != comm._n_devices:
         raise RuntimeError(
             f'all_to_all requires in_array to have {comm._n_devices}'
             f'elements in its first dimension, found {in_array.shape}')
     if out_array.shape[0] != comm._n_devices:
         raise RuntimeError(
             f'all_to_all requires out_array to have {comm._n_devices}'
             f'elements in its first dimension, found {out_array.shape}')
     comm._check_contiguous(in_array)
     comm._check_contiguous(out_array)
     stream = comm._get_stream(stream)
     idtype, icount = comm._get_nccl_dtype_and_count(in_array[0])
     odtype, ocount = comm._get_nccl_dtype_and_count(out_array[0])
     # TODO check out dtypes are the same as in dtypes
     nccl.groupStart()
     for i in range(comm._n_devices):
         cls._send(comm, in_array[i], i, idtype, icount, stream)
         cls._recv(comm, out_array[i], i, odtype, ocount, stream)
     nccl.groupEnd()
Beispiel #14
0
    def compute(self):
        """Returns the loss, and send gradients to servers"""
        # First receive params from servers
        param_shards = []
        weights = self.get_weights(cpu=False)
        params = dict()
        # create the receive lists to group collective calls
        recv_list = []
        for i in range(self.num_ps):
            recv_list.append([])
            param_shard_keys = self.name_list[i]
            for key in param_shard_keys:
                to_recv = weights[key]
                recv_list[-1].append(torch.zeros(to_recv.size()).cuda())

        logging.warning(
            f" worker {self.rank} {recv_list[0][0][0][0][0]},{recv_list[0][0].size()}, {recv_list[0][1]}"
        )
        recv_op = [dist.P2POp(dist.irecv, v, 1) for v in recv_list[0]]
        reqs = dist.batch_isend_irecv(recv_op)
        for req in reqs:
            req.wait()
        logging.warning(
            f"worker {self.rank} {recv_list[0][0][0][0][0]}, {recv_list[0][1]}"
        )
        time.sleep(100)
        for i in range(self.num_ps):
            param_shard_keys = self.name_list[i]
            for j in range(len(param_shard_keys)):
                params[param_shard_keys[j]] = recv_list[i][j]

        grad, loss = self.compute_gradients(params)
        split_grad = self.split_gradients(grad, self.assignments)
        groupStart()
        for i in range(self.num_ps):
            this_shard = self.index_shard(split_grad, i)
            for _, v in this_shard.items():
                collective.send(v, self.num_workers + i, "default")
        groupEnd()
        return loss
Beispiel #15
0
    def broadcast(cls, comm, in_out_array, root=0, stream=None):
        arrays = cls._get_internal_arrays(in_out_array)
        if comm.rank == root:
            shape_and_sizes = cls._get_shape_and_sizes(arrays,
                                                       in_out_array.shape)
        else:
            shape_and_sizes = ()

        shape_and_sizes = cls._exchange_shape_and_sizes(
            comm, root, shape_and_sizes, 'bcast', stream)
        shape = tuple(shape_and_sizes[0:2])
        sizes = shape_and_sizes[2:]
        # Naive approach, we send each of the subarrays one by one
        if comm.rank != root:
            arrays = [
                cupy.empty(s, dtype=a.dtype) for s, a in zip(sizes, arrays)
            ]
        # TODO(ecastill): measure if its faster to just contatenate
        # the arrays in a single one and send it
        nccl.groupStart()
        for a in arrays:
            _DenseNCCLCommunicator.broadcast(comm, a, root, stream)
        nccl.groupEnd()
        cls._assign_arrays(in_out_array, arrays, shape)
Beispiel #16
0
 def send_recv(cls, comm, in_array, out_array, peer, stream=None):
     nccl.groupStart()
     cls.send(comm, in_array, peer, stream)
     cls.recv(comm, out_array, peer, stream)
     nccl.groupEnd()