Ejemplo n.º 1
0
    def allgather(self,
                  tensor_list,
                  tensor,
                  allgather_options=AllGatherOptions()):
        """Allgather tensors across the group into a list of  tensors.

        Args:
            tensor_list: the tensor list to store the results.
            tensor: the tensor to be allgather-ed across the group.
            allgather_options: allgather options.

        Returns:
            None
        """

        _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list)
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        send_ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        flattened = _flatten_for_scatter_gather(tensor_list, copy=False)
        recv_ptr = nccl_util.get_tensor_ptr(flattened)
        comm.allGather(send_ptr, recv_ptr, n_elems, dtype, stream.ptr)
        for i, t in enumerate(tensor_list):
            nccl_util.copy_tensor(t, flattened[i])
Ejemplo n.º 2
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.reduce(nccl_util.get_tensor_ptr(input_tensor),
                 nccl_util.get_tensor_ptr(output_tensor),
                 nccl_util.get_tensor_n_elements(input_tensor),
                 nccl_util.get_nccl_tensor_dtype(input_tensor),
                 nccl_util.get_nccl_reduce_op(reduce_options.reduceOp),
                 root_rank, stream.ptr)
Ejemplo n.º 3
0
    def reducescatter(self,
                      tensor,
                      tensor_list,
                      reducescatter_options=ReduceScatterOptions()):
        """Reducescatter a list of tensors across the group.

        Args:
            tensor: the output after reducescatter (could be unspecified).
            tensor_list: the list of tensor to be reduce and scattered.
            reducescatter_options: reducescatter options.

        Returns:
            None
        """
        _check_inputs_compatibility_for_scatter_gather(tensor, tensor_list)

        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()
        dtype = nccl_util.get_nccl_tensor_dtype(tensor_list[0])
        n_elems = nccl_util.get_tensor_n_elements(tensor_list[0])
        reduce_op = nccl_util.get_nccl_reduce_op(
            reducescatter_options.reduceOp)

        # get the send_ptr
        flattened = _flatten_for_scatter_gather(tensor_list, copy=True)
        send_ptr = nccl_util.get_tensor_ptr(flattened)
        recv_ptr = nccl_util.get_tensor_ptr(tensor)
        comm.reduceScatter(send_ptr, recv_ptr, n_elems, dtype, reduce_op,
                           stream.ptr)
Ejemplo n.º 4
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.broadcast(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(input_tensor),
         nccl_util.get_nccl_tensor_dtype(input_tensor), root_rank,
         stream.ptr)
Ejemplo n.º 5
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.reduceScatter(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(output_tensor),
         nccl_util.get_nccl_tensor_dtype(output_tensor),
         nccl_util.get_nccl_reduce_op(reducescatter_options.reduceOp),
         stream.ptr)
Ejemplo n.º 6
0
 def collective_fn(input_tensor, output_tensor, comm, stream):
     comm.allGather(
         nccl_util.get_tensor_ptr(input_tensor),
         nccl_util.get_tensor_ptr(output_tensor),
         nccl_util.get_tensor_n_elements(input_tensor),
         nccl_util.get_nccl_tensor_dtype(input_tensor),
         stream.ptr,
     )
Ejemplo n.º 7
0
 def p2p_fn(tensor, comm, stream, peer):
     comm.recv(
         nccl_util.get_tensor_ptr(tensor),
         recv_options.n_elements if recv_options.n_elements > 0 else
         nccl_util.get_tensor_n_elements(tensor),
         nccl_util.get_nccl_tensor_dtype(tensor),
         peer,
         stream.ptr,
     )
Ejemplo n.º 8
0
    def broadcast(self, tensor, broadcast_options=BroadcastOptions()):
        """Broadcast tensor to all other processes following options.

        Args:
            tensor: the tensor to be broadcasted.
            broadcast_options: broadcast options.

        Returns:
            None
        """
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        # in-place broadcast
        comm.broadcast(ptr, ptr, n_elems, dtype, broadcast_options.root_rank,
                       stream.ptr)
Ejemplo n.º 9
0
    def reduce(self, tensor, reduce_options=ReduceOptions()):
        """Reduce tensor to a destination process following options.

        Args:
            tensor: the tensor to be reduced.
            reduce_options: reduce options

        Returns:
            None
        """
        comm = self._get_nccl_communicator()
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        reduce_op = nccl_util.get_nccl_reduce_op(reduce_options.reduceOp)

        # in-place reduce
        comm.reduce(ptr, ptr, n_elems, dtype, reduce_op,
                    reduce_options.root_rank, stream.ptr)
Ejemplo n.º 10
0
    def allreduce(self, tensor, allreduce_options=AllReduceOptions()):
        """AllReduce the tensor across the collective group following options.

        Args:
            tensor: the tensor to be reduced, each tensor locates on a GPU
            allreduce_options:

        Returns:
        """
        # obtain the communicator
        comm = self._get_nccl_communicator()
        # obtain the stream: using default stream by now
        # TODO(Hao): implement a simple stream manager here
        stream = self._get_cuda_stream()

        dtype = nccl_util.get_nccl_tensor_dtype(tensor)
        ptr = nccl_util.get_tensor_ptr(tensor)
        n_elems = nccl_util.get_tensor_n_elements(tensor)
        reduce_op = nccl_util.get_nccl_reduce_op(allreduce_options.reduceOp)

        # in-place allreduce
        comm.allReduce(ptr, ptr, n_elems, dtype, reduce_op, stream.ptr)
Ejemplo n.º 11
0
 def p2p_fn(tensor, comm, stream, peer):
     comm.recv(nccl_util.get_tensor_ptr(tensor),
               nccl_util.get_tensor_n_elements(tensor),
               nccl_util.get_nccl_tensor_dtype(tensor), peer,
               stream.ptr)