def recv_multigpu(tensor, src_rank: int, src_gpu_index: int, group_name: str = "default"): """Receive a tensor from a remote GPU synchronously. The function asssume each process owns >1 GPUs, and the sender process and receiver process has equal nubmer of GPUs. Args: tensor: the received tensor, located on a GPU. src_rank (int): the rank of the source process. src_gpu_index (int): the index of the source gpu on the src process. group_name (str): the name of the collective group. Returns: None """ if not types.cupy_available(): raise RuntimeError("recv_multigpu call requires NCCL.") _check_single_tensor_input(tensor) g = _check_and_get_group(group_name) _check_rank_valid(g, src_rank) if src_rank == g.rank: raise RuntimeError("The dst_rank '{}' is self. Considering " "doing GPU to GPU memcpy instead?".format(src_rank)) opts = types.RecvOptions() opts.src_rank = src_rank opts.src_gpu_index = src_gpu_index g.recv([tensor], opts)
def reducescatter_multigpu(output_tensor_list, input_tensor_lists, group_name: str = "default", op=types.ReduceOp.SUM): """Reducescatter a list of tensors across all GPUs. Args: output_tensor_list: the resulted list of tensors, with shape: num_gpus * shape(tensor). input_tensor_lists: the original tensors, with shape: num_gpus * world_size * shape(tensor). group_name (str): the name of the collective group. op: The reduce operation. Returns: None. """ if not types.cupy_available(): raise RuntimeError("Multigpu calls requires NCCL and Cupy.") _check_tensor_lists_input(input_tensor_lists) _check_tensor_list_input(output_tensor_list) g = _check_and_get_group(group_name) opts = types.ReduceScatterOptions() opts.reduceOp = op g.reducescatter(output_tensor_list, input_tensor_lists, opts)
def send_multigpu(tensor, dst_rank: int, dst_gpu_index: int, group_name: str = "default"): """Send a tensor to a remote GPU synchronously. The function asssume each process owns >1 GPUs, and the sender process and receiver process has equal nubmer of GPUs. Args: tensor: the tensor to send, located on a GPU. dst_rank (int): the rank of the destination process. dst_gpu_index (int): the destination gpu index. group_name (str): the name of the collective group. Returns: None """ if not types.cupy_available(): raise RuntimeError("send_multigpu call requires NCCL.") _check_single_tensor_input(tensor) g = _check_and_get_group(group_name) _check_rank_valid(g, dst_rank) if dst_rank == g.rank: raise RuntimeError("The dst_rank '{}' is self. Considering " "doing GPU to GPU memcpy instead?".format(dst_rank)) opts = types.SendOptions() opts.dst_rank = dst_rank opts.dst_gpu_index = dst_gpu_index g.send([tensor], opts)
def broadcast_multigpu(tensor_list, src_rank: int = 0, src_tensor: int = 0, group_name: str = "default"): """Broadcast the tensor from a source GPU to all other GPUs. Args: tensor_list: the tensors to broadcast (src) or receive (dst). src_rank (int): the rank of the source process. src_tensor (int): the index of the source GPU on the source process. group_name (str): the collective group name to perform broadcast. Returns: None """ if not types.cupy_available(): raise RuntimeError("Multigpu calls requires NCCL and Cupy.") _check_tensor_list_input(tensor_list) g = _check_and_get_group(group_name) # check src rank _check_rank_valid(g, src_rank) _check_root_tensor_valid(len(tensor_list), src_tensor) opts = types.BroadcastOptions() opts.root_rank = src_rank opts.root_tensor = src_tensor g.broadcast(tensor_list, opts)
def reduce_multigpu(tensor_list: list, dst_rank: int = 0, dst_tensor: int = 0, group_name: str = "default", op=types.ReduceOp.SUM): """Reduce the tensor across the group to the destination rank and destination tensor. Args: tensor_list: the list of tensors to be reduced on this process; each tensor located on a GPU. dst_rank (int): the rank of the destination process. dst_tensor: the index of GPU at the destination. group_name (str): the collective group name to perform reduce. op: The reduce operation. Returns: None """ if not types.cupy_available(): raise RuntimeError("Multigpu calls requires NCCL and Cupy.") _check_tensor_list_input(tensor_list) g = _check_and_get_group(group_name) # check dst rank _check_rank_valid(g, dst_rank) _check_root_tensor_valid(len(tensor_list), dst_tensor) opts = types.ReduceOptions() opts.reduceOp = op opts.root_rank = dst_rank opts.root_tensor = dst_tensor g.reduce(tensor_list, opts)
def _check_single_tensor_input(tensor): """Check if the tensor is with a supported type.""" if isinstance(tensor, np.ndarray): return if types.cupy_available(): if isinstance(tensor, types.cp.ndarray): return if types.torch_available(): if isinstance(tensor, types.th.Tensor): return raise RuntimeError("Unrecognized tensor type '{}'. Supported types are: " "np.ndarray, torch.Tensor, cupy.ndarray.".format( type(tensor)))
def synchronize(gpu_id: int): """Synchronize the current process to a give device. Args: gpu_id (int): the GPU device id to synchronize. Returns: None """ if not types.cupy_available(): raise RuntimeError("synchronize call requires CUDA and NCCL.") import cupy as cp cp.cuda.Device(gpu_id).synchronize()
def allreduce_multigpu(tensor_list: list, group_name: str = "default", op=types.ReduceOp.SUM): """Collective allreduce a list of tensors across the group. Args: tensor_list (List[tensor]): list of tensors to be allreduced, each on a GPU. group_name (str): the collective group name to perform allreduce. Returns: None """ if not types.cupy_available(): raise RuntimeError("Multigpu calls requires NCCL and Cupy.") _check_tensor_list_input(tensor_list) g = _check_and_get_group(group_name) opts = types.AllReduceOptions opts.reduceOp = op g.allreduce(tensor_list, opts)
def allgather_multigpu(output_tensor_lists: list, input_tensor_list: list, group_name: str = "default"): """Allgather tensors from each gpus of the group into lists. Args: output_tensor_lists (List[List[tensor]]): gathered results, with shape must be num_gpus * world_size * shape(tensor). input_tensor_list: (List[tensor]): a list of tensors, with shape num_gpus * shape(tensor). group_name (str): the name of the collective group. Returns: None """ if not types.cupy_available(): raise RuntimeError("Multigpu calls requires NCCL and Cupy.") _check_tensor_lists_input(output_tensor_lists) _check_tensor_list_input(input_tensor_list) g = _check_and_get_group(group_name) opts = types.AllGatherOptions() g.allgather(output_tensor_lists, input_tensor_list, opts)