def send_multigpu(tensor, dst_rank: int, dst_gpu_index: int, group_name: str = "default"): """Send a tensor to a remote GPU synchronously. The function asssume each process owns >1 GPUs, and the sender process and receiver process has equal nubmer of GPUs. Args: tensor: the tensor to send, located on a GPU. dst_rank (int): the rank of the destination process. dst_gpu_index (int): the destination gpu index. group_name (str): the name of the collective group. Returns: None """ if not types.cupy_available(): raise RuntimeError("send_multigpu call requires NCCL.") _check_single_tensor_input(tensor) g = _check_and_get_group(group_name) _check_rank_valid(g, dst_rank) if dst_rank == g.rank: raise RuntimeError("The dst_rank '{}' is self. Considering " "doing GPU to GPU memcpy instead?".format(dst_rank)) opts = types.SendOptions() opts.dst_rank = dst_rank opts.dst_gpu_index = dst_gpu_index g.send([tensor], opts)
def send(tensor, dst_rank: int, group_name: str = "default"): """Send a tensor to a remote process synchronously. Args: tensor: the tensor to send. dst_rank (int): the rank of the destination process. group_name (str): the name of the collective group. Returns: None """ _check_single_tensor_input(tensor) g = _check_and_get_group(group_name) _check_rank_valid(g, dst_rank) if dst_rank == g.rank: raise RuntimeError( "The destination rank '{}' is self.".format(dst_rank)) opts = types.SendOptions() opts.dst_rank = dst_rank g.send([tensor], opts)