def run(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options=rpc.ProcessGroupRpcBackendOptions( num_send_recv_threads=16, rpc_timeout=0 # infinite timeout ) if rank != 0: rpc.init_rpc( f"trainer{rank}", rank=rank, world_size=world_size, rpc_backend_options=options ) # trainer passively waiting for ps to kick off training iterations else: rpc.init_rpc( "ps", rank=rank, world_size=world_size, rpc_backend_options=options ) run_ps([f"trainer{r}" for r in range(1, world_size)]) # block until all rpcs finish rpc.shutdown()
def bench_mpi(args): guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) os.environ["UCX_NET_DEVICES"] = best_device_map[guess_rank] torch.distributed.init_process_group(backend="mpi") os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10639" if args.socket_name: os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name os.environ["TP_SOCKET_IFNAME"] = args.socket_name init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() torch.cuda.set_device(rank % torch.cuda.device_count()) rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.PROCESS_GROUP, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( rpc_timeout=20, init_method=init_method), ) initialize_model_parallel(1, world_size) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def test_init_pg_and_rpc_with_same_socket(self): addr = DEFAULT_HOSTNAME port = common.find_free_port() os.environ["MASTER_ADDR"] = addr os.environ["MASTER_PORT"] = str(port) # We internally use a multi-tenant TCP store. Both PG and RPC should successfully # initialize even when using the same socket address. dist.init_process_group( backend="gloo", init_method="env://", rank=0, world_size=1, ) backend_opts = rpc.ProcessGroupRpcBackendOptions( init_method=f"tcp://{addr}:{port}" ) rpc.init_rpc( name="worker0", rank=0, world_size=1, rpc_backend_options=backend_opts, ) rpc.shutdown()
def benchmark_multiprocess(rank, world_size, args): init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT) # TODO(anj-s): Add regression benchmarks for nccl as well. torch.distributed.init_process_group( backend="gloo", rank=rank, world_size=world_size, init_method=init_method_pgroup ) torch.cuda.set_device(rank % torch.cuda.device_count()) # TODO(anj-s): Move to TensorPipeRpcBackendOptions. rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.PROCESS_GROUP, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( rpc_timeout=20, init_method="tcp://localhost:{}".format(RPC_PORT) ), ) initialize_model_parallel(1, world_size) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def run(i): global rank rank = i if i == world_size - 1: time.sleep(4) print("Process {} delayed start.".format(i)) rpc.init_rpc("Rank" + str(i), rank=i, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method="env://", rpc_timeout=rpc.timedelta(seconds=2), num_send_recv_threads=4)) if i == 0: time.sleep(2) print("Process 0 exit.") exit(-1) t = time.time() reqs = [] for j in range(messages): for r in range(world_size): reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=())) for req, idx in zip(reqs, range(world_size)): try: print("{} Received from {} : {}".format(rank, idx, req.wait())) except RuntimeError: print("An error ocurred while {} receiving results from {}". format(rank, idx)) reqs.clear() print(time.time() - t) rpc.shutdown(graceful=False)
def __init__(self, name: str, rank: int = -1, world_size: int = None, init_method: str = "tcp://localhost:9100", rpc_timeout: float = 60, rpc_threads: int = 8): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` init_method: Backend initialization method. rpc_timeout: Global rpc call timeout in seconds. rpc_threads: Rpc recv/send thread num. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} # "<rank-number>" is used as the unique name. rpc.init_rpc(self.name, rank=rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=init_method, num_send_recv_threads=rpc_threads, rpc_timeout=timedelta(seconds=rpc_timeout) )) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]
def run(i): rpc.init_rpc("Rank" + str(i), rank=i, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method="env://", rpc_timeout=rpc.timedelta(seconds=60), num_send_recv_threads=4)) t = time.time() reqs = [] for j in range(messages): for r in range(world_size): reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=())) for req in reqs: req.wait() print(time.time() - t) rpc.shutdown()
def bench_mpi(args): guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) os.environ["UCX_NET_DEVICES"] = best_device_map[local_rank] os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10638" if args.socket_name: os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name os.environ["TP_SOCKET_IFNAME"] = args.socket_name torch.distributed.init_process_group(backend="gloo", rank=guess_rank, world_size=world_size) os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10639" init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.PROCESS_GROUP, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( rpc_timeout=20, init_method=init_method), ) backends = { "model_parallel_backend": "nccl", "pipeline_backend": "mpi", "ddp_backend": "nccl" } initialize_model_parallel(1, world_size, **backends) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.ProcessGroupRpcBackendOptions(num_send_recv_threads=256) if rank == 0: rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) run_master(num_split) else: rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass # block until all rpcs finish rpc.shutdown()
def _init_torch_rpc_pg( self, master_addr, master_port, worker_idx, worker_num, ): # https://github.com/pytorch/pytorch/issues/55615 # [BC-Breaking][RFC] Retire ProcessGroup Backend for RPC #55615 str_init_method = "tcp://" + str(master_addr) + ":" + str(master_port) logging.info("str_init_method = {}".format(str_init_method)) options = rpc.ProcessGroupRpcBackendOptions( num_send_recv_threads=4, init_method=str_init_method) rpc.init_rpc( WORKER.format(worker_idx), backend=dist.rpc.BackendType.PROCESS_GROUP, rank=worker_idx, world_size=worker_num, rpc_backend_options=options, ) # torch.distributed.rpc.init_rpc('worker', rank=self.global_rank, world_size=self.world_size) logging.info("_init_rpc_with_process_group finished.")
def dist_init(rank: int, world_size: int, filename: str, filename_rpc: str = "") -> bool: """ Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated tests to be run concurrently. Return false if not enough GPUs present in the system. .. warning: This limits the usecase to all ranks being on the same node """ try: torch.distributed.rpc.shutdown() except Exception: pass print(f"dist init r={rank}, world={world_size}") os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(rank) url = "file://" + filename url_rpc = "file://" + filename_rpc if torch_version() >= (1, 6, 0): backend = "nccl" if torch.cuda.is_available() else "gloo" if backend == "nccl" and torch.cuda.device_count() < world_size: logging.warning( "Requested world size cannot be reached on this machine, not enough GPUs" ) return False torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=url) rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=url_rpc), ) else: if world_size > 1: # TensorPipe is not available in Torch 1.5 rpc.init_rpc( name=f"Test{rank}", rank=rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=url_rpc), ) elif torch.cuda.is_available(): torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=url) else: return False if torch.cuda.is_available() and torch.cuda.device_count(): torch.cuda.set_device(rank % torch.cuda.device_count()) return True
def dist_init(rank: int, world_size: int, filename: str, filename_rpc: str = "") -> bool: """ Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated tests to be run concurrently. Return false if not enough GPUs present in the system. .. warning: This limits the usecase to all ranks being on the same node """ try: torch.distributed.rpc.shutdown() except Exception: pass print(f"dist init r={rank}, world={world_size}") os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(rank) url = "file://" + filename url_rpc = "file://" + filename_rpc if torch_version() >= (1, 6, 0): backend = "nccl" if torch.cuda.is_available() else "gloo" if backend == "nccl" and torch.cuda.device_count() < world_size: logging.warning( "Requested world size cannot be reached on this machine, not enough GPUs" ) return False torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=url) tp_options = {"init_method": url_rpc} # Workaround for bug in torch v1.8.0. Should be fixed in v1.8.1 if torch_version() == (1, 8, 0): if torch.cuda.is_available(): # Workaround for https://github.com/pytorch/pytorch/issues/53844 tp_options["_transports"] = ["ibv", "uv"] # type: ignore else: # Workaround for https://github.com/pytorch/pytorch/issues/54266 tp_options["_channels"] = [ "mpt_uv", "basic", "cuda_ipc", "cuda_gdr", "cuda_xth", "cuda_basic" ] # type: ignore rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions(**tp_options), ) else: if world_size > 1: # TensorPipe is not available in Torch 1.5 rpc.init_rpc( name=f"Test{rank}", rank=rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=url_rpc), ) elif torch.cuda.is_available(): torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=url) else: return False if torch.cuda.is_available() and torch.cuda.device_count(): torch.cuda.set_device(rank % torch.cuda.device_count()) return True
def __init__(self, world_size: int, current_rank: int, roles: Dict[str, Tuple[type, int]], init_method: str = "tcp://localhost:9100", rpc_timeout: int = 60, rpc_threads: int = 4, rpc_role_dispatcher: Any = None): """ Args: world_size: Size of distributed world. current_rank: A unique rank of current process. roles: A list of roles executed by all processes. init_method: Backend initialization method. rpc_timeout: Global rpc call timeout in seconds. rpc_threads: Rpc recv/send thread num. rpc_role_dispatcher: Rpc role dispatch, by default it is :class:`~machin.parallel.distributed.\ RoleDispatcherElection` and uses :class:`machin.parallel.\ distributed.ElectionGroupStableRpc` as its internal election implementation. """ self.world_size = world_size self.role_dict = roles # Maps role Tuple[str, int] to threads self.role_threads = {} self.current_rank = current_rank self.ranks = [i for i in range(world_size)] self.real_names = ["{}".format(i) for i in range(world_size)] self.groups = {} if rpc_role_dispatcher is not None: self.rpc_role_dispatcher = rpc_role_dispatcher else: role_names = list(roles.keys()) role_nums = [val[1] for val in roles.values()] self.rpc_role_dispatcher = RoleDispatcherElection( current_rank, world_size, role_names, role_nums, ElectionGroupStableRpc(name="global", member_ranks=self.ranks, rank=current_rank, timeout=rpc_timeout)) # "<rank-number>" is used as the unique name. rpc.init_rpc("{}".format(self.current_rank), rank=current_rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=init_method, num_send_recv_threads=rpc_threads, rpc_timeout=timedelta(seconds=rpc_timeout))) # Start role dispatching. self.rpc_role_dispatcher.start() while True: self.rpc_role_dispatcher.get_role_update_cond().wait() for role in self.rpc_role_dispatcher.get_roles(): if role not in self.role_threads: role_class = self.role_dict[role[0]][0] role_thread = Thread(target=_exec_role, args=(role_class(role[1]), )) role_thread.start() self.role_threads[role] = role_thread