def _create_c10d_store(hostname, port, rank, world_size, timeout) -> Store: """ Smartly creates a c10d Store object on ``rank`` based on whether we need to re-use agent store. The TCPStore server is assumed to be hosted on ``hostname:port``. If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that the agent leader (node rank 0) hosts the TCPStore server (for which the endpoint is specified by the given ``hostname:port``). Hence ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``). If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname and port are correctly passed via ``hostname`` and ``port``. All non-zero ranks will create and return a TCPStore client. """ # check if port is uint16_t if not 0 <= port < 2**16: raise ValueError( f"port must have value from 0 to 65535 but was {port}.") if _torchelastic_use_agent_store(): attempt = os.environ["TORCHELASTIC_RESTART_COUNT"] tcp_store = TCPStore(hostname, port, world_size, False, timeout) return PrefixStore(f"/worker/attempt_{attempt}", tcp_store) else: start_daemon = rank == 0 return TCPStore(hostname, port, world_size, start_daemon, timeout, multi_tenant=True)
def setup_tcpstore(rank, world_size, rdzv_version, rdzv_impl): if rank == 0: import socket from contextlib import closing # FIXME: ideally, TCPStore should have an API that # accepts a pre-constructed socket. with closing(_get_socket_with_port()) as sock: host = socket.gethostname() port = sock.getsockname()[1] rdzv_impl.store_extra_data(rdzv_version, key="tcpstore_server", value="{}:{}".format(host, port)) log.info(f"Setting up TCPStore server on {host}:{port}") start_daemon = True sock.close( ) # FIXME: get rid of race-condition by improving TCPStore API store = TCPStore(host, port, world_size, start_daemon) log.info(f"TCPStore server initialized on {host}:{port}") else: hostport = rdzv_impl.load_extra_data(rdzv_version, key="tcpstore_server") log.info(f"Rank {rank} will conenct to TCPStore server at {hostport}") import re host, port = re.match(r"(.+):(\d+)$", hostport).groups() start_daemon = False store = TCPStore(host, int(port), world_size, start_daemon) return store
def _env_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs): def _error(msg): return _rendezvous_error("env:// rendezvous: " + msg) def _env_error(var): return _error("environment variable %s expected, but not set" % var) def _get_env_or_raise(env_var: str) -> str: env_val = os.environ.get(env_var, None) if not env_val: raise _env_error(env_var) else: return env_val result = urlparse(url) query: Dict[str, Union[int, str]] # mypy doesn't allow dict() to accept List of values (#257) query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) # type: ignore[misc, arg-type] rank: Optional[Union[str, int]] world_size: Optional[Union[str, int]] master_port: Optional[Union[str, int]] if "rank" in query: rank = int(query["rank"]) else: rank = int(_get_env_or_raise("RANK")) if "world_size" in query: world_size = int(query["world_size"]) else: world_size = int(_get_env_or_raise("WORLD_SIZE")) master_addr = _get_env_or_raise("MASTER_ADDR") master_port = int(_get_env_or_raise("MASTER_PORT")) use_torchelastic_store = os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) if use_torchelastic_store == str(True): worker_process_prefix = "/worker" # When TORCHELASTIC_USE_AGENT_STORE is set up, the worker process is assumed # to be invoked by the torchelastic agent. Torchelastic agent creates a tcp daemon thread # on the GROUP_RANK=0, as a result all user worker processes should create store with: daemon=False tcp_store = TCPStore(master_addr, master_port, world_size, False, timeout) # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191 yield (PrefixStore(worker_process_prefix, tcp_store), rank, world_size) else: # Start the TCP store daemon on the rank 0 start_daemon = rank == 0 store = TCPStore( # type: ignore[call-arg] master_addr, master_port, world_size, start_daemon, timeout, multi_tenant=True ) # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191 yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using env:// method")
def _tcp_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs): def _error(msg): return _rendezvous_error("tcp:// rendezvous: " + msg) result = urlparse(url) if not result.port: raise _error("port number missing") query: Dict[str, Union[int, str]] # mypy doesn't allow dict() to accept List of values (#257) query = dict(pair.split("=") for pair in filter(None, result.query.split("&"))) # type: ignore[misc, arg-type] if "rank" not in query: raise _error("rank parameter missing") if "world_size" not in query: raise _error("world size parameter missing") rank = int(query["rank"]) world_size = int(query["world_size"]) start_daemon = rank == 0 assert result.hostname is not None store = TCPStore( # type: ignore[call-arg] result.hostname, result.port, world_size, start_daemon, timeout, multi_tenant=True ) yield (store, rank, world_size) # If this configuration is invalidated, there is nothing we can do about it raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
def test_create_backend_returns_backend_if_is_host_is_false(self) -> None: store = TCPStore( # type: ignore[call-arg] # noqa: F841 self._expected_endpoint_host, self._expected_endpoint_port, is_master=True ) self._params.config["is_host"] = "false" self.test_create_backend_returns_backend()
def test_store_methods_forward_calls_to_inner(self): inner = TCPStore("127.0.0.1", 0, is_master=True) store = _ClosableStore(inner) store.set("dummy", "dummy") store.close()
def test_create_backend_returns_backend_if_is_host_is_not_specified_and_store_already_exists( self, ) -> None: store = TCPStore( # type: ignore[call-arg] # noqa: F841 self._expected_endpoint_host, self._expected_endpoint_port, is_master=True ) del self._params.config["is_host"] self.test_create_backend_returns_backend()
def _init_distributed_setting(self): """Initialize the distributed library and bind the worker to GPU. Return: True if distributed library is initialized successfully. """ if self._args.distributed_impl: logger.info( 'Distributed training is enabled - model: {}, distributed implementation: {}.' .format(self._name, self._args.distributed_impl)) if self._args.distributed_impl == DistributedImpl.HOROVOD: import horovod.torch as hvd hvd.init() self._world_size = int(hvd.size()) self._local_rank = int(hvd.local_rank()) self._global_rank = int(hvd.rank()) elif self._args.distributed_impl == DistributedImpl.DDP: if os.environ.get('WORLD_SIZE') is None or os.environ.get( 'LOCAL_RANK') is None: logger.error( 'Can not find WORLD_SIZE or LOCAL_RANK in env variables - model: {},' ' distributed implementation: {}.'.format( self._name, self._args.distributed_impl)) return False # torch >= 1.9.0a0 torch.distributed.elastic is used by default port = int(os.environ['MASTER_PORT']) + 1 addr = os.environ['MASTER_ADDR'] self._global_rank = int(os.environ['RANK']) self._local_rank = int(os.environ['LOCAL_RANK']) self._world_size = int(os.environ['WORLD_SIZE']) logger.debug('ip:{},port:{},rank:{},world:{}'.format( addr, port, self._global_rank, self._world_size)) store = PrefixStore( self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))) torch.distributed.init_process_group( backend=self._args.distributed_backend.value, timeout=timedelta(seconds=300), rank=self._global_rank, world_size=self._world_size, store=store) else: logger.error( 'Unsupported distributed implementation - model: {}, distributed implementation: {}.' .format(self._name, self._args.distributed_impl)) return False if self._gpu_available: torch.cuda.set_device(self._local_rank) return True
def test_store_methods_raise_error_if_store_is_closed(self): inner = TCPStore("127.0.0.1", 0, is_master=True) store = _ClosableStore(inner) store.set("dummy", "dummy") store.close() with self.assertRaisesRegex(RuntimeError, r"^The store is already closed.$"): store.set("dummy", "dummy")
def next_rendezvous(self) -> Tuple[Store, int, int]: log.info("Creating TCPStore as the c10d::Store implementation") if not self._store: is_master = self.rank == 0 self._store = TCPStore( self.master_addr, self.master_port, self.world_size, is_master, self.timeout, ) store = PrefixStore(self.run_id, self._store) return store, self.rank, self.world_size
def _create_tcp_store(params: RendezvousParameters) -> TCPStore: host, port = _parse_rendezvous_endpoint(params.endpoint, default_port=29500) cfg_is_host = params.get_as_bool("is_host") # If the user has explicitly specified whether our process should host the # the store, respect it. if cfg_is_host is not None: is_host = cfg_is_host # Otherwise try to determine whether we are the host based on our hostname # and IP address. else: is_host = _matches_machine_hostname(host) # The timeout read_timeout = cast(int, params.get_as_int("read_timeout", 60)) if read_timeout <= 0: raise ValueError("The read timeout must be a positive integer.") # In specific cases we attempt to instantiate the store twice. For details # see the explanation in the except clause below. for _ in range(2): try: store = TCPStore( # type: ignore[call-arg] host, port, is_master=is_host, timeout=timedelta(seconds=read_timeout)) if is_host: log.info( f"Process {os.getpid()} hosts the TCP store for the C10d rendezvous backend." ) break except (ValueError, RuntimeError) as exc: # If we heuristically inferred the value of is_host as True and our # first attempt to instantiate the TCP store has failed, try it one # more time with is_host set to False. As an edge case there can be # more than one process that is part of the same rendezvous on this # machine and only one of them will eventually host the store. if not is_host or cfg_is_host is not None: raise RendezvousConnectionError( "The connection to the C10d store has failed. See inner exception for details." ) from exc is_host = False return store
def setUpClass(cls) -> None: cls._store = TCPStore("localhost", 0, is_master=True) # type: ignore[call-arg]
def setUpClass(cls) -> None: cls._store = TCPStore("127.0.0.1", 0, is_master=True) # type: ignore[call-arg]