def test_get_or_default(self): params = RendezvousParameters( backend="foobar", endpoint="localhost", run_id="1234", min_nodes=1, max_nodes=1, timeout1=None, timeout2=10, ) self.assertEqual(30, params.get("timeout1", 30)) self.assertEqual(10, params.get("timeout2", 20)) self.assertEqual(60, params.get("timeout3", 60))
def test_get(self): params = RendezvousParameters( backend="foobar", endpoint="localhost", run_id="1234", min_nodes=1, max_nodes=1, timeout1=None, timeout2=10, ) with self.assertRaises(KeyError): params.get("timeout3") with self.assertRaises(KeyError): params.get("timeout1") self.assertEqual(10, params.get("timeout2"))
def create_rdzv_handler(rdzv_params: RendezvousParameters): """ Usage: :: rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8, timeout=300, last_call_timeout=30, etcd_prefix="custom_prefix", protocol="https", cacert="/etc/kubernetes/certs/ca.crt", cert="/etc/kubernetes/certs/client.crt", key="/etc/kubernetes/certs/client.key") # -- or -- rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8) etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params) Where: run_id - unique id for this training job instance, min_nodes - min number of workers expected to join the rendezvous, max_nodes - max number of workers allowed to join the rendezvous, defaults to min_workers is not specified. timeout - total timeout within which next_rendezvous is expected to succeed; a RendezvousTimeoutException is raised otherwise; Defaults is 600 (10 minutes). last_call_timeout - additional wait amount ("last call") after min number of workers has been reached. Defaults to 30 seconds. etcd_prefix - path prefix (from etcd root), inside which all etcd nodes will be created. Default is "/torchelastic/p2p". protocol - http (default) or https to access etcd. cacert - CA cert to access etcd, only makes sense with https. cert - client cert to access etcd, only makes sense with https. key - client key to access etcd, only makes sense with https. """ import re # Etcd endpoints. (Current url format only allows a single host) endpoint = rdzv_params.endpoint match = re.match(r"(.+):(\d+)$", endpoint) # check if port was provided if match: etcd_endpoints = ((match.group(1), int(match.group(2))), ) else: # Use default etcd port etcd_endpoints = ((endpoint, 2379), ) # Run ID value -> unique identifier of this training job instance: # typically a job_id or name assigned by the scheduler or user run_id = rdzv_params.run_id # Parse all of query parameters: etcd_prefix = rdzv_params.get("etcd_prefix", "/torchelastic/p2p") min_workers = rdzv_params.min_nodes max_workers = rdzv_params.max_nodes assert min_workers >= 1, "Min number of workers should be at least 1" assert ( max_workers >= min_workers ), "Max number of workers cannot be less than min number of workers" timeout = rdzv_params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT) last_call_timeout = rdzv_params.get("last_call_timeout", CONST_DEFAULT_LAST_CALL_TIMEOUT) kwargs = _parse_etcd_client_params(rdzv_params.configs) # Etcd rendezvous implementation etcd_rdzv = EtcdRendezvous( endpoints=etcd_endpoints, prefix=etcd_prefix, run_id=run_id, num_min_workers=min_workers, num_max_workers=max_workers, timeout=timeout, last_call_timeout=last_call_timeout, **kwargs, ) return EtcdRendezvousHandler(rdzv_impl=etcd_rdzv)
def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler: """ Usage: :: rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8, timeout=300, last_call_timeout=30, etcd_prefix="custom_prefix", protocol="https", cacert="/etc/kubernetes/certs/ca.crt", cert="/etc/kubernetes/certs/client.crt", key="/etc/kubernetes/certs/client.key") # -- or -- rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8) etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params) Where: run_id - unique id for this training job instance, min_nodes - min number of workers expected to join the rendezvous, max_nodes - max number of workers allowed to join the rendezvous, defaults to min_workers is not specified. timeout - total timeout within which next_rendezvous is expected to succeed; a RendezvousTimeoutException is raised otherwise; Defaults is 600 (10 minutes). last_call_timeout - additional wait amount ("last call") after min number of workers has been reached. Defaults to 30 seconds. etcd_prefix - path prefix (from etcd root), inside which all etcd nodes will be created. Default is "/torchelastic/p2p". protocol - http (default) or https to access etcd. cacert - CA cert to access etcd, only makes sense with https. cert - client cert to access etcd, only makes sense with https. key - client key to access etcd, only makes sense with https. """ client = _create_etcd_client(params) etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p") rdzv = EtcdRendezvous( client=client, prefix=etcd_prefix, run_id=params.run_id, num_min_workers=params.min_nodes, num_max_workers=params.max_nodes, timeout=params.timeout, last_call_timeout=params.last_call_timeout, ) return EtcdRendezvousHandler(rdzv_impl=rdzv)