def test_get_or_default(self): params = RendezvousParameters( backend="foobar", endpoint="localhost", run_id="1234", min_nodes=1, max_nodes=1, timeout1=10, ) self.assertEqual(10, params.get("timeout1", 20)) self.assertEqual(60, params.get("timeout2", 60))
def _get_worker_spec( self, max_restarts=1, monitor_interval=1.0, role="test_trainer", local_world_size=8, ): run_id = str(uuid.uuid4().int) endpoint = self._etcd_server.get_endpoint() rdzv_params = RendezvousParameters(backend="etcd", endpoint=endpoint, run_id=run_id, min_nodes=1, max_nodes=1) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role=role, local_world_size=local_world_size, fn=do_nothing, args=(), rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, ) return spec
def _get_worker_spec( self, fn=None, cmd=None, args=(), max_restarts=1, num_agents=1, monitor_interval=0.1, local_world_size=8, ): run_id = str(uuid.uuid4().int) rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{self._etcd_server.get_endpoint()}", run_id=run_id, min_nodes=num_agents, max_nodes=num_agents, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role="test_trainer", local_world_size=local_world_size, fn=fn, cmd=cmd, args=args, rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, ) return spec
def run_agent(run_id, etcd_host, etcd_port, start_method, worker_fn, worker_args=()): rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{etcd_host}:{etcd_port}", run_id=run_id, min_nodes=2, max_nodes=2, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role="test_trainer", local_world_size=1, fn=worker_fn, args=worker_args, rdzv_handler=rdzv_handler, max_restarts=3, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method) agent.run()
def get_worker_spec( self, node_config: Conf, min_nodes=1, max_nodes=1, max_restarts=0, monitor_interval=0.01, ): rdzv_params = RendezvousParameters( backend="etcd", endpoint=self._etcd_server.get_endpoint(), run_id=self._run_id, min_nodes=min_nodes, max_nodes=max_nodes, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) return WorkerSpec( role=node_config.role, local_world_size=node_config.local_world_size, entrypoint=node_config.entrypoint, args=node_config.args, rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, redirects=node_config.redirects, tee=node_config.tee, )
def test_no_factory_method_found(self): factory = RendezvousHandlerFactory() rdzv_params = RendezvousParameters( backend="mock", endpoint="", run_id="foobar", min_nodes=1, max_nodes=2 ) with self.assertRaises(ValueError): factory.create_rdzv_handler(rdzv_params)
def test_create_rdzv_handler(self): rdzv_params = RendezvousParameters( backend="mock", endpoint="", run_id="foobar", min_nodes=1, max_nodes=2 ) factory = RendezvousHandlerFactory() factory.register("mock", create_mock_rdzv_handler) mock_rdzv_handler = factory.create_rdzv_handler(rdzv_params) self.assertTrue(isinstance(mock_rdzv_handler, MockRendezvousHandler))
def _create_params(self) -> RendezvousParameters: return RendezvousParameters( backend=self._backend_name, endpoint=self._endpoint, run_id=self._run_id, min_nodes=self._min_num_nodes, max_nodes=self._max_num_nodes, **self._kwargs, )
def setUp(self) -> None: self._params = RendezvousParameters( backend="dummy_backend", endpoint=self._server.get_endpoint(), run_id="dummy_run_id", min_nodes=1, max_nodes=1, protocol="hTTp", read_timeout="10", ) self._expected_protocol = "http" self._expected_read_timeout = 10
def test_etcd_rdzv_basic_params(self): """ Check that we can create the handler with a minimum set of params """ rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{self._etcd_server.get_endpoint()}", run_id=f"{uuid.uuid4()}", min_nodes=1, max_nodes=1, ) etcd_rdzv = create_rdzv_handler(rdzv_params) self.assertIsNotNone(etcd_rdzv)
def setUp(self) -> None: self._params = RendezvousParameters( backend="dummy_backend", endpoint="127.0.0.1:29400", run_id="dummy_run_id", min_nodes=1, max_nodes=1, is_host="true", store_type="tCp", read_timeout="10", ) self._expected_endpoint_host = "127.0.0.1" self._expected_endpoint_port = 29400 self._expected_store_type = TCPStore self._expected_read_timeout = timedelta(seconds=10)
def test_get_backend(self): run_id = str(uuid.uuid4()) rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{self._etcd_server.get_endpoint()}", run_id=run_id, min_nodes=1, max_nodes=1, timeout=60, last_call_timeout=30, protocol="http", ) etcd_rdzv = create_rdzv_handler(rdzv_params) self.assertEqual("etcd", etcd_rdzv.get_backend())
def setUp(self) -> None: self._backend = DummyRendezvousBackend() self._params = RendezvousParameters( backend=self._backend.name, endpoint="dummy_end_point", run_id="dummy_run_id", min_nodes=3, max_nodes=6, store_port="1234", join_timeout="50", last_call_timeout="60", close_timeout="70", store_timeout="80", ) self._expected_store_port = 1234 self._expected_close_timeout = 70
def test_get(self): params = RendezvousParameters( backend="foobar", endpoint="localhost", run_id="1234", min_nodes=1, max_nodes=1, timeout1=None, timeout2=10, ) with self.assertRaises(KeyError): params.get("timeout3") with self.assertRaises(KeyError): params.get("timeout1") self.assertEqual(10, params.get("timeout2"))
def _run_agent( run_id, etcd_host, etcd_port, min_size, max_size, func_to_run, args, local_world_size=8, role="test_trainer", output_dict=None, agent_barrier_timeout=300, ): rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{etcd_host}:{etcd_port}", run_id=run_id, min_nodes=min_size, max_nodes=max_size, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role=role, local_world_size=local_world_size, fn=func_to_run, args=args, rdzv_handler=rdzv_handler, max_restarts=2, monitor_interval=1, ) agent = LocalElasticAgent( spec, start_method="fork", exit_barrier_timeout=agent_barrier_timeout ) res = agent.run() if output_dict is not None: key = str(uuid.uuid4().int) output_dict[key] = (role, res)
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, cmd=cmd, rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) group_result = elastic_agent.run(spec.role) if group_result.is_failed(): min_rank = min(group_result.failures.keys()) failure = group_result.failures[min_rank] # Note: this line will raise an exception to indicate to the # scheduler process that something went wrong. # If any workers wrote the error file, it will be propagated # to the scheduler specific destination. process_failure(failure) msg = f""" *********************************************************************** \n ***********************USER CODE FAILED WITH ERROR****************** \n\n {get_failure_message(failure)} \n ******************************************************************** \n\n ******************************************************************** \n """ log.warning(msg) # Expected (0-127), 0 - success, anything else - failure sys.exit(abs(failure.exit_code)) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop() cleanup()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 elastic_agent = None if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rendezvous_config(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, entrypoint=cmd[0], args=(*cmd[1:], ), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec=spec, start_method=args.start_method, log_dir=args.log_dir) run_result = elastic_agent.run(spec.role) events.record( elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED)) if run_result.is_failed(): # ChildFailedError is treated specially by @record # if the error files for the failed children exist # @record will copy the first error (root cause) # to the error file of the launcher process raise ChildFailedError( name=args.training_script, failures=run_result.failures, ) except ChildFailedError: raise except Exception: if elastic_agent: events.record( elastic_agent.get_agent_status_event(WorkerState.FAILED)) else: events.record(_construct_event(args)) raise finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()
def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler: """ Usage: :: rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8, timeout=300, last_call_timeout=30, etcd_prefix="custom_prefix", protocol="https", cacert="/etc/kubernetes/certs/ca.crt", cert="/etc/kubernetes/certs/client.crt", key="/etc/kubernetes/certs/client.key") # -- or -- rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8) etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params) Where: run_id - unique id for this training job instance, min_nodes - min number of workers expected to join the rendezvous, max_nodes - max number of workers allowed to join the rendezvous, defaults to min_workers is not specified. timeout - total timeout within which next_rendezvous is expected to succeed; a RendezvousTimeoutException is raised otherwise; Defaults is 600 (10 minutes). last_call_timeout - additional wait amount ("last call") after min number of workers has been reached. Defaults to 30 seconds. etcd_prefix - path prefix (from etcd root), inside which all etcd nodes will be created. Default is "/torchelastic/p2p". protocol - http (default) or https to access etcd. cacert - CA cert to access etcd, only makes sense with https. cert - client cert to access etcd, only makes sense with https. key - client key to access etcd, only makes sense with https. """ client = _create_etcd_client(params) etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p") rdzv = EtcdRendezvous( client=client, prefix=etcd_prefix, run_id=params.run_id, num_min_workers=params.min_nodes, num_max_workers=params.max_nodes, timeout=params.timeout, last_call_timeout=params.last_call_timeout, ) return EtcdRendezvousHandler(rdzv_impl=rdzv)
def create_rdzv_handler(rdzv_params: RendezvousParameters): """ Usage: :: rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8, timeout=300, last_call_timeout=30, etcd_prefix="custom_prefix", protocol="https", cacert="/etc/kubernetes/certs/ca.crt", cert="/etc/kubernetes/certs/client.crt", key="/etc/kubernetes/certs/client.key") # -- or -- rdzv_params = RendezvousParameters( backend="etcd", endpoint="192.168.0.42:2379", run_id="123", min_nodes=4, max_nodes=8) etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params) Where: run_id - unique id for this training job instance, min_nodes - min number of workers expected to join the rendezvous, max_nodes - max number of workers allowed to join the rendezvous, defaults to min_workers is not specified. timeout - total timeout within which next_rendezvous is expected to succeed; a RendezvousTimeoutException is raised otherwise; Defaults is 600 (10 minutes). last_call_timeout - additional wait amount ("last call") after min number of workers has been reached. Defaults to 30 seconds. etcd_prefix - path prefix (from etcd root), inside which all etcd nodes will be created. Default is "/torchelastic/p2p". protocol - http (default) or https to access etcd. cacert - CA cert to access etcd, only makes sense with https. cert - client cert to access etcd, only makes sense with https. key - client key to access etcd, only makes sense with https. """ import re # Etcd endpoints. (Current url format only allows a single host) endpoint = rdzv_params.endpoint match = re.match(r"(.+):(\d+)$", endpoint) # check if port was provided if match: etcd_endpoints = ((match.group(1), int(match.group(2))), ) else: # Use default etcd port etcd_endpoints = ((endpoint, 2379), ) # Run ID value -> unique identifier of this training job instance: # typically a job_id or name assigned by the scheduler or user run_id = rdzv_params.run_id # Parse all of query parameters: etcd_prefix = rdzv_params.get("etcd_prefix", "/torchelastic/p2p") min_workers = rdzv_params.min_nodes max_workers = rdzv_params.max_nodes assert min_workers >= 1, "Min number of workers should be at least 1" assert ( max_workers >= min_workers ), "Max number of workers cannot be less than min number of workers" timeout = rdzv_params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT) last_call_timeout = rdzv_params.get("last_call_timeout", CONST_DEFAULT_LAST_CALL_TIMEOUT) kwargs = _parse_etcd_client_params(rdzv_params.configs) # Etcd rendezvous implementation etcd_rdzv = EtcdRendezvous( endpoints=etcd_endpoints, prefix=etcd_prefix, run_id=run_id, num_min_workers=min_workers, num_max_workers=max_workers, timeout=timeout, last_call_timeout=last_call_timeout, **kwargs, ) return EtcdRendezvousHandler(rdzv_impl=etcd_rdzv)
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info( f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n" ) nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************" ) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError( "Don't use both the '--no_python' flag" " and the '--module' flag at the same time." ) cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, fn=wrapper_fn, args=(omp_num_threads, cmd), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) elastic_agent.run(spec.role) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()