def test_parse_rendezvous_raises_error_if_value_is_empty(self) -> None: config_strs = [ "b=dummy1,a,c=dummy2", "b=dummy1,c=dummy2,a", "b=dummy1,a=,c=dummy2", " a ", ] for config_str in config_strs: with self.subTest(config_str=config_str): with self.assertRaisesRegex( ValueError, r"^The rendezvous configuration option 'a' must have a value specified.$", ): _parse_rendezvous_config(config_str)
def elastic_config(args): min_node, max_node = parse_min_max_nodes(args.n_node) n_proc = local_world_size(args.n_proc) rdzv_configs = _parse_rendezvous_config(args.rdzv_conf) if args.rdzv_backend == "static": rdzv_configs["rank"] = args.node_rank rdzv_endpoint = get_rdzv_endpoint(args, max_node) config = LaunchConfig( min_nodes=min_node, max_nodes=max_node, nproc_per_node=n_proc, run_id=args.rdzv_id, role=args.role, rdzv_endpoint=rdzv_endpoint, rdzv_backend=args.rdzv_backend, rdzv_configs=rdzv_configs, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, start_method=args.start_method, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), log_dir=args.log_dir, ) return config
def test_parse_rendezvous_returns_empty_dict_if_str_is_empty(self) -> None: config_strs = ["", " "] for config_str in config_strs: with self.subTest(config_str=config_str): config = _parse_rendezvous_config(config_str) self.assertEqual(config, {})
def test_parse_rendezvous_raises_error_if_str_is_invalid(self) -> None: config_strs = [ "a=dummy1,", "a=dummy1,,c=dummy2", "a=dummy1, ,c=dummy2", "a=dummy1,= ,c=dummy2", "a=dummy1, = ,c=dummy2", "a=dummy1, =,c=dummy2", " , ", ] for config_str in config_strs: with self.subTest(config_str=config_str): with self.assertRaisesRegex( ValueError, r"^The rendezvous configuration string must be in format " r"<key1>=<value1>,...,<keyN>=<valueN>.$", ): _parse_rendezvous_config(config_str)
def test_parse_rendezvous_config_returns_dict(self) -> None: expected_config = { "a": "dummy1", "b": "dummy2", "c": "dummy3=dummy4", "d": "dummy5/dummy6", } config = _parse_rendezvous_config( " b= dummy2 ,c=dummy3=dummy4, a =dummy1,d=dummy5/dummy6") self.assertEqual(config, expected_config)
def config_from_args(args) -> Tuple[LaunchConfig, List[str]]: # If ``args`` not passed, defaults to ``sys.argv[:1]`` min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) config = LaunchConfig( min_nodes=min_nodes, max_nodes=max_nodes, nproc_per_node=nproc_per_node, run_id=args.rdzv_id, role=args.role, rdzv_endpoint=args.rdzv_endpoint, rdzv_backend=args.rdzv_backend, rdzv_configs=_parse_rendezvous_config(args.rdzv_conf), max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, start_method=args.start_method, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), ) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) return config, cmd
def config_from_args( args) -> Tuple[LaunchConfig, Union[Callable, str], List[str]]: # If ``args`` not passed, defaults to ``sys.argv[:1]`` min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) rdzv_configs = _parse_rendezvous_config(args.rdzv_conf) if args.rdzv_backend == "static": rdzv_configs["rank"] = args.node_rank rdzv_endpoint = get_rdzv_endpoint(args) config = LaunchConfig( min_nodes=min_nodes, max_nodes=max_nodes, nproc_per_node=nproc_per_node, run_id=args.rdzv_id, role=args.role, rdzv_endpoint=rdzv_endpoint, rdzv_backend=args.rdzv_backend, rdzv_configs=rdzv_configs, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, start_method=args.start_method, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), log_dir=args.log_dir, ) with_python = not args.no_python cmd: Union[Callable, str] cmd_args = [] if args.run_path: cmd = run_script_path cmd_args.append(args.training_script) else: if with_python: cmd = sys.executable cmd_args.append("-u") if args.module: cmd_args.append("-m") cmd_args.append(args.training_script) else: if not args.use_env: raise ValueError("When using the '--no_python' flag," " you must also set the '--use_env' flag.") if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd = args.training_script if not args.use_env: log.warning( "--use_env is deprecated and will be removed in future releases.\n" " Please read local_rank from `os.environ('LOCAL_RANK')` instead.") cmd_args.append(f"--local_rank={macros.local_rank}") cmd_args.extend(args.training_script_args) return config, cmd, cmd_args
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 elastic_agent = None if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rendezvous_config(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, entrypoint=cmd[0], args=(*cmd[1:], ), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec=spec, start_method=args.start_method, log_dir=args.log_dir) run_result = elastic_agent.run(spec.role) events.record( elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED)) if run_result.is_failed(): # ChildFailedError is treated specially by @record # if the error files for the failed children exist # @record will copy the first error (root cause) # to the error file of the launcher process raise ChildFailedError( name=args.training_script, failures=run_result.failures, ) except ChildFailedError: raise except Exception: if elastic_agent: events.record( elastic_agent.get_agent_status_event(WorkerState.FAILED)) else: events.record(_construct_event(args)) raise finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()