def test_launch_elastic(self): run_id = str(uuid.uuid4().int) min_nodes = 1 max_nodes = 2 nproc_per_node = 4 # we are only launching 1 node (even though max = 2) world_size = nproc_per_node args = [ f"--nnodes={min_nodes}:{max_nodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--start_method=fork", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] launch.main(args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_launch_elastic_multiple_agents(self): run_id = str(uuid.uuid4().int) min_nodes = 1 max_nodes = 2 nproc_per_node = 4 nnodes = 2 world_size = nnodes * nproc_per_node args = [ f"--nnodes={min_nodes}:{max_nodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--start_method=fork", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] procs = [] for _ in range(nnodes - 1): p = mp.Process(target=launch.main, args=[args]) procs.append(p) p.start() launch.main(args) for i in range(nnodes - 1): p = procs[i] p.join() self.assertEqual(0, p.exitcode) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run): """ Asserts that when the agent raises an exception the launcher re-raises the original exception """ run_id = str(uuid.uuid4().int) min_nodes = 1 max_nodes = 2 nproc_per_node = 4 args = [ f"--nnodes={min_nodes}:{max_nodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--max_restarts=0", "--start_method=fork", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] mock_agent_run.side_effect = MockException with self.assertRaises(MockException): launch.main(args) record_mock.assert_called_once()
def test_launch_elastic_worker_raise_exception(self, record_mock): """ Asserts that when the worker program fails and lancher raieses exception to indicate that worker process failed """ run_id = str(uuid.uuid4().int) min_nodes = 1 max_nodes = 2 nproc_per_node = 4 args = [ f"--nnodes={min_nodes}:{max_nodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--max_restarts=0", "--start_method=fork", path("bin/test_script.py"), "--fail", ] with self.assertRaises(ChildFailedError): launch.main(args) record_mock.assert_called_once()
def test_launch_with_env_vars(self): run_id = str(uuid.uuid4().int) nnodes = 1 nproc_per_node = 4 world_size = nnodes * nproc_per_node os.environ["PET_NNODES"] = str(nnodes) os.environ["PET_NPROC_PER_NODE"] = str(nproc_per_node) os.environ["PET_RDZV_BACKEND"] = "etcd" os.environ["PET_RDZV_ENDPOINT"] = self._etcd_endpoint os.environ["PET_RDZV_ID"] = run_id os.environ["PET_MONITOR_INTERVAL"] = "1" os.environ["PET_START_METHOD"] = "fork" os.environ["PET_NO_PYTHON"] = "1" script_args = [path("bin/test_script.sh"), f"{self.test_dir}"] with self.assertRaises(ValueError): # --no_python cannot be used with --module os.environ["PET_MODULE"] = "1" launch.main(script_args) os.environ["PET_MODULE"] = "0" launch.main(script_args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def _test_nproc_launch_configuration(self, nproc_type, expected_number): run_id = str(uuid.uuid4().int) nnodes = 1 args = [ f"--nnodes={nnodes}", f"--nproc_per_node={nproc_type}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--start_method=fork", "--no_python", ] script_args = [path("bin/test_script.sh"), f"{self.test_dir}"] launch.main(args + script_args) world_size = nnodes * expected_number # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_launch_user_script_bash(self): run_id = str(uuid.uuid4().int) nnodes = 1 nproc_per_node = 4 world_size = nnodes * nproc_per_node args = [ f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--start_method=fork", "--no_python", ] script_args = [path("bin/test_script.sh"), f"{self.test_dir}"] with self.assertRaises(ValueError): # --no_python cannot be used with --module launch.main(args + ["--module"] + script_args) launch.main(args + script_args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_launch_user_script_python_caffe2_bc(self): nnodes = 1 nproc_per_node = 4 world_size = nnodes * nproc_per_node sock = get_socket_with_port() with closing(sock): master_port = sock.getsockname()[1] args = [ f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", "--start_method=fork", "--master_addr=localhost", f"--master_port={master_port}", "--node_rank=0", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] launch.main(args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_init_method_env_with_torchelastic(self): port = get_free_port() launch.main([ "--run_path", "--nnodes=1", "--nproc_per_node=4", "--master_addr=localhost", f"--master_port={port}", "--monitor_interval=1", path("bin/test_script_init_method.py"), "--init_method=env://", ])
def test_is_torchelastic_launched(self): # launch test script with torchelastic and validate that # torch.distributed.is_torchelastic_launched() returns True out_file = f"{os.path.join(self.test_dir, 'out')}" launch.main([ "--run_path", "--nnodes=1", "--nproc_per_node=1", "--monitor_interval=1", path("bin/test_script_is_torchelastic_launched.py"), f"--out_file={out_file}", ]) with open(out_file, "r") as fp: is_torchelastic_launched = fp.readline() self.assertEqual("True", is_torchelastic_launched)
def test_launch_run_path(self): nnodes = 1 nproc_per_node = 4 world_size = nnodes * nproc_per_node args = [ "--run_path", f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", "--start_method=fork", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] launch.main(args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def test_launch_shutdown(self, agent_mock_cls): nnodes = 1 nproc_per_node = 4 args = [ f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--monitor_interval=1", "--start_method=fork", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] agent_mock = Mock() agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED) agent_mock_cls.return_value = agent_mock rdzv_handler_mock = Mock() with patch( "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler" ) as param_mock: param_mock.return_value = rdzv_handler_mock launch.main(args) rdzv_handler_mock.shutdown.assert_called_once()
def _test_launch_user_script_python(self): run_id = str(uuid.uuid4().int) nnodes = 1 nproc_per_node = 4 world_size = nnodes * nproc_per_node args = [ f"--nnodes={nnodes}", f"--nproc_per_node={nproc_per_node}", "--rdzv_backend=etcd", f"--rdzv_endpoint={self._etcd_endpoint}", f"--rdzv_id={run_id}", "--monitor_interval=1", "--start_method=spawn", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}", ] launch.main(args) # make sure all the workers ran # each worker touches a file with its global rank as the name self.assertSetEqual({str(i) for i in range(world_size)}, set(os.listdir(self.test_dir)))
def launch_in_proc(args): launch.main(args)