def main(args=None): args = parse_args(args) if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") config, cmd = config_from_args(args) try: elastic_launch( config=config, entrypoint=cmd[0], )(*cmd[1:]) finally: if args.standalone: etcd_server.stop()
def elastic_launch_wrapper(): """We need a wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code.""" elastic_launch( self.get_test_launch_config(min_nodes, max_nodes, nproc_per_node, run_id), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
def test_launch_elastic(self): nproc_per_node = 4 elastic_launch( self.get_test_launch_config(1, 2, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") world_size = nproc_per_node self.check_works_ran(world_size)
def test_launch_script_bash(self): nnodes = 1 nproc_per_node = 4 elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), path("bin/test_script.sh"), )(f"{self.test_dir}") world_size = nnodes * nproc_per_node self.check_works_ran(world_size)
def test_launch_script_python(self): nnodes = 1 nproc_per_node = 4 elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") # make sure all the workers ran. # each worker touches a file with its global rank as the name. world_size = nnodes * nproc_per_node self.check_works_ran(world_size)
def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run): """ Asserts that when the agent raises an exception the launcher re-raises the original exception. """ mock_agent_run.side_effect = MockException with self.assertRaises(MockException): elastic_launch( self.get_test_launch_config(1, 2, 4), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") record_mock.assert_called_once()
def test_launch_elastic_worker_raise_exception(self, record_mock): """ Asserts that when the worker program fails and lancher raieses exception to indicate that worker process failed. """ nproc_per_node = 4 with self.assertRaises(ChildFailedError): elastic_launch( self.get_test_launch_config(1, 2, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), "--fail") record_mock.assert_called_once()
def test_launch_shutdown(self, agent_mock_cls): agent_mock = Mock() agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED) agent_mock_cls.return_value = agent_mock rdzv_handler_mock = Mock() with patch( "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler" ) as param_mock: param_mock.return_value = rdzv_handler_mock elastic_launch( self.get_test_launch_config(1, 1, 4), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") rdzv_handler_mock.shutdown.assert_called_once()
def test_launch_function(self): nnodes = 1 nproc_per_node = 4 res = elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), simple_rank_scale, )() expected_res = [10, 11, 12, 13] actual_res = sorted(value for value in res.values()) self.assertEqual(expected_res, actual_res)