def test_launch_single_slot( mock_cluster_info: mock.MagicMock, mock_subprocess: mock.MagicMock, ) -> None: cluster_info = test_util.make_mock_cluster_info(["0.0.0.0"], 0, 1) mock_cluster_info.return_value = cluster_info script = ["python3", "-m", "determined.exec.harness", "my_module:MyTrial"] override_args = ["--max_restarts", "1"] with test_util.set_resources_id_env_var(): launch.torch_distributed.main(override_args, script) mock_subprocess.assert_called_once_with(script) assert os.environ.get("USE_TORCH_DISTRIBUTED") is None
def test_sshd_worker( mock_api_post: mock.MagicMock, mock_cluster_info: mock.MagicMock, mock_popen: mock.MagicMock, ) -> None: info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 1, num_slots=1) mock_cluster_info.return_value = info hvd_args = ["ds1", "ds2"] script = ["s1", "s2"] pid_server_cmd, run_sshd_cmd = launch.horovod.create_sshd_worker_cmd( info.allocation_id, len(info.slot_ids), ) launch_cmd = pid_server_cmd + run_sshd_cmd mock_proc = mock.MagicMock() mock_proc.wait.return_value = 99 mock_popen.return_value = mock_proc with test_util.set_resources_id_env_var(): assert launch.horovod.main(hvd_args, script, True) == 99 mock_cluster_info.assert_called_once() assert os.environ["DET_CHIEF_IP"] == info.container_addrs[0] assert os.environ["USE_HOROVOD"] == "1" mock_popen.assert_has_calls([mock.call(launch_cmd)]) mock_api_post.assert_has_calls([ mock.call( info.master_url, path= f"/api/v1/allocations/{info.allocation_id}/resources/resourcesId/daemon", cert=certs.cli_cert, ) ]) mock_proc.wait.assert_called_once()
def test_launch_worker(mock_api: mock.MagicMock, mock_cluster_info: mock.MagicMock, mock_subprocess: mock.MagicMock) -> None: cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 1, 4) mock_cluster_info.return_value = cluster_info with test_util.set_resources_id_env_var(): launch.deepspeed.main(["script"]) mock_cluster_info.assert_called_once() assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0] mock_api.assert_called_once() pid_server_cmd = launch.deepspeed.create_pid_server_cmd( cluster_info.allocation_id, len(cluster_info.slot_ids)) sshd_cmd = launch.deepspeed.create_sshd_cmd() expected_cmd = pid_server_cmd + sshd_cmd mock_subprocess.assert_called_once_with(expected_cmd)
def test_launch_one_slot(mock_cluster_info: mock.MagicMock, mock_subprocess: mock.MagicMock) -> None: cluster_info = test_util.make_mock_cluster_info(["0.0.0.0"], 0, 4) mock_cluster_info.return_value = cluster_info script = ["s1", "s2"] pid_server_cmd = launch.deepspeed.create_pid_server_cmd( cluster_info.allocation_id, len(cluster_info.slot_ids)) deepspeed_cmd = launch.deepspeed.create_run_command( "localhost", launch.deepspeed.hostfile_path) pid_client_cmd = launch.deepspeed.create_pid_client_cmd( cluster_info.allocation_id) log_redirect_cmd = launch.deepspeed.create_log_redirect_cmd() launch_cmd = pid_server_cmd + deepspeed_cmd + pid_client_cmd + log_redirect_cmd + script with test_util.set_resources_id_env_var(): launch.deepspeed.main(script) mock_cluster_info.assert_called_once() assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0] assert os.environ["USE_DEEPSPEED"] == "1" mock_subprocess.assert_called_once_with(launch_cmd)
def test_launch_distributed( mock_cluster_info: mock.MagicMock, mock_subprocess: mock.MagicMock, ) -> None: cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 0, 2) mock_cluster_info.return_value = cluster_info script = ["python3", "-m", "determined.exec.harness", "my_module:MyTrial"] override_args = ["--max_restarts", "1"] mock_success_code = 99 mock_proc = mock.MagicMock() mock_proc.wait.return_value = mock_success_code mock_subprocess.return_value = mock_proc with test_util.set_resources_id_env_var(): assert launch.torch_distributed.main(override_args, script) == mock_success_code launch_cmd = launch.torch_distributed.create_pid_server_cmd( cluster_info.allocation_id, len(cluster_info.slot_ids)) launch_cmd += launch.torch_distributed.create_launch_cmd( len(cluster_info.container_addrs), len(cluster_info.slot_ids), cluster_info.container_rank, cluster_info.container_addrs[0], override_args, ) launch_cmd += launch.torch_distributed.create_pid_client_cmd( cluster_info.allocation_id) launch_cmd += launch.torch_distributed.create_log_redirect_cmd() launch_cmd += script mock_subprocess.assert_called_once_with(launch_cmd) assert os.environ["USE_TORCH_DISTRIBUTED"] == "True" assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0]
def test_launch_multi_slot_chief( mock_time: mock.MagicMock, mock_check_sshd: mock.MagicMock, mock_cluster_info: mock.MagicMock, mock_subprocess: mock.MagicMock, ) -> None: cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 0, 4) mock_cluster_info.return_value = cluster_info mock_start_time = time.time() mock_time.return_value = mock_start_time script = ["s1", "s2"] sshd_cmd = launch.deepspeed.create_sshd_cmd() pid_server_cmd = launch.deepspeed.create_pid_server_cmd( cluster_info.allocation_id, len(cluster_info.slot_ids)) deepspeed_cmd = launch.deepspeed.create_run_command( cluster_info.container_addrs[0], launch.deepspeed.hostfile_path) pid_client_cmd = launch.deepspeed.create_pid_client_cmd( cluster_info.allocation_id) log_redirect_cmd = launch.deepspeed.create_log_redirect_cmd() launch_cmd = pid_server_cmd + deepspeed_cmd + pid_client_cmd + log_redirect_cmd + script sshd_proc_mock = mock.MagicMock() launch_proc_mock = mock.MagicMock() def mock_process(cmd: List[str], *args: Any, **kwargs: Any) -> Any: if cmd == sshd_cmd: return sshd_proc_mock(*args, **kwargs) if cmd == launch_cmd: return launch_proc_mock(*args, **kwargs) return None mock_subprocess.side_effect = mock_process with test_util.set_resources_id_env_var(): launch.deepspeed.main(script) mock_cluster_info.assert_called_once() assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0] assert os.environ["USE_DEEPSPEED"] == "1" assert os.environ["PDSH_SSH_ARGS"] == ( "-o PasswordAuthentication=no -o StrictHostKeyChecking=no " f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h") mock_subprocess.assert_has_calls( [mock.call(sshd_cmd), mock.call(launch_cmd)]) assert mock_check_sshd.call_count == len(cluster_info.container_addrs) mock_check_sshd.assert_has_calls([ mock.call(addr, mock_start_time + 20, constants.DTRAIN_SSH_PORT) for addr in cluster_info.container_addrs ]) launch_proc_mock().wait.assert_called_once() sshd_proc_mock().kill.assert_called_once() sshd_proc_mock().wait.assert_called_once() # Cleanup deepspeed environment file created in launch.deepspeed.main deepspeed_env_path = os.path.join(os.getcwd(), DEEPSPEED_ENVIRONMENT_NAME) if os.path.isfile(deepspeed_env_path): os.remove(deepspeed_env_path)
def test_horovod_chief( mock_time: mock.MagicMock, mock_check_sshd: mock.MagicMock, mock_cluster_info: mock.MagicMock, mock_popen: mock.MagicMock, nslots: int, nnodes: int, autohorovod: bool, ) -> None: info = test_util.make_mock_cluster_info( ["0.0.0.{i}" for i in range(nnodes)], 0, num_slots=nslots) experiment_config = info.trial._config mock_cluster_info.return_value = info mock_start_time = time.time() mock_time.return_value = mock_start_time hvd_args = ["ds1", "ds2"] script = ["s1", "s2"] pid_server_cmd = launch.horovod.create_hvd_pid_server_cmd( info.allocation_id, len(info.slot_ids)) hvd_cmd = horovod.create_run_command( num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, inter_node_network_interface=info.trial._inter_node_network_interface, optimizations=experiment_config["optimizations"], debug=False, optional_args=hvd_args, ) worker_wrapper_cmd = launch.horovod.create_worker_wrapper_cmd( info.allocation_id) launch_cmd = pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script mock_proc = mock.MagicMock() mock_proc.wait.return_value = 99 mock_popen.return_value = mock_proc with test_util.set_resources_id_env_var(): assert launch.horovod.main(hvd_args, script, autohorovod) == 99 if autohorovod and nnodes == 1 and nslots == 1: # Single-slot --autohorovod: we should have just called the script directly. mock_popen.assert_has_calls([mock.call(script)]) mock_check_sshd.assert_not_called() else: # Multi-slot or non --autohorovod: expect a full horovodrun command. mock_cluster_info.assert_called_once() assert os.environ["DET_CHIEF_IP"] == info.container_addrs[0] assert os.environ["USE_HOROVOD"] == "1" mock_popen.assert_has_calls([mock.call(launch_cmd)]) assert mock_check_sshd.call_count == len(info.container_addrs[1:]) mock_check_sshd.assert_has_calls([ mock.call(addr, mock_start_time + 20, constants.DTRAIN_SSH_PORT) for addr in info.container_addrs[1:] ]) mock_proc.wait.assert_called_once()