Ejemplo n.º 1
0
def test_launch_single_slot(
    mock_cluster_info: mock.MagicMock,
    mock_subprocess: mock.MagicMock,
) -> None:
    cluster_info = test_util.make_mock_cluster_info(["0.0.0.0"], 0, 1)
    mock_cluster_info.return_value = cluster_info
    script = ["python3", "-m", "determined.exec.harness", "my_module:MyTrial"]
    override_args = ["--max_restarts", "1"]

    with test_util.set_resources_id_env_var():
        launch.torch_distributed.main(override_args, script)

    mock_subprocess.assert_called_once_with(script)

    assert os.environ.get("USE_TORCH_DISTRIBUTED") is None
def test_sshd_worker(
    mock_api_post: mock.MagicMock,
    mock_cluster_info: mock.MagicMock,
    mock_popen: mock.MagicMock,
) -> None:
    info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"],
                                            1,
                                            num_slots=1)
    mock_cluster_info.return_value = info
    hvd_args = ["ds1", "ds2"]
    script = ["s1", "s2"]

    pid_server_cmd, run_sshd_cmd = launch.horovod.create_sshd_worker_cmd(
        info.allocation_id,
        len(info.slot_ids),
    )

    launch_cmd = pid_server_cmd + run_sshd_cmd

    mock_proc = mock.MagicMock()
    mock_proc.wait.return_value = 99

    mock_popen.return_value = mock_proc

    with test_util.set_resources_id_env_var():
        assert launch.horovod.main(hvd_args, script, True) == 99

    mock_cluster_info.assert_called_once()
    assert os.environ["DET_CHIEF_IP"] == info.container_addrs[0]
    assert os.environ["USE_HOROVOD"] == "1"

    mock_popen.assert_has_calls([mock.call(launch_cmd)])

    mock_api_post.assert_has_calls([
        mock.call(
            info.master_url,
            path=
            f"/api/v1/allocations/{info.allocation_id}/resources/resourcesId/daemon",
            cert=certs.cli_cert,
        )
    ])

    mock_proc.wait.assert_called_once()
def test_launch_worker(mock_api: mock.MagicMock,
                       mock_cluster_info: mock.MagicMock,
                       mock_subprocess: mock.MagicMock) -> None:
    cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 1,
                                                    4)
    mock_cluster_info.return_value = cluster_info
    with test_util.set_resources_id_env_var():
        launch.deepspeed.main(["script"])

    mock_cluster_info.assert_called_once()
    assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0]

    mock_api.assert_called_once()

    pid_server_cmd = launch.deepspeed.create_pid_server_cmd(
        cluster_info.allocation_id, len(cluster_info.slot_ids))
    sshd_cmd = launch.deepspeed.create_sshd_cmd()

    expected_cmd = pid_server_cmd + sshd_cmd
    mock_subprocess.assert_called_once_with(expected_cmd)
def test_launch_one_slot(mock_cluster_info: mock.MagicMock,
                         mock_subprocess: mock.MagicMock) -> None:
    cluster_info = test_util.make_mock_cluster_info(["0.0.0.0"], 0, 4)
    mock_cluster_info.return_value = cluster_info
    script = ["s1", "s2"]
    pid_server_cmd = launch.deepspeed.create_pid_server_cmd(
        cluster_info.allocation_id, len(cluster_info.slot_ids))
    deepspeed_cmd = launch.deepspeed.create_run_command(
        "localhost", launch.deepspeed.hostfile_path)
    pid_client_cmd = launch.deepspeed.create_pid_client_cmd(
        cluster_info.allocation_id)
    log_redirect_cmd = launch.deepspeed.create_log_redirect_cmd()
    launch_cmd = pid_server_cmd + deepspeed_cmd + pid_client_cmd + log_redirect_cmd + script

    with test_util.set_resources_id_env_var():
        launch.deepspeed.main(script)

    mock_cluster_info.assert_called_once()
    assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0]
    assert os.environ["USE_DEEPSPEED"] == "1"

    mock_subprocess.assert_called_once_with(launch_cmd)
Ejemplo n.º 5
0
def test_launch_distributed(
    mock_cluster_info: mock.MagicMock,
    mock_subprocess: mock.MagicMock,
) -> None:
    cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 0,
                                                    2)
    mock_cluster_info.return_value = cluster_info
    script = ["python3", "-m", "determined.exec.harness", "my_module:MyTrial"]
    override_args = ["--max_restarts", "1"]

    mock_success_code = 99
    mock_proc = mock.MagicMock()
    mock_proc.wait.return_value = mock_success_code

    mock_subprocess.return_value = mock_proc

    with test_util.set_resources_id_env_var():
        assert launch.torch_distributed.main(override_args,
                                             script) == mock_success_code

    launch_cmd = launch.torch_distributed.create_pid_server_cmd(
        cluster_info.allocation_id, len(cluster_info.slot_ids))

    launch_cmd += launch.torch_distributed.create_launch_cmd(
        len(cluster_info.container_addrs),
        len(cluster_info.slot_ids),
        cluster_info.container_rank,
        cluster_info.container_addrs[0],
        override_args,
    )
    launch_cmd += launch.torch_distributed.create_pid_client_cmd(
        cluster_info.allocation_id)
    launch_cmd += launch.torch_distributed.create_log_redirect_cmd()
    launch_cmd += script

    mock_subprocess.assert_called_once_with(launch_cmd)

    assert os.environ["USE_TORCH_DISTRIBUTED"] == "True"
    assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0]
def test_launch_multi_slot_chief(
    mock_time: mock.MagicMock,
    mock_check_sshd: mock.MagicMock,
    mock_cluster_info: mock.MagicMock,
    mock_subprocess: mock.MagicMock,
) -> None:
    cluster_info = test_util.make_mock_cluster_info(["0.0.0.0", "0.0.0.1"], 0,
                                                    4)
    mock_cluster_info.return_value = cluster_info
    mock_start_time = time.time()
    mock_time.return_value = mock_start_time
    script = ["s1", "s2"]
    sshd_cmd = launch.deepspeed.create_sshd_cmd()
    pid_server_cmd = launch.deepspeed.create_pid_server_cmd(
        cluster_info.allocation_id, len(cluster_info.slot_ids))
    deepspeed_cmd = launch.deepspeed.create_run_command(
        cluster_info.container_addrs[0], launch.deepspeed.hostfile_path)
    pid_client_cmd = launch.deepspeed.create_pid_client_cmd(
        cluster_info.allocation_id)
    log_redirect_cmd = launch.deepspeed.create_log_redirect_cmd()

    launch_cmd = pid_server_cmd + deepspeed_cmd + pid_client_cmd + log_redirect_cmd + script

    sshd_proc_mock = mock.MagicMock()
    launch_proc_mock = mock.MagicMock()

    def mock_process(cmd: List[str], *args: Any, **kwargs: Any) -> Any:
        if cmd == sshd_cmd:
            return sshd_proc_mock(*args, **kwargs)
        if cmd == launch_cmd:
            return launch_proc_mock(*args, **kwargs)
        return None

    mock_subprocess.side_effect = mock_process

    with test_util.set_resources_id_env_var():
        launch.deepspeed.main(script)

    mock_cluster_info.assert_called_once()
    assert os.environ["DET_CHIEF_IP"] == cluster_info.container_addrs[0]
    assert os.environ["USE_DEEPSPEED"] == "1"
    assert os.environ["PDSH_SSH_ARGS"] == (
        "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
        f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h")

    mock_subprocess.assert_has_calls(
        [mock.call(sshd_cmd), mock.call(launch_cmd)])

    assert mock_check_sshd.call_count == len(cluster_info.container_addrs)
    mock_check_sshd.assert_has_calls([
        mock.call(addr, mock_start_time + 20, constants.DTRAIN_SSH_PORT)
        for addr in cluster_info.container_addrs
    ])

    launch_proc_mock().wait.assert_called_once()

    sshd_proc_mock().kill.assert_called_once()
    sshd_proc_mock().wait.assert_called_once()

    # Cleanup deepspeed environment file created in launch.deepspeed.main
    deepspeed_env_path = os.path.join(os.getcwd(), DEEPSPEED_ENVIRONMENT_NAME)
    if os.path.isfile(deepspeed_env_path):
        os.remove(deepspeed_env_path)
def test_horovod_chief(
    mock_time: mock.MagicMock,
    mock_check_sshd: mock.MagicMock,
    mock_cluster_info: mock.MagicMock,
    mock_popen: mock.MagicMock,
    nslots: int,
    nnodes: int,
    autohorovod: bool,
) -> None:
    info = test_util.make_mock_cluster_info(
        ["0.0.0.{i}" for i in range(nnodes)], 0, num_slots=nslots)
    experiment_config = info.trial._config
    mock_cluster_info.return_value = info
    mock_start_time = time.time()
    mock_time.return_value = mock_start_time
    hvd_args = ["ds1", "ds2"]
    script = ["s1", "s2"]

    pid_server_cmd = launch.horovod.create_hvd_pid_server_cmd(
        info.allocation_id, len(info.slot_ids))

    hvd_cmd = horovod.create_run_command(
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
        inter_node_network_interface=info.trial._inter_node_network_interface,
        optimizations=experiment_config["optimizations"],
        debug=False,
        optional_args=hvd_args,
    )

    worker_wrapper_cmd = launch.horovod.create_worker_wrapper_cmd(
        info.allocation_id)

    launch_cmd = pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script

    mock_proc = mock.MagicMock()
    mock_proc.wait.return_value = 99

    mock_popen.return_value = mock_proc

    with test_util.set_resources_id_env_var():
        assert launch.horovod.main(hvd_args, script, autohorovod) == 99

    if autohorovod and nnodes == 1 and nslots == 1:
        # Single-slot --autohorovod: we should have just called the script directly.
        mock_popen.assert_has_calls([mock.call(script)])
        mock_check_sshd.assert_not_called()
    else:
        # Multi-slot or non --autohorovod: expect a full horovodrun command.
        mock_cluster_info.assert_called_once()
        assert os.environ["DET_CHIEF_IP"] == info.container_addrs[0]
        assert os.environ["USE_HOROVOD"] == "1"

        mock_popen.assert_has_calls([mock.call(launch_cmd)])

        assert mock_check_sshd.call_count == len(info.container_addrs[1:])
        mock_check_sshd.assert_has_calls([
            mock.call(addr, mock_start_time + 20, constants.DTRAIN_SSH_PORT)
            for addr in info.container_addrs[1:]
        ])

        mock_proc.wait.assert_called_once()