Exemple #1
0
def test_failed_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                           enable_test_module):
    address = ray_start_with_dashboard.address_info["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    job_sleep_time_s = 5
    entrypoint_cmd = ('python -c"'
                      "import ray;"
                      "ray.init();"
                      "import time;"
                      f"time.sleep({job_sleep_time_s});"
                      "import sys;"
                      "sys.exit(1);"
                      '"')
    start_time_s = int(time.time())
    client = JobSubmissionClient(address)
    runtime_env = {"env_vars": {"RAY_TEST_456": "456"}}
    metadata = {"ray_test_789": "789"}
    job_id = client.submit_job(entrypoint=entrypoint_cmd,
                               metadata=metadata,
                               runtime_env=runtime_env)

    def wait_for_job_to_fail():
        data = _get_snapshot(address)

        legacy_job_failed = False
        job_failed = False

        # Test legacy job snapshot (one driver per job).
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert job_entry["statusMessage"] is not None
                legacy_job_failed = job_entry["status"] == "FAILED"

        # Test new jobs snapshot (0 to N drivers per job).
        for job_submission_id, entry in data["data"]["snapshot"][
                "jobSubmission"].items():
            if entry["status"] is not None:
                assert entry["status"] in {"PENDING", "RUNNING", "FAILED"}
                assert entry["message"] is not None
                # TODO(architkulkarni): Disable automatic camelcase.
                assert entry["runtimeEnv"] == {
                    "envVars": {
                        "RAYTest456": "456"
                    }
                }
                assert entry["metadata"] == {"rayTest789": "789"}
                assert entry["errorType"] is None
                assert abs(entry["startTime"] - start_time_s) <= 2
                if entry["status"] == "FAILED":
                    job_failed = True
                    assert entry[
                        "endTime"] >= entry["startTime"] + job_sleep_time_s
        return legacy_job_failed and job_failed

    wait_for_condition(wait_for_job_to_fail, timeout=10)
Exemple #2
0
def test_ray_tune_basic(job_sdk_client: JobSubmissionClient):
    run_cmd = "python ray_tune_basic.py"
    job_id = job_sdk_client.submit_job(
        entrypoint=run_cmd,
        runtime_env={"working_dir": DRIVER_SCRIPT_DIR},
    )
    wait_for_condition(_check_job_succeeded,
                       timeout=30,
                       client=job_sdk_client,
                       job_id=job_id)
Exemple #3
0
def test_per_task_runtime_env(job_sdk_client: JobSubmissionClient):
    run_cmd = "python per_task_runtime_env.py"
    job_id = job_sdk_client.submit_job(
        entrypoint=run_cmd,
        runtime_env={"working_dir": DRIVER_SCRIPT_DIR},
    )

    wait_for_condition(_check_job_succeeded,
                       client=job_sdk_client,
                       job_id=job_id)
Exemple #4
0
def test_cli_apis_sanity_check(ray_start_cluster):
    """Test all of CLI APIs work as expected."""
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    for _ in range(3):
        cluster.add_node(num_cpus=2)
    runner = CliRunner()

    client = JobSubmissionClient(
        f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}"
    )

    @ray.remote
    def f():
        import time

        time.sleep(30)

    @ray.remote
    class Actor:
        pass

    obj = ray.put(3)  # noqa
    task = f.remote()  # noqa
    actor = Actor.remote()  # noqa
    actor_runtime_env = Actor.options(  # noqa
        runtime_env={
            "pip": ["requests"]
        }).remote()
    job_id = client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    pg = ray.util.placement_group(bundles=[{"CPU": 1}])  # noqa

    def verify_output(resource_name, necessary_substrings: List[str]):
        result = runner.invoke(cli_list, [resource_name])
        exit_code_correct = result.exit_code == 0
        substring_matched = all(substr in result.output
                                for substr in necessary_substrings)
        print(result.output)
        return exit_code_correct and substring_matched

    wait_for_condition(lambda: verify_output("actors", ["actor_id"]))
    wait_for_condition(lambda: verify_output("workers", ["worker_id"]))
    wait_for_condition(lambda: verify_output("nodes", ["node_id"]))
    wait_for_condition(
        lambda: verify_output("placement-groups", ["placement_group_id"]))
    wait_for_condition(lambda: verify_output("jobs", ["raysubmit"]))
    wait_for_condition(lambda: verify_output("tasks", ["task_id"]))
    wait_for_condition(lambda: verify_output("objects", ["object_id"]))
    wait_for_condition(lambda: verify_output("runtime-envs", ["runtime_env"]))
Exemple #5
0
def test_list_jobs(shutdown_only):
    ray.init()
    client = JobSubmissionClient(
        f"http://{ray.worker.global_worker.node.address_info['webui_url']}")
    job_id = client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )

    def verify():
        job_data = list(list_jobs().values())[0]
        job_id_from_api = list(list_jobs().keys())[0]
        correct_state = job_data["status"] == "SUCCEEDED"
        correct_id = job_id == job_id_from_api
        return correct_state and correct_id

    wait_for_condition(verify)
    print(list_jobs())
Exemple #6
0
def ray_job_submit(
    script_name: str,
    head_service: str,
    k8s_namespace: str = "default",
    ray_dashboard_port: int = 8265,
) -> str:
    """Submits a Python script via the Ray Job Submission API, using the Python SDK.
    Waits for successful completion of the job and returns the job logs as a string.

    Uses `kubectl port-forward` to access the Ray head's dashboard port.

    Scripts live in `tests/kuberay/scripts`. This directory is used as the working
    dir for the job.

    Args:
        script_name: The name of the script to submit.
        head_service: The name of the Ray head K8s service.
        k8s_namespace: K8s namespace the Ray cluster belongs to.
        ray_dashboard_port: The port on which the Ray head is running the Ray dashboard.
    """
    with _kubectl_port_forward(service=head_service,
                               namespace=k8s_namespace,
                               target_port=ray_dashboard_port) as local_port:
        # It takes a bit of time to establish the connection.
        # Try a few times to instantiate the JobSubmissionClient, as the client's
        # instantiation does not retry on connection errors.
        for trie in range(1, 7):
            time.sleep(5)
            try:
                client = JobSubmissionClient(f"http://127.0.0.1:{local_port}")
            except ConnectionError as e:
                if trie < 6:
                    logger.info(
                        "Job client connection failed. Retrying in 5 seconds.")
                else:
                    raise e from None
        job_id = client.submit_job(
            entrypoint=f"python {script_name}",
            runtime_env={
                "working_dir": SCRIPTS_DIR,
                # Throw in some extra data for fun, to validate runtime envs.
                "pip": ["pytest==6.0.0"],
                "env_vars": {
                    "key_foo": "value_bar"
                },
            },
        )
        # Wait for the job to complete successfully.
        # This logic is copied from the Job Submission docs.
        start = time.time()
        timeout = 60
        while time.time() - start <= timeout:
            status = client.get_job_status(job_id)
            print(f"status: {status}")
            if status in {
                    JobStatus.SUCCEEDED, JobStatus.STOPPED, JobStatus.FAILED
            }:
                break
            time.sleep(5)

        assert status == JobStatus.SUCCEEDED
        return client.get_job_logs(job_id)
Exemple #7
0
async def test_state_data_source_client(ray_start_cluster):
    cluster = ray_start_cluster
    # head
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    # worker
    worker = cluster.add_node(num_cpus=2)

    GRPC_CHANNEL_OPTIONS = (
        ("grpc.enable_http_proxy", 0),
        ("grpc.max_send_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
        ("grpc.max_receive_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
    )
    gcs_channel = ray._private.utils.init_grpc_channel(cluster.address,
                                                       GRPC_CHANNEL_OPTIONS,
                                                       asynchronous=True)
    client = StateDataSourceClient(gcs_channel)
    """
    Test actor
    """
    result = await client.get_all_actor_info()
    assert isinstance(result, GetAllActorInfoReply)
    """
    Test placement group
    """
    result = await client.get_all_placement_group_info()
    assert isinstance(result, GetAllPlacementGroupReply)
    """
    Test node
    """
    result = await client.get_all_node_info()
    assert isinstance(result, GetAllNodeInfoReply)
    """
    Test worker info
    """
    result = await client.get_all_worker_info()
    assert isinstance(result, GetAllWorkerInfoReply)
    """
    Test job
    """
    job_client = JobSubmissionClient(
        f"http://{ray.worker.global_worker.node.address_info['webui_url']}")
    job_id = job_client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    result = client.get_job_info()
    assert list(result.keys())[0] == job_id
    assert isinstance(result, dict)
    """
    Test tasks
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_task_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_task_info(node_id)
        assert isinstance(result, GetTasksInfoReply)

    assert len(client.get_all_registered_raylet_ids()) == 2
    """
    Test objects
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_object_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_object_info(node_id)
        assert isinstance(result, GetNodeStatsReply)
    """
    Test the exception is raised when the RPC error occurs.
    """
    cluster.remove_node(worker)
    # Wait until the dead node information is propagated.
    wait_for_condition(lambda: len(
        list(filter(lambda node: node["Alive"], ray.nodes()))) == 1)
    for node in ray.nodes():
        node_id = node["NodeID"]
        if node["Alive"]:
            continue

        # Querying to the dead node raises gRPC error, which should be
        # translated into `StateSourceNetworkException`
        with pytest.raises(StateSourceNetworkException):
            result = await client.get_object_info(node_id)

        # Make sure unregister API works as expected.
        client.unregister_raylet_client(node_id)
        assert len(client.get_all_registered_raylet_ids()) == 1
        # Since the node_id is unregistered, the API should raise ValueError.
        with pytest.raises(ValueError):
            result = await client.get_object_info(node_id)
Exemple #8
0
async def test_state_data_source_client(ray_start_cluster):
    cluster = ray_start_cluster
    # head
    cluster.add_node(num_cpus=2)
    ray.init(address=cluster.address)
    # worker
    worker = cluster.add_node(num_cpus=2)

    GRPC_CHANNEL_OPTIONS = (
        *ray_constants.GLOBAL_GRPC_OPTIONS,
        ("grpc.max_send_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
        ("grpc.max_receive_message_length",
         ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
    )
    gcs_channel = ray._private.utils.init_grpc_channel(cluster.address,
                                                       GRPC_CHANNEL_OPTIONS,
                                                       asynchronous=True)
    client = StateDataSourceClient(gcs_channel)
    """
    Test actor
    """
    result = await client.get_all_actor_info()
    assert isinstance(result, GetAllActorInfoReply)
    """
    Test placement group
    """
    result = await client.get_all_placement_group_info()
    assert isinstance(result, GetAllPlacementGroupReply)
    """
    Test node
    """
    result = await client.get_all_node_info()
    assert isinstance(result, GetAllNodeInfoReply)
    """
    Test worker info
    """
    result = await client.get_all_worker_info()
    assert isinstance(result, GetAllWorkerInfoReply)
    """
    Test job
    """
    job_client = JobSubmissionClient(
        f"http://{ray._private.worker.global_worker.node.address_info['webui_url']}"
    )
    job_id = job_client.submit_job(  # noqa
        # Entrypoint shell command to execute
        entrypoint="ls", )
    result = client.get_job_info()
    assert list(result.keys())[0] == job_id
    assert isinstance(result, dict)
    """
    Test tasks
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_task_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_task_info(node_id)
        assert isinstance(result, GetTasksInfoReply)

    assert len(client.get_all_registered_raylet_ids()) == 2
    """
    Test objects
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_object_info("1234")

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        ip = node["NodeManagerAddress"]
        port = int(node["NodeManagerPort"])
        client.register_raylet_client(node_id, ip, port)
        result = await client.get_object_info(node_id)
        assert isinstance(result, GetNodeStatsReply)
    """
    Test runtime env
    """
    with pytest.raises(ValueError):
        # Since we didn't register this node id, it should raise an exception.
        result = await client.get_runtime_envs_info("1234")
    wait_for_condition(lambda: len(ray.nodes()) == 2)
    for node in ray.nodes():
        node_id = node["NodeID"]
        key = f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{node_id}"

        def get_port():
            return ray.experimental.internal_kv._internal_kv_get(
                key, namespace=ray_constants.KV_NAMESPACE_DASHBOARD)

        wait_for_condition(lambda: get_port() is not None)
        # The second index is the gRPC port
        port = json.loads(get_port())[1]
        ip = node["NodeManagerAddress"]
        client.register_agent_client(node_id, ip, port)
        result = await client.get_runtime_envs_info(node_id)
        assert isinstance(result, GetRuntimeEnvsInfoReply)
    """
    Test logs
    """
    with pytest.raises(ValueError):
        result = await client.list_logs("1234", "*")
    with pytest.raises(ValueError):
        result = await client.stream_log("1234", "raylet.out", True, 100, 1, 5)

    wait_for_condition(lambda: len(ray.nodes()) == 2)
    # The node information should've been registered in the previous section.
    for node in ray.nodes():
        node_id = node["NodeID"]
        result = await client.list_logs(node_id, timeout=30, glob_filter="*")
        assert isinstance(result, ListLogsReply)

        stream = await client.stream_log(node_id, "raylet.out", False, 10, 1,
                                         5)
        async for logs in stream:
            log_lines = len(logs.data.decode().split("\n"))
            assert isinstance(logs, StreamLogReply)
            assert log_lines >= 10
            assert log_lines <= 11
    """
    Test the exception is raised when the RPC error occurs.
    """
    cluster.remove_node(worker)
    # Wait until the dead node information is propagated.
    wait_for_condition(lambda: len(
        list(filter(lambda node: node["Alive"], ray.nodes()))) == 1)
    for node in ray.nodes():
        node_id = node["NodeID"]
        if node["Alive"]:
            continue

        # Querying to the dead node raises gRPC error, which should raise an exception.
        with pytest.raises(DataSourceUnavailable):
            await client.get_object_info(node_id)

        # Make sure unregister API works as expected.
        client.unregister_raylet_client(node_id)
        assert len(client.get_all_registered_raylet_ids()) == 1
        # Since the node_id is unregistered, the API should raise ValueError.
        with pytest.raises(ValueError):
            result = await client.get_object_info(node_id)
def test_successful_job_status(ray_start_with_dashboard, disable_aiohttp_cache,
                               enable_test_module, address_suffix):
    address = ray_start_with_dashboard.address_info["webui_url"]
    assert wait_until_server_available(address)
    address = format_web_url(address)

    job_sleep_time_s = 5
    entrypoint = ('python -c"'
                  "import ray;"
                  "ray.init();"
                  "import time;"
                  f"time.sleep({job_sleep_time_s});"
                  '"')

    client = JobSubmissionClient(address + address_suffix)
    start_time_s = int(time.time())
    runtime_env = {"env_vars": {"RAY_TEST_123": "123"}}
    metadata = {"ray_test_456": "456"}
    job_id = client.submit_job(entrypoint=entrypoint,
                               metadata=metadata,
                               runtime_env=runtime_env)

    def wait_for_job_to_succeed():
        data = _get_snapshot(address)
        legacy_job_succeeded = False
        job_succeeded = False

        # Test legacy job snapshot (one driver per job).
        for job_entry in data["data"]["snapshot"]["jobs"].values():
            if job_entry["status"] is not None:
                assert job_entry["config"]["metadata"][
                    "jobSubmissionId"] == job_id
                assert job_entry["status"] in {
                    "PENDING", "RUNNING", "SUCCEEDED"
                }
                assert job_entry["statusMessage"] is not None
                legacy_job_succeeded = job_entry["status"] == "SUCCEEDED"

        # Test new jobs snapshot (0 to N drivers per job).
        assert data["data"]["snapshot"]["jobSubmission"]
        for job_submission_id, entry in data["data"]["snapshot"][
                "jobSubmission"].items():
            if entry["status"] is not None:
                assert entry["jobSubmissionId"] == job_id
                assert entry["entrypoint"] == entrypoint
                assert entry["status"] in {"PENDING", "RUNNING", "SUCCEEDED"}
                assert entry["message"] is not None
                # TODO(architkulkarni): Disable automatic camelcase.
                assert entry["runtimeEnv"] == {
                    "envVars": {
                        "RAYTest123": "123"
                    }
                }
                assert entry["metadata"] == {"rayTest456": "456"}
                assert entry["errorType"] is None
                assert abs(entry["startTime"] - start_time_s * 1000) <= 2000
                if entry["status"] == "SUCCEEDED":
                    job_succeeded = True
                    assert (entry["endTime"] >=
                            entry["startTime"] + job_sleep_time_s * 1000)

        print(f"Legacy job submission succeeded: {legacy_job_succeeded}")
        print(f"Job submission succeeded: {job_succeeded}")
        return legacy_job_succeeded and job_succeeded

    wait_for_condition(wait_for_job_to_succeed, timeout=45)