Exemple #1
0
def test_drivers_named_actors(call_ray_start):
    # This test will create some drivers that submit some tasks to the same
    # named actor.
    address = call_ray_start

    ray.init(address=address, namespace="")

    # Define a driver that creates a named actor then sleeps for a while.
    driver_script1 = """
import ray
import time
ray.init(address="{}", namespace="")
@ray.remote
class Counter:
    def __init__(self):
        self.count = 0
    def increment(self):
        self.count += 1
        return self.count
counter = Counter.options(name="Counter").remote()
time.sleep(100)
""".format(address)

    # Define a driver that submits to the named actor and exits.
    driver_script2 = """
import ray
import time
ray.init(address="{}", namespace="")
while True:
    try:
        counter = ray.get_actor("Counter")
        break
    except ValueError:
        time.sleep(1)
assert ray.get(counter.increment.remote()) == {}
print("success")
""".format(address, "{}")

    process_handle = run_string_as_driver_nonblocking(driver_script1)

    for i in range(3):
        driver_script = driver_script2.format(i + 1)
        out = run_string_as_driver(driver_script)
        assert "success" in out

    process_handle.kill()
Exemple #2
0
def test_two_node_local_file(two_node_cluster, working_dir, client_mode):
    with open(os.path.join(working_dir, "test_file"), "w") as f:
        f.write("1")
    cluster, _ = two_node_cluster
    address, env, runtime_env_dir = start_client_server(cluster, client_mode)
    # test runtime_env iwth working_dir
    runtime_env = f"""{{  "working_dir": "{working_dir}" }}"""
    # Execute the following cmd in driver with runtime_env
    execute_statement = """
vals = ray.get([check_file.remote('test_file')] * 1000)
print(sum([int(v) for v in vals]))
"""
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    assert out.strip().split()[-1] == "1000"
    assert len(list(Path(runtime_env_dir).iterdir())) == 1
    assert len(kv._internal_kv_list("gcs://")) == 0
def test_namespace(ray_start_cluster):
    """
    Most of the "checks" in this test case rely on the fact that
    `run_string_as_driver` will throw an exception if the driver string exits
    with a non-zero exit code (e.g. when the driver scripts throws an
    exception). Since all of these drivers start named, detached actors, the
    most likely failure case would be a collision of named actors if they're
    put in the same namespace.

    This test checks that:
    * When two drivers don't specify a namespace, they are placed in different
      anonymous namespaces.
    * When two drivers specify a namespace, they collide.
    * The namespace name (as provided by the runtime context) is correct.
    """
    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4, ray_client_server_port=50055)
    cluster.wait_for_nodes(1)

    template = """
import ray
ray.client("localhost:50055").namespace({namespace}).connect()

@ray.remote
class Foo:
    def ping(self):
        return "pong"

a = Foo.options(lifetime="detached", name="abc").remote()
ray.get(a.ping.remote())
print("Current namespace:", ray.get_runtime_context().namespace)
    """

    anon_driver = template.format(namespace="None")
    run_string_as_driver(anon_driver)
    # This second run will fail if the actors don't run in separate anonymous
    # namespaces.
    run_string_as_driver(anon_driver)

    run_in_namespace = template.format(namespace="'namespace'")
    script_output = run_string_as_driver(run_in_namespace)
    # The second run fails because the actors are run in the same namespace.
    with pytest.raises(subprocess.CalledProcessError):
        run_string_as_driver(run_in_namespace)

    assert "Current namespace: namespace" in script_output
    subprocess.check_output("ray stop --force", shell=True)
Exemple #4
0
def test_run_driver_twice(ray_start_regular):
    # We used to have issue 2165 and 2288:
    # https://github.com/ray-project/ray/issues/2165
    # https://github.com/ray-project/ray/issues/2288
    # both complain that driver will hang when run for the second time.
    # This test is used to verify the fix for above issue, it will run the
    # same driver for twice and verify whether both of them succeed.
    address_info = ray_start_regular
    driver_script = """
import ray
import ray.tune as tune
import os
import time

def train_func(config, reporter):  # add a reporter arg
    for i in range(2):
        time.sleep(0.1)
        reporter(timesteps_total=i, mean_accuracy=i+97)  # report metrics

os.environ["TUNE_RESUME_PROMPT_OFF"] = "True"
ray.init(address="{}", namespace="default_test_namespace")
ray.tune.register_trainable("train_func", train_func)

tune.run_experiments({{
    "my_experiment": {{
        "run": "train_func",
        "stop": {{"mean_accuracy": 99}},
        "config": {{
            "layer1": {{
                "class_name": tune.grid_search(["a"]),
                "config": {{"lr": tune.grid_search([1, 2])}}
            }},
        }},
        "local_dir": os.path.expanduser("~/tmp")
    }}
}})
print("success")
""".format(
        address_info["address"]
    )

    for i in range(2):
        out = run_string_as_driver(driver_script)
        assert "success" in out
Exemple #5
0
def test_remote_function_isolation(call_ray_start):
    # This test will run multiple remote functions with the same names in
    # two different drivers. Connect a driver to the Ray cluster.
    address = call_ray_start

    ray.init(address=address)

    # Start another driver and make sure that it can define and call its
    # own commands with the same names.
    driver_script = """
import ray
import time
ray.init(address="{}")
@ray.remote
def f():
    return 3
@ray.remote
def g(x, y):
    return 4
for _ in range(10000):
    result = ray.get([f.remote(), g.remote(0, 0)])
    assert result == [3, 4]
print("success")
""".format(
        address
    )

    out = run_string_as_driver(driver_script)

    @ray.remote
    def f():
        return 1

    @ray.remote
    def g(x):
        return 2

    for _ in range(10000):
        result = ray.get([f.remote(), g.remote(0)])
        assert result == [1, 2]

    # Make sure the other driver succeeded.
    assert "success" in out
Exemple #6
0
def test_user_setup_function():
    script = """
import ray
ray.init()
@ray.remote
def get_pkg_dir():
    return ray._private.runtime_env.VAR

print("remote", ray.get(get_pkg_dir.remote()))
print("local", ray._private.runtime_env.VAR)


"""

    env = {"RAY_USER_SETUP_FUNCTION": "ray._private.test_utils.set_setup_func"}
    out = run_string_as_driver(script, dict(os.environ, **env))
    (remote_out, local_out) = out.strip().splitlines()[-2:]
    assert remote_out == "remote hello world"
    assert local_out == "local hello world"
Exemple #7
0
def test_two_node_uri(two_node_cluster, working_dir, client_mode):
    cluster, _ = two_node_cluster
    (address, env, PKG_DIR) = start_client_server(cluster, client_mode)
    with tempfile.NamedTemporaryFile(suffix="zip") as tmp_file:
        pkg_name = working_dir_pkg.get_project_package_name(
            working_dir, [], [])
        pkg_uri = working_dir_pkg.Protocol.PIN_GCS.value + "://" + pkg_name
        working_dir_pkg.create_project_package(working_dir, [], [],
                                               tmp_file.name)
        working_dir_pkg.push_package(pkg_uri, tmp_file.name)
        runtime_env = f"""{{ "uris": ["{pkg_uri}"] }}"""
        # Execute the following cmd in driver with runtime_env
        execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))"
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    assert out.strip().split()[-1] == "1000"
    assert len(list(Path(PKG_DIR).iterdir())) == 1
    # pinned uri will not be deleted
    print(list(kv._internal_kv_list("")))
    assert len(kv._internal_kv_list("pingcs://")) == 1
Exemple #8
0
def test_large_dir_upload_message(start_cluster, option):
    cluster, address = start_cluster
    with tempfile.TemporaryDirectory() as tmp_dir:
        filepath = os.path.join(tmp_dir, "test_file.txt")
        if option == "working_dir":
            driver_script = f"""
import ray
ray.init("{address}", runtime_env={{"working_dir": "{tmp_dir}"}})
"""
        else:
            driver_script = f"""
import ray
ray.init("{address}", runtime_env={{"py_modules": ["{tmp_dir}"]}})
"""

        with open(filepath, "w") as f:
            f.write("Hi")

        output = run_string_as_driver(driver_script)
        assert "Pushing file package" in output
        assert "Successfully pushed file package" in output
        assert "warning" not in output.lower()
Exemple #9
0
def test_node_name_in_raylet_death():
    NODE_NAME = "RAY_TEST_RAYLET_DEATH_NODE_NAME"
    script = f"""
import ray
import time
import os

NUM_HEARTBEATS=10
HEARTBEAT_PERIOD=500
WAIT_BUFFER_SECONDS=5

os.environ["RAY_num_heartbeats_timeout"]=str(NUM_HEARTBEATS)
os.environ["RAY_raylet_heartbeat_period_milliseconds"]=str(HEARTBEAT_PERIOD)

ray.init(_node_name=\"{NODE_NAME}\")
# This will kill raylet without letting it exit gracefully.
ray.worker._global_node.kill_raylet()
time.sleep(NUM_HEARTBEATS * HEARTBEAT_PERIOD / 1000 + WAIT_BUFFER_SECONDS)
ray.shutdown()
    """
    out = run_string_as_driver(script)
    assert out.count(f"node name: {NODE_NAME} has been marked dead") == 1
Exemple #10
0
def test_jobconfig_compatible_2(ray_start_cluster_head, working_dir):
    # start job_config=something
    # start job_config=None
    cluster = ray_start_cluster_head
    (address, env, PKG_DIR) = start_client_server(cluster, True)
    runtime_env = """{  "py_modules": [test_module.__path__[0]] }"""
    # To make the first one hanging there
    execute_statement = """
time.sleep(600)
"""
    script = driver_script.format(**locals())
    proc = run_string_as_driver_nonblocking(script, env)
    time.sleep(5)
    runtime_env = None
    # Execute the following in the second one which should
    # succeed
    execute_statement = "print('OK')"
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    assert out.strip().split()[-1] == "OK", out
    proc.kill()
    proc.wait()
Exemple #11
0
def test_storage_isolation(external_redis, call_ray_start, call_ray_start_2):
    script = """
import ray
ray.init("{address}", namespace="a")
@ray.remote
class A:
    def ready(self):
        return {val}
    pass

a = A.options(lifetime="detached", name="A").remote()
assert ray.get(a.ready.remote()) == {val}
    """
    run_string_as_driver(script.format(address=call_ray_start, val=1))
    run_string_as_driver(script.format(address=call_ray_start_2, val=2))

    script = """
import ray
ray.init("{address}", namespace="a")
a = ray.get_actor(name="A")
assert ray.get(a.ready.remote()) == {val}
"""
    run_string_as_driver(script.format(address=call_ray_start, val=1))
    run_string_as_driver(script.format(address=call_ray_start_2, val=2))
Exemple #12
0
def test_ray_start_non_head(call_ray_stop_only, monkeypatch):

    # Test that we can call ray start to connect to an existing cluster.

    # Test starting Ray with a port specified.
    check_call_ray(
        ["start", "--head", "--port", "7298", "--resources", '{"res_0": 1}'])

    # Test starting node connecting to the above cluster.
    check_call_ray([
        "start", "--address", "127.0.0.1:7298", "--resources", '{"res_1": 1}'
    ])

    # Test starting Ray with address `auto`.
    check_call_ray(
        ["start", "--address", "auto", "--resources", '{"res_2": 1}'])

    # Run tasks to verify nodes with custom resources are available.
    driver_script = """
import ray
ray.init()

@ray.remote
def f():
    return 1

assert ray.get(f.remote()) == 1
assert ray.get(f.options(resources={"res_0": 1}).remote()) == 1
assert ray.get(f.options(resources={"res_1": 1}).remote()) == 1
assert ray.get(f.options(resources={"res_2": 1}).remote()) == 1
print("success")
"""
    monkeypatch.setenv("RAY_ADDRESS", "auto")
    out = run_string_as_driver(driver_script)
    # Make sure the driver succeeded.
    assert "success" in out

    check_call_ray(["stop"])
Exemple #13
0
def test_jobconfig_compatible_1(ray_start_cluster_head, working_dir):
    # start job_config=None
    # start job_config=something
    cluster = ray_start_cluster_head
    (address, env, PKG_DIR) = start_client_server(cluster, True)
    runtime_env = None
    # To make the first one hanging there
    execute_statement = """
time.sleep(600)
"""
    script = driver_script.format(**locals())
    # Have one running with job config = None
    proc = run_string_as_driver_nonblocking(script, env)
    # waiting it to be up
    time.sleep(5)
    runtime_env = f"""{{  "working_dir": "{working_dir}" }}"""
    # Execute the second one which should work because Ray Client servers.
    execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))"
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    assert out.strip().split()[-1] == "1000"
    proc.kill()
    proc.wait()
Exemple #14
0
def test_detached_actors(ray_start_cluster_head, working_dir, client_mode):
    cluster = ray_start_cluster_head
    address, env, runtime_env_dir = start_client_server(cluster, client_mode)
    runtime_env = f"""{{  "working_dir": "{working_dir}" }}"""
    # Execute the following cmd in driver with runtime_env
    execute_statement = """
test_actor = TestActor.options(name="test_actor", lifetime="detached").remote()
print(sum(ray.get([test_actor.one.remote()] * 1000)))
"""
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    assert out.strip().split()[-1] == "1000"
    # It's a detached actors, so it should still be there
    assert len(kv._internal_kv_list("gcs://")) == 1
    assert len(list(Path(runtime_env_dir).iterdir())) == 2
    pkg_dir = [f for f in Path(runtime_env_dir).glob("*") if f.is_dir()][0]
    sys.path.insert(0, str(pkg_dir))
    test_actor = ray.get_actor("test_actor")
    assert sum(ray.get([test_actor.one.remote()] * 1000)) == 1000
    ray.kill(test_actor)
    time.sleep(5)
    assert len(list(Path(runtime_env_dir).iterdir())) == 1
    assert len(kv._internal_kv_list("gcs://")) == 0
Exemple #15
0
def test_jobconfig_compatible_3(ray_start_cluster_head, working_dir):
    # start job_config=something
    # start job_config=something else
    cluster = ray_start_cluster_head
    (address, env, PKG_DIR) = start_client_server(cluster, True)
    runtime_env = """{  "py_modules": [test_module.__path__[0]] }"""
    # To make the first one hanging ther
    execute_statement = """
time.sleep(600)
"""
    script = driver_script.format(**locals())
    proc = run_string_as_driver_nonblocking(script, env)
    time.sleep(5)
    runtime_env = f"""
{{  "working_dir": test_module.__path__[0] }}"""  # noqa: F541
    # Execute the following cmd in the second one and ensure that
    # it is able to run.
    execute_statement = "print('OK')"
    script = driver_script.format(**locals())
    out = run_string_as_driver(script, env)
    proc.kill()
    proc.wait()
    assert out.strip().split()[-1] == "OK"
Exemple #16
0
def test_run_on_all_workers(ray_start_regular, tmp_path):
    # This test is to ensure run_function_on_all_workers are executed
    # on all workers.
    lock_file = tmp_path / "lock"
    data_file = tmp_path / "data"
    driver_script = f"""
import ray
from filelock import FileLock
from pathlib import Path
import pickle

lock_file = r"{str(lock_file)}"
data_file = Path(r"{str(data_file)}")

def init_func(worker_info):
    with FileLock(lock_file):
        if data_file.exists():
            old = pickle.loads(data_file.read_bytes())
        else:
            old = []
        old.append(worker_info['worker'].worker_id)
        data_file.write_bytes(pickle.dumps(old))

ray.worker.global_worker.run_function_on_all_workers(init_func)
ray.init(address='auto')

@ray.remote
def ready():
    with FileLock(lock_file):
        worker_ids = pickle.loads(data_file.read_bytes())
        assert ray.worker.global_worker.worker_id in worker_ids

ray.get(ready.remote())
"""
    run_string_as_driver(driver_script)
    run_string_as_driver(driver_script)
    run_string_as_driver(driver_script)
Exemple #17
0
def test_cleanup_on_driver_exit(call_ray_start):
    # This test will create a driver that creates a bunch of objects and then
    # exits. The entries in the object table should be cleaned up.
    address = call_ray_start

    ray.init(address=address)

    # Define a driver that creates a bunch of objects and exits.
    driver_script = """
import time
import ray
import numpy as np
from ray._private.test_utils import object_memory_usage
import os


ray.init(address="{}")
object_refs = [ray.put(np.zeros(200 * 1024, dtype=np.uint8))
              for i in range(1000)]
start_time = time.time()
while time.time() - start_time < 30:
    if object_memory_usage() > 0:
        break
else:
    raise Exception("Objects did not appear in object table.")

@ray.remote
def f():
    time.sleep(1)

print("success")
# Submit some tasks without waiting for them to finish. Their workers should
# still get cleaned up eventually, even if they get started after the driver
# exits.
[f.remote() for _ in range(10)]
""".format(
        address
    )

    out = run_string_as_driver(driver_script)
    assert "success" in out

    # Make sure the objects are removed from the object table.
    start_time = time.time()
    while time.time() - start_time < 30:
        if object_memory_usage() == 0:
            break
    else:
        raise Exception("Objects were not all removed from object table.")

    def all_workers_exited():
        result = True
        print("list of idle workers:")
        for proc in psutil.process_iter():
            if ray_constants.WORKER_PROCESS_TYPE_IDLE_WORKER in proc.name():
                print(f"{proc}")
                result = False
        return result

    # Check that workers are eventually cleaned up.
    wait_for_condition(all_workers_exited, timeout=15, retry_interval_ms=1000)
def test_automatic_cleanup_detached_actors(ray_start_cluster):
    # Make sure the placement groups created by a
    # detached actors are cleaned properly.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 2
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)
    cluster.wait_for_nodes()

    info = ray.init(address=cluster.address,
                    namespace="default_test_namespace")
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["address"]}", namespace="default_test_namespace")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

# TODO(sang): Placement groups created by tasks launched by detached actor
# is not cleaned with the current protocol.
# @ray.remote(num_cpus=0)
# def f():
#     create_pg()

@ray.remote(num_cpus=0, max_restarts=1)
class A:
    def create_pg(self):
        create_pg()
    def create_child_pg(self):
        self.a = A.options(name="B").remote()
        ray.get(self.a.create_pg.remote())
    def kill_child_actor(self):
        ray.kill(self.a)
        try:
            ray.get(self.a.create_pg.remote())
        except Exception:
            pass

a = A.options(lifetime="detached", name="A").remote()
ray.get(a.create_pg.remote())
# TODO(sang): Currently, child tasks are cleaned when a detached actor
# is dead. We cannot test this scenario until it is fixed.
# ray.get(a.create_child_pg.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.state.jobs()
        for job in jobs:
            if job["IsDead"]:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    wait_for_condition(lambda: assert_num_cpus(num_nodes))

    # Make sure when a child actor spawned by a detached actor
    # is killed, the placement group is removed.
    a = ray.get_actor("A")
    # TODO(sang): child of detached actors
    # seem to be killed when jobs are done. We should fix this before
    # testing this scenario.
    # ray.get(a.kill_child_actor.remote())
    # assert assert_num_cpus(num_nodes)

    # Make sure placement groups are cleaned when detached actors are killed.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
    # The detached actor a should've been restarted.
    # Recreate a placement group.
    ray.get(a.create_pg.remote())
    wait_for_condition(lambda: assert_num_cpus(num_nodes))
    # Kill it again and make sure the placement group
    # that is created is deleted again.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_automatic_cleanup_job(ray_start_cluster):
    # Make sure the placement groups created by a
    # job, actor, and task are cleaned when the job is done.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 4
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)
    cluster.wait_for_nodes()

    info = ray.init(address=cluster.address)
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["address"]}")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

@ray.remote(num_cpus=0)
def f():
    create_pg()

@ray.remote(num_cpus=0)
class A:
    def create_pg(self):
        create_pg()

ray.get(f.remote())
a = A.remote()
ray.get(a.create_pg.remote())
# Create 2 pgs to make sure multiple placement groups that belong
# to a single job will be properly cleaned.
create_pg()
create_pg()

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.state.jobs()
        for job in jobs:
            if job["IsDead"]:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    available_cpus = ray.available_resources()["CPU"]
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
Exemple #20
0
def test_working_dir_scale_up_in_new_driver(ray_start, tmp_dir, use_ray_client):
    with open("hello", "w") as f:
        f.write("world")

    driver1 = """
import os

import ray
from ray import serve

job_config = ray.job_config.JobConfig(runtime_env={{"working_dir": "."}})
if {use_ray_client}:
    ray.util.connect("{client_addr}", namespace="serve", job_config=job_config)
else:
    ray.init(address="auto", namespace="serve", job_config=job_config)

serve.start(detached=True)

@serve.deployment(version="1")
class Test:
    def __call__(self, *args):
        return os.getpid(), open("hello").read()

Test.deploy()
handle = Test.get_handle()
assert ray.get(handle.remote())[1] == "world"
""".format(
        use_ray_client=use_ray_client, client_addr=ray_start
    )

    run_string_as_driver(driver1)

    with open("hello", "w") as f:
        f.write("no longer world")

    driver2 = """
import ray
from ray import serve

job_config = ray.job_config.JobConfig(runtime_env={{"working_dir": "."}})
if {use_ray_client}:
    ray.util.connect("{client_addr}", namespace="serve", job_config=job_config)
else:
    ray.init(address="auto", namespace="serve", job_config=job_config)

serve.start(detached=True)

Test = serve.get_deployment("Test")
Test.options(num_replicas=2).deploy()
handle = Test.get_handle()
results = ray.get([handle.remote() for _ in range(1000)])
print(set(results))
assert all(r[1] == "world" for r in results), (
    "results should still come from the first env")
assert len(set(r[0] for r in results)) == 2, (
    "make sure there are two replicas")
Test.delete()
""".format(
        use_ray_client=use_ray_client, client_addr=ray_start
    )

    run_string_as_driver(driver2)
Exemple #21
0
def test_log_redirect_to_stderr(shutdown_only, capfd):

    log_components = {
        ray_constants.PROCESS_TYPE_DASHBOARD: "Dashboard head grpc address",
        ray_constants.PROCESS_TYPE_DASHBOARD_AGENT: "Dashboard agent grpc address",
        ray_constants.PROCESS_TYPE_GCS_SERVER: "Loading job table data",
        # No log monitor output if all components are writing to stderr.
        ray_constants.PROCESS_TYPE_LOG_MONITOR: "",
        ray_constants.PROCESS_TYPE_MONITOR: "Starting monitor using ray installation",
        ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER: "worker server started",
        ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER: "driver server started",
        # TODO(Clark): Add coverage for Ray Client.
        # ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER: "Starting Ray Client server",
        ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER: "",
        ray_constants.PROCESS_TYPE_RAYLET: "Starting object store with directory",
        # No reaper process run (kernel fate-sharing).
        ray_constants.PROCESS_TYPE_REAPER: "",
        # No reporter process run.
        ray_constants.PROCESS_TYPE_REPORTER: "",
        # No web UI process run.
        ray_constants.PROCESS_TYPE_WEB_UI: "",
        # Unused.
        ray_constants.PROCESS_TYPE_WORKER: "",
    }

    script = """
import os
from pathlib import Path

import ray

os.environ["RAY_LOG_TO_STDERR"] = "1"
ray.init()

session_dir = ray.worker.global_worker.node.address_info["session_dir"]
session_path = Path(session_dir)
log_dir_path = session_path / "logs"

# Run the basic workload.
@ray.remote
def f():
    for i in range(10):
        print(f"test {{i}}")

ray.get(f.remote())

log_component_names = {}

# Confirm that no log files are created for any of the components.
paths = list(path.stem for path in log_dir_path.iterdir())
assert set(log_component_names).isdisjoint(set(paths)), paths
""".format(
        str(list(log_components.keys()))
    )
    stderr = run_string_as_driver(script)

    # Make sure that the expected startup log records for each of the
    # components appears in the stderr stream.
    # stderr = capfd.readouterr().err
    for component, canonical_record in log_components.items():
        if not canonical_record:
            # Process not run or doesn't generate logs; skip.
            continue
        assert canonical_record in stderr, stderr
        if component == ray_constants.PROCESS_TYPE_REDIS_SERVER:
            # Redis doesn't expose hooks for custom log formats, so we aren't able to
            # inject the Redis server component name into the log records.
            continue
        # NOTE: We do a prefix match instead of including the enclosing right
        # parentheses since some components, like the core driver and worker, add a
        # unique ID suffix.
        assert f"({component}" in stderr, stderr
Exemple #22
0
def test_detached_actor_cleanup(ray_start_regular):
    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    dup_actor_name = "actor"

    def create_and_kill_actor(actor_name):
        # Make sure same name is creatable after killing it.
        detached_actor = DetachedActor.options(lifetime="detached",
                                               name=actor_name).remote()
        # Wait for detached actor creation.
        assert ray.get(detached_actor.ping.remote()) == "pong"
        del detached_actor
        assert ray.util.list_named_actors() == [dup_actor_name]
        detached_actor = ray.get_actor(dup_actor_name)
        ray.kill(detached_actor)
        # Wait until actor dies.
        actor_status = ray.state.actors(
            actor_id=detached_actor._actor_id.hex())
        max_wait_time = 10
        wait_time = 0
        while actor_status["State"] != gcs_utils.ActorTableData.DEAD:
            actor_status = ray.state.actors(
                actor_id=detached_actor._actor_id.hex())
            time.sleep(1.0)
            wait_time += 1
            if wait_time >= max_wait_time:
                assert None, (
                    "It took too much time to kill an actor: {}".format(
                        detached_actor._actor_id))

    create_and_kill_actor(dup_actor_name)

    # This shouldn't be broken because actor
    # name should have been cleaned up from GCS.
    create_and_kill_actor(dup_actor_name)

    redis_address = ray_start_regular["redis_address"]
    driver_script = """
import ray
import ray._private.gcs_utils as gcs_utils
import time
ray.init(address="{}", namespace="default_test_namespace")

@ray.remote
class DetachedActor:
    def ping(self):
        return "pong"

# Make sure same name is creatable after killing it.
detached_actor = DetachedActor.options(lifetime="detached", name="{}").remote()
assert ray.get(detached_actor.ping.remote()) == "pong"
ray.kill(detached_actor)
# Wait until actor dies.
actor_status = ray.state.actors(actor_id=detached_actor._actor_id.hex())
max_wait_time = 10
wait_time = 0
while actor_status["State"] != gcs_utils.ActorTableData.DEAD:
    actor_status = ray.state.actors(actor_id=detached_actor._actor_id.hex())
    time.sleep(1.0)
    wait_time += 1
    if wait_time >= max_wait_time:
        assert None, (
            "It took too much time to kill an actor")
""".format(redis_address, dup_actor_name)

    run_string_as_driver(driver_script)
    # Make sure we can create a detached actor created/killed
    # at other scripts.
    create_and_kill_actor(dup_actor_name)
Exemple #23
0
def test_detached_actor(ray_start_regular):
    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    with pytest.raises(TypeError):
        DetachedActor._remote(lifetime="detached", name=1)

    with pytest.raises(ValueError,
                       match="Actor name cannot be an empty string"):
        DetachedActor._remote(lifetime="detached", name="")

    with pytest.raises(ValueError):
        DetachedActor._remote(lifetime="detached", name="hi", namespace="")

    with pytest.raises(TypeError):
        DetachedActor._remote(lifetime="detached", name="hi", namespace=2)

    d = DetachedActor._remote(lifetime="detached", name="d_actor")
    assert ray.get(d.ping.remote()) == "pong"

    with pytest.raises(ValueError, match="Please use a different name"):
        DetachedActor._remote(lifetime="detached", name="d_actor")

    redis_address = ray_start_regular["redis_address"]

    get_actor_name = "d_actor"
    create_actor_name = "DetachedActor"
    driver_script = """
import ray
ray.init(address="{}", namespace="default_test_namespace")

name = "{}"
assert ray.util.list_named_actors() == [name]
existing_actor = ray.get_actor(name)
assert ray.get(existing_actor.ping.remote()) == "pong"

@ray.remote
def foo():
    return "bar"

@ray.remote
class NonDetachedActor:
    def foo(self):
        return "bar"

@ray.remote
class DetachedActor:
    def ping(self):
        return "pong"

    def foobar(self):
        actor = NonDetachedActor.remote()
        return ray.get([foo.remote(), actor.foo.remote()])

actor = DetachedActor._remote(lifetime="detached", name="{}")
ray.get(actor.ping.remote())
""".format(redis_address, get_actor_name, create_actor_name)

    run_string_as_driver(driver_script)
    assert len(ray.util.list_named_actors()) == 2
    assert get_actor_name in ray.util.list_named_actors()
    assert create_actor_name in ray.util.list_named_actors()
    detached_actor = ray.get_actor(create_actor_name)
    assert ray.get(detached_actor.ping.remote()) == "pong"
    # Verify that a detached actor is able to create tasks/actors
    # even if the driver of the detached actor has exited.
    assert ray.get(detached_actor.foobar.remote()) == ["bar", "bar"]
Exemple #24
0
def test_serve_snapshot(ray_start_with_dashboard):
    """Test detached and nondetached Serve instances running concurrently."""

    detached_serve_driver_script = f"""
import ray
from ray import serve

ray.init(
    address="{ray_start_with_dashboard['redis_address']}",
    namespace="serve")

serve.start(detached=True)

@serve.deployment
def my_func(request):
  return "hello"

my_func.deploy()

@serve.deployment(version="v1")
def my_func_deleted(request):
  return "hello"

my_func_deleted.deploy()
my_func_deleted.delete()
    """

    run_string_as_driver(detached_serve_driver_script)
    assert requests.get("http://127.0.0.1:8000/my_func").text == "hello"

    # Use a new port to avoid clobbering the first Serve instance.
    serve.start(http_options={"port": 8123})

    @serve.deployment(version="v1")
    def my_func_nondetached(request):
        return "hello"

    my_func_nondetached.deploy()

    assert requests.get(
        "http://127.0.0.1:8123/my_func_nondetached").text == "hello"

    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    response = requests.get(f"{webui_url}/api/snapshot")
    response.raise_for_status()
    data = response.json()
    schema_path = os.path.join(os.path.dirname(dashboard.__file__),
                               "modules/snapshot/snapshot_schema.json")
    pprint.pprint(data)
    jsonschema.validate(instance=data, schema=json.load(open(schema_path)))

    assert len(data["data"]["snapshot"]["deployments"]) == 3

    entry = data["data"]["snapshot"]["deployments"][hashlib.sha1(
        "my_func".encode()).hexdigest()]
    assert entry["name"] == "my_func"
    assert entry["version"] == "None"
    assert entry["namespace"] == "serve"
    assert entry["httpRoute"] == "/my_func"
    assert entry["className"] == "my_func"
    assert entry["status"] == "RUNNING"
    assert entry["rayJobId"] is not None
    assert entry["startTime"] > 0
    assert entry["endTime"] == 0

    assert len(entry["actors"]) == 1
    actor_id = next(iter(entry["actors"]))
    metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][
        "serve"]
    assert metadata["deploymentName"] == "my_func"
    assert metadata["version"] == "None"
    assert len(metadata["replicaTag"]) > 0

    entry_deleted = data["data"]["snapshot"]["deployments"][hashlib.sha1(
        "my_func_deleted".encode()).hexdigest()]
    assert entry_deleted["name"] == "my_func_deleted"
    assert entry_deleted["version"] == "v1"
    assert entry_deleted["namespace"] == "serve"
    assert entry_deleted["httpRoute"] == "/my_func_deleted"
    assert entry_deleted["className"] == "my_func_deleted"
    assert entry_deleted["status"] == "DELETED"
    assert entry["rayJobId"] is not None
    assert entry_deleted["startTime"] > 0
    assert entry_deleted["endTime"] > entry_deleted["startTime"]

    entry_nondetached = data["data"]["snapshot"]["deployments"][hashlib.sha1(
        "my_func_nondetached".encode()).hexdigest()]
    assert entry_nondetached["name"] == "my_func_nondetached"
    assert entry_nondetached["version"] == "v1"
    assert entry_nondetached["namespace"] == "default_test_namespace"
    assert entry_nondetached["httpRoute"] == "/my_func_nondetached"
    assert entry_nondetached["className"] == "my_func_nondetached"
    assert entry_nondetached["status"] == "RUNNING"
    assert entry_nondetached["rayJobId"] is not None
    assert entry_nondetached["startTime"] > 0
    assert entry_nondetached["endTime"] == 0

    assert len(entry_nondetached["actors"]) == 1
    actor_id = next(iter(entry_nondetached["actors"]))
    metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][
        "serve"]
    assert metadata["deploymentName"] == "my_func_nondetached"
    assert metadata["version"] == "v1"
    assert len(metadata["replicaTag"]) > 0

    my_func_nondetached.delete()
Exemple #25
0
def test_ray_client(ray_client_instance):
    ray.util.connect(ray_client_instance, namespace="default_test_namespace")

    start = """
import ray
ray.util.connect("{}", namespace="default_test_namespace")

from ray import serve

serve.start(detached=True)
""".format(ray_client_instance)
    run_string_as_driver(start)

    deploy = """
import ray
ray.util.connect("{}", namespace="default_test_namespace")

from ray import serve

@serve.deployment(name="test1", route_prefix="/hello")
def f(*args):
    return "hello"

f.deploy()
""".format(ray_client_instance)
    run_string_as_driver(deploy)

    assert "test1" in serve.list_deployments()
    assert requests.get("http://*****:*****@app.get("/")
def hello():
    return "hello"

@serve.deployment
@serve.ingress(app)
class A:
    pass

A.deploy()
""".format(ray_client_instance)
    run_string_as_driver(fastapi)

    assert requests.get("http://localhost:8000/A").json() == "hello"

    serve.shutdown()
    ray.util.disconnect()
Exemple #26
0
def test_driver_exiting_when_worker_blocked(call_ray_start):
    # This test will create some drivers that submit some tasks and then
    # exit without waiting for the tasks to complete.
    address = call_ray_start

    ray.init(address=address)

    # Define a driver that creates two tasks, one that runs forever and the
    # other blocked on the first in a `ray.get`.
    driver_script = """
import time
import ray
ray.init(address="{}")
@ray.remote
def f():
    time.sleep(10**6)
@ray.remote
def g():
    ray.get(f.remote())
g.remote()
time.sleep(1)
print("success")
""".format(address)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Make sure the first driver ran to completion.
        assert "success" in out

    # Define a driver that creates two tasks, one that runs forever and the
    # other blocked on the first in a `ray.wait`.
    driver_script = """
import time
import ray
ray.init(address="{}")
@ray.remote
def f():
    time.sleep(10**6)
@ray.remote
def g():
    ray.wait([f.remote()])
g.remote()
time.sleep(1)
print("success")
""".format(address)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Make sure the first driver ran to completion.
        assert "success" in out

    # Define a driver that creates one task that depends on a nonexistent
    # object. This task will be queued as waiting to execute.
    driver_script_template = """
import time
import ray
ray.init(address="{}")
@ray.remote
def g(x):
    return
g.remote(ray.ObjectRef(ray._private.utils.hex_to_binary("{}")))
time.sleep(1)
print("success")
"""

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        nonexistent_id = ray.ObjectRef.from_random()
        driver_script = driver_script_template.format(address,
                                                      nonexistent_id.hex())
        out = run_string_as_driver(driver_script)
        # Simulate the nonexistent dependency becoming available.
        ray.worker.global_worker.put_object(None, nonexistent_id)
        # Make sure the first driver ran to completion.
        assert "success" in out

    # Define a driver that calls `ray.wait` on a nonexistent object.
    driver_script_template = """
import time
import ray
ray.init(address="{}")
@ray.remote
def g():
    ray.wait(ray.ObjectRef(ray._private.utils.hex_to_binary("{}")))
g.remote()
time.sleep(1)
print("success")
"""

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        nonexistent_id = ray.ObjectRef.from_random()
        driver_script = driver_script_template.format(address,
                                                      nonexistent_id.hex())
        out = run_string_as_driver(driver_script)
        # Simulate the nonexistent dependency becoming available.
        ray.worker.global_worker.put_object(None, nonexistent_id)
        # Make sure the first driver ran to completion.
        assert "success" in out

    @ray.remote
    def f():
        return 1

    # Make sure we can still talk with the raylet.
    ray.get(f.remote())
Exemple #27
0
def test_drivers_release_resources(call_ray_start):
    address = call_ray_start

    # Define a driver that creates an actor and exits.
    driver_script1 = """
import time
import ray

ray.init(address="{}")

@ray.remote
def f(duration):
    time.sleep(duration)

@ray.remote(num_gpus=1)
def g(duration):
    time.sleep(duration)

@ray.remote(num_gpus=1)
class Foo:
    def __init__(self):
        pass

# Make sure some resources are available for us to run tasks.
ray.get(f.remote(0))
ray.get(g.remote(0))

# Start a bunch of actors and tasks that use resources. These should all be
# cleaned up when this driver exits.
foos = [Foo.remote() for _ in range(100)]
[f.remote(10 ** 6) for _ in range(100)]

print("success")
""".format(
        address
    )

    driver_script2 = (
        driver_script1 + "import sys\nsys.stdout.flush()\ntime.sleep(10 ** 6)\n"
    )

    def wait_for_success_output(process_handle, timeout=10):
        # Wait until the process prints "success" and then return.
        start_time = time.time()
        while time.time() - start_time < timeout:
            output_line = ray._private.utils.decode(
                process_handle.stdout.readline()
            ).strip()
            print(output_line)
            if output_line == "success":
                return
            time.sleep(1)
        raise RayTestTimeoutException("Timed out waiting for process to print success.")

    # Make sure we can run this driver repeatedly, which means that resources
    # are getting released in between.
    for _ in range(5):
        out = run_string_as_driver(driver_script1)
        # Make sure the first driver ran to completion.
        assert "success" in out
        # Also make sure that this works when the driver exits ungracefully.
        process_handle = run_string_as_driver_nonblocking(driver_script2)
        wait_for_success_output(process_handle)
        # Kill the process ungracefully.
        process_handle.kill()
Exemple #28
0
def run_driver():
    output = run_string_as_driver(driver_script, encode="utf-8")
    assert "success" in output
Exemple #29
0
def test_error_isolation(call_ray_start):
    address = call_ray_start
    # Connect a driver to the Ray cluster.
    ray.init(address=address)

    # If a GRPC call exceeds timeout, the calls is cancelled at client side but
    # server may still reply to it, leading to missed message. Using a sequence
    # number to ensure no message is dropped can be the long term solution,
    # but its complexity and the fact the Ray subscribers do not use deadline
    # in production makes it less preferred.
    # Therefore, a simpler workaround is used instead: a different subscriber
    # is used for each get_error_message() call.
    subscribers = [init_error_pubsub() for _ in range(3)]

    # There shouldn't be any errors yet.
    errors = get_error_message(subscribers[0], 1, timeout=2)
    assert len(errors) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    errors = get_error_message(subscribers[1], 1)

    # Make sure we got the error.
    assert len(errors) == 1
    assert error_string1 in errors[0].error_message

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time
from ray._private.test_utils import init_error_pubsub, get_error_message

ray.init(address="{}")
subscribers = [init_error_pubsub() for _ in range(2)]
time.sleep(1)
errors = get_error_message(subscribers[0], 1, timeout=2)
assert len(errors) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

errors = get_error_message(subscribers[1], 1)
assert len(errors) == 1

assert "{}" in errors[0].error_message

print("success")
""".format(
        address, error_string2, error_string2
    )

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    errors = get_error_message(subscribers[2], 1)
    assert len(errors) == 1
Exemple #30
0
def test_calling_start_ray_head(call_ray_stop_only):

    # Test that we can call ray start with various command line
    # parameters.

    # Test starting Ray with a redis port specified.
    check_call_ray(["start", "--head", "--port", "0"])
    check_call_ray(["stop"])

    # Test starting Ray with a node IP address specified.
    check_call_ray(
        ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"])
    check_call_ray(["stop"])

    # Test starting Ray with a system config parameter set.
    check_call_ray([
        "start",
        "--head",
        "--system-config",
        '{"metrics_report_interval_ms":100}',
        "--port",
        "0",
    ])
    check_call_ray(["stop"])

    # Test starting Ray with the object manager and node manager ports
    # specified.
    check_call_ray([
        "start",
        "--head",
        "--object-manager-port",
        "22345",
        "--node-manager-port",
        "54321",
        "--port",
        "0",
    ])
    check_call_ray(["stop"])

    # Test starting Ray with the worker port range specified.
    check_call_ray([
        "start",
        "--head",
        "--min-worker-port",
        "51000",
        "--max-worker-port",
        "51050",
        "--port",
        "0",
    ])
    check_call_ray(["stop"])

    # Test starting Ray with a worker port list.
    check_call_ray(["start", "--head", "--worker-port-list", "10002,10003"])
    check_call_ray(["stop"])

    # Test starting Ray with a non-int in the worker port list.
    with pytest.raises(subprocess.CalledProcessError):
        check_call_ray(["start", "--head", "--worker-port-list", "10002,a"])
    check_call_ray(["stop"])

    # Test starting Ray with an invalid port in the worker port list.
    with pytest.raises(subprocess.CalledProcessError):
        check_call_ray(["start", "--head", "--worker-port-list", "100"])
    check_call_ray(["stop"])

    # Test starting Ray with the number of CPUs specified.
    check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"])
    check_call_ray(["stop"])

    # Test starting Ray with the number of GPUs specified.
    check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"])
    check_call_ray(["stop"])

    # Test starting Ray with redis shard ports specified.
    check_call_ray([
        "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port",
        "0"
    ])
    check_call_ray(["stop"])

    # Test starting Ray with all arguments specified.
    check_call_ray([
        "start",
        "--head",
        "--redis-shard-ports",
        "6380,6381,6382",
        "--object-manager-port",
        "22345",
        "--num-cpus",
        "2",
        "--num-gpus",
        "0",
        "--resources",
        '{"Custom": 1}',
        "--port",
        "0",
    ])
    check_call_ray(["stop"])

    temp_dir = ray._private.utils.get_ray_temp_dir()

    # Test starting Ray with RAY_REDIS_ADDRESS env.
    _, proc = _start_redis_instance(
        REDIS_EXECUTABLE,
        temp_dir,
        8888,
        password=ray_constants.REDIS_DEFAULT_PASSWORD)
    os.environ["RAY_REDIS_ADDRESS"] = "127.0.0.1:8888"
    check_call_ray(["start", "--head"])
    check_call_ray(["stop"])
    proc.process.terminate()
    del os.environ["RAY_REDIS_ADDRESS"]

    # Test --block. Killing a child process should cause the command to exit.
    blocked = subprocess.Popen(
        ["ray", "start", "--head", "--block", "--port", "0"])

    blocked.poll()
    assert blocked.returncode is None
    # Make sure ray cluster is up
    run_string_as_driver("""
import ray
from time import sleep
for i in range(0, 5):
    try:
        ray.init(address='auto')
        break
    except:
        sleep(1)
""")

    # Make sure ray cluster is up
    run_string_as_driver("""
import ray
from time import sleep
for i in range(0, 5):
    try:
        ray.init(address='auto')
        break
    except:
        sleep(1)
""")

    kill_process_by_name("raylet", SIGKILL=True)
    wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
    blocked.wait()
    assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit"

    # Test --block. Killing the command should clean up all child processes.
    blocked = subprocess.Popen(
        ["ray", "start", "--head", "--block", "--port", "0"])
    blocked.poll()
    assert blocked.returncode is None

    # Include GCS, autoscaler monitor, client server, dashboard, raylet and
    # log_monitor.py
    num_children = 6
    if not detect_fate_sharing_support():
        # Account for ray_process_reaper.py
        num_children += 1
    # Check a set of child process commands & scripts instead?
    wait_for_children_of_pid(blocked.pid,
                             num_children=num_children,
                             timeout=30)

    blocked.terminate()
    wait_for_children_of_pid_to_exit(blocked.pid, timeout=30)
    blocked.wait()
    assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit"