def test_receive_late_worker_logs(): # Make sure that log messages from tasks appear in the stdout even if the # script exits quickly. log_message = "some helpful debugging message" # Define a driver that creates a task that prints something, ensures that # the task runs, and then exits. driver_script = """ import ray import random import time log_message = "{}" @ray.remote class Actor: def log(self): print(log_message) @ray.remote def f(): print(log_message) ray.init(num_cpus=2) a = Actor.remote() ray.get([a.log.remote(), f.remote()]) ray.get([a.log.remote(), f.remote()]) """.format(log_message) for _ in range(2): out = run_string_as_driver(driver_script) assert out.count(log_message) == 4
def test_detached_actors(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, client_mode) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = """ test_actor = TestActor.options(name="test_actor", lifetime="detached").remote() print(sum(ray.get([test_actor.one.remote()] * 1000))) """ script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" # It's a detached actors, so it should still be there assert len(kv._internal_kv_list("gcs://")) == 1 assert len(list(Path(PKG_DIR).iterdir())) == 2 pkg_dir = [f for f in Path(PKG_DIR).glob("*") if f.is_dir()][0] import sys sys.path.insert(0, str(pkg_dir)) test_actor = ray.get_actor("test_actor") assert sum(ray.get([test_actor.one.remote()] * 1000)) == 1000 ray.kill(test_actor) from time import sleep sleep(5) assert len(list(Path(PKG_DIR).iterdir())) == 1 assert len(kv._internal_kv_list("gcs://")) == 0
def test_util_without_job_config(shutdown_only): from ray.cluster_utils import Cluster with tempfile.TemporaryDirectory() as tmp_dir: with (Path(tmp_dir) / "lib.py").open("w") as f: f.write(""" def one(): return 1 """) old_dir = os.getcwd() os.chdir(tmp_dir) cluster = Cluster() cluster.add_node(num_cpus=1) ray.init(address=cluster.address) (address, env, PKG_DIR) = start_client_server(cluster, True) script = f""" import ray import ray.util import os ray.util.connect("{address}", job_config=None) @ray.remote def run(): from lib import one return one() print(ray.get([run.remote()])[0]) """ out = run_string_as_driver(script, env) print(out) os.chdir(old_dir)
def test_namespace_client(): cluster = Cluster() cluster.add_node(num_cpus=4, ray_client_server_port=8080) cluster.wait_for_nodes(1) template = """ import ray ray.util.connect("{address}", namespace="{namespace}") @ray.remote class DetachedActor: def ping(self): return "pong from other job" actor = DetachedActor.options(name="Pinger", lifetime="detached").remote() ray.get(actor.ping.remote()) print("Done!!!") """ print( run_string_as_driver( template.format(address="localhost:8080", namespace=""))) ray.util.connect("localhost:8080", namespace="") pinger = ray.get_actor("Pinger") assert ray.get(pinger.ping.remote()) == "pong from other job" ray.util.disconnect() cluster.shutdown() # This piece of cleanup doesn't seem to happen automatically. ray._private.client_mode_hook._explicitly_disable_client_mode()
def test_client_tasks_and_actors_inherit_from_driver(conda_envs, call_ray_start): for i, package_version in enumerate(REQUEST_VERSIONS): runtime_env = {"conda": f"package-{package_version}"} with ray.client("localhost:24001").env(runtime_env).connect(): assert ray.get(get_requests_version.remote()) == package_version actor_handle = VersionActor.remote() assert ray.get( actor_handle.get_requests_version.remote()) == package_version # Ensure that we can have a second client connect using the other # conda environment. other_package_version = REQUEST_VERSIONS[(i + 1) % 2] run_string_as_driver( check_remote_client_conda.format( package_version=other_package_version))
def test_job_timestamps(ray_start_regular): driver_template = """ import ray from time import sleep ray.init(address="{}") print("My job id: ", str(ray.get_runtime_context().job_id)) {} ray.shutdown() """ non_hanging = driver_template.format(ray_start_regular["redis_address"], "sleep(1)") hanging_driver = driver_template.format(ray_start_regular["redis_address"], "sleep(60)") out = run_string_as_driver(non_hanging) p = run_string_as_driver_nonblocking(hanging_driver) # The nonblocking process needs time to connect. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) driver = jobs[0] finished = jobs[1] running = jobs[2] # The initial driver timestamp/start time go down a different code path. assert driver["Timestamp"] == driver["StartTime"] assert finished["Timestamp"] == finished["EndTime"] assert running["Timestamp"] == running["StartTime"] assert finished["EndTime"] > finished["StartTime"] > 0, out lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 2000, f"Job should've taken ~1s, {finished}" assert running["StartTime"] > 0 assert running["EndTime"] == 0 p.kill() # Give the second job time to clean itself up. time.sleep(1) jobs = list(ray.state.jobs()) jobs.sort(key=lambda x: x["JobID"]) # jobs[0] is the test case driver. finished = jobs[1] prev_running = jobs[2] assert finished["EndTime"] > finished["StartTime"] > 0, f"{finished}" assert finished["EndTime"] == finished["Timestamp"] lapsed = finished["EndTime"] - finished["StartTime"] assert 0 < lapsed < 2000, f"Job should've taken ~1s {finished}" assert prev_running["EndTime"] > prev_running["StartTime"] > 0
def test_named_but_not_detached(ray_start_regular): redis_address = ray_start_regular["redis_address"] driver_script = """ import ray ray.init(address="{}") @ray.remote class NotDetached: def ping(self): return "pong" actor = NotDetached.options(name="actor").remote() assert ray.get(actor.ping.remote()) == "pong" handle = ray.get_actor("actor") assert ray.get(handle.ping.remote()) == "pong" """.format(redis_address) # Creates and kills actor once the driver exits. run_string_as_driver(driver_script) # Must raise an exception since lifetime is not detached. with pytest.raises(Exception): detached_actor = ray.get_actor("actor") ray.get(detached_actor.ping.remote()) # Check that the names are reclaimed after actors die. def check_name_available(name): try: ray.get_actor(name) return False except ValueError: return True @ray.remote class A: pass a = A.options(name="my_actor_1").remote() ray.kill(a, no_restart=True) wait_for_condition(lambda: check_name_available("my_actor_1")) b = A.options(name="my_actor_2").remote() del b wait_for_condition(lambda: check_name_available("my_actor_2"))
def test_tmpdir_env_var(shutdown_only): result = run_string_as_driver(""" import ray context = ray.init() assert context["session_dir"].startswith("/tmp/qqq/"), context print("passed") """, env={"RAY_TMPDIR": "/tmp/qqq"}) assert "passed" in result, result
def test_two_node(two_node_cluster, working_dir): cluster, _ = two_node_cluster redis_address = cluster.address runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" script = driver_script.format(redis_address=redis_address, working_dir=working_dir, runtime_env=runtime_env) out = run_string_as_driver(script) assert out.strip().split()[-1] == "1000"
def test_single_node(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, client_mode) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_two_node_module(two_node_cluster, working_dir, client_mode): cluster, _ = two_node_cluster (address, env, PKG_DIR) = start_client_server(cluster, client_mode) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_cleanup_on_driver_exit(call_ray_start): # This test will create a driver that creates a bunch of objects and then # exits. The entries in the object table should be cleaned up. address = call_ray_start ray.init(address=address) # Define a driver that creates a bunch of objects and exits. driver_script = """ import time import ray import numpy as np from ray.test_utils import object_memory_usage import os ray.init(address="{}") object_refs = [ray.put(np.zeros(200 * 1024, dtype=np.uint8)) for i in range(1000)] start_time = time.time() while time.time() - start_time < 30: if object_memory_usage() > 0: break else: raise Exception("Objects did not appear in object table.") @ray.remote def f(): time.sleep(1) print("success") # Submit some tasks without waiting for them to finish. Their workers should # still get cleaned up eventually, even if they get started after the driver # exits. [f.remote() for _ in range(10)] """.format(address) out = run_string_as_driver(driver_script) assert "success" in out # Make sure the objects are removed from the object table. start_time = time.time() while time.time() - start_time < 30: if object_memory_usage() == 0: break else: raise Exception("Objects were not all removed from object table.") def all_workers_exited(): for proc in psutil.process_iter(): if ray_constants.WORKER_PROCESS_TYPE_IDLE_WORKER in proc.name(): return False return True # Check that workers are eventually cleaned up. wait_for_condition(all_workers_exited)
def test_detached_actor(ray_start_regular): @ray.remote class DetachedActor: def ping(self): return "pong" with pytest.raises(ValueError, match="Actor name cannot be an empty string"): DetachedActor._remote(detached=True, name="") with pytest.raises(ValueError, match="Detached actors must be named"): DetachedActor._remote(detached=True) with pytest.raises(ValueError, match="Only detached actors can be named"): DetachedActor._remote(name="d_actor") DetachedActor._remote(detached=True, name="d_actor") with pytest.raises(ValueError, match="Please use a different name"): DetachedActor._remote(detached=True, name="d_actor") redis_address = ray_start_regular["redis_address"] get_actor_name = "d_actor" create_actor_name = "DetachedActor" driver_script = """ import ray ray.init(address="{}") existing_actor = ray.util.get_actor("{}") assert ray.get(existing_actor.ping.remote()) == "pong" @ray.remote class DetachedActor: def ping(self): return "pong" actor = DetachedActor._remote(name="{}", detached=True) ray.get(actor.ping.remote()) """.format(redis_address, get_actor_name, create_actor_name) run_string_as_driver(driver_script) detached_actor = ray.util.get_actor(create_actor_name) assert ray.get(detached_actor.ping.remote()) == "pong"
def test_two_node(two_node_cluster, working_dir): cluster, _ = two_node_cluster redis_address = cluster.address runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script) assert out.strip().split()[-1] == "1000" from ray._private.runtime_env import PKG_DIR assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_two_node_module(two_node_cluster, working_dir): cluster, _ = two_node_cluster redis_address = cluster.address runtime_env = """{ "local_modules": [test_module] }""" script = driver_script.format(redis_address=redis_address, working_dir=working_dir, runtime_env=runtime_env) print(script) out = run_string_as_driver(script) assert out.strip().split()[-1] == "1000"
def test_startup_error_yields_clean_result(shutdown_only): """ Check that an error while preparing the environment yields an actionable, clear error on the *client side*. """ ray_instance = ray.init() def raise_not_rewrite(input: JobConfig): raise RuntimeError("WEIRD_ERROR") server = proxier.serve_proxier("localhost:25030", ray_instance["redis_address"], ray_instance["session_dir"]) with patch.object(proxier, "ray_client_server_env_prep", raise_not_rewrite): run_string_as_driver(get_error) server.stop(0)
def test_single_node(ray_start_cluster_head, working_dir): cluster = ray_start_cluster_head redis_address = cluster.address runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" script = driver_script.format(redis_address=redis_address, working_dir=working_dir, runtime_env=runtime_env) out = run_string_as_driver(script) assert out.strip().split()[-1] == "1000"
def test_list_named_actors_namespace(ray_start_regular): """Verify that actor names are filtered on namespace by default.""" address = ray_start_regular["redis_address"] driver_script_1 = """ import ray ray.init(address="{}", namespace="test") @ray.remote class A: pass A.options(name="hi", lifetime="detached").remote() assert len(ray.util.list_named_actors()) == 1 assert ray.util.list_named_actors() == ["hi"] assert ray.util.list_named_actors(all_namespaces=True) == \ [dict(name="hi", namespace="test")] """.format(address) run_string_as_driver(driver_script_1) assert not ray.util.list_named_actors() assert ray.util.list_named_actors(all_namespaces=True) == [{ "name": "hi", "namespace": "test" }] driver_script_2 = """ import ray ray.init(address="{}", namespace="test") assert ray.util.list_named_actors() == ["hi"] assert ray.util.list_named_actors(all_namespaces=True) == \ [dict(name="hi", namespace="test")] ray.kill(ray.get_actor("hi"), no_restart=True) assert not ray.util.list_named_actors() """.format(address) run_string_as_driver(driver_script_2) assert not ray.util.list_named_actors() assert not ray.util.list_named_actors(all_namespaces=True)
def test_two_node(two_node_cluster, working_dir, client_mode): cluster, _ = two_node_cluster (address, env, PKG_DIR) = start_client_server(cluster, client_mode) # Testing runtime env with working_dir runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_runtime_env_getter(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, client_mode) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = """ print(ray.get_runtime_context().runtime_env["working_dir"]) """ script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == working_dir
def test_worker_capping_fifo(shutdown_only): # Start 2 initial workers by setting num_cpus to 2. info = ray.init(num_cpus=2) wait_for_condition(lambda: len(get_workers()) == 2) time.sleep(1) @ray.remote def getpid(): return os.getpid() worker1, worker2 = get_workers() if worker1.pid == ray.get(getpid.remote()): worker1, worker2 = [worker2, worker1] # Worker 1 is before worker 2 in the FIFO queue. driver_code = """ import ray import time ray.init(address="{}") @ray.remote def foo(): pass ray.get(foo.remote()) # Sleep a while to make sure an idle worker exits before this driver exits. time.sleep(2) ray.shutdown() """.format(info["redis_address"]) run_string_as_driver(driver_code) # Worker 1 should have been killed. wait_for_pid_to_exit(worker1.pid) wait_for_condition(lambda: len(get_workers()) == 1) assert worker2.pid == get_workers()[0].pid
def test_file_deleted_when_driver_exits(tmp_path, shutdown_only): # Limit our object store to 75 MiB of memory. temp_folder = tmp_path / "spill" temp_folder.mkdir() driver = """ import json import os import signal import numpy as np import ray ray.init( object_store_memory=75 * 1024 * 1024, _system_config={{ "max_io_workers": 2, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": json.dumps({{ "type": "filesystem", "params": {{ "directory_path": "{temp_dir}" }} }}), }}) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] # Spill lots of objects for _ in range(30): ref = None while ref is None: ref = ray.put(arr) replay_buffer.append(ref) # Send sigterm to itself. signum = {signum} sig = None if signum == 2: sig = signal.SIGINT elif signum == 15: sig = signal.SIGTERM os.kill(os.getpid(), sig) """ # Run a driver with sigint. print("Sending sigint...") with pytest.raises(subprocess.CalledProcessError): print( run_string_as_driver( driver.format(temp_dir=str(temp_folder), signum=2))) wait_for_condition(lambda: is_dir_empty(temp_folder, append_path=""))
def test_single_node(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, client_mode) # Setup runtime env here runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1 assert len(kv._internal_kv_list("gcs://")) == 0
def test_regular_actors(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, client_mode) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" execute_statement = """ test_actor = TestActor.options(name="test_actor").remote() print(sum(ray.get([test_actor.one.remote()] * 1000))) """ script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_delay_in_rewriting_environment(shutdown_only): """ Check that a delay in `ray_client_server_env_prep` does not break a Client connecting. """ proxier.LOGSTREAM_RETRIES = 3 proxier.LOGSTREAM_RETRY_INTERVAL_SEC = 1 ray_instance = ray.init() def delay_in_rewrite(input: JobConfig): import time time.sleep(6) return input server = proxier.serve_proxier("localhost:25010", ray_instance["redis_address"], ray_instance["session_dir"]) with patch.object(proxier, "ray_client_server_env_prep", delay_in_rewrite): run_string_as_driver(check_connection) server.stop(0)
def test_infeasible_tasks(ray_start_cluster): cluster = ray_start_cluster @ray.remote def f(): return cluster.add_node(resources={str(0): 100}) ray.init(address=cluster.address) # Submit an infeasible task. x_id = f._remote(args=[], kwargs={}, resources={str(1): 1}) # Add a node that makes the task feasible and make sure we can get the # result. cluster.add_node(resources={str(1): 100}) ray.get(x_id) # Start a driver that submits an infeasible task and then let it exit. driver_script = """ import ray ray.init(address="{}") @ray.remote(resources={}) def f(): {}pass # This is a weird hack to insert some blank space. f.remote() """.format(cluster.address, "{str(2): 1}", " ") run_string_as_driver(driver_script) # Now add a new node that makes the task feasible. cluster.add_node(resources={str(2): 100}) # Make sure we can still run tasks on all nodes. ray.get([ f._remote(args=[], kwargs={}, resources={str(i): 1}) for i in range(3) ])
def test_placement_groups(shutdown_only): info = ray.init(namespace="namespace") address = info["redis_address"] # First param of template is the namespace. Second is the redis address. driver_template = """ import ray ray.init(address="{}", namespace="{}") pg = ray.util.placement_group(bundles=[dict(CPU=1)], name="hello", lifetime="detached") ray.get(pg.ready()) """ # Start a detached placement group in a different namespace. run_string_as_driver(driver_template.format(address, "different")) # Create an actor. This should succeed because the other actor is in a # different namespace. probe = ray.util.placement_group(bundles=[{"CPU": 1}], name="hello") ray.get(probe.ready()) ray.util.remove_placement_group(probe) removed = False for _ in range(50): # Timeout after 5s try: ray.util.get_placement_group("hello") except ValueError: removed = True # This means the actor was removed. break else: time.sleep(0.1) assert removed, "This is an anti-flakey test measure" # Now make the actor in this namespace, from a different job. run_string_as_driver(driver_template.format(address, "namespace"))
def test_regular_actors(ray_start_cluster_head, working_dir): cluster = ray_start_cluster_head redis_address = cluster.address runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" execute_statement = """ test_actor = TestActor.options(name="test_actor").remote() print(sum(ray.get([test_actor.one.remote()] * 1000))) """ script = driver_script.format(**locals()) out = run_string_as_driver(script) assert out.strip().split()[-1] == "1000" from ray._private.runtime_env import PKG_DIR assert len(list(Path(PKG_DIR).iterdir())) == 1
def test_connecting_in_local_case(ray_start_regular): address_info = ray_start_regular # Define a driver that just connects to Redis. driver_script = """ import ray ray.init(address="{}") print("success") """.format(address_info["redis_address"]) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out
def test_driver_exiting_quickly(call_ray_start): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. address = call_ray_start ray.init(address=address) # Define a driver that creates an actor and exits. driver_script1 = """ import ray ray.init(address="{}") @ray.remote class Foo: def __init__(self): pass Foo.remote() print("success") """.format(address) # Define a driver that creates some tasks and exits. driver_script2 = """ import ray ray.init(address="{}") @ray.remote def f(): return 1 f.remote() print("success") """.format(address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script1) # Make sure the first driver ran to completion. assert "success" in out out = run_string_as_driver(driver_script2) # Make sure the first driver ran to completion. assert "success" in out