def test_drivers_named_actors(call_ray_start): # This test will create some drivers that submit some tasks to the same # named actor. address = call_ray_start ray.init(address=address, namespace="") # Define a driver that creates a named actor then sleeps for a while. driver_script1 = """ import ray import time ray.init(address="{}", namespace="") @ray.remote class Counter: def __init__(self): self.count = 0 def increment(self): self.count += 1 return self.count counter = Counter.options(name="Counter").remote() time.sleep(100) """.format(address) # Define a driver that submits to the named actor and exits. driver_script2 = """ import ray import time ray.init(address="{}", namespace="") while True: try: counter = ray.get_actor("Counter") break except ValueError: time.sleep(1) assert ray.get(counter.increment.remote()) == {} print("success") """.format(address, "{}") process_handle = run_string_as_driver_nonblocking(driver_script1) for i in range(3): driver_script = driver_script2.format(i + 1) out = run_string_as_driver(driver_script) assert "success" in out process_handle.kill()
def test_two_node_local_file(two_node_cluster, working_dir, client_mode): with open(os.path.join(working_dir, "test_file"), "w") as f: f.write("1") cluster, _ = two_node_cluster address, env, runtime_env_dir = start_client_server(cluster, client_mode) # test runtime_env iwth working_dir runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = """ vals = ray.get([check_file.remote('test_file')] * 1000) print(sum([int(v) for v in vals])) """ script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(runtime_env_dir).iterdir())) == 1 assert len(kv._internal_kv_list("gcs://")) == 0
def test_namespace(ray_start_cluster): """ Most of the "checks" in this test case rely on the fact that `run_string_as_driver` will throw an exception if the driver string exits with a non-zero exit code (e.g. when the driver scripts throws an exception). Since all of these drivers start named, detached actors, the most likely failure case would be a collision of named actors if they're put in the same namespace. This test checks that: * When two drivers don't specify a namespace, they are placed in different anonymous namespaces. * When two drivers specify a namespace, they collide. * The namespace name (as provided by the runtime context) is correct. """ cluster = ray_start_cluster cluster.add_node(num_cpus=4, ray_client_server_port=50055) cluster.wait_for_nodes(1) template = """ import ray ray.client("localhost:50055").namespace({namespace}).connect() @ray.remote class Foo: def ping(self): return "pong" a = Foo.options(lifetime="detached", name="abc").remote() ray.get(a.ping.remote()) print("Current namespace:", ray.get_runtime_context().namespace) """ anon_driver = template.format(namespace="None") run_string_as_driver(anon_driver) # This second run will fail if the actors don't run in separate anonymous # namespaces. run_string_as_driver(anon_driver) run_in_namespace = template.format(namespace="'namespace'") script_output = run_string_as_driver(run_in_namespace) # The second run fails because the actors are run in the same namespace. with pytest.raises(subprocess.CalledProcessError): run_string_as_driver(run_in_namespace) assert "Current namespace: namespace" in script_output subprocess.check_output("ray stop --force", shell=True)
def test_run_driver_twice(ray_start_regular): # We used to have issue 2165 and 2288: # https://github.com/ray-project/ray/issues/2165 # https://github.com/ray-project/ray/issues/2288 # both complain that driver will hang when run for the second time. # This test is used to verify the fix for above issue, it will run the # same driver for twice and verify whether both of them succeed. address_info = ray_start_regular driver_script = """ import ray import ray.tune as tune import os import time def train_func(config, reporter): # add a reporter arg for i in range(2): time.sleep(0.1) reporter(timesteps_total=i, mean_accuracy=i+97) # report metrics os.environ["TUNE_RESUME_PROMPT_OFF"] = "True" ray.init(address="{}", namespace="default_test_namespace") ray.tune.register_trainable("train_func", train_func) tune.run_experiments({{ "my_experiment": {{ "run": "train_func", "stop": {{"mean_accuracy": 99}}, "config": {{ "layer1": {{ "class_name": tune.grid_search(["a"]), "config": {{"lr": tune.grid_search([1, 2])}} }}, }}, "local_dir": os.path.expanduser("~/tmp") }} }}) print("success") """.format( address_info["address"] ) for i in range(2): out = run_string_as_driver(driver_script) assert "success" in out
def test_remote_function_isolation(call_ray_start): # This test will run multiple remote functions with the same names in # two different drivers. Connect a driver to the Ray cluster. address = call_ray_start ray.init(address=address) # Start another driver and make sure that it can define and call its # own commands with the same names. driver_script = """ import ray import time ray.init(address="{}") @ray.remote def f(): return 3 @ray.remote def g(x, y): return 4 for _ in range(10000): result = ray.get([f.remote(), g.remote(0, 0)]) assert result == [3, 4] print("success") """.format( address ) out = run_string_as_driver(driver_script) @ray.remote def f(): return 1 @ray.remote def g(x): return 2 for _ in range(10000): result = ray.get([f.remote(), g.remote(0)]) assert result == [1, 2] # Make sure the other driver succeeded. assert "success" in out
def test_user_setup_function(): script = """ import ray ray.init() @ray.remote def get_pkg_dir(): return ray._private.runtime_env.VAR print("remote", ray.get(get_pkg_dir.remote())) print("local", ray._private.runtime_env.VAR) """ env = {"RAY_USER_SETUP_FUNCTION": "ray._private.test_utils.set_setup_func"} out = run_string_as_driver(script, dict(os.environ, **env)) (remote_out, local_out) = out.strip().splitlines()[-2:] assert remote_out == "remote hello world" assert local_out == "local hello world"
def test_two_node_uri(two_node_cluster, working_dir, client_mode): cluster, _ = two_node_cluster (address, env, PKG_DIR) = start_client_server(cluster, client_mode) with tempfile.NamedTemporaryFile(suffix="zip") as tmp_file: pkg_name = working_dir_pkg.get_project_package_name( working_dir, [], []) pkg_uri = working_dir_pkg.Protocol.PIN_GCS.value + "://" + pkg_name working_dir_pkg.create_project_package(working_dir, [], [], tmp_file.name) working_dir_pkg.push_package(pkg_uri, tmp_file.name) runtime_env = f"""{{ "uris": ["{pkg_uri}"] }}""" # Execute the following cmd in driver with runtime_env execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" assert len(list(Path(PKG_DIR).iterdir())) == 1 # pinned uri will not be deleted print(list(kv._internal_kv_list(""))) assert len(kv._internal_kv_list("pingcs://")) == 1
def test_large_dir_upload_message(start_cluster, option): cluster, address = start_cluster with tempfile.TemporaryDirectory() as tmp_dir: filepath = os.path.join(tmp_dir, "test_file.txt") if option == "working_dir": driver_script = f""" import ray ray.init("{address}", runtime_env={{"working_dir": "{tmp_dir}"}}) """ else: driver_script = f""" import ray ray.init("{address}", runtime_env={{"py_modules": ["{tmp_dir}"]}}) """ with open(filepath, "w") as f: f.write("Hi") output = run_string_as_driver(driver_script) assert "Pushing file package" in output assert "Successfully pushed file package" in output assert "warning" not in output.lower()
def test_node_name_in_raylet_death(): NODE_NAME = "RAY_TEST_RAYLET_DEATH_NODE_NAME" script = f""" import ray import time import os NUM_HEARTBEATS=10 HEARTBEAT_PERIOD=500 WAIT_BUFFER_SECONDS=5 os.environ["RAY_num_heartbeats_timeout"]=str(NUM_HEARTBEATS) os.environ["RAY_raylet_heartbeat_period_milliseconds"]=str(HEARTBEAT_PERIOD) ray.init(_node_name=\"{NODE_NAME}\") # This will kill raylet without letting it exit gracefully. ray.worker._global_node.kill_raylet() time.sleep(NUM_HEARTBEATS * HEARTBEAT_PERIOD / 1000 + WAIT_BUFFER_SECONDS) ray.shutdown() """ out = run_string_as_driver(script) assert out.count(f"node name: {NODE_NAME} has been marked dead") == 1
def test_jobconfig_compatible_2(ray_start_cluster_head, working_dir): # start job_config=something # start job_config=None cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" # To make the first one hanging there execute_statement = """ time.sleep(600) """ script = driver_script.format(**locals()) proc = run_string_as_driver_nonblocking(script, env) time.sleep(5) runtime_env = None # Execute the following in the second one which should # succeed execute_statement = "print('OK')" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "OK", out proc.kill() proc.wait()
def test_storage_isolation(external_redis, call_ray_start, call_ray_start_2): script = """ import ray ray.init("{address}", namespace="a") @ray.remote class A: def ready(self): return {val} pass a = A.options(lifetime="detached", name="A").remote() assert ray.get(a.ready.remote()) == {val} """ run_string_as_driver(script.format(address=call_ray_start, val=1)) run_string_as_driver(script.format(address=call_ray_start_2, val=2)) script = """ import ray ray.init("{address}", namespace="a") a = ray.get_actor(name="A") assert ray.get(a.ready.remote()) == {val} """ run_string_as_driver(script.format(address=call_ray_start, val=1)) run_string_as_driver(script.format(address=call_ray_start_2, val=2))
def test_ray_start_non_head(call_ray_stop_only, monkeypatch): # Test that we can call ray start to connect to an existing cluster. # Test starting Ray with a port specified. check_call_ray( ["start", "--head", "--port", "7298", "--resources", '{"res_0": 1}']) # Test starting node connecting to the above cluster. check_call_ray([ "start", "--address", "127.0.0.1:7298", "--resources", '{"res_1": 1}' ]) # Test starting Ray with address `auto`. check_call_ray( ["start", "--address", "auto", "--resources", '{"res_2": 1}']) # Run tasks to verify nodes with custom resources are available. driver_script = """ import ray ray.init() @ray.remote def f(): return 1 assert ray.get(f.remote()) == 1 assert ray.get(f.options(resources={"res_0": 1}).remote()) == 1 assert ray.get(f.options(resources={"res_1": 1}).remote()) == 1 assert ray.get(f.options(resources={"res_2": 1}).remote()) == 1 print("success") """ monkeypatch.setenv("RAY_ADDRESS", "auto") out = run_string_as_driver(driver_script) # Make sure the driver succeeded. assert "success" in out check_call_ray(["stop"])
def test_jobconfig_compatible_1(ray_start_cluster_head, working_dir): # start job_config=None # start job_config=something cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = None # To make the first one hanging there execute_statement = """ time.sleep(600) """ script = driver_script.format(**locals()) # Have one running with job config = None proc = run_string_as_driver_nonblocking(script, env) # waiting it to be up time.sleep(5) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the second one which should work because Ray Client servers. execute_statement = "print(sum(ray.get([run_test.remote()] * 1000)))" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" proc.kill() proc.wait()
def test_detached_actors(ray_start_cluster_head, working_dir, client_mode): cluster = ray_start_cluster_head address, env, runtime_env_dir = start_client_server(cluster, client_mode) runtime_env = f"""{{ "working_dir": "{working_dir}" }}""" # Execute the following cmd in driver with runtime_env execute_statement = """ test_actor = TestActor.options(name="test_actor", lifetime="detached").remote() print(sum(ray.get([test_actor.one.remote()] * 1000))) """ script = driver_script.format(**locals()) out = run_string_as_driver(script, env) assert out.strip().split()[-1] == "1000" # It's a detached actors, so it should still be there assert len(kv._internal_kv_list("gcs://")) == 1 assert len(list(Path(runtime_env_dir).iterdir())) == 2 pkg_dir = [f for f in Path(runtime_env_dir).glob("*") if f.is_dir()][0] sys.path.insert(0, str(pkg_dir)) test_actor = ray.get_actor("test_actor") assert sum(ray.get([test_actor.one.remote()] * 1000)) == 1000 ray.kill(test_actor) time.sleep(5) assert len(list(Path(runtime_env_dir).iterdir())) == 1 assert len(kv._internal_kv_list("gcs://")) == 0
def test_jobconfig_compatible_3(ray_start_cluster_head, working_dir): # start job_config=something # start job_config=something else cluster = ray_start_cluster_head (address, env, PKG_DIR) = start_client_server(cluster, True) runtime_env = """{ "py_modules": [test_module.__path__[0]] }""" # To make the first one hanging ther execute_statement = """ time.sleep(600) """ script = driver_script.format(**locals()) proc = run_string_as_driver_nonblocking(script, env) time.sleep(5) runtime_env = f""" {{ "working_dir": test_module.__path__[0] }}""" # noqa: F541 # Execute the following cmd in the second one and ensure that # it is able to run. execute_statement = "print('OK')" script = driver_script.format(**locals()) out = run_string_as_driver(script, env) proc.kill() proc.wait() assert out.strip().split()[-1] == "OK"
def test_run_on_all_workers(ray_start_regular, tmp_path): # This test is to ensure run_function_on_all_workers are executed # on all workers. lock_file = tmp_path / "lock" data_file = tmp_path / "data" driver_script = f""" import ray from filelock import FileLock from pathlib import Path import pickle lock_file = r"{str(lock_file)}" data_file = Path(r"{str(data_file)}") def init_func(worker_info): with FileLock(lock_file): if data_file.exists(): old = pickle.loads(data_file.read_bytes()) else: old = [] old.append(worker_info['worker'].worker_id) data_file.write_bytes(pickle.dumps(old)) ray.worker.global_worker.run_function_on_all_workers(init_func) ray.init(address='auto') @ray.remote def ready(): with FileLock(lock_file): worker_ids = pickle.loads(data_file.read_bytes()) assert ray.worker.global_worker.worker_id in worker_ids ray.get(ready.remote()) """ run_string_as_driver(driver_script) run_string_as_driver(driver_script) run_string_as_driver(driver_script)
def test_cleanup_on_driver_exit(call_ray_start): # This test will create a driver that creates a bunch of objects and then # exits. The entries in the object table should be cleaned up. address = call_ray_start ray.init(address=address) # Define a driver that creates a bunch of objects and exits. driver_script = """ import time import ray import numpy as np from ray._private.test_utils import object_memory_usage import os ray.init(address="{}") object_refs = [ray.put(np.zeros(200 * 1024, dtype=np.uint8)) for i in range(1000)] start_time = time.time() while time.time() - start_time < 30: if object_memory_usage() > 0: break else: raise Exception("Objects did not appear in object table.") @ray.remote def f(): time.sleep(1) print("success") # Submit some tasks without waiting for them to finish. Their workers should # still get cleaned up eventually, even if they get started after the driver # exits. [f.remote() for _ in range(10)] """.format( address ) out = run_string_as_driver(driver_script) assert "success" in out # Make sure the objects are removed from the object table. start_time = time.time() while time.time() - start_time < 30: if object_memory_usage() == 0: break else: raise Exception("Objects were not all removed from object table.") def all_workers_exited(): result = True print("list of idle workers:") for proc in psutil.process_iter(): if ray_constants.WORKER_PROCESS_TYPE_IDLE_WORKER in proc.name(): print(f"{proc}") result = False return result # Check that workers are eventually cleaned up. wait_for_condition(all_workers_exited, timeout=15, retry_interval_ms=1000)
def test_automatic_cleanup_detached_actors(ray_start_cluster): # Make sure the placement groups created by a # detached actors are cleaned properly. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 2 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) cluster.wait_for_nodes() info = ray.init(address=cluster.address, namespace="default_test_namespace") available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["address"]}", namespace="default_test_namespace") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg # TODO(sang): Placement groups created by tasks launched by detached actor # is not cleaned with the current protocol. # @ray.remote(num_cpus=0) # def f(): # create_pg() @ray.remote(num_cpus=0, max_restarts=1) class A: def create_pg(self): create_pg() def create_child_pg(self): self.a = A.options(name="B").remote() ray.get(self.a.create_pg.remote()) def kill_child_actor(self): ray.kill(self.a) try: ray.get(self.a.create_pg.remote()) except Exception: pass a = A.options(lifetime="detached", name="A").remote() ray.get(a.create_pg.remote()) # TODO(sang): Currently, child tasks are cleaned when a detached actor # is dead. We cannot test this scenario until it is fixed. # ray.get(a.create_child_pg.remote()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.state.jobs() for job in jobs: if job["IsDead"]: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Make sure when a child actor spawned by a detached actor # is killed, the placement group is removed. a = ray.get_actor("A") # TODO(sang): child of detached actors # seem to be killed when jobs are done. We should fix this before # testing this scenario. # ray.get(a.kill_child_actor.remote()) # assert assert_num_cpus(num_nodes) # Make sure placement groups are cleaned when detached actors are killed. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node)) # The detached actor a should've been restarted. # Recreate a placement group. ray.get(a.create_pg.remote()) wait_for_condition(lambda: assert_num_cpus(num_nodes)) # Kill it again and make sure the placement group # that is created is deleted again. ray.kill(a, no_restart=False) wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_automatic_cleanup_job(ray_start_cluster): # Make sure the placement groups created by a # job, actor, and task are cleaned when the job is done. cluster = ray_start_cluster num_nodes = 3 num_cpu_per_node = 4 # Create 3 nodes cluster. for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpu_per_node) cluster.wait_for_nodes() info = ray.init(address=cluster.address) available_cpus = ray.available_resources()["CPU"] assert available_cpus == num_nodes * num_cpu_per_node driver_code = f""" import ray ray.init(address="{info["address"]}") def create_pg(): pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(3)], strategy="STRICT_SPREAD") ray.get(pg.ready()) return pg @ray.remote(num_cpus=0) def f(): create_pg() @ray.remote(num_cpus=0) class A: def create_pg(self): create_pg() ray.get(f.remote()) a = A.remote() ray.get(a.create_pg.remote()) # Create 2 pgs to make sure multiple placement groups that belong # to a single job will be properly cleaned. create_pg() create_pg() ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.state.jobs() for job in jobs: if job["IsDead"]: return True return False def assert_num_cpus(expected_num_cpus): if expected_num_cpus == 0: return "CPU" not in ray.available_resources() return ray.available_resources()["CPU"] == expected_num_cpus wait_for_condition(is_job_done) available_cpus = ray.available_resources()["CPU"] wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
def test_working_dir_scale_up_in_new_driver(ray_start, tmp_dir, use_ray_client): with open("hello", "w") as f: f.write("world") driver1 = """ import os import ray from ray import serve job_config = ray.job_config.JobConfig(runtime_env={{"working_dir": "."}}) if {use_ray_client}: ray.util.connect("{client_addr}", namespace="serve", job_config=job_config) else: ray.init(address="auto", namespace="serve", job_config=job_config) serve.start(detached=True) @serve.deployment(version="1") class Test: def __call__(self, *args): return os.getpid(), open("hello").read() Test.deploy() handle = Test.get_handle() assert ray.get(handle.remote())[1] == "world" """.format( use_ray_client=use_ray_client, client_addr=ray_start ) run_string_as_driver(driver1) with open("hello", "w") as f: f.write("no longer world") driver2 = """ import ray from ray import serve job_config = ray.job_config.JobConfig(runtime_env={{"working_dir": "."}}) if {use_ray_client}: ray.util.connect("{client_addr}", namespace="serve", job_config=job_config) else: ray.init(address="auto", namespace="serve", job_config=job_config) serve.start(detached=True) Test = serve.get_deployment("Test") Test.options(num_replicas=2).deploy() handle = Test.get_handle() results = ray.get([handle.remote() for _ in range(1000)]) print(set(results)) assert all(r[1] == "world" for r in results), ( "results should still come from the first env") assert len(set(r[0] for r in results)) == 2, ( "make sure there are two replicas") Test.delete() """.format( use_ray_client=use_ray_client, client_addr=ray_start ) run_string_as_driver(driver2)
def test_log_redirect_to_stderr(shutdown_only, capfd): log_components = { ray_constants.PROCESS_TYPE_DASHBOARD: "Dashboard head grpc address", ray_constants.PROCESS_TYPE_DASHBOARD_AGENT: "Dashboard agent grpc address", ray_constants.PROCESS_TYPE_GCS_SERVER: "Loading job table data", # No log monitor output if all components are writing to stderr. ray_constants.PROCESS_TYPE_LOG_MONITOR: "", ray_constants.PROCESS_TYPE_MONITOR: "Starting monitor using ray installation", ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER: "worker server started", ray_constants.PROCESS_TYPE_PYTHON_CORE_WORKER_DRIVER: "driver server started", # TODO(Clark): Add coverage for Ray Client. # ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER: "Starting Ray Client server", ray_constants.PROCESS_TYPE_RAY_CLIENT_SERVER: "", ray_constants.PROCESS_TYPE_RAYLET: "Starting object store with directory", # No reaper process run (kernel fate-sharing). ray_constants.PROCESS_TYPE_REAPER: "", # No reporter process run. ray_constants.PROCESS_TYPE_REPORTER: "", # No web UI process run. ray_constants.PROCESS_TYPE_WEB_UI: "", # Unused. ray_constants.PROCESS_TYPE_WORKER: "", } script = """ import os from pathlib import Path import ray os.environ["RAY_LOG_TO_STDERR"] = "1" ray.init() session_dir = ray.worker.global_worker.node.address_info["session_dir"] session_path = Path(session_dir) log_dir_path = session_path / "logs" # Run the basic workload. @ray.remote def f(): for i in range(10): print(f"test {{i}}") ray.get(f.remote()) log_component_names = {} # Confirm that no log files are created for any of the components. paths = list(path.stem for path in log_dir_path.iterdir()) assert set(log_component_names).isdisjoint(set(paths)), paths """.format( str(list(log_components.keys())) ) stderr = run_string_as_driver(script) # Make sure that the expected startup log records for each of the # components appears in the stderr stream. # stderr = capfd.readouterr().err for component, canonical_record in log_components.items(): if not canonical_record: # Process not run or doesn't generate logs; skip. continue assert canonical_record in stderr, stderr if component == ray_constants.PROCESS_TYPE_REDIS_SERVER: # Redis doesn't expose hooks for custom log formats, so we aren't able to # inject the Redis server component name into the log records. continue # NOTE: We do a prefix match instead of including the enclosing right # parentheses since some components, like the core driver and worker, add a # unique ID suffix. assert f"({component}" in stderr, stderr
def test_detached_actor_cleanup(ray_start_regular): @ray.remote class DetachedActor: def ping(self): return "pong" dup_actor_name = "actor" def create_and_kill_actor(actor_name): # Make sure same name is creatable after killing it. detached_actor = DetachedActor.options(lifetime="detached", name=actor_name).remote() # Wait for detached actor creation. assert ray.get(detached_actor.ping.remote()) == "pong" del detached_actor assert ray.util.list_named_actors() == [dup_actor_name] detached_actor = ray.get_actor(dup_actor_name) ray.kill(detached_actor) # Wait until actor dies. actor_status = ray.state.actors( actor_id=detached_actor._actor_id.hex()) max_wait_time = 10 wait_time = 0 while actor_status["State"] != gcs_utils.ActorTableData.DEAD: actor_status = ray.state.actors( actor_id=detached_actor._actor_id.hex()) time.sleep(1.0) wait_time += 1 if wait_time >= max_wait_time: assert None, ( "It took too much time to kill an actor: {}".format( detached_actor._actor_id)) create_and_kill_actor(dup_actor_name) # This shouldn't be broken because actor # name should have been cleaned up from GCS. create_and_kill_actor(dup_actor_name) redis_address = ray_start_regular["redis_address"] driver_script = """ import ray import ray._private.gcs_utils as gcs_utils import time ray.init(address="{}", namespace="default_test_namespace") @ray.remote class DetachedActor: def ping(self): return "pong" # Make sure same name is creatable after killing it. detached_actor = DetachedActor.options(lifetime="detached", name="{}").remote() assert ray.get(detached_actor.ping.remote()) == "pong" ray.kill(detached_actor) # Wait until actor dies. actor_status = ray.state.actors(actor_id=detached_actor._actor_id.hex()) max_wait_time = 10 wait_time = 0 while actor_status["State"] != gcs_utils.ActorTableData.DEAD: actor_status = ray.state.actors(actor_id=detached_actor._actor_id.hex()) time.sleep(1.0) wait_time += 1 if wait_time >= max_wait_time: assert None, ( "It took too much time to kill an actor") """.format(redis_address, dup_actor_name) run_string_as_driver(driver_script) # Make sure we can create a detached actor created/killed # at other scripts. create_and_kill_actor(dup_actor_name)
def test_detached_actor(ray_start_regular): @ray.remote class DetachedActor: def ping(self): return "pong" with pytest.raises(TypeError): DetachedActor._remote(lifetime="detached", name=1) with pytest.raises(ValueError, match="Actor name cannot be an empty string"): DetachedActor._remote(lifetime="detached", name="") with pytest.raises(ValueError): DetachedActor._remote(lifetime="detached", name="hi", namespace="") with pytest.raises(TypeError): DetachedActor._remote(lifetime="detached", name="hi", namespace=2) d = DetachedActor._remote(lifetime="detached", name="d_actor") assert ray.get(d.ping.remote()) == "pong" with pytest.raises(ValueError, match="Please use a different name"): DetachedActor._remote(lifetime="detached", name="d_actor") redis_address = ray_start_regular["redis_address"] get_actor_name = "d_actor" create_actor_name = "DetachedActor" driver_script = """ import ray ray.init(address="{}", namespace="default_test_namespace") name = "{}" assert ray.util.list_named_actors() == [name] existing_actor = ray.get_actor(name) assert ray.get(existing_actor.ping.remote()) == "pong" @ray.remote def foo(): return "bar" @ray.remote class NonDetachedActor: def foo(self): return "bar" @ray.remote class DetachedActor: def ping(self): return "pong" def foobar(self): actor = NonDetachedActor.remote() return ray.get([foo.remote(), actor.foo.remote()]) actor = DetachedActor._remote(lifetime="detached", name="{}") ray.get(actor.ping.remote()) """.format(redis_address, get_actor_name, create_actor_name) run_string_as_driver(driver_script) assert len(ray.util.list_named_actors()) == 2 assert get_actor_name in ray.util.list_named_actors() assert create_actor_name in ray.util.list_named_actors() detached_actor = ray.get_actor(create_actor_name) assert ray.get(detached_actor.ping.remote()) == "pong" # Verify that a detached actor is able to create tasks/actors # even if the driver of the detached actor has exited. assert ray.get(detached_actor.foobar.remote()) == ["bar", "bar"]
def test_serve_snapshot(ray_start_with_dashboard): """Test detached and nondetached Serve instances running concurrently.""" detached_serve_driver_script = f""" import ray from ray import serve ray.init( address="{ray_start_with_dashboard['redis_address']}", namespace="serve") serve.start(detached=True) @serve.deployment def my_func(request): return "hello" my_func.deploy() @serve.deployment(version="v1") def my_func_deleted(request): return "hello" my_func_deleted.deploy() my_func_deleted.delete() """ run_string_as_driver(detached_serve_driver_script) assert requests.get("http://127.0.0.1:8000/my_func").text == "hello" # Use a new port to avoid clobbering the first Serve instance. serve.start(http_options={"port": 8123}) @serve.deployment(version="v1") def my_func_nondetached(request): return "hello" my_func_nondetached.deploy() assert requests.get( "http://127.0.0.1:8123/my_func_nondetached").text == "hello" webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) response = requests.get(f"{webui_url}/api/snapshot") response.raise_for_status() data = response.json() schema_path = os.path.join(os.path.dirname(dashboard.__file__), "modules/snapshot/snapshot_schema.json") pprint.pprint(data) jsonschema.validate(instance=data, schema=json.load(open(schema_path))) assert len(data["data"]["snapshot"]["deployments"]) == 3 entry = data["data"]["snapshot"]["deployments"][hashlib.sha1( "my_func".encode()).hexdigest()] assert entry["name"] == "my_func" assert entry["version"] == "None" assert entry["namespace"] == "serve" assert entry["httpRoute"] == "/my_func" assert entry["className"] == "my_func" assert entry["status"] == "RUNNING" assert entry["rayJobId"] is not None assert entry["startTime"] > 0 assert entry["endTime"] == 0 assert len(entry["actors"]) == 1 actor_id = next(iter(entry["actors"])) metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][ "serve"] assert metadata["deploymentName"] == "my_func" assert metadata["version"] == "None" assert len(metadata["replicaTag"]) > 0 entry_deleted = data["data"]["snapshot"]["deployments"][hashlib.sha1( "my_func_deleted".encode()).hexdigest()] assert entry_deleted["name"] == "my_func_deleted" assert entry_deleted["version"] == "v1" assert entry_deleted["namespace"] == "serve" assert entry_deleted["httpRoute"] == "/my_func_deleted" assert entry_deleted["className"] == "my_func_deleted" assert entry_deleted["status"] == "DELETED" assert entry["rayJobId"] is not None assert entry_deleted["startTime"] > 0 assert entry_deleted["endTime"] > entry_deleted["startTime"] entry_nondetached = data["data"]["snapshot"]["deployments"][hashlib.sha1( "my_func_nondetached".encode()).hexdigest()] assert entry_nondetached["name"] == "my_func_nondetached" assert entry_nondetached["version"] == "v1" assert entry_nondetached["namespace"] == "default_test_namespace" assert entry_nondetached["httpRoute"] == "/my_func_nondetached" assert entry_nondetached["className"] == "my_func_nondetached" assert entry_nondetached["status"] == "RUNNING" assert entry_nondetached["rayJobId"] is not None assert entry_nondetached["startTime"] > 0 assert entry_nondetached["endTime"] == 0 assert len(entry_nondetached["actors"]) == 1 actor_id = next(iter(entry_nondetached["actors"])) metadata = data["data"]["snapshot"]["actors"][actor_id]["metadata"][ "serve"] assert metadata["deploymentName"] == "my_func_nondetached" assert metadata["version"] == "v1" assert len(metadata["replicaTag"]) > 0 my_func_nondetached.delete()
def test_ray_client(ray_client_instance): ray.util.connect(ray_client_instance, namespace="default_test_namespace") start = """ import ray ray.util.connect("{}", namespace="default_test_namespace") from ray import serve serve.start(detached=True) """.format(ray_client_instance) run_string_as_driver(start) deploy = """ import ray ray.util.connect("{}", namespace="default_test_namespace") from ray import serve @serve.deployment(name="test1", route_prefix="/hello") def f(*args): return "hello" f.deploy() """.format(ray_client_instance) run_string_as_driver(deploy) assert "test1" in serve.list_deployments() assert requests.get("http://*****:*****@app.get("/") def hello(): return "hello" @serve.deployment @serve.ingress(app) class A: pass A.deploy() """.format(ray_client_instance) run_string_as_driver(fastapi) assert requests.get("http://localhost:8000/A").json() == "hello" serve.shutdown() ray.util.disconnect()
def test_driver_exiting_when_worker_blocked(call_ray_start): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. address = call_ray_start ray.init(address=address) # Define a driver that creates two tasks, one that runs forever and the # other blocked on the first in a `ray.get`. driver_script = """ import time import ray ray.init(address="{}") @ray.remote def f(): time.sleep(10**6) @ray.remote def g(): ray.get(f.remote()) g.remote() time.sleep(1) print("success") """.format(address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Make sure the first driver ran to completion. assert "success" in out # Define a driver that creates two tasks, one that runs forever and the # other blocked on the first in a `ray.wait`. driver_script = """ import time import ray ray.init(address="{}") @ray.remote def f(): time.sleep(10**6) @ray.remote def g(): ray.wait([f.remote()]) g.remote() time.sleep(1) print("success") """.format(address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Make sure the first driver ran to completion. assert "success" in out # Define a driver that creates one task that depends on a nonexistent # object. This task will be queued as waiting to execute. driver_script_template = """ import time import ray ray.init(address="{}") @ray.remote def g(x): return g.remote(ray.ObjectRef(ray._private.utils.hex_to_binary("{}"))) time.sleep(1) print("success") """ # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): nonexistent_id = ray.ObjectRef.from_random() driver_script = driver_script_template.format(address, nonexistent_id.hex()) out = run_string_as_driver(driver_script) # Simulate the nonexistent dependency becoming available. ray.worker.global_worker.put_object(None, nonexistent_id) # Make sure the first driver ran to completion. assert "success" in out # Define a driver that calls `ray.wait` on a nonexistent object. driver_script_template = """ import time import ray ray.init(address="{}") @ray.remote def g(): ray.wait(ray.ObjectRef(ray._private.utils.hex_to_binary("{}"))) g.remote() time.sleep(1) print("success") """ # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): nonexistent_id = ray.ObjectRef.from_random() driver_script = driver_script_template.format(address, nonexistent_id.hex()) out = run_string_as_driver(driver_script) # Simulate the nonexistent dependency becoming available. ray.worker.global_worker.put_object(None, nonexistent_id) # Make sure the first driver ran to completion. assert "success" in out @ray.remote def f(): return 1 # Make sure we can still talk with the raylet. ray.get(f.remote())
def test_drivers_release_resources(call_ray_start): address = call_ray_start # Define a driver that creates an actor and exits. driver_script1 = """ import time import ray ray.init(address="{}") @ray.remote def f(duration): time.sleep(duration) @ray.remote(num_gpus=1) def g(duration): time.sleep(duration) @ray.remote(num_gpus=1) class Foo: def __init__(self): pass # Make sure some resources are available for us to run tasks. ray.get(f.remote(0)) ray.get(g.remote(0)) # Start a bunch of actors and tasks that use resources. These should all be # cleaned up when this driver exits. foos = [Foo.remote() for _ in range(100)] [f.remote(10 ** 6) for _ in range(100)] print("success") """.format( address ) driver_script2 = ( driver_script1 + "import sys\nsys.stdout.flush()\ntime.sleep(10 ** 6)\n" ) def wait_for_success_output(process_handle, timeout=10): # Wait until the process prints "success" and then return. start_time = time.time() while time.time() - start_time < timeout: output_line = ray._private.utils.decode( process_handle.stdout.readline() ).strip() print(output_line) if output_line == "success": return time.sleep(1) raise RayTestTimeoutException("Timed out waiting for process to print success.") # Make sure we can run this driver repeatedly, which means that resources # are getting released in between. for _ in range(5): out = run_string_as_driver(driver_script1) # Make sure the first driver ran to completion. assert "success" in out # Also make sure that this works when the driver exits ungracefully. process_handle = run_string_as_driver_nonblocking(driver_script2) wait_for_success_output(process_handle) # Kill the process ungracefully. process_handle.kill()
def run_driver(): output = run_string_as_driver(driver_script, encode="utf-8") assert "success" in output
def test_error_isolation(call_ray_start): address = call_ray_start # Connect a driver to the Ray cluster. ray.init(address=address) # If a GRPC call exceeds timeout, the calls is cancelled at client side but # server may still reply to it, leading to missed message. Using a sequence # number to ensure no message is dropped can be the long term solution, # but its complexity and the fact the Ray subscribers do not use deadline # in production makes it less preferred. # Therefore, a simpler workaround is used instead: a different subscriber # is used for each get_error_message() call. subscribers = [init_error_pubsub() for _ in range(3)] # There shouldn't be any errors yet. errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. errors = get_error_message(subscribers[1], 1) # Make sure we got the error. assert len(errors) == 1 assert error_string1 in errors[0].error_message # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time from ray._private.test_utils import init_error_pubsub, get_error_message ray.init(address="{}") subscribers = [init_error_pubsub() for _ in range(2)] time.sleep(1) errors = get_error_message(subscribers[0], 1, timeout=2) assert len(errors) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass errors = get_error_message(subscribers[1], 1) assert len(errors) == 1 assert "{}" in errors[0].error_message print("success") """.format( address, error_string2, error_string2 ) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. errors = get_error_message(subscribers[2], 1) assert len(errors) == 1
def test_calling_start_ray_head(call_ray_stop_only): # Test that we can call ray start with various command line # parameters. # Test starting Ray with a redis port specified. check_call_ray(["start", "--head", "--port", "0"]) check_call_ray(["stop"]) # Test starting Ray with a node IP address specified. check_call_ray( ["start", "--head", "--node-ip-address", "127.0.0.1", "--port", "0"]) check_call_ray(["stop"]) # Test starting Ray with a system config parameter set. check_call_ray([ "start", "--head", "--system-config", '{"metrics_report_interval_ms":100}', "--port", "0", ]) check_call_ray(["stop"]) # Test starting Ray with the object manager and node manager ports # specified. check_call_ray([ "start", "--head", "--object-manager-port", "22345", "--node-manager-port", "54321", "--port", "0", ]) check_call_ray(["stop"]) # Test starting Ray with the worker port range specified. check_call_ray([ "start", "--head", "--min-worker-port", "51000", "--max-worker-port", "51050", "--port", "0", ]) check_call_ray(["stop"]) # Test starting Ray with a worker port list. check_call_ray(["start", "--head", "--worker-port-list", "10002,10003"]) check_call_ray(["stop"]) # Test starting Ray with a non-int in the worker port list. with pytest.raises(subprocess.CalledProcessError): check_call_ray(["start", "--head", "--worker-port-list", "10002,a"]) check_call_ray(["stop"]) # Test starting Ray with an invalid port in the worker port list. with pytest.raises(subprocess.CalledProcessError): check_call_ray(["start", "--head", "--worker-port-list", "100"]) check_call_ray(["stop"]) # Test starting Ray with the number of CPUs specified. check_call_ray(["start", "--head", "--num-cpus", "2", "--port", "0"]) check_call_ray(["stop"]) # Test starting Ray with the number of GPUs specified. check_call_ray(["start", "--head", "--num-gpus", "100", "--port", "0"]) check_call_ray(["stop"]) # Test starting Ray with redis shard ports specified. check_call_ray([ "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--port", "0" ]) check_call_ray(["stop"]) # Test starting Ray with all arguments specified. check_call_ray([ "start", "--head", "--redis-shard-ports", "6380,6381,6382", "--object-manager-port", "22345", "--num-cpus", "2", "--num-gpus", "0", "--resources", '{"Custom": 1}', "--port", "0", ]) check_call_ray(["stop"]) temp_dir = ray._private.utils.get_ray_temp_dir() # Test starting Ray with RAY_REDIS_ADDRESS env. _, proc = _start_redis_instance( REDIS_EXECUTABLE, temp_dir, 8888, password=ray_constants.REDIS_DEFAULT_PASSWORD) os.environ["RAY_REDIS_ADDRESS"] = "127.0.0.1:8888" check_call_ray(["start", "--head"]) check_call_ray(["stop"]) proc.process.terminate() del os.environ["RAY_REDIS_ADDRESS"] # Test --block. Killing a child process should cause the command to exit. blocked = subprocess.Popen( ["ray", "start", "--head", "--block", "--port", "0"]) blocked.poll() assert blocked.returncode is None # Make sure ray cluster is up run_string_as_driver(""" import ray from time import sleep for i in range(0, 5): try: ray.init(address='auto') break except: sleep(1) """) # Make sure ray cluster is up run_string_as_driver(""" import ray from time import sleep for i in range(0, 5): try: ray.init(address='auto') break except: sleep(1) """) kill_process_by_name("raylet", SIGKILL=True) wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) blocked.wait() assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit" # Test --block. Killing the command should clean up all child processes. blocked = subprocess.Popen( ["ray", "start", "--head", "--block", "--port", "0"]) blocked.poll() assert blocked.returncode is None # Include GCS, autoscaler monitor, client server, dashboard, raylet and # log_monitor.py num_children = 6 if not detect_fate_sharing_support(): # Account for ray_process_reaper.py num_children += 1 # Check a set of child process commands & scripts instead? wait_for_children_of_pid(blocked.pid, num_children=num_children, timeout=30) blocked.terminate() wait_for_children_of_pid_to_exit(blocked.pid, timeout=30) blocked.wait() assert blocked.returncode != 0, "ray start shouldn't return 0 on bad exit"