def test_run_runtime_env(ray_start_stop): """Test `serve run` with runtime_env passed in.""" # With import path p = subprocess.Popen([ "serve", "run", "--address=auto", "ray.serve.tests.test_cli.metal_detector_node", "--runtime-env-json", ('{"env_vars": {"buried_item": "lucky coin"} }'), ]) wait_for_condition(lambda: ping_endpoint("MetalDetector") == "lucky coin", timeout=10) p.send_signal(signal.SIGINT) p.wait() # With config p = subprocess.Popen([ "serve", "run", "--address=auto", os.path.join( os.path.dirname(__file__), "test_config_files", "missing_runtime_env.yaml", ), "--runtime-env-json", ('{"py_modules": ["https://github.com/ray-project/test_deploy_group' '/archive/67971777e225600720f91f618cdfe71fc47f60ee.zip"],' '"working_dir": "http://nonexistentlink-q490123950ni34t"}'), "--working-dir", ("https://github.com/ray-project/test_dag/archive/" "76a741f6de31df78411b1f302071cde46f098418.zip"), ]) wait_for_condition(lambda: ping_endpoint("") == "wonderful world", timeout=15) p.send_signal(signal.SIGINT) p.wait()
def test_schedule_placement_groups_at_the_same_time(connect_to_client): ray.init(num_cpus=4) with connect_to_client_or_not(connect_to_client): pgs = [placement_group([{"CPU": 2}]) for _ in range(6)] wait_pgs = {pg.ready(): pg for pg in pgs} def is_all_placement_group_removed(): ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5) if ready: ready_pg = wait_pgs[ready[0]] remove_placement_group(ready_pg) del wait_pgs[ready[0]] if len(wait_pgs) == 0: return True return False wait_for_condition(is_all_placement_group_removed) ray.shutdown()
def test_task_summary(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=2) ray.init(address=cluster.address) cluster.add_node(num_cpus=2) @ray.remote def run_long_time_task(): time.sleep(30) return True @ray.remote def task_wait_for_dep(dep): print(dep) a = task_wait_for_dep.remote(run_long_time_task.remote()) # noqa b = task_wait_for_dep.remote(run_long_time_task.remote()) # noqa def verify(): # task_name -> states task_summary = summarize_tasks() task_summary = task_summary["cluster"]["summary"] assert "task_wait_for_dep" in task_summary assert "run_long_time_task" in task_summary assert (task_summary["task_wait_for_dep"]["state_counts"] ["WAITING_FOR_DEPENDENCIES"] == 2) assert task_summary["run_long_time_task"]["state_counts"][ "RUNNING"] == 2 assert task_summary["task_wait_for_dep"]["type"] == "NORMAL_TASK" return True wait_for_condition(verify) """ Test CLI """ runner = CliRunner() result = runner.invoke(summary_state_cli_group, ["tasks"]) assert "task_wait_for_dep" in result.output assert result.exit_code == 0
def test_e2e_basic_scale_up_down(serve_instance): """Send 100 requests and check that we autoscale up, and then back down.""" signal = SignalActor.remote() @serve.deployment( _autoscaling_config={ "metrics_interval_s": 0.1, "min_replicas": 1, "max_replicas": 2, "look_back_period_s": 0.2, "downscale_delay_s": 0, "upscale_delay_s": 0 }, # We will send over a lot of queries. This will make sure replicas are # killed quickly during cleanup. _graceful_shutdown_timeout_s=1, max_concurrent_queries=1000, version="v1") class A: def __call__(self): ray.get(signal.wait.remote()) A.deploy() controller = serve_instance._controller start_time = get_deployment_start_time(controller, A) handle = A.get_handle() [handle.remote() for _ in range(100)] wait_for_condition(lambda: get_num_running_replicas(controller, A) >= 2) signal.send.remote() # As the queue is drained, we should scale back down. wait_for_condition(lambda: get_num_running_replicas(controller, A) <= 1) # Make sure start time did not change for the deployment assert get_deployment_start_time(controller, A) == start_time
def test_job_gc(call_ray_start): address = call_ray_start ray.init(address=address) driver = """ import ray ray.init(address="{}") @ray.remote class Actor: def __init__(self): pass _ = Actor.remote() """.format(address) p = run_string_as_driver_nonblocking(driver) # Wait for actor to be created wait_for_num_actors(1) actor_table = ray.state.actors() assert len(actor_table) == 1 job_table = ray.state.jobs() assert len(job_table) == 2 # dash # Kill the driver process. p.kill() p.wait() def actor_finish(): actor_table = ray.state.actors() if (len(actor_table) == 0): return True else: return False wait_for_condition(actor_finish)
def test_get_node_info_after_raylet_died(ray_start_cluster_head): cluster = ray_start_cluster_head def get_node_info(): return ray._private.services.get_node_to_connect_for_driver( cluster.redis_address, cluster.gcs_address, cluster.head_node.node_ip_address, redis_password=cluster.redis_password, ) assert get_node_info( ).raylet_socket_name == cluster.head_node.raylet_socket_name cluster.head_node.kill_raylet() wait_for_condition( lambda: not cluster.global_state.node_table()[0]["Alive"], timeout=30) with pytest.raises(RuntimeError): get_node_info() node2 = cluster.add_node() assert get_node_info().raylet_socket_name == node2.raylet_socket_name
def test_plugin_hang(ray_start_regular): env_key = MyPluginForHang.env_key @ray.remote(num_cpus=0.1) def f(): return os.environ[env_key] refs = [ f.options( # Avoid hitting the cache of runtime_env runtime_env={ "plugins": { MY_PLUGIN_FOR_HANG_CLASS_PATH: { "name": "f1" } } }).remote(), f.options(runtime_env={ "plugins": { MY_PLUGIN_FOR_HANG_CLASS_PATH: { "name": "f2" } } }).remote(), ] def condition(): for ref in refs: try: res = ray.get(ref, timeout=1) print("result:", res) assert int(res) == 2 return True except Exception as error: print(f"Got error: {error}") pass return False wait_for_condition(condition, timeout=60)
def _test_task_and_actor(capsys): @ray.remote def f(): pass with pytest.raises(RuntimeEnvSetupError): ray.get(f.options(runtime_env={"pip": ["requests"]}).remote()) def stderr_checker(): captured = capsys.readouterr() return "ray[default]" in captured.err wait_for_condition(stderr_checker) @ray.remote class A: def task(self): pass A.options(runtime_env={"pip": ["requests"]}).remote() wait_for_condition(stderr_checker)
def test_subprocess_exception(self, job_manager): """ Run a python script with exception, ensure: 1) Job status is marked as failed 2) Job manager can surface exception message back to logs api 3) Job no hanging job supervisor actor 4) Empty logs """ run_cmd = f"python {_driver_script_path('script_with_exception.py')}" job_id = job_manager.submit_job(entrypoint=run_cmd) def cleaned_up(): status = job_manager.get_job_status(job_id) if status.status != JobStatus.FAILED: return False if ("Exception: Script failed with exception !" not in status.message): return False return job_manager._get_actor_for_job(job_id) is None wait_for_condition(cleaned_up)
def test_pass_metadata(self, job_manager): def dict_to_binary(d): return str(dict(sorted(d.items()))).encode("utf-8") print_metadata_cmd = ( "python -c\"" "import ray;" "ray.init();" "job_config=ray.worker.global_worker.core_worker.get_job_config();" "print(dict(sorted(job_config.metadata.items())))" "\"") # Check that we default to only the job ID. job_id = job_manager.submit_job(print_metadata_cmd) wait_for_condition(check_job_succeeded, job_manager=job_manager, job_id=job_id) assert job_manager.get_job_stdout(job_id) == dict_to_binary( {JOB_ID_METADATA_KEY: job_id}) # Check that we can pass custom metadata. job_id = job_manager.submit_job(print_metadata_cmd, metadata={ "key1": "val1", "key2": "val2" }) wait_for_condition(check_job_succeeded, job_manager=job_manager, job_id=job_id) assert job_manager.get_job_stdout(job_id) == dict_to_binary({ JOB_ID_METADATA_KEY: job_id, "key1": "val1", "key2": "val2" })
def test_updating_status_message(lower_slow_startup_threshold_and_reset): """Check if status message says if a serve deployment has taken a long time""" client = lower_slow_startup_threshold_and_reset @serve.deployment( num_replicas=5, ray_actor_options={"num_cpus": 1}, ) def f(*args): pass f.deploy(_blocking=False) def updating_message(): deployment_status = client.get_serve_status().deployment_statuses[0] message_substring = "more than 1s to be scheduled." return (deployment_status.status == "UPDATING") and (message_substring in deployment_status.message) wait_for_condition(updating_message, timeout=20)
def test_user_logs(serve_instance): logger = logging.getLogger("ray.serve") msg = "user log message" name = "user_fn" @serve.deployment(name=name) def fn(*args): logger.info("user log message") return serve.get_replica_context().replica_tag fn.deploy() handle = fn.get_handle() f = io.StringIO() with redirect_stderr(f): def check_log(replica_tag: str): s = f.getvalue() return all([name in s, replica_tag in s, msg in s]) replica_tag = ray.get(handle.remote()) wait_for_condition(check_log, replica_tag=replica_tag)
def test_heartbeat_ip(shutdown_only): cluster = ray.init(num_cpus=1, _system_config={ "report_worker_backlog": True, }) global_state_accessor = GlobalStateAccessor( cluster["redis_address"], ray.ray_constants.REDIS_DEFAULT_PASSWORD) global_state_accessor.connect() self_ip = ray.util.get_node_ip_address() def self_ip_is_set(): message = global_state_accessor.get_all_resource_usage() if message is None: return False resource_usage = gcs_utils.ResourceUsageBatchData.FromString(message) resources_data = resource_usage.batch[0] return resources_data.node_manager_address == self_ip wait_for_condition(self_ip_is_set, timeout=2) global_state_accessor.disconnect()
def test_no_user_defined_method(serve_instance, use_class): """Check the default behavior when an actor crashes.""" if use_class: @serve.deployment class A: def __call__(self, *args): return ray.get_runtime_context().current_actor else: @serve.deployment def A(*args): return ray.get_runtime_context().current_actor h = serve.run(A.bind()) actor = ray.get(h.remote()) ray.kill(actor) # This would time out if we wait for multiple health check failures. wait_for_condition(check_new_actor_started, handle=h, original_actors=actor)
def test_actor_scheduling_not_block_with_placement_group(ray_start_cluster): """Tests the scheduling of lots of actors will not be blocked when using placement groups. For more detailed information please refer to: https://github.com/ray-project/ray/issues/15801. """ cluster = ray_start_cluster cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote(num_cpus=1) class A: def ready(self): pass actor_num = 1000 pgs = [ray.util.placement_group([{"CPU": 1}]) for _ in range(actor_num)] actors = [A.options(placement_group=pg).remote() for pg in pgs] refs = [actor.ready.remote() for actor in actors] expected_created_num = 1 def is_actor_created_number_correct(): ready, not_ready = ray.wait(refs, num_returns=len(refs), timeout=1) return len(ready) == expected_created_num def is_pg_created_number_correct(): created_pgs = [ pg for _, pg in ray.util.placement_group_table().items() if pg["state"] == "CREATED" ] return len(created_pgs) == expected_created_num wait_for_condition(is_pg_created_number_correct, timeout=3) wait_for_condition( is_actor_created_number_correct, timeout=30, retry_interval_ms=0) # NOTE: we don't need to test all the actors create successfully. for _ in range(20): expected_created_num += 1 cluster.add_node(num_cpus=1) wait_for_condition(is_pg_created_number_correct, timeout=10) # Make sure the node add event will cause a waiting actor # to create successfully in time. wait_for_condition( is_actor_created_number_correct, timeout=30, retry_interval_ms=0)
def test_stop_job_in_pending(self, job_manager): """ Kick off a job that is in PENDING state, stop the job and ensure 1) Job can correctly be stop immediately with correct JobStatus 2) No dangling subprocess left. """ start_signal_actor = SignalActor.remote() with tempfile.TemporaryDirectory() as tmp_dir: pid_file, _, job_id = _run_hanging_command( job_manager, tmp_dir, start_signal_actor=start_signal_actor) assert not os.path.exists(pid_file), ( "driver subprocess should NOT be running while job is " "still PENDING.") assert job_manager.stop_job(job_id) is True # Send run signal to unblock run function ray.get(start_signal_actor.send.remote()) wait_for_condition(check_job_stopped, job_manager=job_manager, job_id=job_id)
def test_scaledown_shared_objects(shutdown_only): cluster = AutoscalingCluster( head_resources={"CPU": 0}, worker_node_types={ "cpu_node": { "resources": { "CPU": 1, "object_store_memory": 100 * 1024 * 1024, }, "node_config": {}, "min_workers": 0, "max_workers": 5, }, }, idle_timeout_minutes=0.05, ) try: cluster.start( _system_config={"scheduler_report_pinned_bytes_only": True}) ray.init("auto") actors = [Actor.remote() for _ in range(5)] ray.get([a.f.remote() for a in actors]) print("All five nodes launched") # Verify scale-up. wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 5) data = actors[0].create.remote(1024 * 1024 * 5) ray.get([a.recv.remote(data) for a in actors]) print("Data broadcast successfully, deleting actors.") del actors # Verify scale-down. wait_for_condition(lambda: ray.cluster_resources().get("CPU", 0) == 1, timeout=30) finally: cluster.shutdown()
def test_kill_job_actor_in_pending(self, job_manager): """ Kick off a job that is in PENDING state, kill the job actor and ensure 1) Job can correctly be stop immediately with correct JobStatus 2) No dangling subprocess left. """ start_signal_actor = SignalActor.remote() with tempfile.TemporaryDirectory() as tmp_dir: pid_file, _, job_id = _run_hanging_command( job_manager, tmp_dir, start_signal_actor=start_signal_actor) assert not os.path.exists(pid_file), ( "driver subprocess should NOT be running while job is " "still PENDING.") actor = job_manager._get_actor_for_job(job_id) ray.kill(actor, no_restart=True) wait_for_condition(check_job_failed, job_manager=job_manager, job_id=job_id)
def test_job_level_gc(self, runtime_env_disable_URI_cache, start_cluster, field, spec_format, tmp_path): """Tests that job-level conda env is GC'd when the job exits.""" # We must use a single-node cluster. If we simulate a multi-node # cluster then the conda installs will proceed simultaneously, one on # each node, but since they're actually running on the same machine we # get errors. cluster, address = start_cluster ray.init(address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path)) @ray.remote def f(): import pip_install_test # noqa: F401 return True # Ensure that the runtime env has been installed. assert ray.get(f.remote()) # Sleep some seconds before checking that we didn't GC. Otherwise this # check may spuriously pass. time.sleep(2) assert not check_local_files_gced(cluster) ray.shutdown() wait_for_condition(lambda: check_local_files_gced(cluster), timeout=30) # Check that we can reconnect with the same env. (In other words, ensure # the conda env was fully deleted and not left in some kind of corrupted # state that prevents reinstalling the same conda env.) ray.init(address, runtime_env=generate_runtime_env_dict(field, spec_format, tmp_path)) assert ray.get(f.remote())
def test_stop_job_subprocess_cleanup_upon_stop(self, job_manager): """ Ensure driver scripts' subprocess is cleaned up properly when we stopped a running job. SIGTERM first, SIGKILL after 3 seconds. """ with tempfile.TemporaryDirectory() as tmp_dir: pid_file, _, job_id = _run_hanging_command(job_manager, tmp_dir) with open(pid_file, "r") as file: pid = int(file.read()) assert psutil.pid_exists(pid), ( "driver subprocess should be running") assert job_manager.stop_job(job_id) is True wait_for_condition(check_job_stopped, job_manager=job_manager, job_id=job_id) # Ensure driver subprocess gets cleaned up after job reached # termination state wait_for_condition(check_subprocess_cleaned, pid=pid)
def test_delete_actor(ray_start_regular): with ray_start_client_server_pair() as pair: ray, server = pair @ray.remote class Accumulator: def __init__(self): self.acc = 0 def inc(self): self.acc += 1 actor = Accumulator.remote() actor.inc.remote() actor2 = Accumulator.remote() actor2.inc.remote() assert server_actor_ref_count(server, 2)() del actor wait_for_condition(server_actor_ref_count(server, 1), timeout=5)
async def test_failed_job(self, job_manager): """Test tailing logs for a job that unexpectedly exits.""" with tempfile.TemporaryDirectory() as tmp_dir: pid_file, _, job_id = _run_hanging_command(job_manager, tmp_dir) await self._tail_and_assert_logs(job_id, job_manager, expected_log="Waiting...", num_iteration=5) # Kill the job unexpectedly. with open(pid_file, "r") as f: os.kill(int(f.read()), signal.SIGKILL) async for lines in job_manager.tail_job_logs(job_id): assert all(s == "Waiting..." for s in lines.strip().split("\n")) print(lines, end="") wait_for_condition(check_job_failed, job_manager=job_manager, job_id=job_id)
def test_node_physical_stats(enable_test_module, shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=1) class Actor: def getpid(self): return os.getpid() actors = [Actor.remote() for _ in range(6)] actor_pids = ray.get([actor.getpid.remote() for actor in actors]) actor_pids = set(actor_pids) webui_url = addresses["webui_url"] assert wait_until_server_available(webui_url) is True webui_url = format_web_url(webui_url) def _check_workers(): try: resp = requests.get(webui_url + "/test/dump?key=node_physical_stats") resp.raise_for_status() result = resp.json() assert result["result"] is True node_physical_stats = result["data"]["nodePhysicalStats"] assert len(node_physical_stats) == 1 current_stats = node_physical_stats[addresses["node_id"]] # Check Actor workers current_actor_pids = set() for worker in current_stats["workers"]: if "ray::Actor" in worker["cmdline"][0]: current_actor_pids.add(worker["pid"]) assert current_actor_pids == actor_pids # Check raylet cmdline assert "raylet" in current_stats["cmdline"][0] return True except Exception as ex: logger.info(ex) return False wait_for_condition(_check_workers, timeout=10)
def test_pass_returned_object_ref(one_worker_100MiB, use_ray_put, failure): @ray.remote def return_an_id(): return [ put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8), use_ray_put) ] # TODO(edoakes): this fails with an ActorError with max_retries=1. @ray.remote(max_retries=0) def pending(ref, signal): ray.get(signal.wait.remote()) ray.get(ref[0]) if failure: os._exit(0) signal = SignalActor.remote() outer_oid = return_an_id.remote() inner_oid_binary = ray.get(outer_oid)[0].binary() pending_oid = pending.remote([outer_oid], signal) # Remove the local reference to the returned ID. del outer_oid # Check that the inner ID is pinned by the remote task ID and finishing # the task unpins the object. ray.get(signal.send.remote()) try: # Should succeed because inner_oid is pinned if no failure. ray.get(pending_oid) assert not failure except ray.exceptions.WorkerCrashedError: assert failure def ref_not_exists(): worker = ray.worker.global_worker inner_oid = ray.ObjectRef(inner_oid_binary) return not worker.core_worker.object_exists(inner_oid) wait_for_condition(ref_not_exists)
def test_stop_long_running_job(job_sdk_client): """ Submit a job that runs for a while and stop it in the middle. """ client = job_sdk_client with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) driver_script = """ print('Hello !') import time time.sleep(300) # This should never finish raise RuntimeError('Intentionally failed.') """ test_script_file = path / "test_script.py" with open(test_script_file, "w+") as file: file.write(driver_script) job_id = client.submit_job(entrypoint="python test_script.py", runtime_env={"working_dir": tmp_dir}) assert client.stop_job(job_id) is True wait_for_condition(_check_job_stopped, client=client, job_id=job_id)
def test_usage_file_error_message(monkeypatch, ray_start_cluster, reset_lib_usage): """ Make sure the usage report file is generated with a proper error message when the report is failed. """ with monkeypatch.context() as m: m.setenv("RAY_USAGE_STATS_ENABLED", "1") m.setenv("RAY_USAGE_STATS_REPORT_URL", "http://127.0.0.1:8000") m.setenv("RAY_USAGE_STATS_REPORT_INTERVAL_S", "1") cluster = ray_start_cluster cluster.add_node(num_cpus=0) ray.init(address=cluster.address) global_node = ray._private.worker._global_node temp_dir = pathlib.Path(global_node.get_session_dir_path()) try: wait_for_condition(lambda: file_exists(temp_dir), timeout=30) except Exception: print_dashboard_log() raise error_message = read_file(temp_dir, "error") failure_old = read_file(temp_dir, "usage_stats")["total_failed"] report_success = read_file(temp_dir, "success") # Test if the timestampe has been updated. assert ( "HTTPConnectionPool(host='127.0.0.1', port=8000): " "Max retries exceeded with url:" ) in error_message assert not report_success try: wait_for_condition( lambda: failure_old < read_file(temp_dir, "usage_stats")["total_failed"] ) except Exception: print_dashboard_log() read_file(temp_dir, "usage_stats")["total_failed"] raise assert read_file(temp_dir, "usage_stats")["total_success"] == 0
def test_replica_spread(ray_cluster): cluster = ray_cluster cluster.add_node(num_cpus=2) # NOTE(edoakes): we need to start serve before adding the worker node to # guarantee that the controller is placed on the head node (we should be # able to tolerate being placed on workers, but there's currently a bug). # We should add an explicit test for that in the future when it's fixed. cluster.connect(namespace=SERVE_NAMESPACE) serve.start(detached=True) worker_node = cluster.add_node(num_cpus=2) @serve.deployment(num_replicas=2) def get_node_id(): return os.getpid(), ray.get_runtime_context().node_id.hex() h = serve.run(get_node_id.bind()) def get_num_nodes(): pids = set() node_ids = set() while len(pids) < 2: pid, node = ray.get(h.remote()) pids.add(pid) node_ids.add(node) return len(node_ids) # Check that the two replicas are spread across the two nodes. wait_for_condition(lambda: get_num_nodes() == 2) # Kill the worker node. The second replica should get rescheduled on # the head node. cluster.remove_node(worker_node) # Check that the replica on the dead node can be rescheduled. wait_for_condition(lambda: get_num_nodes() == 1)
def test_delete_objects_delete_while_creating(object_spilling_config, shutdown_only): # Limit our object store to 75 MiB of memory. object_spilling_config, temp_folder = object_spilling_config address = ray.init( object_store_memory=75 * 1024 * 1024, _system_config={ "max_io_workers": 4, "min_spilling_size": 0, "automatic_object_spilling_enabled": True, "object_store_full_delay_ms": 100, "object_spilling_config": object_spilling_config, }, ) arr = np.random.rand(1024 * 1024) # 8 MB data replay_buffer = [] for _ in range(80): ref = None while ref is None: ref = ray.put(arr) replay_buffer.append(ref) # Remove the replay buffer with 60% probability. if random.randint(0, 9) < 6: replay_buffer.pop() # Do random sampling. for _ in range(200): ref = random.choice(replay_buffer) sample = ray.get(ref, timeout=0) assert np.array_equal(sample, arr) # After all, make sure all objects are killed without race condition. del replay_buffer del ref wait_for_condition(lambda: is_dir_empty(temp_folder)) assert_no_thrashing(address["address"])
def test_cached_object(ray_start_cluster): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node( num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node( num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node1": 1}).remote() ray.get(dependent_task.options(resources={"node2": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node( num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) wait_for_condition( lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() ray.get(dependent_task.remote(obj))
def test_idempotence_after_controller_death(ray_start_stop, use_command: bool): """Check that CLI is idempotent even if controller dies.""" config_file_name = os.path.join(os.path.dirname(__file__), "test_config_files", "basic_graph.yaml") success_message_fragment = b"Sent deploy request successfully!" deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response ray.init(address="auto", namespace=SERVE_NAMESPACE) serve.start(detached=True) wait_for_condition( lambda: len(ray.util.list_named_actors(all_namespaces=True)) == 4, timeout=15) # Kill controller if use_command: subprocess.check_output(["serve", "shutdown", "-y"]) else: serve.shutdown() status_response = subprocess.check_output(["serve", "status"]) status_info = yaml.safe_load(status_response) assert len(status_info["deployment_statuses"]) == 0 deploy_response = subprocess.check_output( ["serve", "deploy", config_file_name]) assert success_message_fragment in deploy_response # Restore testing controller serve.start(detached=True) wait_for_condition( lambda: len(ray.util.list_named_actors(all_namespaces=True)) == 4, timeout=15) serve.shutdown() ray.shutdown()