def test_pending_task_dependency_pinning(one_worker_100MiB): @ray.remote def pending(input1, input2): return # The object that is ray.put here will go out of scope immediately, so if # pending task dependencies aren't considered, it will be evicted before # the ray.get below due to the subsequent ray.puts that fill up the object # store. np_array = np.zeros(40 * 1024 * 1024, dtype=np.uint8) signal = SignalActor.remote() obj_ref = pending.remote(np_array, signal.wait.remote()) for _ in range(2): ray.put(np.zeros(40 * 1024 * 1024, dtype=np.uint8)) ray.get(signal.send.remote()) ray.get(obj_ref)
def test_async_callback(ray_start_regular_shared): global_set = set() ref = ray.put(None) ref._on_completed(lambda _: global_set.add("completed-1")) wait_for_condition(lambda: "completed-1" in global_set) signal = SignalActor.remote() @ray.remote def wait(): ray.get(signal.wait.remote()) ref = wait.remote() ref._on_completed(lambda _: global_set.add("completed-2")) assert "completed-2" not in global_set signal.send.remote() wait_for_condition(lambda: "completed-2" in global_set)
def test_recursively_nest_ids(one_worker_100MiB, use_ray_put, failure): @ray.remote(max_retries=1) def recursive(ref, signal, max_depth, depth=0): unwrapped = ray.get(ref[0]) if depth == max_depth: ray.get(signal.wait.remote()) if failure: os._exit(0) return else: return recursive.remote(unwrapped, signal, max_depth, depth + 1) signal = SignalActor.remote() max_depth = 5 array_oid = put_object(np.zeros(20 * 1024 * 1024, dtype=np.uint8), use_ray_put) nested_oid = array_oid for _ in range(max_depth): nested_oid = ray.put([nested_oid]) head_oid = recursive.remote([nested_oid], signal, max_depth) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid, nested_oid tail_oid = head_oid for _ in range(max_depth): tail_oid = ray.get(tail_oid) # Check that the remote reference pins the object. _fill_object_store_and_get(array_oid_bytes) # Fulfill the dependency, causing the tail task to finish. ray.get(signal.send.remote()) try: ray.get(tail_oid) assert not failure # TODO(edoakes): this should raise WorkerError. except ray.exceptions.ObjectLostError: assert failure # Reference should be gone, check that array gets evicted. _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_pass_returned_object_id(one_worker_100MiB, use_ray_put): @ray.remote(num_cpus=0) class Signal: def __init__(self): self.ready_event = asyncio.Event() def send(self): self.ready_event.set() async def wait(self): await self.ready_event.wait() @ray.remote def put(): return @ray.remote def return_an_id(): return [ put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) ] @ray.remote def pending(ref): ray.get(ref[0]) return ref[0] signal = SignalActor.remote() outer_oid = return_an_id.remote() pending_oid = pending.remote([outer_oid]) # Remove the local reference to the returned ID. del outer_oid # Check that the inner ID is pinned by the remote task ID. _fill_object_store_and_get(pending_oid, succeed=False) ray.get(signal.send.remote()) inner_oid = ray.get(pending_oid) inner_oid_binary = inner_oid.binary() _fill_object_store_and_get(inner_oid_binary) del pending_oid del inner_oid _fill_object_store_and_get(inner_oid_binary, succeed=False)
def test_recursive_serialized_reference(one_worker_100MiB, use_ray_put): @ray.remote(num_cpus=0) class Signal: def __init__(self): self.ready_event = asyncio.Event() def send(self): self.ready_event.set() async def wait(self): await self.ready_event.wait() @ray.remote def recursive(ref, signal, max_depth, depth=0): ray.get(ref[0]) if depth == max_depth: return ray.get(signal.wait.remote()) else: return recursive.remote(ref, signal, max_depth, depth + 1) signal = SignalActor.remote() max_depth = 5 array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) head_oid = recursive.remote([array_oid], signal, max_depth) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid tail_oid = head_oid for _ in range(max_depth): tail_oid = ray.get(tail_oid) # Check that the remote reference pins the object. _fill_object_store_and_get(array_oid_bytes) # Fulfill the dependency, causing the tail task to finish. ray.get(signal.send.remote()) assert ray.get(tail_oid) is None # Reference should be gone, check that array gets evicted. _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_dying_worker_get(ray_start_2_cpus): @ray.remote def sleep_forever(signal): ray.get(signal.send.remote()) time.sleep(10**6) @ray.remote def get_worker_pid(): return os.getpid() signal = SignalActor.remote() x_id = sleep_forever.remote(signal) ray.get(signal.wait.remote()) # Get the PID of the other worker. worker_pid = ray.get(get_worker_pid.remote()) @ray.remote def f(id_in_a_list): ray.get(id_in_a_list[0]) # Have the worker wait in a get call. result_id = f.remote([x_id]) time.sleep(1) # Make sure the task hasn't finished. ready_ids, _ = ray.wait([result_id], timeout=0) assert len(ready_ids) == 0 # Kill the worker. os.kill(worker_pid, SIGKILL) time.sleep(0.1) # Make sure the sleep task hasn't finished. ready_ids, _ = ray.wait([x_id], timeout=0) assert len(ready_ids) == 0 # Seal the object so the store attempts to notify the worker that the # get has been fulfilled. obj = np.ones(200 * 1024, dtype=np.uint8) ray.worker.global_worker.put_object(obj, x_id) time.sleep(0.1) # Make sure that nothing has died. assert ray._private.services.remaining_processes_alive()
def _setup_cluster_for_test(ray_start_cluster): NUM_NODES = 2 cluster = ray_start_cluster # Add a head node. cluster.add_node(_system_config={"metrics_report_interval_ms": 1000}) # Add worker nodes. [cluster.add_node() for _ in range(NUM_NODES - 1)] cluster.wait_for_nodes() ray.init(address=cluster.address) worker_should_exit = SignalActor.remote() # Generate some metrics from actor & tasks. @ray.remote def f(): counter = Count("test_counter", description="desc") counter.record(1) ray.get(worker_should_exit.wait.remote()) @ray.remote class A: async def ping(self): histogram = Histogram("test_histogram", description="desc", boundaries=[0.1, 1.6]) histogram.record(1.5) ray.get(worker_should_exit.wait.remote()) a = A.remote() obj_refs = [f.remote(), a.ping.remote()] node_info_list = ray.nodes() prom_addresses = [] for node_info in node_info_list: metrics_export_port = node_info["MetricsExportPort"] addr = node_info["NodeManagerAddress"] prom_addresses.append(f"{addr}:{metrics_export_port}") yield prom_addresses ray.get(worker_should_exit.send.remote()) ray.get(obj_refs) ray.shutdown() cluster.shutdown()
def test_map_async(pool_4_processes): def f(args): index, signal = args ray.get(signal.wait.remote()) return index, os.getpid() signal = SignalActor.remote() async_result = pool_4_processes.map_async(f, [(i, signal) for i in range(1000)]) assert not async_result.ready() with pytest.raises(TimeoutError): async_result.get(timeout=0.01) async_result.wait(timeout=0.01) # Send the signal to finish the tasks. ray.get(signal.send.remote()) async_result.wait(timeout=10) assert async_result.ready() assert async_result.successful() results = async_result.get() assert len(results) == 1000 pid_counts = defaultdict(int) for i, (index, pid) in enumerate(results): assert i == index pid_counts[pid] += 1 # Check that the functions are spread somewhat evenly. for count in pid_counts.values(): assert count > 100 def bad_func(index): if index == 50: raise Exception("test_map_async failure") async_result = pool_4_processes.map_async(bad_func, range(100)) async_result.wait(10) assert async_result.ready() assert not async_result.successful() with pytest.raises(Exception, match="test_map_async failure"): async_result.get()
def test_close(pool_4_processes): def f(signal): ray.get(signal.wait.remote()) return "hello" signal = SignalActor.remote() result = pool_4_processes.map_async(f, [signal for _ in range(4)]) assert not result.ready() pool_4_processes.close() assert not result.ready() # Signal the head of line tasks to finish. ray.get(signal.send.remote()) pool_4_processes.join() # close() shouldn't interrupt pending tasks, so check that they succeeded. result.wait(timeout=10) assert result.ready() assert result.successful() assert result.get() == ["hello"] * 4
def test_pass_returned_object_id(one_worker_100MiB, use_ray_put, failure): @ray.remote def return_an_id(): return [ put_object( np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) ] # TODO(edoakes): this fails with an ActorError with max_retries=1. @ray.remote(max_retries=0) def pending(ref, signal): ray.get(signal.wait.remote()) ray.get(ref[0]) if failure: os._exit(0) signal = SignalActor.remote() outer_oid = return_an_id.remote() inner_oid_binary = ray.get(outer_oid)[0].binary() pending_oid = pending.remote([outer_oid], signal) # Remove the local reference to the returned ID. del outer_oid # Check that the inner ID is pinned by the remote task ID and finishing # the task unpins the object. ray.get(signal.send.remote()) try: # Should succeed because inner_oid is pinned if no failure. ray.get(pending_oid) assert not failure except ray.exceptions.RayWorkerError: assert failure def ref_not_exists(): worker = ray.worker.global_worker inner_oid = ray.ObjectID(inner_oid_binary) return not worker.core_worker.object_exists(inner_oid) assert wait_for_condition(ref_not_exists)
def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put): @ray.remote(num_cpus=0) class Signal: def __init__(self): self.ready_event = asyncio.Event() def send(self): self.ready_event.set() async def wait(self): await self.ready_event.wait() @ray.remote def child(dep1, dep2): return @ray.remote def launch_pending_task(ref, signal): return child.remote(ref[0], signal.wait.remote()) signal = SignalActor.remote() # Test that the reference held by the actor isn't evicted. array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) child_return_id = ray.get(launch_pending_task.remote([array_oid], signal)) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid # Test that the reference prevents the object from being evicted. _fill_object_store_and_get(array_oid_bytes) ray.get(signal.send.remote()) ray.get(child_return_id) del child_return_id _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_recursively_pass_returned_object_id(one_worker_100MiB): @ray.remote def put(): return np.zeros(40 * 1024 * 1024, dtype=np.uint8) @ray.remote def return_an_id(): return [put.remote()] @ray.remote def recursive(ref, signal, max_depth, depth=0): ray.get(ref[0]) if depth == max_depth: return ray.get(signal.wait.remote()) else: return recursive.remote(ref, signal, max_depth, depth + 1) max_depth = 5 outer_oid = return_an_id.remote() inner_oid_bytes = ray.get(outer_oid)[0].binary() signal = SignalActor.remote() head_oid = recursive.remote([outer_oid], signal, max_depth) # Remove the local reference. del outer_oid tail_oid = head_oid for _ in range(max_depth): tail_oid = ray.get(tail_oid) # Check that the remote reference pins the object. _fill_object_store_and_get(inner_oid_bytes) # Fulfill the dependency, causing the tail task to finish. ray.get(signal.send.remote()) ray.get(tail_oid) # Reference should be gone, check that returned ID gets evicted. _fill_object_store_and_get(inner_oid_bytes, succeed=False)
def test_recursively_nest_ids(one_worker_100MiB): @ray.remote def recursive(ref, signal, max_depth, depth=0): unwrapped = ray.get(ref[0]) if depth == max_depth: return ray.get(signal.wait.remote()) else: return recursive.remote(unwrapped, signal, max_depth, depth + 1) @ray.remote def put(): return np.zeros(40 * 1024 * 1024, dtype=np.uint8) signal = SignalActor.remote() max_depth = 5 array_oid = put.remote() nested_oid = array_oid for _ in range(max_depth): nested_oid = ray.put([nested_oid]) head_oid = recursive.remote([nested_oid], signal, max_depth) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid, nested_oid tail_oid = head_oid for _ in range(max_depth): tail_oid = ray.get(tail_oid) # Check that the remote reference pins the object. _fill_object_store_and_get(array_oid_bytes) # Fulfill the dependency, causing the tail task to finish. ray.get(signal.send.remote()) ray.get(tail_oid) # Reference should be gone, check that array gets evicted. _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_fast(shutdown_only, use_force): ray.init(num_cpus=2) @ray.remote def fast(y): return y signaler = SignalActor.remote() ids = list() for _ in range(100): x = fast.remote("a") # NOTE If a non-force Cancellation is attempted in the time # between a worker receiving a task and the worker executing # that task (specifically the python execution), Cancellation # can fail. if not use_force: time.sleep(0.1) ray.cancel(x, force=use_force) ids.append(x) @ray.remote def wait_for(y): return y sig = signaler.wait.remote() for _ in range(5000): x = wait_for.remote(sig) ids.append(x) for idx in range(100, 5100): if random.random() > 0.95: ray.cancel(ids[idx], force=use_force) signaler.send.remote() for i, obj_ref in enumerate(ids): try: ray.get(obj_ref, timeout=120) except Exception as e: assert isinstance( e, valid_exceptions(use_force)), f"Failure on iteration: {i}"
def test_serve_graceful_shutdown(serve_instance): signal = SignalActor.remote() @serve.deployment(name="wait", max_concurrent_queries=10) class Wait: async def __call__(self, request): signal_actor = await request.body() await signal_actor.wait.remote() return "" Wait.config.experimental_graceful_shutdown_wait_loop_s = 0.5 Wait.config.experimental_graceful_shutdown_timeout_s = 1000 Wait.deploy() handle = Wait.get_handle() refs = [handle.remote(signal) for _ in range(10)] # Wait for all the queries to be enqueued with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(refs, timeout=1) @ray.remote(num_cpus=0) def do_blocking_delete(): Wait.delete() # Now delete the backend. This should trigger the shutdown sequence. delete_ref = do_blocking_delete.remote() # The queries should be enqueued but not executed becuase they are blocked # by signal actor. with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(refs, timeout=1) signal.send.remote() # All the queries should be drained and executed without error. ray.get(refs) # Blocking delete should complete. ray.get(delete_ref)
def test_basic_serialized_reference(one_worker_100MiB, use_ray_put): @ray.remote def pending(ref, dep): ray.get(ref[0]) array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) signal = SignalActor.remote() oid = pending.remote([array_oid], signal.wait.remote()) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid # Check that the remote reference pins the object. _fill_object_store_and_get(array_oid_bytes) # Fulfill the dependency, causing the task to finish. ray.get(signal.send.remote()) ray.get(oid) # Reference should be gone, check that array gets evicted. _fill_object_store_and_get(array_oid_bytes, succeed=False)
def test_remote_cancel(ray_start_regular, use_force): signaler = SignalActor.remote() @ray.remote def wait_for(y): return ray.get(y[0]) @ray.remote def remote_wait(sg): return [wait_for.remote([sg[0]])] sig = signaler.wait.remote() outer = remote_wait.remote([sig]) inner = ray.get(outer)[0] with pytest.raises(GetTimeoutError): ray.get(inner, timeout=1) ray.cancel(inner, force=use_force) with pytest.raises(valid_exceptions(use_force)): ray.get(inner, timeout=10)
def test_ref_in_handle_input(serve_instance): # https://github.com/ray-project/ray/issues/12593 unblock_worker_signal = SignalActor.remote() @serve.deployment async def blocked_by_ref(data): assert not isinstance(data, ray.ObjectRef) blocked_by_ref.deploy() handle = blocked_by_ref.get_handle() # Pass in a ref that's not ready yet ref = unblock_worker_signal.wait.remote() worker_result = handle.remote(ref) # Worker shouldn't execute the request with pytest.raises(GetTimeoutError): ray.get(worker_result, timeout=1) # Now unblock the worker unblock_worker_signal.send.remote() ray.get(worker_result)
def test_ref_in_handle_input(serve_instance): # https://github.com/ray-project/ray/issues/12593 unblock_worker_signal = SignalActor.remote() async def blocked_by_ref(serve_request): data = await serve_request.body() assert not isinstance(data, ray.ObjectRef) serve.create_backend("ref", blocked_by_ref) serve.create_endpoint("ref", backend="ref") handle = serve.get_handle("ref") # Pass in a ref that's not ready yet ref = unblock_worker_signal.wait.remote() worker_result = handle.remote(ref) # Worker shouldn't execute the request with pytest.raises(GetTimeoutError): ray.get(worker_result, timeout=1) # Now unblock the worker unblock_worker_signal.send.remote() ray.get(worker_result)
def test_worker_holding_serialized_reference(one_worker_100MiB, use_ray_put, failure): @ray.remote(max_retries=1) def child(dep1, dep2): if failure: os._exit(0) return @ray.remote def launch_pending_task(ref, signal): return child.remote(ref[0], signal.wait.remote()) signal = SignalActor.remote() # Test that the reference held by the actor isn't evicted. array_oid = put_object(np.zeros(40 * 1024 * 1024, dtype=np.uint8), use_ray_put) child_return_id = ray.get(launch_pending_task.remote([array_oid], signal)) # Remove the local reference. array_oid_bytes = array_oid.binary() del array_oid # Test that the reference prevents the object from being evicted. _fill_object_store_and_get(array_oid_bytes) ray.get(signal.send.remote()) try: ray.get(child_return_id) assert not failure except (ray.exceptions.RayWorkerError, ray.exceptions.UnreconstructableError): assert failure del child_return_id _fill_object_store_and_get(array_oid_bytes, succeed=False)
async def test_router_use_max_concurrency(serve_instance): # The VisibleRouter::get_queues method needs to pickle queries # so we register serializer here. In regular code path, query # serialization is done by Serve manually for performance. ray.register_custom_serializer(Query, Query.ray_serialize, Query.ray_deserialize) signal = SignalActor.remote() @ray.remote class MockWorker: async def handle_request(self, request): await signal.wait.remote() return "DONE" def ready(self): pass class VisibleRouter(Router): def get_queues(self): return self.queries_counter, self.backend_queues worker = MockWorker.remote() q = ray.remote(VisibleRouter).remote() await q.setup.remote("") backend_name = "max-concurrent-test" config = BackendConfig({"max_concurrent_queries": 1}) await q.set_traffic.remote("svc", TrafficPolicy({backend_name: 1.0})) await q.add_new_worker.remote(backend_name, "replica-tag", worker) await q.set_backend_config.remote(backend_name, config) # We send over two queries first_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1) second_query = q.enqueue_request.remote(RequestMetadata("svc", None), 1) # Neither queries should be available with pytest.raises(ray.exceptions.RayTimeoutError): ray.get([first_query, second_query], timeout=0.2) # Let's retrieve the router internal state queries_counter, backend_queues = await q.get_queues.remote() # There should be just one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # The second query is buffered assert len(backend_queues["max-concurrent-test"]) == 1 # Let's unblock the first query await signal.send.remote(clear=True) assert await first_query == "DONE" # The internal state of router should have changed. queries_counter, backend_queues = await q.get_queues.remote() # There should still be one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # But there shouldn't be any queries in the queue assert len(backend_queues["max-concurrent-test"]) == 0 # Unblocking the second query await signal.send.remote(clear=True) assert await second_query == "DONE" # Checking the internal state of the router one more time queries_counter, backend_queues = await q.get_queues.remote() assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 0 assert len(backend_queues["max-concurrent-test"]) == 0
def test_dependency_refcounts(ray_start_regular): @ray.remote def one_dep(dep, signal=None, fail=False): if signal is not None: ray.get(signal.wait.remote()) if fail: raise Exception("failed on purpose") @ray.remote def one_dep_large(dep, signal=None): if signal is not None: ray.get(signal.wait.remote()) # This will be spilled to plasma. return np.zeros(10 * 1024 * 1024, dtype=np.uint8) # Test that regular plasma dependency refcounts are decremented once the # task finishes. signal = SignalActor.remote() large_dep = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8)) result = one_dep.remote(large_dep, signal=signal) check_refcounts({large_dep: (1, 1), result: (1, 0)}) ray.get(signal.send.remote()) # Reference count should be removed once the task finishes. check_refcounts({large_dep: (1, 0), result: (1, 0)}) del large_dep, result check_refcounts({}) # Test that inlined dependency refcounts are decremented once they are # inlined. signal = SignalActor.remote() dep = one_dep.remote(None, signal=signal) check_refcounts({dep: (1, 0)}) result = one_dep.remote(dep) check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal.send.remote()) # Reference count should be removed as soon as the dependency is inlined. check_refcounts({dep: (1, 0), result: (1, 0)}) del dep, result check_refcounts({}) # Test that spilled plasma dependency refcounts are decremented once # the task finishes. signal1, signal2 = SignalActor.remote(), SignalActor.remote() dep = one_dep_large.remote(None, signal=signal1) check_refcounts({dep: (1, 0)}) result = one_dep.remote(dep, signal=signal2) check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal1.send.remote()) ray.get(dep, timeout=10) # Reference count should remain because the dependency is in plasma. check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal2.send.remote()) # Reference count should be removed because the task finished. check_refcounts({dep: (1, 0), result: (1, 0)}) del dep, result check_refcounts({}) # Test that regular plasma dependency refcounts are decremented if a task # fails. signal = SignalActor.remote() large_dep = ray.put(np.zeros(10 * 1024 * 1024, dtype=np.uint8)) result = one_dep.remote(large_dep, signal=signal, fail=True) check_refcounts({large_dep: (1, 1), result: (1, 0)}) ray.get(signal.send.remote()) # Reference count should be removed once the task finishes. check_refcounts({large_dep: (1, 0), result: (1, 0)}) del large_dep, result check_refcounts({}) # Test that spilled plasma dependency refcounts are decremented if a task # fails. signal1, signal2 = SignalActor.remote(), SignalActor.remote() dep = one_dep_large.remote(None, signal=signal1) check_refcounts({dep: (1, 0)}) result = one_dep.remote(dep, signal=signal2, fail=True) check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal1.send.remote()) ray.get(dep, timeout=10) # Reference count should remain because the dependency is in plasma. check_refcounts({dep: (1, 1), result: (1, 0)}) ray.get(signal2.send.remote()) # Reference count should be removed because the task finished. check_refcounts({dep: (1, 0), result: (1, 0)}) del dep, result check_refcounts({})
def test_async_actor_task_retries(ray_start_regular): # https://github.com/ray-project/ray/issues/11683 signal = SignalActor.remote() @ray.remote class DyingActor: def __init__(self): print("DyingActor init called") self.should_exit = False def set_should_exit(self): print("DyingActor.set_should_exit called") self.should_exit = True async def get(self, x, wait=False): print(f"DyingActor.get called with x={x}, wait={wait}") if self.should_exit: os._exit(0) if wait: await signal.wait.remote() return x # Normal in order actor task retries should work dying = DyingActor.options( max_restarts=-1, max_task_retries=-1, ).remote() assert ray.get(dying.get.remote(1)) == 1 ray.get(dying.set_should_exit.remote()) assert ray.get(dying.get.remote(42)) == 42 # Now let's try out of order retries: # Task seqno 0 will return # Task seqno 1 will be pending and retried later # Task seqno 2 will return # Task seqno 3 will crash the actor and retried later dying = DyingActor.options( max_restarts=-1, max_task_retries=-1, ).remote() # seqno 0 ref_0 = dying.get.remote(0) assert ray.get(ref_0) == 0 # seqno 1 ref_1 = dying.get.remote(1, wait=True) # seqno 2 ref_2 = dying.set_should_exit.remote() assert ray.get(ref_2) is None # seqno 3, this will crash the actor because previous task set should exit # to true. ref_3 = dying.get.remote(3) # At this point the actor should be restarted. The two pending tasks # [ref_1, ref_3] should be retried, but not the completed tasks [ref_0, # ref_2]. Critically, if ref_2 was retried, ref_3 can never return. ray.get(signal.send.remote()) assert ray.get(ref_1) == 1 assert ray.get(ref_3) == 3
async def test_router_use_max_concurrency(serve_instance): signal = SignalActor.remote() @ray.remote class MockWorker: async def handle_request(self, request): await signal.wait.remote() return "DONE" def ready(self): pass class VisibleRouter(Router): def get_queues(self): return self.queries_counter, self.backend_queues worker = MockWorker.remote() q = ray.remote(VisibleRouter).remote() await q.setup.remote("", serve_instance._controller_name) backend_name = "max-concurrent-test" config = BackendConfig(max_concurrent_queries=1) await q.set_traffic.remote("svc", TrafficPolicy({backend_name: 1.0})) await q.add_new_worker.remote(backend_name, "replica-tag", worker) await q.set_backend_config.remote(backend_name, config) # We send over two queries first_query = q.enqueue_request.remote( RequestMetadata(get_random_letters(10), "svc", None), 1) second_query = q.enqueue_request.remote( RequestMetadata(get_random_letters(10), "svc", None), 1) # Neither queries should be available with pytest.raises(ray.exceptions.GetTimeoutError): ray.get([first_query, second_query], timeout=0.2) # Let's retrieve the router internal state queries_counter, backend_queues = await q.get_queues.remote() # There should be just one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # The second query is buffered assert len(backend_queues["max-concurrent-test"]) == 1 # Let's unblock the first query await signal.send.remote(clear=True) assert await first_query == "DONE" # The internal state of router should have changed. queries_counter, backend_queues = await q.get_queues.remote() # There should still be one inflight request assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 1 # But there shouldn't be any queries in the queue assert len(backend_queues["max-concurrent-test"]) == 0 # Unblocking the second query await signal.send.remote(clear=True) assert await second_query == "DONE" # Checking the internal state of the router one more time queries_counter, backend_queues = await q.get_queues.remote() assert queries_counter[backend_name][ "max-concurrent-test:replica-tag"] == 0 assert len(backend_queues["max-concurrent-test"]) == 0
def test_metrics_export_end_to_end(ray_start_cluster): NUM_NODES = 2 cluster = ray_start_cluster # Add a head node. cluster.add_node( _internal_config=json.dumps({"metrics_report_interval_ms": 1000})) # Add worker nodes. [cluster.add_node() for _ in range(NUM_NODES - 1)] cluster.wait_for_nodes() ray.init(address=cluster.address) signal = SignalActor.remote() # Generate some metrics around actor & tasks. @ray.remote def f(): counter = Count("test_counter", "desc", "unit", []) ray.get(signal.send.remote()) while True: counter.record(1, {}) time.sleep(0.1) @ray.remote class A: async def ready(self): pass async def ping(self): histogram = Histogram("test_histogram", "desc", "unit", [0, 1, 2], []) while True: histogram.record(1, {}) await asyncio.sleep(0.1) obj_refs = [f.remote() for _ in range(30)] a = A.remote() obj_refs.append(a.ping.remote()) # Make sure both histogram and counter are created ray.get(a.ready.remote()) ray.get(signal.wait.remote()) node_info_list = ray.nodes() prom_addresses = [] for node_info in node_info_list: metrics_export_port = node_info["MetricsExportPort"] addr = node_info["NodeManagerAddress"] prom_addresses.append(f"{addr}:{metrics_export_port}") # Make sure we can ping Prometheus endpoints. def fetch_prometheus(prom_addresses): components_dict = {} metric_names = set() for address in prom_addresses: if address not in components_dict: components_dict[address] = set() try: response = requests.get( "http://localhost:{}".format(metrics_export_port)) except requests.exceptions.ConnectionError: return components_dict, metric_names for line in response.text.split("\n"): for family in text_string_to_metric_families(line): for sample in family.samples: # print(sample) metric_names.add(sample.name) if "Component" in sample.labels: components_dict[address].add( sample.labels["Component"]) return components_dict, metric_names def test_prometheus_endpoint(): # TODO(Simon): Add a gcs_server after fixing metrics. components_dict, metric_names = fetch_prometheus(prom_addresses) # Raylet should be on every node expected_components = {"raylet"} components_found = all( expected_components.issubset(components) for components in components_dict.values()) # Core worker should be on at least one node components_found = components_found and any( "core_worker" in components for components in components_dict.values()) expected_metric_names = {"ray_test_counter", "ray_test_histogram_max"} metric_names_found = expected_metric_names.issubset(metric_names) return components_found and metric_names_found try: wait_for_condition( test_prometheus_endpoint, timeout=20, retry_interval_ms=1000, # Yield resource for other processes ) except RuntimeError: # This is for debugging when test failed. raise RuntimeError( "All components were not visible to " "prometheus endpoints on time. " f"The compoenents are {fetch_prometheus(prom_addresses)}") ray.shutdown()
async def test_replica_set(ray_instance): signal = SignalActor.remote() @ray.remote(num_cpus=0) class MockWorker: _num_queries = 0 async def handle_request(self, request): self._num_queries += 1 await signal.wait.remote() return "DONE" async def num_queries(self): return self._num_queries # We will test a scenario with two replicas in the replica set. rs = ReplicaSet() workers = [MockWorker.remote() for _ in range(2)] rs.set_max_concurrent_queries(1) rs.update_worker_replicas(workers) # Send two queries. They should go through the router but blocked by signal # actors. query = Query([], {}, TaskContext.Python, RequestMetadata("request-id", "endpoint", TaskContext.Python)) first_ref = await rs.assign_replica(query) second_ref = await rs.assign_replica(query) # These should be blocked by signal actor. with pytest.raises(ray.exceptions.GetTimeoutError): ray.get([first_ref, second_ref], timeout=1) # Each replica should have exactly one inflight query. Let make sure the # queries arrived there. for worker in workers: while await worker.num_queries.remote() != 1: await asyncio.sleep(1) # Let's try to send another query. third_ref_pending_task = asyncio.get_event_loop().create_task( rs.assign_replica(query)) # We should fail to assign a replica, so this coroutine should still be # pending after some time. await asyncio.sleep(0.2) assert not third_ref_pending_task.done() # Let's unblock the two workers await signal.send.remote() assert await first_ref == "DONE" assert await second_ref == "DONE" # The third request should be unblocked and sent to first worker. # This meas we should be able to get the object ref. third_ref = await third_ref_pending_task # Now we got the object ref, let's get it result. await signal.send.remote() assert await third_ref == "DONE" # Finally, make sure that one of the replica processed the third query. num_queries_set = {(await worker.num_queries.remote()) for worker in workers} assert num_queries_set == {2, 1}