def test_limit_concurrency(shutdown_only): ray.init(num_cpus=1) block_task = Semaphore.remote(0) block_driver = Semaphore.remote(0) ray.get([block_task.locked.remote(), block_driver.locked.remote()]) @ray.remote(num_cpus=1) def foo(): ray.get(block_driver.release.remote()) ray.get(block_task.acquire.remote()) refs = [foo.remote() for _ in range(20)] block_driver_refs = [block_driver.acquire.remote() for _ in range(20)] # Some of the tasks will run since we relax the cap, but not all because it # should take exponentially long for the cap to be increased. ready, not_ready = ray.wait(block_driver_refs, timeout=10, num_returns=20) assert len(not_ready) >= 1 # Now the first instance of foo finishes, so the second starts to run. ray.get([block_task.release.remote() for _ in range(19)]) ready, not_ready = ray.wait(block_driver_refs, timeout=10, num_returns=20) assert len(not_ready) == 0 ready, not_ready = ray.wait(refs, num_returns=20, timeout=15) assert len(ready) == 19 assert len(not_ready) == 1
def test_for_each_concur_sync(ray_start_regular_shared): main_wait = Semaphore.remote(value=0) test_wait = Semaphore.remote(value=0) def task(x): i, main_wait, test_wait = x ray.get(main_wait.release.remote()) ray.get(test_wait.acquire.remote()) return i + 10 @ray.remote(num_cpus=0.01) def to_list(it): return list(it) it = from_items([(i, main_wait, test_wait) for i in range(8)], num_shards=2) it = it.for_each(task, max_concurrency=2, resources={"num_cpus": 0.01}) list_promise = to_list.remote(it.gather_sync()) for i in range(4): assert i in [0, 1, 2, 3] ray.get(main_wait.acquire.remote()) # There should be exactly 4 tasks executing at this point. assert ray.get(main_wait.locked.remote()) is True, "Too much parallelism" for i in range(8): ray.get(test_wait.release.remote()) assert repr( it) == "ParallelIterator[from_items[tuple, 8, shards=2].for_each()]" result_list = ray.get(list_promise) assert set(result_list) == set(range(10, 18))
def test_hybrid_policy(ray_start_cluster): cluster = ray_start_cluster num_nodes = 2 num_cpus = 10 for _ in range(num_nodes): cluster.add_node(num_cpus=num_cpus, memory=num_cpus) cluster.wait_for_nodes() ray.init(address=cluster.address) # `block_task` ensures that scheduled tasks do not return until all are # running. block_task = Semaphore.remote(0) # `block_driver` ensures that the driver does not allow tasks to continue # until all are running. block_driver = Semaphore.remote(0) # Add the memory resource because the cpu will be released in the ray.get @ray.remote(num_cpus=1, memory=1) def get_node(): ray.get(block_driver.release.remote()) ray.get(block_task.acquire.remote()) return ray.worker.global_worker.current_node_id # Below the hybrid threshold we pack on the local node first. refs = [get_node.remote() for _ in range(5)] ray.get([block_driver.acquire.remote() for _ in refs]) ray.get([block_task.release.remote() for _ in refs]) nodes = ray.get(refs) assert len(set(nodes)) == 1 # We pack the second node to the hybrid threshold. refs = [get_node.remote() for _ in range(10)] ray.get([block_driver.acquire.remote() for _ in refs]) ray.get([block_task.release.remote() for _ in refs]) nodes = ray.get(refs) counter = collections.Counter(nodes) for node_id in counter: print(f"{node_id}: {counter[node_id]}") assert counter[node_id] == 5 # Once all nodes are past the hybrid threshold we round robin. # TODO (Alex): Ideally we could schedule less than 20 nodes here, but the # policy is imperfect if a resource report interrupts the process. refs = [get_node.remote() for _ in range(20)] ray.get([block_driver.acquire.remote() for _ in refs]) ray.get([block_task.release.remote() for _ in refs]) nodes = ray.get(refs) counter = collections.Counter(nodes) for node_id in counter: print(f"{node_id}: {counter[node_id]}") assert counter[node_id] == 10, counter
def test_worker_failed(ray_start_workers_separate_multinode): num_nodes, num_initial_workers = ray_start_workers_separate_multinode block_worker = Semaphore.remote(0) block_driver = Semaphore.remote(0) ray.get([block_worker.locked.remote(), block_driver.locked.remote()]) # Acquire a custom resource that isn't released on `ray.get` to make sure # this task gets spread across all the nodes. @ray.remote(num_cpus=1, resources={"custom": 1}) def get_pids(): ray.get(block_driver.release.remote()) ray.get(block_worker.acquire.remote()) return os.getpid() total_num_workers = num_nodes * num_initial_workers pid_refs = [get_pids.remote() for _ in range(total_num_workers)] ray.get([block_driver.acquire.remote() for _ in range(total_num_workers)]) ray.get([block_worker.release.remote() for _ in range(total_num_workers)]) pids = set(ray.get(pid_refs)) @ray.remote def f(x): time.sleep(0.5) return x # Submit more tasks than there are workers so that all workers and # cores are utilized. object_refs = [f.remote(i) for i in range(num_initial_workers * num_nodes)] object_refs += [f.remote(object_ref) for object_ref in object_refs] # Allow the tasks some time to begin executing. time.sleep(0.1) # Kill the workers as the tasks execute. for pid in pids: try: os.kill(pid, SIGKILL) except OSError: # The process may have already exited due to worker capping. pass time.sleep(0.1) # Make sure that we either get the object or we get an appropriate # exception. for object_ref in object_refs: try: ray.get(object_ref) except (ray.exceptions.RayTaskError, ray.exceptions.WorkerCrashedError): pass
def test_warning_for_too_many_nested_tasks(shutdown_only): # Check that if we run a workload which requires too many workers to be # started that we will receive a warning. num_cpus = 2 ray.init(num_cpus=num_cpus) p = init_error_pubsub() remote_wait = Semaphore.remote(value=0) nested_wait = Semaphore.remote(value=0) ray.get([ remote_wait.locked.remote(), nested_wait.locked.remote(), ]) @ray.remote(num_cpus=0.25) def f(): time.sleep(1000) return 1 @ray.remote(num_cpus=0.25) def h(nested_waits): nested_wait.release.remote() ray.get(nested_waits) ray.get(f.remote()) @ray.remote(num_cpus=0.25) def g(remote_waits, nested_waits): # Sleep so that the f tasks all get submitted to the scheduler after # the g tasks. remote_wait.release.remote() # wait until every lock is released. ray.get(remote_waits) ray.get(h.remote(nested_waits)) num_root_tasks = num_cpus * 4 # Lock remote task until everything is scheduled. remote_waits = [] nested_waits = [] for _ in range(num_root_tasks): remote_waits.append(remote_wait.acquire.remote()) nested_waits.append(nested_wait.acquire.remote()) [g.remote(remote_waits, nested_waits) for _ in range(num_root_tasks)] errors = get_error_message(p, 1, ray_constants.WORKER_POOL_LARGE_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.WORKER_POOL_LARGE_ERROR p.close()
def test_many_queued_tasks(): sema = Semaphore.remote(0) @ray.remote(num_cpus=1) def block(): ray.get(sema.acquire.remote()) @ray.remote(num_cpus=1) def f(): pass num_cpus = int(ray.cluster_resources()["CPU"]) blocked_tasks = [] for _ in range(num_cpus): blocked_tasks.append(block.remote()) print("Submitting many tasks") pending_tasks = [] for _ in trange(MAX_QUEUED_TASKS): pending_tasks.append(f.remote()) # Make sure all the tasks can actually run. for _ in range(num_cpus): sema.release.remote() print("Unblocking tasks") for ref in tqdm(pending_tasks): assert ray.get(ref) is None
def test_back_pressure(shutdown_only_with_initialization_check): ray.init() signal_actor = Semaphore.options(max_pending_calls=10).remote(value=0) try: for i in range(10): signal_actor.acquire.remote() except ray.exceptions.PendingCallsLimitExceeded: assert False with pytest.raises(ray.exceptions.PendingCallsLimitExceeded): signal_actor.acquire.remote() @ray.remote def release(signal_actor): ray.get(signal_actor.release.remote()) return 1 # Release signal actor through common task, # because actor tasks will be back pressured for i in range(10): ray.get(release.remote(signal_actor)) # Check whether we can call remote actor normally after # back presssure released. try: signal_actor.acquire.remote() except ray.exceptions.PendingCallsLimitExceeded: assert False ray.shutdown()
def test_task_status(ray_start_regular): address = ray_start_regular["address"] @ray.remote def dep(sema, x=None): ray.get(sema.acquire.remote()) return @ray.remote(num_gpus=1) def impossible(): pass # Filter out actor handle refs. def filtered_summary(): data = "\n".join( [ line for line in memory_summary(address, line_wrap=False).split("\n") if "ACTOR_HANDLE" not in line ] ) print(data) return data sema = Semaphore.remote(value=0) x = dep.remote(sema) y = dep.remote(sema, x=x) im = impossible.remote() # noqa # x and its semaphore task are scheduled. im cannot # be scheduled, so it is pending forever. wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 1) wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 1) z = dep.remote(sema, x=x) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 2) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 0) sema.release.remote() time.sleep(2) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 1) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) # y, z, and two semaphore tasks are scheduled. wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 4) sema.release.remote() wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 2) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 2) sema.release.remote() ray.get(y) ray.get(z) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 3) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_EXECUTION) == 0) wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 1)
def test_zero_cpu_scheduling(shutdown_only): ray.init(num_cpus=1) block_task = Semaphore.remote(0) block_driver = Semaphore.remote(0) @ray.remote(num_cpus=0) def foo(): ray.get(block_driver.release.remote()) ray.get(block_task.acquire.remote()) foo.remote() foo.remote() ray.get(block_driver.acquire.remote()) block_driver_ref = block_driver.acquire.remote() # Both tasks should be running, so the driver should be unblocked. ready, not_ready = ray.wait([block_driver_ref], timeout=1) assert len(not_ready) == 0
def test_task_status(ray_start_regular): address = ray_start_regular["address"] @ray.remote def dep(sema, x=None): ray.get(sema.acquire.remote()) return # Filter out actor handle refs. def filtered_summary(): return "\n".join( [ line for line in memory_summary(address, line_wrap=False).split("\n") if "ACTOR_HANDLE" not in line ] ) sema = Semaphore.remote(value=0) x = dep.remote(sema) y = dep.remote(sema, x=x) # x and its semaphore task are scheduled. wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 1) z = dep.remote(sema, x=x) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 2) wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 0) sema.release.remote() time.sleep(2) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 1) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) # y, z, and two semaphore tasks are scheduled. wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 4) sema.release.remote() wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 2) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 2) sema.release.remote() ray.get(y) ray.get(z) wait_for_condition(lambda: count(filtered_summary(), FINISHED) == 3) wait_for_condition(lambda: count(filtered_summary(), WAITING_FOR_DEPENDENCIES) == 0) wait_for_condition(lambda: count(filtered_summary(), SCHEDULED) == 0)
def test_multi_driver_logging(ray_start_regular): address_info = ray_start_regular address = address_info["address"] # ray.init(address=address) driver1_wait = Semaphore.options(name="driver1_wait").remote(value=0) driver2_wait = Semaphore.options(name="driver2_wait").remote(value=0) main_wait = Semaphore.options(name="main_wait").remote(value=0) # The creation of an actor is asynchronous. # We need to wait for the completion of the actor creation, # otherwise we can't get the actor by name. ray.get(driver1_wait.locked.remote()) ray.get(driver2_wait.locked.remote()) ray.get(main_wait.locked.remote()) # Params are address, semaphore name, output1, output2 driver_script_template = """ import ray import sys from ray._private.test_utils import Semaphore @ray.remote(num_cpus=0) def remote_print(s, file=None): print(s, file=file) ray.init(address="{}", namespace="default_test_namespace") driver_wait = ray.get_actor("{}") main_wait = ray.get_actor("main_wait") ray.get(main_wait.release.remote()) ray.get(driver_wait.acquire.remote()) s1 = "{}" ray.get(remote_print.remote(s1)) ray.get(main_wait.release.remote()) ray.get(driver_wait.acquire.remote()) s2 = "{}" ray.get(remote_print.remote(s2)) ray.get(main_wait.release.remote()) """ p1 = run_string_as_driver_nonblocking( driver_script_template.format(address, "driver1_wait", "1", "2")) p2 = run_string_as_driver_nonblocking( driver_script_template.format(address, "driver2_wait", "3", "4")) ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) # At this point both of the other drivers are fully initialized. ray.get(driver1_wait.release.remote()) ray.get(driver2_wait.release.remote()) # At this point driver1 should receive '1' and driver2 '3' ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) ray.get(driver1_wait.release.remote()) ray.get(driver2_wait.release.remote()) # At this point driver1 should receive '2' and driver2 '4' ray.get(main_wait.acquire.remote()) ray.get(main_wait.acquire.remote()) driver1_out = p1.stdout.read().decode("ascii") driver2_out = p2.stdout.read().decode("ascii") if sys.platform == "win32": driver1_out = driver1_out.replace("\r", "") driver2_out = driver2_out.replace("\r", "") driver1_out_split = driver1_out.split("\n") driver2_out_split = driver2_out.split("\n") assert driver1_out_split[0][-1] == "1", driver1_out_split assert driver1_out_split[1][-1] == "2", driver1_out_split assert driver2_out_split[0][-1] == "3", driver2_out_split assert driver2_out_split[1][-1] == "4", driver2_out_split
def test_memory_util(ray_start_cluster): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster # Head node with no resources. cluster.add_node( num_cpus=0, resources={"head": 1}, _system_config=config, enable_object_reconstruction=True, ) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote def large_object(sema=None): if sema is not None: ray.get(sema.acquire.remote()) return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x, sema): ray.get(sema.acquire.remote()) return x def stats(): info = memory_summary(cluster.address, line_wrap=False) print(info) info = info.split("\n") reconstructing_waiting = [ line for line in info if "Attempt #2" in line and WAITING_FOR_DEPENDENCIES in line ] reconstructing_scheduled = [ line for line in info if "Attempt #2" in line and WAITING_FOR_EXECUTION in line ] reconstructing_finished = [ line for line in info if "Attempt #2" in line and FINISHED in line ] return ( len(reconstructing_waiting), len(reconstructing_scheduled), len(reconstructing_finished), ) sema = Semaphore.options(resources={"head": 1}).remote(value=0) obj = large_object.options(resources={"node1": 1}).remote(sema) x = dependent_task.options(resources={"node1": 1}).remote(obj, sema) ref = dependent_task.options(resources={"node1": 1}).remote(x, sema) ray.get(sema.release.remote()) ray.get(sema.release.remote()) ray.get(sema.release.remote()) ray.get(ref) wait_for_condition(lambda: stats() == (0, 0, 0)) del ref cluster.remove_node(node_to_kill, allow_graceful=False) node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) ref = dependent_task.remote(x, sema) wait_for_condition(lambda: stats() == (1, 1, 0)) ray.get(sema.release.remote()) wait_for_condition(lambda: stats() == (0, 1, 1)) ray.get(sema.release.remote()) ray.get(sema.release.remote()) ray.get(ref) wait_for_condition(lambda: stats() == (0, 0, 2))