def test_placement_group_status_no_bundle_demand(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) @ray.remote def f(): pass pg = ray.util.placement_group([{"CPU": 1}]) ray.get(pg.ready()) ray.util.remove_placement_group(pg) wait_for_condition(lambda: is_placement_group_removed(pg)) # Create a ready task after the placement group is removed. # This shouldn't be reported to the resource demand. r = pg.ready() # noqa # Wait until the usage is updated, which is # when the demand is also updated. def is_usage_updated(): demand_output = get_ray_status_output(cluster.address) return demand_output["usage"] != "" wait_for_condition(is_usage_updated) # The output shouldn't include the pg.ready task demand. demand_output = get_ray_status_output(cluster.address) assert demand_output["demand"] == "(no resource demands)"
def test_placement_group_synchronous_registration(ray_start_cluster, connect_to_client): cluster = ray_start_cluster # One node which only has one CPU. cluster.add_node(num_cpus=1) cluster.wait_for_nodes() ray.init(address=cluster.address) with connect_to_client_or_not(connect_to_client): # Create a placement group that has two bundles and `STRICT_PACK` # strategy so its registration will successful but scheduling failed. placement_group = ray.util.placement_group( name="name", strategy="STRICT_PACK", bundles=[ { "CPU": 1, }, {"CPU": 1}, ], ) # Make sure we can properly remove it immediately # as its registration is synchronous. ray.util.remove_placement_group(placement_group) wait_for_condition(lambda: is_placement_group_removed(placement_group))
def test_remove_placement_group(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) @ray.remote def warmup(): pass # warm up the cluster. ray.get([warmup.remote() for _ in range(4)]) with connect_to_client_or_not(connect_to_client): # First try to remove a placement group that doesn't # exist. This should not do anything. random_group_id = PlacementGroupID.from_random() random_placement_group = PlacementGroup(random_group_id) for _ in range(3): ray.util.remove_placement_group(random_placement_group) # Creating a placement group as soon as it is # created should work. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) ray.util.remove_placement_group(placement_group) wait_for_condition(lambda: is_placement_group_removed(placement_group)) # # Now let's create a placement group. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) # Create an actor that occupies resources. @ray.remote(num_cpus=2) class A: def f(self): return 3 # Currently, there's no way to prevent # tasks to be retried for removed placement group. # Set max_retrie=0 for testing. # TODO(sang): Handle this edge case. @ray.remote(num_cpus=2, max_retries=0) def long_running_task(): print(os.getpid()) import time time.sleep(50) # Schedule a long running task and actor. task_ref = long_running_task.options( placement_group=placement_group).remote() a = A.options(placement_group=placement_group).remote() assert ray.get(a.f.remote()) == 3 ray.util.remove_placement_group(placement_group) # Subsequent remove request shouldn't do anything. for _ in range(3): ray.util.remove_placement_group(placement_group) # Make sure placement group resources are # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3 # Since the placement group is removed, # the actor should've been killed. # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref)