def test_remove_placement_group(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) # First try to remove a placement group that doesn't # exist. This should not do anything. random_group_id = PlacementGroupID.from_random() random_placement_group = PlacementGroup(random_group_id) for _ in range(3): ray.util.remove_placement_group(random_placement_group) # Creating a placement group as soon as it is # created should work. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) ray.util.remove_placement_group(placement_group) def is_placement_group_removed(): table = ray.util.placement_group_table(placement_group) if "state" not in table: return False return table["state"] == "REMOVED" wait_for_condition(is_placement_group_removed) # # Now let's create a placement group. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) # Create an actor that occupies resources. @ray.remote(num_cpus=2) class A: def f(self): return 3 # Currently, there's no way to prevent # tasks to be retried for removed placement group. # Set max_retrie=0 for testing. # TODO(sang): Handle this edge case. @ray.remote(num_cpus=2, max_retries=0) def long_running_task(): print(os.getpid()) import time time.sleep(50) # Schedule a long running task and actor. task_ref = long_running_task.options( placement_group=placement_group).remote() a = A.options(placement_group=placement_group).remote() assert ray.get(a.f.remote()) == 3 ray.util.remove_placement_group(placement_group) # Subsequent remove request shouldn't do anything. for _ in range(3): ray.util.remove_placement_group(placement_group) # Make sure placement group resources are # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3 # Since the placement group is removed, # the actor should've been killed. # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref)
def test_named_placement_group(ray_start_cluster): cluster = ray_start_cluster for _ in range(2): cluster.add_node(num_cpus=3) cluster.wait_for_nodes() info = ray.init(address=cluster.address) global_placement_group_name = "named_placement_group" # Create a detached placement group with name. driver_code = f""" import ray ray.init(address="{info["redis_address"]}") pg = ray.util.placement_group( [{{"CPU": 1}} for _ in range(2)], strategy="STRICT_SPREAD", name="{global_placement_group_name}", lifetime="detached") ray.get(pg.ready()) ray.shutdown() """ run_string_as_driver(driver_code) # Wait until the driver is reported as dead by GCS. def is_job_done(): jobs = ray.jobs() for job in jobs: if "StopTime" in job: return True return False wait_for_condition(is_job_done) @ray.remote(num_cpus=1) class Actor: def ping(self): return "pong" # Get the named placement group and schedule a actor. placement_group = ray.util.get_placement_group(global_placement_group_name) assert placement_group is not None assert placement_group.wait(5) actor = Actor.options( placement_group=placement_group, placement_group_bundle_index=0).remote() ray.get(actor.ping.remote()) # Create another placement group and make sure its creation will failed. error_creation_count = 0 try: ray.util.placement_group( [{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", name=global_placement_group_name) except RaySystemError: error_creation_count += 1 assert error_creation_count == 1 # Remove a named placement group and make sure the second creation # will successful. ray.util.remove_placement_group(placement_group) same_name_pg = ray.util.placement_group( [{ "CPU": 1 } for _ in range(2)], strategy="STRICT_SPREAD", name=global_placement_group_name) assert same_name_pg.wait(10) # Get a named placement group with a name that doesn't exist # and make sure it will raise ValueError correctly. error_count = 0 try: ray.util.get_placement_group("inexistent_pg") except ValueError: error_count = error_count + 1 assert error_count == 1