def testOthersTakingResources(self): # Let someone occupy the head node pg = placement_group([{"CPU": 4, "GPU": 1}]) ray.get(pg.ready()) # We are left with the second node assert len(nodes()) == 1 assert default_device(refresh=True) == "GPU" pg = placement_group([{"GPU": 1}]) ray.get(pg.ready()) # Default device should be CPU assert default_device(refresh=True) == "CPU" assert len(nodes()) == 1
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): p = error_pubsub # Check that we get warning messages for infeasible tasks. @ray.remote(num_gpus=1) def f(): pass @ray.remote(resources={"Custom": 1}) class Foo: pass # This task is infeasible. f.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # This actor placement task is infeasible. Foo.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # Placement group cannot be made, but no warnings should occur. pg = placement_group([{"GPU": 1}], strategy="STRICT_PACK") pg.ready() f.options(placement_group=pg).remote() errors = get_error_message( p, 1, ray_constants.INFEASIBLE_TASK_ERROR, timeout=5) assert len(errors) == 0, errors
def _create_placement_group(num_cpus_per_actor, num_actors): """ Create Ray placement group to grab resources. Parameters ---------- num_cpus_per_actor : int Number of CPUs per actor. num_actors : int Number of actors. Returns ------- ray.util.PlacementGroup Placement group with grabbed resources. """ cpu_bundle = {"CPU": num_cpus_per_actor} bundles = [cpu_bundle for _ in range(num_actors)] pg = placement_group(bundles, strategy="SPREAD") ready, _ = ray.wait([pg.ready()], timeout=100) if ready is None: raise TimeoutError("Placement group creation timeout.") return pg
def pg_launcher(num_pgs_to_create): print("Creating pgs") pgs = [] for i in range(num_pgs_to_create): pgs.append(placement_group(bundles, strategy="STRICT_SPREAD")) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. if pg_removal: print("removing pgs") for pg in pgs: if random() < 0.5 and pg_removal: pgs_removed.append(pg) else: pgs_unremoved.append(pg) print(len(pgs_unremoved)) tasks = [] # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: for i in range(num_nodes): tasks.append( mock_task.options(placement_group=pg, placement_group_bundle_index=i).remote()) # Remove the rest of placement groups. if pg_removal: for pg in pgs_removed: remove_placement_group(pg) ray.get(tasks) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
def main(): ray.init(address="auto") bundles = [{"CPU": 1, "GPU": 1}] bundles += [{"CPU": 1} for _ in range(NUM_CPU_BUNDLES)] pg = placement_group(bundles, strategy="PACK") ray.get(pg.ready()) workers = [ Worker.options(placement_group=pg).remote(i) for i in range(NUM_CPU_BUNDLES) ] trainer = Trainer.options(placement_group=pg).remote(0) start = time.time() while True: ray.get([workers[i].work.remote() for i in range(NUM_CPU_BUNDLES)]) ray.get(trainer.train.remote()) end = time.time() if end - start > RUNTIME: break if "TEST_OUTPUT_JSON" in os.environ: out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") results = {} json.dump(results, out_file)
def pg_launcher(pre_created_pgs, num_pgs_to_create): pgs = [] pgs += pre_created_pgs for i in range(num_pgs_to_create): pgs.append( placement_group(bundles, strategy="STRICT_SPREAD", name=str(i))) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. for pg in pgs: if random() < .5: pgs_removed.append(pg) else: pgs_unremoved.append(pg) # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: # TODO(sang): Comment in this line causes GCS actor management # failure. We need to fix it. # if random() < .5: mock_task.options(placement_group=pg).remote() # else: # MockActor.options(placement_group=pg).remote() # Remove the rest of placement groups. for pg in pgs_removed: remove_placement_group(pg) ray.get([pg.ready() for pg in pgs_unremoved], timeout=10) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
def test_placement_group_removal_leak_regression(ray_start_cluster): """Related issue: https://github.com/ray-project/ray/issues/19131 """ cluster = ray_start_cluster cluster.add_node(num_cpus=5) ray.init(address=cluster.address) TOTAL_CPUS = 8 bundles = [{"CPU": 1, "GPU": 1}] bundles += [{"CPU": 1} for _ in range(TOTAL_CPUS - 1)] pg = placement_group(bundles, strategy="PACK") # Here, we simulate that the ready task is queued and # the new node is up. As soon as the new node is up, # the ready task is scheduled. # See https://github.com/ray-project/ray/pull/19138 # for more details about the test. o = pg.ready() # Add an artificial delay until the new node is up. time.sleep(3) cluster.add_node(num_cpus=5, num_gpus=1) ray.get(o) bundle_resource_name = f"bundle_group_{pg.id.hex()}" expected_bundle_wildcard_val = TOTAL_CPUS * 1000 # This should fail if there's a leakage # because the bundle resources are never returned properly. def check_bundle_leaks(): bundle_resources = ray.available_resources()[bundle_resource_name] return expected_bundle_wildcard_val == bundle_resources wait_for_condition(check_bundle_leaks)
def test_pg_actor_workloads(ray_start_regular_with_external_redis): from ray.util.placement_group import placement_group bundle1 = {"CPU": 1} pg = placement_group([bundle1], strategy="STRICT_PACK") ray.get(pg.ready()) @ray.remote class Counter: def r(self, v): return v def pid(self): import os return os.getpid() c = Counter.options(placement_group=pg).remote() r = ray.get(c.r.remote(10)) assert r == 10 print("GCS is killed") pid = ray.get(c.pid.remote()) ray.worker._global_node.kill_gcs_server() assert ray.get(c.r.remote(10)) == 10 ray.worker._global_node.start_gcs_server() for _ in range(100): assert pid == ray.get(c.pid.remote())
def spread_to_all_nodes(f: RemoteFunction): nodes = ray.state.nodes() resources = [{'CPU': f._num_cpus} for _ in range(len(nodes))] pg = placement_group(resources, strategy="STRICT_SPREAD") ray.get(pg.ready()) yield len(nodes), pg remove_placement_group(pg)
def test_fractional_resources_handle_correct(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=1000) ray.init(address=cluster.address) bundles = [{"CPU": 0.01} for _ in range(5)] pg = placement_group(bundles, strategy="SPREAD") ray.get(pg.ready(), timeout=10)
def test_placement_group_local_resource_view(monkeypatch, ray_start_cluster): """Please refer to https://github.com/ray-project/ray/pull/19911 for more details. """ with monkeypatch.context() as m: # Increase broadcasting interval so that node resource will arrive # at raylet after local resource all being allocated. m.setenv("RAY_raylet_report_resources_period_milliseconds", "2000") m.setenv("RAY_grpc_based_resource_broadcast", "true") cluster = ray_start_cluster cluster.add_node(num_cpus=16, object_store_memory=1e9) cluster.wait_for_nodes() # We need to init here so that we can make sure it's connecting to # the raylet where it only has cpu resources. # This is a hacky way to prevent scheduling hanging which will # schedule <CPU:1> job to the node with GPU and for <GPU:1, CPU:1> task # there is no node has this resource. ray.init(address="auto") cluster.add_node(num_cpus=16, num_gpus=1) cluster.wait_for_nodes() NUM_CPU_BUNDLES = 30 @ray.remote(num_cpus=1) class Worker(object): def __init__(self, i): self.i = i def work(self): time.sleep(0.1) print("work ", self.i) @ray.remote(num_cpus=1, num_gpus=1) class Trainer(object): def __init__(self, i): self.i = i def train(self): time.sleep(0.2) print("train ", self.i) bundles = [{"CPU": 1, "GPU": 1}] bundles += [{"CPU": 1} for _ in range(NUM_CPU_BUNDLES)] pg = placement_group(bundles, strategy="PACK") ray.get(pg.ready()) # Local resource will be allocated and here we are to ensure # local view is consistent and node resouce updates are discarded workers = [ Worker.options(placement_group=pg).remote(i) for i in range(NUM_CPU_BUNDLES) ] trainer = Trainer.options(placement_group=pg).remote(0) ray.get([workers[i].work.remote() for i in range(NUM_CPU_BUNDLES)]) ray.get(trainer.train.remote())
def _create_placement_group(num_cpus_per_actor, num_actors): cpu_bundle = {"CPU": num_cpus_per_actor} bundles = [cpu_bundle for _ in range(num_actors)] pg = placement_group(bundles, strategy="SPREAD") ready, _ = ray.wait([pg.ready()], timeout=100) if ready is None: raise TimeoutError("Placement group creation timeout.") return pg
def main(): """Run a long running placement group creation/removal tests. This test runs 20 trials first and measure the P50 performance. After that it runs trials for a long time and make sure the P50 creation/scheduling/removal performance is not regressed after the long running job. """ args, _ = parse_script_args() NUM_PG_AT_EACH_STAGE = args.num_pgs_stage NUM_PENDING_PG = args.num_pending_pgs TOTAL_STAGE = args.num_stages if args.local: ray.init(resources={"custom": 100, "pending": 1}) else: ray.init(address="auto") assert ray.cluster_resources()["custom"] >= NUM_PG_AT_EACH_STAGE * 4 assert ray.cluster_resources()["pending"] >= 1 # Create pending placement groups. pending_pgs = [] for _ in range(NUM_PENDING_PG): # Right now, we don't have infeasible pgs, # so this will simulate the pending pgs. pending_pgs.append(placement_group([{"pending": 1}], strategy="PACK")) (scheduling_perf, removing_perf, creation_perf) = run_trial(20, NUM_PG_AT_EACH_STAGE) (scheduling_perf_final, removing_perf_final, creation_perf_final) = run_trial(TOTAL_STAGE, NUM_PG_AT_EACH_STAGE) print(f"Scheduling performance 20 trials: {scheduling_perf}") print( f"Scheduling performance {TOTAL_STAGE} trials: {scheduling_perf_final}" ) print(f"Removal performance 20 trials: {removing_perf}") print(f"Removal performance {TOTAL_STAGE} trials: {removing_perf_final}") print(f"Creation performance 20 trials: {creation_perf}") print(f"Creation performance {TOTAL_STAGE} trials: {creation_perf_final}") assert scheduling_perf["p50_ms"] * 100 > scheduling_perf_final["p50_ms"] assert removing_perf["p50_ms"] * 100 > removing_perf_final["p50_ms"] assert creation_perf["p50_ms"] * 100 > creation_perf_final["p50_ms"] if "TEST_OUTPUT_JSON" in os.environ: out_file = open(os.environ["TEST_OUTPUT_JSON"], "w") results = {} json.dump(results, out_file)
def test_warning_for_infeasible_tasks(ray_start_regular, error_pubsub): p = error_pubsub # Check that we get warning messages for infeasible tasks. @ray.remote(num_gpus=1) def f(): pass @ray.remote(resources={"Custom": 1}) class Foo: pass # This task is infeasible. f.remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # This actor placement task is infeasible. foo = Foo.remote() print(foo) errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR) assert len(errors) == 1 assert errors[0].type == ray_constants.INFEASIBLE_TASK_ERROR # Placement group cannot be made, but no warnings should occur. total_cpus = ray.cluster_resources()["CPU"] # Occupy one cpu by an actor @ray.remote(num_cpus=1) class A: pass a = A.remote() print(a) @ray.remote(num_cpus=total_cpus) def g(): pass pg = placement_group([{"CPU": total_cpus}], strategy="STRICT_PACK") g.options(placement_group=pg).remote() errors = get_error_message(p, 1, ray_constants.INFEASIBLE_TASK_ERROR, timeout=5) assert len(errors) == 0, errors
def test_many_placement_groups(): @ray.remote(num_cpus=1, resources={"node": 0.02}) def f1(): sleep(10) pass @ray.remote(num_cpus=1) def f2(): sleep(10) pass @ray.remote(resources={"node": 0.02}) def f3(): sleep(10) pass bundle1 = {"node": 0.02, "CPU": 1} bundle2 = {"CPU": 1} bundle3 = {"node": 0.02} pgs = [] for _ in trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): pg = placement_group(bundles=[bundle1, bundle2, bundle3]) pgs.append(pg) for pg in tqdm(pgs, desc="Waiting for pgs to be ready"): ray.get(pg.ready()) refs = [] for pg in tqdm(pgs, desc="Scheduling tasks"): ref1 = f1.options(placement_group=pg).remote() ref2 = f2.options(placement_group=pg).remote() ref3 = f3.options(placement_group=pg).remote() refs.extend([ref1, ref2, ref3]) for _ in trange(10, desc="Waiting"): sleep(1) with tqdm() as p_bar: while refs: done, refs = ray.wait(refs) p_bar.update() for pg in tqdm(pgs, desc="Cleaning up pgs"): remove_placement_group(pg)
def test_many_placement_groups(): # @ray.remote(num_cpus=1, resources={"node": 0.02}) @ray.remote class C1: def ping(self): return "pong" # @ray.remote(num_cpus=1) @ray.remote class C2: def ping(self): return "pong" # @ray.remote(resources={"node": 0.02}) @ray.remote class C3: def ping(self): return "pong" bundle1 = {"node": 0.02, "CPU": 1} bundle2 = {"CPU": 1} bundle3 = {"node": 0.02} pgs = [] for _ in tqdm.trange(MAX_PLACEMENT_GROUPS, desc="Creating pgs"): pg = placement_group(bundles=[bundle1, bundle2, bundle3]) pgs.append(pg) for pg in tqdm.tqdm(pgs, desc="Waiting for pgs to be ready"): ray.get(pg.ready()) actors = [] for pg in tqdm.tqdm(pgs, desc="Scheduling tasks"): actors.append(C1.options(placement_group=pg).remote()) actors.append(C2.options(placement_group=pg).remote()) actors.append(C3.options(placement_group=pg).remote()) not_ready = [actor.ping.remote() for actor in actors] for _ in tqdm.trange(len(actors)): ready, not_ready = ray.wait(not_ready) assert ray.get(*ready) == "pong" for pg in tqdm.tqdm(pgs, desc="Cleaning up pgs"): remove_placement_group(pg)
def test_schedule_placement_groups_at_the_same_time(): ray.init(num_cpus=4) pgs = [placement_group([{"CPU": 2}]) for _ in range(6)] wait_pgs = {pg.ready(): pg for pg in pgs} def is_all_placement_group_removed(): ready, _ = ray.wait(list(wait_pgs.keys()), timeout=0.5) if ready: ready_pg = wait_pgs[ready[0]] remove_placement_group(ready_pg) del wait_pgs[ready[0]] if len(wait_pgs) == 0: return True return False wait_for_condition(is_all_placement_group_removed)
def test_placement_group_gpu_unique_assigned(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_gpus=4, num_cpus=4) ray.init(address=cluster.address) gpu_ids_res = set() # Create placement group with 4 bundles using 1 GPU each. num_gpus = 4 bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)] pg = placement_group(bundles) ray.get(pg.ready()) # Actor using 1 GPU that has a method to get # $CUDA_VISIBLE_DEVICES env variable. @ray.remote(num_gpus=1, num_cpus=1) class Actor: def get_gpu(self): import os return os.environ["CUDA_VISIBLE_DEVICES"] # Create actors out of order. actors = [] actors.append( Actor.options(placement_group=pg, placement_group_bundle_index=0).remote()) actors.append( Actor.options(placement_group=pg, placement_group_bundle_index=3).remote()) actors.append( Actor.options(placement_group=pg, placement_group_bundle_index=2).remote()) actors.append( Actor.options(placement_group=pg, placement_group_bundle_index=1).remote()) for actor in actors: gpu_ids = ray.get(actor.get_gpu.remote()) assert len(gpu_ids) == 1 gpu_ids_res.add(gpu_ids) assert len(gpu_ids_res) == 4
def test_infeasible_pg(ray_start_cluster): """Test infeasible pgs are scheduled after new nodes are added.""" cluster = ray_start_cluster cluster.add_node(num_cpus=2) ray.init("auto") bundle = {"CPU": 4, "GPU": 1} pg = placement_group([bundle], name="worker_1", strategy="STRICT_PACK") # Placement group is infeasible. with pytest.raises(GetTimeoutError): ray.get(pg.ready(), timeout=3) state = ray.util.placement_group_table()[ pg.id.hex()]["stats"]["scheduling_state"] assert state == "INFEASIBLE" # Add a new node. PG can now be scheduled. cluster.add_node(num_cpus=4, num_gpus=1) assert ray.get(pg.ready(), timeout=10)
def pg_launcher(pre_created_pgs, num_pgs_to_create): pgs = [] pgs += pre_created_pgs for i in range(num_pgs_to_create): pgs.append(placement_group(BUNDLES, strategy="STRICT_SPREAD")) pgs_removed = [] pgs_unremoved = [] # Randomly choose placement groups to remove. for pg in pgs: if random() < .5: pgs_removed.append(pg) else: pgs_unremoved.append(pg) tasks = [] max_actor_cnt = 5 actor_cnt = 0 actors = [] # Randomly schedule tasks or actors on placement groups that # are not removed. for pg in pgs_unremoved: # TODO(sang): Comment in this line causes GCS actor management # failure. We need to fix it. if random() < .5: tasks.append(mock_task.options(placement_group=pg).remote()) else: if actor_cnt < max_actor_cnt: actors.append(MockActor.options(placement_group=pg).remote()) actor_cnt += 1 # Remove the rest of placement groups. for pg in pgs_removed: remove_placement_group(pg) ray.get([pg.ready() for pg in pgs_unremoved]) ray.get(tasks) ray.get([actor.ping.remote() for actor in actors]) # Since placement groups are scheduled, remove them. for pg in pgs_unremoved: remove_placement_group(pg)
def test_chaos_defer(monkeypatch, ray_start_cluster): with monkeypatch.context() as m: m.setenv("RAY_grpc_based_resource_broadcast", "true") # defer for 3s m.setenv( "RAY_testing_asio_delay_us", "NodeManagerService.grpc_client.PrepareBundleResources=2000000:2000000", ) m.setenv("RAY_event_stats", "true") cluster = ray_start_cluster cluster.add_node(num_cpus=1, object_store_memory=1e9) cluster.wait_for_nodes() ray.init(address="auto") # this will connect to gpu nodes cluster.add_node(num_cpus=0, num_gpus=1) bundle = [{"GPU": 1}, {"CPU": 1}] pg = placement_group(bundle) # PG will not be ready within 3s with pytest.raises(ray.exceptions.GetTimeoutError): ray.get(pg.ready(), timeout=1) # it'll be ready eventually ray.get(pg.ready())
def __call__(self, *args, **kwargs): kwargs.update(self._bound.kwargs) # Call with bounded *args and **kwargs return placement_group(*self._bound.args, **kwargs)
value = torch.randn(batch_size) input = torch.randn(batch_size) return { "input": input, "value": value, "actor_id": self.id, } tot_gpus = 1 tot_cpus = 8 pg_cnt = min(int(tot_gpus / bundle1['GPU']), int(tot_cpus / bundle1['CPU'])) pgs = [ placement_group([bundle1], strategy="STRICT_PACK") for _ in range(pg_cnt) ] # Wait until placement group is created. ray.get([pg.ready() for pg in pgs]) for pg in pgs: print(placement_group_table(pg)) # You can look at placement group states using this API. print(placement_group_table(pg)) actors = [ AsyncActor.options(placement_group=pg).remote(i) for i, pg in enumerate(pgs) ] # Get a list of the IP addresses of the nodes that have joined the cluster.
import ray from ray.util.placement_group import ( placement_group, ) from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy # Two "CPU"s are available. ray.init(num_cpus=2) # Create a placement group. pg = placement_group([{"CPU": 2}]) ray.get(pg.ready()) # Now, 2 CPUs are not available anymore because # they are pre-reserved by the placement group. @ray.remote(num_cpus=2) def f(): return True # Won't be scheduled because there are no 2 cpus. f.remote() # Will be scheduled because 2 cpus are reserved by the placement group. f.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote()
time.sleep(5) logger.info("Nodes have all joined. There are %s resources.", ray.cluster_resources()) def hey(_): time.sleep(0.01) # Sleep for 10ms return b"hey" num_connections = int(num_remote_cpus * 0.75) num_threads = 2 time_to_run = "10s" pg = placement_group( [{ "CPU": 1 } for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD") ray.get(pg.ready()) # The number of replicas is the number of cores remaining after accounting # for the one HTTP proxy actor on each node, the "hey" requester task on each # node, and the serve controller. # num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1 num_replicas = ray.available_resources()["CPU"] logger.info("Starting %i replicas", num_replicas) client.create_backend( "hey", hey, config=BackendConfig(num_replicas=num_replicas)) client.create_endpoint("hey", backend="hey", route="/hey") @ray.remote
# In[ ]: @ray.remote def remote_fun(x): return x # In[ ]: #tag::placement_group[] # Create a placement group. cpu_bundle = {"CPU": 3} mini_cpu_bundle = {"CPU": 1} pg = placement_group([cpu_bundle, mini_cpu_bundle]) ray.get(pg.ready()) print(placement_group_table(pg)) print(ray.available_resources()) # Run remote_fun in cpu_bundle handle = remote_fun.options(placement_group=pg, placement_group_bundle_index=0).remote(1) #end::placement_group[] # In[ ]: #tag::runtime_env_local[] @ray.remote(runtime_env=runtime_env) def sup(x): from bs4 import BeautifulSoup
def run_trial(total_stage, num_pg_per_stage): creating_e2e_s = [] removing_e2e_s = [] # Create and remove placement groups. for i in range(total_stage): # Create pgs. pgs = [] start = perf_counter() for _ in range(num_pg_per_stage): pgs.append( placement_group(bundles=[{ "custom": 0.025 } for _ in range(4)], strategy="PACK")) logger.info(f"Created {num_pg_per_stage} pgs.") ray.get([pg.ready() for pg in pgs]) end = perf_counter() total_creating_time = end - start logger.info(f"Creating {num_pg_per_stage} took " f"{total_creating_time} seconds at stage {i}") creating_e2e_s.append(total_creating_time * 1000.0) # Remove pgs start = perf_counter() for _, pg in enumerate(pgs): remove_placement_group(pg) end = perf_counter() total_removal_time = end - start logger.info(f"removed {num_pg_per_stage} pgs took " f"{total_removal_time} seconds at stage {i}") removing_e2e_s.append(total_removal_time * 1000.0) # time.sleep(1) # Calculate the scheduling latency (excluding queueing time). latencies = [] for entry in ray.util.placement_group_table().values(): latency = entry["stats"]["scheduling_latency_ms"] latencies.append(latency) latencies = sorted(latencies) removing_e2e_s = sorted(removing_e2e_s) creating_e2e_s = sorted(creating_e2e_s) def get_scheduling_perf(latencies): """Return P10, 50, 95, 99 latency""" p10 = latencies[int(len(latencies) * 0.1)] p50 = latencies[int(len(latencies) * 0.5)] p95 = latencies[int(len(latencies) * 0.95)] p99 = latencies[int(len(latencies) * 0.99)] return {"p10_ms": p10, "p50_ms": p50, "p95_ms": p95, "p99_ms": p99} scheduling_perf = get_scheduling_perf(latencies) removing_perf = get_scheduling_perf(removing_e2e_s) creation_perf = get_scheduling_perf(creating_e2e_s) wait_for_condition( lambda: (ray.cluster_resources()["custom"] == ray.available_resources( )["custom"]), timeout=30, ) wait_for_condition( lambda: (ray.cluster_resources()["pending"] == ray.available_resources( )["pending"]), timeout=30, ) return scheduling_perf, removing_perf, creation_perf
def verify_load_metrics(monitor, expected_resource_usage=None, timeout=30): request_resources(num_cpus=42) # add placement groups. pg_demands = [{"GPU": 2}, {"extra_resource": 2}] strategy = "STRICT_PACK" pg = placement_group(pg_demands, strategy=strategy) pg.ready() time.sleep(2) # wait for placemnt groups to propogate. # Disable event clearing for test. monitor.event_summarizer.clear = lambda *a: None visited_atleast_once = [set(), set()] while True: monitor.update_load_metrics() monitor.update_resource_requests() monitor.update_event_summary() resource_usage = monitor.load_metrics._get_resource_usage() # Check resource request propagation. req = monitor.load_metrics.resource_requests assert req == [{"CPU": 1}] * 42, req pg_response_data = monitor.load_metrics.pending_placement_groups assert_correct_pg(pg_response_data, pg_demands, strategy) if "memory" in resource_usage[0]: del resource_usage[0]["memory"] visited_atleast_once[0].add("memory") if "object_store_memory" in resource_usage[0]: del resource_usage[0]["object_store_memory"] visited_atleast_once[0].add("object_store_memory") if "memory" in resource_usage[1]: del resource_usage[1]["memory"] visited_atleast_once[1].add("memory") if "object_store_memory" in resource_usage[1]: del resource_usage[1]["object_store_memory"] visited_atleast_once[1].add("object_store_memory") for key in list(resource_usage[0].keys()): if key.startswith("node:"): del resource_usage[0][key] visited_atleast_once[0].add("node:") for key in list(resource_usage[1].keys()): if key.startswith("node:"): del resource_usage[1][key] visited_atleast_once[1].add("node:") if expected_resource_usage is None: if all(x for x in resource_usage[0:]): break elif all(x == y for x, y in zip(resource_usage, expected_resource_usage)): break else: timeout -= 1 time.sleep(1) if timeout <= 0: raise ValueError("Timeout. {} != {}".format( resource_usage, expected_resource_usage)) # Sanity check we emitted a resize event. assert any("Resized to" in x for x in monitor.event_summarizer.summary()) assert visited_atleast_once[0] == { "memory", "object_store_memory", "node:" } assert visited_atleast_once[0] == visited_atleast_once[1] remove_placement_group(pg) return resource_usage
ray.cluster_resources()) # Scenario 1: Create bunch of placement groups and measure how long # it takes. total_creating_time = 0 total_removing_time = 0 repeat = 1 total_trial = repeat * NUM_PG BUNDLES = [{"pg_custom": 1}] * NUM_NODES # Create and remove placement groups. for _ in range(repeat): pgs = [] for i in range(NUM_PG): start = perf_counter() pgs.append(placement_group(BUNDLES, strategy="PACK")) end = perf_counter() logger.info(f"append_group iteration {i}") total_creating_time += (end - start) ray.get([pg.ready() for pg in pgs]) for i, pg in enumerate(pgs): start = perf_counter() remove_placement_group(pg) end = perf_counter() logger.info(f"remove_group iteration {i}") total_removing_time += (end - start) # Validate the correctness. assert ray.cluster_resources()[
import time import ray from ray.util.placement_group import (placement_group, placement_group_table, remove_placement_group) if __name__ == "__main__": ray.init(num_cpus=2, resources={"extra_resources": 2}) bundle_1 = {"CPU": 2} bundle_2 = {"extra_resources": 2} pg = placement_group([bundle_1, bundle_2], strategy="STRICT_PACK") # You can also use ray.wait. ready, unready = ray.wait([pg.ready()], timeout=5) print(f"placement group status:{ready}") print(placement_group_table(pg)) time.sleep(10)