def test_warn_cpu(): def f(*a): @ray.remote(num_cpus=1) def f(): pass ray.get(f.remote()) with pytest.raises(TuneError): tune.run(f, verbose=0) with pytest.raises(TuneError): tune.run(f, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }]), verbose=0) def g(*a): @ray.remote(num_cpus=1) class Actor: def f(self): pass a = Actor.remote() ray.get(a.f.remote()) with pytest.raises(TuneError): tune.run(g, verbose=0) with pytest.raises(TuneError): tune.run(g, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }]), verbose=0)
def test_bad_pg_slots(): def f(*a): @ray.remote(num_cpus=2) def f(): pass ray.get(f.remote()) with pytest.raises(TuneError): tune.run( f, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }] * 2), verbose=0, )
def test_pg_slots_ok(): def f(*a): @ray.remote(num_cpus=1) def f(): pass @ray.remote(num_cpus=1) class Actor: def f(self): pass ray.get(f.remote()) a = Actor.remote() ray.get(a.f.remote()) tune.run(f, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }] * 2), verbose=0)
def test_dataset_ok(): def f(*a): ray.data.range(10).show() tune.run(f, verbose=0) def g(*a): ctx = DatasetContext.get_current() ctx.scheduling_strategy = PlacementGroupSchedulingStrategy( ray.util.get_current_placement_group()) ray.data.range(10).show() with pytest.raises(TuneError): tune.run(g, verbose=0) tune.run(g, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }] * 2), verbose=0)
def allocation_to_pgf(alloc: List[str], resources_per_node=None): """ Convert AdaptDL allocation to a Placement Group Factory""" if not resources_per_node: resources_per_node = {"CPU": 1.0} if config.default_device() == "GPU": resources_per_node["GPU"] = 1.0 def _construct_bundle(node, number_of_instances): resources = deepcopy(resources_per_node) resources["CPU"] *= number_of_instances if "GPU" in resources: resources["GPU"] *= number_of_instances if "adaptdl_virtual" not in node: resources[f"node:{node}"] = 0.01 return resources assert len(alloc) > 0 resources = [{"CPU": 0.001}] alloc = Counter(alloc) for node, res in alloc.items(): resources.append(_construct_bundle(node, res)) return tune.PlacementGroupFactory(resources)
if not MOCK: # __resources_start__ tune.run( train_fn, resources_per_trial={"cpu": 2, "gpu": 0.5, "custom_resources": {"hdd": 80}}, ) # __resources_end__ # __resources_pgf_start__ tune.run( train_fn, resources_per_trial=tune.PlacementGroupFactory( [ {"CPU": 2, "GPU": 0.5, "hdd": 80}, {"CPU": 1}, {"CPU": 1}, ], strategy="PACK", ), ) # __resources_pgf_end__ metric = None # __modin_start__ def train_fn(config, checkpoint_dir=None): # some Modin operations here # import modin.pandas as pd tune.report(metric=metric) tune.run(
# Create a cluster with 4 CPU slots available. ray.init(num_cpus=4) # This will error, since Tune has no resources reserved for Dataset tasks. try: tune.run(objective) except TuneError: print("This failed as expected") # This runs fine, since there are 4 CPUs in the trial's placement group. The first # CPU slot is used to run the objective function, leaving 3 for Dataset tasks. tune.run( objective, resources_per_trial=tune.PlacementGroupFactory([{ "CPU": 1 }] * 4), ) # __resource_allocation_2_end__ # fmt: on # fmt: off # __block_move_begin__ import ray from ray.data.context import DatasetContext ctx = DatasetContext.get_current() ctx.optimize_fuse_stages = False def map_udf(df):
# This Dataset workload will use reserved cluster resources for execution. def objective(*args): ray.data.range(10).show() # Create a cluster with 4 CPU slots available. ray.init(num_cpus=4) # This runs smoothly since _max_cpu_fraction_per_node is set to 0.8, effectively # reserving 1 CPU for Datasets task execution. tune.run( objective, num_samples=4, resources_per_trial=tune.PlacementGroupFactory( [{ "CPU": 1 }], _max_cpu_fraction_per_node=0.8, ), ) # __resource_allocation_2_end__ # fmt: on # fmt: off # __block_move_begin__ import ray from ray.data.context import DatasetContext ctx = DatasetContext.get_current() ctx.optimize_fuse_stages = False