def testAllocateFreeResourcesWithIncreaseByTimes(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources( add_bundles=True, increase_by={"GPU": 2}, increase_by_times=2 ) ) base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) decision = scheduler.on_trial_result( self.trial_runner, trial1, {"metric": 1, "training_iteration": 4} ) assert decision == TrialScheduler.CONTINUE trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2) ) trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial2, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2) ) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 3) )
def testAllocateFreeResourcesWithIncreaseBy(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResourcesToTopJob( add_bundles=False, increase_by={ "CPU": 2, "GPU": 2 }, metric="metric", mode="max", )) base_pgf = PlacementGroupFactory([{"CPU": 2, "GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials( scheduler, base_pgf) decision = scheduler.on_trial_result(self.trial_runner, trial2, { "metric": 0.9, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE decision = scheduler.on_trial_result(self.trial_runner, trial1, { "metric": 1.0, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{ "CPU": 4, "GPU": 4 }])) decision = scheduler.on_trial_result(self.trial_runner, trial2, { "metric": 1.1, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources(trial2, scheduler, PlacementGroupFactory([{ "CPU": 4, "GPU": 4 }]), metric=1.1) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources(trial1, scheduler, PlacementGroupFactory([{ "CPU": 8, "GPU": 8 }]), metric=1.2)
def test_data_parallel_trainer(ray_start_8_cpus): num_workers = 2 trainer = AssertingDataParallelTrainer( train_fn, scaling_config=ScalingConfig(num_workers=num_workers) ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "num_epochs": 100, "metric": tune.grid_search([1, 2, 3, 4, 5]), } }, tune_config=TuneConfig( mode="max", metric="metric", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid) # + 1 for Trainable assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
def testAllocateFreeResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources(add_bundles=False) ) base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 2}]) ) self._allocateAndAssertNewResources( trial2, scheduler, PlacementGroupFactory([{"CPU": 2}]) ) trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 3}]) ) trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 4}]) ) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 8}]) )
def _create_scheduler(self, scheduler_config, parameters): if not scheduler_config: return None dynamic_resource_allocation = scheduler_config.pop("dynamic_resource_allocation", False) if scheduler_config.get("type") == "pbt": scheduler_config.update({"hyperparam_mutations": self.search_space}) scheduler = tune.create_scheduler(scheduler_config.get("type"), **scheduler_config) if dynamic_resource_allocation: scheduler = ResourceChangingScheduler(scheduler, ray_resource_allocation_function) return scheduler
def testDeallocateResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResourcesToTopJob( add_bundles=False, increase_by={"GPU": 2}, metric="metric", mode="max" ) ) base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) trial1.placement_group_factory = PlacementGroupFactory([{"CPU": 1, "GPU": 4}]) trial4.status = Trial.PENDING self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1, "GPU": 2}]) )
def testDeallocateResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources( add_bundles=True, increase_by={"GPU": 2} ) ) base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) trial1.placement_group_factory = PlacementGroupFactory( [{"CPU": 1}] + [{"GPU": 2}] * 2 ) trial4.status = Trial.PENDING self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) )
def test_gbdt_trainer(ray_start_8_cpus): data_raw = load_breast_cancer() dataset_df = pd.DataFrame(data_raw["data"], columns=data_raw["feature_names"]) dataset_df["target"] = data_raw["target"] train_ds = ray.data.from_pandas(dataset_df).repartition(16) trainer = AssertingXGBoostTrainer( datasets={TRAIN_DATASET_KEY: train_ds}, label_column="target", scaling_config=ScalingConfig(num_workers=2), params={ "objective": "binary:logistic", "eval_metric": ["logloss"], }, ) tuner = Tuner( trainer, param_space={ "num_boost_round": 100, "params": { "eta": tune.grid_search([0.28, 0.29, 0.3, 0.31, 0.32]), }, }, tune_config=TuneConfig( mode="min", metric="train-logloss", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid)