def testAllocateFreeResourcesWithIncreaseByTimes(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources( add_bundles=True, increase_by={"GPU": 2}, increase_by_times=2 ) ) base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) decision = scheduler.on_trial_result( self.trial_runner, trial1, {"metric": 1, "training_iteration": 4} ) assert decision == TrialScheduler.CONTINUE trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2) ) trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial2, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 2) ) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}] + [{"GPU": 2}] * 3) )
def testAllocateFreeResourcesWithIncreaseBy(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResourcesToTopJob( add_bundles=False, increase_by={ "CPU": 2, "GPU": 2 }, metric="metric", mode="max", )) base_pgf = PlacementGroupFactory([{"CPU": 2, "GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials( scheduler, base_pgf) decision = scheduler.on_trial_result(self.trial_runner, trial2, { "metric": 0.9, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE decision = scheduler.on_trial_result(self.trial_runner, trial1, { "metric": 1.0, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{ "CPU": 4, "GPU": 4 }])) decision = scheduler.on_trial_result(self.trial_runner, trial2, { "metric": 1.1, "training_iteration": 4 }) assert decision == TrialScheduler.CONTINUE trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources(trial2, scheduler, PlacementGroupFactory([{ "CPU": 4, "GPU": 4 }]), metric=1.1) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources(trial1, scheduler, PlacementGroupFactory([{ "CPU": 8, "GPU": 8 }]), metric=1.2)
def testDeallocateResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResourcesToTopJob( add_bundles=False, increase_by={"GPU": 2}, metric="metric", mode="max" ) ) base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) trial1.placement_group_factory = PlacementGroupFactory([{"CPU": 1, "GPU": 4}]) trial4.status = Trial.PENDING self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1, "GPU": 2}]) )
def default_resource_request(cls, config): num_workers = config.get("num_workers", kwargs.get("num_workers", 1)) num_cpus_per_worker = config.get( "num_cpus_per_worker", kwargs.get("num_cpus_per_worker", 1)) use_gpu = config.get("use_gpu", kwargs.get("use_gpu")) use_local = config.get("use_local", kwargs.get("use_local", False)) bundles = [] if not use_local: # We need a separate bundle for the driver bundles += [{"CPU": 1}] bundles += [ # Worker bundles { "CPU": num_cpus_per_worker, "GPU": int(use_gpu) } ] * num_workers return PlacementGroupFactory(bundles, strategy="PACK")
def pause(self, trial_runner): """ Pause the AdaptDLTrial with a checkpoint. We try to remove the PG attached to this trial""" assert self.runner is not None checkpoint_obj = ray.get( self.runner.save_all_states.remote(self.runner.get_state.remote())) # Serialize to disk temp_checkpoint_dir = (FuncCheckpointUtil.mk_temp_checkpoint_dir( self.logdir)) checkpoint_path = TrainableUtil.create_from_pickle( checkpoint_obj, temp_checkpoint_dir) # Trial will be restored from the checkpoint_path when it's resumed self.restore_path = checkpoint_path # Clear the allocation. This is a hack to clear the PG associated with # the trial. We assign a temporary PG which will get replaced with a # real PG once we resume the trial. This is needed because Tune likes # to keep the PGs around even for PAUSED trials. self.placement_group_factory = PlacementGroupFactory([{"CPU": 0.001}]) # This forces Tune to garbage-collect uneeded PGs which can then be # reused trial_runner.trial_executor._pg_manager.\ reconcile_placement_groups([self]) logger.debug(f"PAUSING {self} w/ checkpoint at {checkpoint_path}")
def testDeallocateResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources( add_bundles=True, increase_by={"GPU": 2} ) ) base_pgf = PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) trial1.placement_group_factory = PlacementGroupFactory( [{"CPU": 1}] + [{"GPU": 2}] * 2 ) trial4.status = Trial.PENDING self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 1}, {"GPU": 2}]) )
def default_resource_request(cls, config: Dict) -> PlacementGroupFactory: trainer_bundle = [{"CPU": 1}] worker_resources = {"CPU": 1, "GPU": int(use_gpu)} worker_resources_extra = ( {} if resources_per_worker is None else resources_per_worker ) worker_bundles = [ {**worker_resources, **worker_resources_extra} for _ in range(num_workers) ] bundles = trainer_bundle + worker_bundles return PlacementGroupFactory(bundles, strategy="PACK")
def testAllocateFreeResources(self): scheduler = ResourceChangingScheduler( resources_allocation_function=DistributeResources(add_bundles=False) ) base_pgf = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) trial1, trial2, trial3, trial4 = self._prepareTrials(scheduler, base_pgf) self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 2}]) ) self._allocateAndAssertNewResources( trial2, scheduler, PlacementGroupFactory([{"CPU": 2}]) ) trial4.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 3}]) ) trial3.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 4}]) ) trial2.status = Trial.TERMINATED self._allocateAndAssertNewResources( trial1, scheduler, PlacementGroupFactory([{"CPU": 8}]) )
def get_tune_ddp_resources(num_workers: int = 1, cpus_per_worker: int = 1, use_gpu: bool = False) -> Dict[str, int]: """Returns the PlacementGroupFactory to use for Ray Tune.""" from ray.tune import PlacementGroupFactory head_bundle = {"CPU": 1} child_bundle = {"CPU": cpus_per_worker, "GPU": int(use_gpu)} child_bundles = [child_bundle.copy() for _ in range(num_workers)] bundles = [head_bundle] + child_bundles placement_group_factory = PlacementGroupFactory( bundles, strategy="PACK") return placement_group_factory
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[ { # Single CPU for the local worker. This CPU will host the # main model in this example (num_workers=0). "CPU": 1, # Possibly add n GPUs to this. "GPU": cf["num_gpus"], }, { # Different bundle (meaning: possibly different node) # for your n "remote" envs (set remote_worker_envs=True). "CPU": cf["num_envs_per_worker"], } ], strategy=config.get("placement_strategy", "PACK"))
def _get_tune_resources(num_actors: int, cpus_per_actor: int, gpus_per_actor: int, resources_per_actor: Optional[Dict]): """Returns object to use for ``resources_per_trial`` with Ray Tune.""" if TUNE_INSTALLED: if not TUNE_USING_PG: resources_per_actor = {} if not resources_per_actor \ else resources_per_actor extra_custom_resources = { k: v * num_actors for k, v in resources_per_actor.items() } return dict( cpu=1, extra_cpu=cpus_per_actor * num_actors, extra_gpu=gpus_per_actor * num_actors, extra_custom_resources=extra_custom_resources, ) else: from ray.tune import PlacementGroupFactory head_bundle = {"CPU": 1} child_bundle = {"CPU": cpus_per_actor, "GPU": gpus_per_actor} child_bundle_extra = {} if resources_per_actor is None else \ resources_per_actor child_bundles = [{ **child_bundle, **child_bundle_extra } for _ in range(num_actors)] bundles = [head_bundle] + child_bundles placement_group_factory = PlacementGroupFactory(bundles, strategy="PACK") return placement_group_factory else: raise RuntimeError("Tune is not installed, so `get_tune_resources` is " "not supported. You can install Ray Tune via `pip " "install ray[tune]`.")