def _on_insufficient_samples(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, time: float) -> str: pause = time - self._last_pause[trial] > self._min_time_slice pause = pause and [ t for t in trial_runner.get_live_trials() if t.status in (Trial.PENDING, Trial.PAUSED) ] return TrialScheduler.PAUSE if pause else TrialScheduler.CONTINUE
def example_resources_allocation_function( trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Union[None, PlacementGroupFactory, Resources]: """This is a basic example of a resource allocating function. The function naively balances available CPUs over live trials. This function returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). See :class:`DistributeResources` for a more complex, robust approach. Args: trial_runner (TrialRunner): Trial runner for this Tune run. Can be used to obtain information about other trials. trial (Trial): The trial to allocate new resources to. result (Dict[str, Any]): The latest results of trial. scheduler (ResourceChangingScheduler): The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler._base_trial_resources # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs cannot go below what was # specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) # Get the number of CPUs available in total (not just free) total_available_cpus = ( trial_runner.trial_executor._resource_updater.get_num_cpus()) # Divide the free CPUs among all live trials cpu_to_use = max( min_cpu, total_available_cpus // len(trial_runner.get_live_trials())) # Assign new CPUs to the trial in a PlacementGroupFactory return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])
def __call__( self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler" ) -> Union[None, PlacementGroupFactory]: # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler.base_trial_resources if not isinstance(base_trial_resource, PlacementGroupFactory): raise ValueError("evenly_distribute_cpus_gpus only supports" " PlacementGroupFactories.") # Don't bother if this is just the first iteration if result["training_iteration"] < 1: return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) # Assume that the number of CPUs and GPUs can't go below # what was specified in tune.run min_cpu = base_trial_resource.required_resources.get("CPU", 0) min_gpu = base_trial_resource.required_resources.get("GPU", 0) min_cpu_bundle = base_trial_resource.bundles[0].get("CPU", 0) min_gpu_bundle = base_trial_resource.bundles[0].get("GPU", 0) # Get the number of CPUs and GPUs avaialble in total (not just free) total_available_cpus = ( trial_runner.trial_executor._avail_resources.cpu) total_available_gpus = ( trial_runner.trial_executor._avail_resources.gpu) # Set upper limits for resources based on number of live trials # to ensure that the trial cannot get more resources that it's # possible to run num_running_trials = len(trial_runner.get_live_trials()) if min_cpu == 0: upper_cpu_limit = 0 else: upper_cpu_limit = math.ceil(total_available_cpus / num_running_trials) # Round to nearest bundle minimum # eg. 8 CPUs between 3 trials with min 2 CPUs per bundle # -> 4, 2, 2 if self.add_bundles: upper_cpu_limit = math.ceil( upper_cpu_limit / min_cpu_bundle) * min_cpu_bundle upper_cpu_limit = max(min_cpu, upper_cpu_limit) if min_gpu == 0: upper_gpu_limit = 0 else: upper_gpu_limit = math.ceil(total_available_gpus / num_running_trials) # Ensure we don't go below per-bundle minimum if self.add_bundles: upper_gpu_limit = math.ceil( upper_gpu_limit / min_cpu_bundle) * min_gpu_bundle upper_gpu_limit = max(min_gpu, upper_gpu_limit) # Function to check how many CPUs and GPUs a trial is using currently def get_used_cpus_and_gpus(t: Trial): return (t.placement_group_factory.required_resources.get("CPU", 0), t.placement_group_factory.required_resources.get("GPU", 0)) # Check how many CPUs and GPUs are currently being used by this trial trial_used_cpus, trial_used_gpus = get_used_cpus_and_gpus(trial) # Check how many CPUs and GPUs are currently being used by live trials used_cpus_and_gpus = [ get_used_cpus_and_gpus(t) for t in trial_runner.get_live_trials() ] used_cpus, used_gpus = zip(*used_cpus_and_gpus) used_cpus = sum(used_cpus) used_gpus = sum(used_gpus) # Calculate how many free CPUs and GPUs there are free_cpus = total_available_cpus - used_cpus free_gpus = total_available_gpus - used_gpus # Add free CPUs and GPUs enforcing upper and lower limits new_cpu = min(upper_cpu_limit, max(trial_used_cpus + free_cpus, min_cpu)) new_gpu = min(upper_gpu_limit, max(trial_used_gpus + free_gpus, min_gpu)) # Assign new CPUs and GPUs to the trial in a PlacementGroupFactory # If self.add_bundles, make new bundles out of the resources if self.add_bundles: if min_cpu_bundle and min_gpu_bundle: multiplier = min(new_cpu // min_cpu_bundle, new_gpu // min_cpu_bundle) elif min_gpu_bundle: multiplier = new_gpu // min_cpu_bundle else: multiplier = new_cpu // min_cpu_bundle new_bundles = [{ "CPU": min_cpu_bundle, "GPU": min_gpu_bundle }] * int(multiplier) # Otherwise, just put them all in one bundle else: new_bundles = [{"CPU": new_cpu, "GPU": new_gpu}] return PlacementGroupFactory(new_bundles)
def on_trial_result(self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict) -> str: if self._time_attr not in result: time_missing_msg = ("Cannot find time_attr {} " "in trial result {}. Make sure that this " "attribute is returned in the " "results of your Trainable.".format( self._time_attr, result)) if self._require_attrs: raise RuntimeError( time_missing_msg + "If this error is expected, you can change this to " "a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-time_attr-error"): logger.warning(time_missing_msg) if self._metric not in result: metric_missing_msg = ("Cannot find metric {} in trial result {}. " "Make sure that this attribute is returned " "in the " "results of your Trainable.".format( self._metric, result)) if self._require_attrs: raise RuntimeError( metric_missing_msg + "If this error is expected, " "you can change this to a warning message by " "setting PBT(require_attrs=False)") else: if log_once("pbt-metric-error"): logger.warning(metric_missing_msg) if self._metric not in result or self._time_attr not in result: return TrialScheduler.CONTINUE time = result[self._time_attr] state = self._trial_state[trial] # Continue training if burn-in period has not been reached, yet. if time < self._burn_in_period: return TrialScheduler.CONTINUE # Continue training if perturbation interval has not been reached, yet. if time - state.last_perturbation_time < self._perturbation_interval: return TrialScheduler.CONTINUE # avoid checkpoint overhead self._save_trial_state(state, time, result, trial) if not self._synch: state.last_perturbation_time = time lower_quantile, upper_quantile = self._quantiles() decision = TrialScheduler.CONTINUE for other_trial in trial_runner.get_trials(): if other_trial.status in [Trial.PENDING, Trial.PAUSED]: decision = TrialScheduler.PAUSE break self._checkpoint_or_exploit(trial, trial_runner.trial_executor, upper_quantile, lower_quantile) return TrialScheduler.NOOP if trial.status == Trial.PAUSED else decision else: # Synchronous mode. if any(self._trial_state[t].last_train_time < self._next_perturbation_sync and t != trial for t in trial_runner.get_live_trials()): logger.debug("Pausing trial {}".format(trial)) else: # All trials are synced at the same timestep. lower_quantile, upper_quantile = self._quantiles() all_trials = trial_runner.get_trials() not_in_quantile = [] for t in all_trials: if t not in lower_quantile and t not in upper_quantile: not_in_quantile.append(t) # Move upper quantile trials to beginning and lower quantile # to end. This ensures that checkpointing of strong trials # occurs before exploiting of weaker ones. all_trials = upper_quantile + not_in_quantile + lower_quantile for t in all_trials: logger.debug("Perturbing Trial {}".format(t)) self._trial_state[t].last_perturbation_time = time self._checkpoint_or_exploit(t, trial_runner.trial_executor, upper_quantile, lower_quantile) all_train_times = [ self._trial_state[t].last_train_time for t in trial_runner.get_trials() ] max_last_train_time = max(all_train_times) self._next_perturbation_sync = max( self._next_perturbation_sync + self._perturbation_interval, max_last_train_time, ) # In sync mode we should pause all trials once result comes in. # Once a perturbation step happens for all trials, they should # still all be paused. # choose_trial_to_run will then pick the next trial to run out of # the paused trials. return (TrialScheduler.NOOP if trial.status == Trial.PAUSED else TrialScheduler.PAUSE)
def __call__( self, trial_runner: "trial_runner.TrialRunner", trial: Trial, result: Dict[str, Any], scheduler: "ResourceChangingScheduler", ) -> Union[None, PlacementGroupFactory]: """Run resource allocation logic. Returns a new ``PlacementGroupFactory`` with updated resource requirements, or None. If the returned ``PlacementGroupFactory`` is equal by value to the one the trial has currently, the scheduler will skip the update process internally (same with None). Args: trial_runner: Trial runner for this Tune run. Can be used to obtain information about other trials. trial: The trial to allocate new resources to. result: The latest results of trial. scheduler: The scheduler calling the function. """ # Get base trial resources as defined in # ``tune.run(resources_per_trial)`` base_trial_resource = scheduler.base_trial_resources if not self._validate(base_trial_resource=base_trial_resource, result=result): return None # default values if resources_per_trial is unspecified if base_trial_resource is None: base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}]) if self.increase_by: increase_by = self.increase_by assert not self._is_bundle_empty(increase_by) assert increase_by.get("CPU", 0) >= 0 and increase_by.get("GPU", 0) >= 0 elif self.add_bundles: increase_by = base_trial_resource.bundles[-1] elif base_trial_resource.bundles[0].get("GPU", 0): increase_by = {"GPU": 1} else: increase_by = {"CPU": 1} base_bundles = deepcopy(base_trial_resource.bundles) ( total_available_cpus, total_available_gpus, ) = self._get_total_available_resources(trial_runner=trial_runner) all_trials = trial_runner.get_live_trials() used_cpus_and_gpus = [self._get_used_cpus_and_gpus(t) for t in all_trials] used_cpus, used_gpus = zip(*used_cpus_and_gpus) used_cpus = sum(used_cpus) used_gpus = sum(used_gpus) added_bundles = self._get_new_added_bundles( trial, all_trials, base_bundles, increase_by, total_available_cpus, total_available_gpus, used_cpus, used_gpus, ) new_bundles = self._add_two_bundles( base_bundles, added_bundles, increase_by, False ) pgf = PlacementGroupFactory(new_bundles) pgf._head_bundle_is_empty = base_trial_resource._head_bundle_is_empty return pgf