Example #1
0
 def _on_insufficient_samples(self,
                              trial_runner: "trial_runner.TrialRunner",
                              trial: Trial, time: float) -> str:
     pause = time - self._last_pause[trial] > self._min_time_slice
     pause = pause and [
         t for t in trial_runner.get_live_trials()
         if t.status in (Trial.PENDING, Trial.PAUSED)
     ]
     return TrialScheduler.PAUSE if pause else TrialScheduler.CONTINUE
    def example_resources_allocation_function(
        trial_runner: "trial_runner.TrialRunner",
        trial: Trial,
        result: Dict[str, Any],
        scheduler: "ResourceChangingScheduler",
    ) -> Union[None, PlacementGroupFactory, Resources]:
        """This is a basic example of a resource allocating function.

        The function naively balances available CPUs over live trials.

        This function returns a new ``PlacementGroupFactory`` with updated
        resource requirements, or None. If the returned
        ``PlacementGroupFactory`` is equal by value to the one the
        trial has currently, the scheduler will skip the update process
        internally (same with None).

        See :class:`DistributeResources` for a more complex,
        robust approach.

        Args:
            trial_runner (TrialRunner): Trial runner for this Tune run.
                Can be used to obtain information about other trials.
            trial (Trial): The trial to allocate new resources to.
            result (Dict[str, Any]): The latest results of trial.
            scheduler (ResourceChangingScheduler): The scheduler calling
                the function.
        """

        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler._base_trial_resources

        # Don't bother if this is just the first iteration
        if result["training_iteration"] < 1:
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        # Assume that the number of CPUs cannot go below what was
        # specified in tune.run
        min_cpu = base_trial_resource.required_resources.get("CPU", 0)

        # Get the number of CPUs available in total (not just free)
        total_available_cpus = (
            trial_runner.trial_executor._resource_updater.get_num_cpus())

        # Divide the free CPUs among all live trials
        cpu_to_use = max(
            min_cpu,
            total_available_cpus // len(trial_runner.get_live_trials()))

        # Assign new CPUs to the trial in a PlacementGroupFactory
        return PlacementGroupFactory([{"CPU": cpu_to_use, "GPU": 0}])
    def __call__(
        self, trial_runner: "trial_runner.TrialRunner", trial: Trial,
        result: Dict[str, Any], scheduler: "ResourceChangingScheduler"
    ) -> Union[None, PlacementGroupFactory]:
        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler.base_trial_resources

        if not isinstance(base_trial_resource, PlacementGroupFactory):
            raise ValueError("evenly_distribute_cpus_gpus only supports"
                             " PlacementGroupFactories.")

        # Don't bother if this is just the first iteration
        if result["training_iteration"] < 1:
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        # Assume that the number of CPUs and GPUs can't go below
        # what was specified in tune.run
        min_cpu = base_trial_resource.required_resources.get("CPU", 0)
        min_gpu = base_trial_resource.required_resources.get("GPU", 0)

        min_cpu_bundle = base_trial_resource.bundles[0].get("CPU", 0)
        min_gpu_bundle = base_trial_resource.bundles[0].get("GPU", 0)

        # Get the number of CPUs and GPUs avaialble in total (not just free)
        total_available_cpus = (
            trial_runner.trial_executor._avail_resources.cpu)
        total_available_gpus = (
            trial_runner.trial_executor._avail_resources.gpu)

        # Set upper limits for resources based on number of live trials
        # to ensure that the trial cannot get more resources that it's
        # possible to run
        num_running_trials = len(trial_runner.get_live_trials())
        if min_cpu == 0:
            upper_cpu_limit = 0
        else:
            upper_cpu_limit = math.ceil(total_available_cpus /
                                        num_running_trials)
            # Round to nearest bundle minimum
            # eg. 8 CPUs between 3 trials with min 2 CPUs per bundle
            #   -> 4, 2, 2
            if self.add_bundles:
                upper_cpu_limit = math.ceil(
                    upper_cpu_limit / min_cpu_bundle) * min_cpu_bundle
            upper_cpu_limit = max(min_cpu, upper_cpu_limit)

        if min_gpu == 0:
            upper_gpu_limit = 0
        else:
            upper_gpu_limit = math.ceil(total_available_gpus /
                                        num_running_trials)
            # Ensure we don't go below per-bundle minimum
            if self.add_bundles:
                upper_gpu_limit = math.ceil(
                    upper_gpu_limit / min_cpu_bundle) * min_gpu_bundle
            upper_gpu_limit = max(min_gpu, upper_gpu_limit)

        # Function to check how many CPUs and GPUs a trial is using currently
        def get_used_cpus_and_gpus(t: Trial):
            return (t.placement_group_factory.required_resources.get("CPU", 0),
                    t.placement_group_factory.required_resources.get("GPU", 0))

        # Check how many CPUs and GPUs are currently being used by this trial
        trial_used_cpus, trial_used_gpus = get_used_cpus_and_gpus(trial)

        # Check how many CPUs and GPUs are currently being used by live trials
        used_cpus_and_gpus = [
            get_used_cpus_and_gpus(t) for t in trial_runner.get_live_trials()
        ]
        used_cpus, used_gpus = zip(*used_cpus_and_gpus)
        used_cpus = sum(used_cpus)
        used_gpus = sum(used_gpus)

        # Calculate how many free CPUs and GPUs there are
        free_cpus = total_available_cpus - used_cpus
        free_gpus = total_available_gpus - used_gpus

        # Add free CPUs and GPUs enforcing upper and lower limits
        new_cpu = min(upper_cpu_limit, max(trial_used_cpus + free_cpus,
                                           min_cpu))
        new_gpu = min(upper_gpu_limit, max(trial_used_gpus + free_gpus,
                                           min_gpu))

        # Assign new CPUs and GPUs to the trial in a PlacementGroupFactory

        # If self.add_bundles, make new bundles out of the resources
        if self.add_bundles:
            if min_cpu_bundle and min_gpu_bundle:
                multiplier = min(new_cpu // min_cpu_bundle,
                                 new_gpu // min_cpu_bundle)
            elif min_gpu_bundle:
                multiplier = new_gpu // min_cpu_bundle
            else:
                multiplier = new_cpu // min_cpu_bundle
            new_bundles = [{
                "CPU": min_cpu_bundle,
                "GPU": min_gpu_bundle
            }] * int(multiplier)
        # Otherwise, just put them all in one bundle
        else:
            new_bundles = [{"CPU": new_cpu, "GPU": new_gpu}]
        return PlacementGroupFactory(new_bundles)
Example #4
0
    def on_trial_result(self, trial_runner: "trial_runner.TrialRunner",
                        trial: Trial, result: Dict) -> str:
        if self._time_attr not in result:
            time_missing_msg = ("Cannot find time_attr {} "
                                "in trial result {}. Make sure that this "
                                "attribute is returned in the "
                                "results of your Trainable.".format(
                                    self._time_attr, result))
            if self._require_attrs:
                raise RuntimeError(
                    time_missing_msg +
                    "If this error is expected, you can change this to "
                    "a warning message by "
                    "setting PBT(require_attrs=False)")
            else:
                if log_once("pbt-time_attr-error"):
                    logger.warning(time_missing_msg)
        if self._metric not in result:
            metric_missing_msg = ("Cannot find metric {} in trial result {}. "
                                  "Make sure that this attribute is returned "
                                  "in the "
                                  "results of your Trainable.".format(
                                      self._metric, result))
            if self._require_attrs:
                raise RuntimeError(
                    metric_missing_msg + "If this error is expected, "
                    "you can change this to a warning message by "
                    "setting PBT(require_attrs=False)")
            else:
                if log_once("pbt-metric-error"):
                    logger.warning(metric_missing_msg)

        if self._metric not in result or self._time_attr not in result:
            return TrialScheduler.CONTINUE

        time = result[self._time_attr]
        state = self._trial_state[trial]

        # Continue training if burn-in period has not been reached, yet.
        if time < self._burn_in_period:
            return TrialScheduler.CONTINUE

        # Continue training if perturbation interval has not been reached, yet.
        if time - state.last_perturbation_time < self._perturbation_interval:
            return TrialScheduler.CONTINUE  # avoid checkpoint overhead

        self._save_trial_state(state, time, result, trial)

        if not self._synch:
            state.last_perturbation_time = time
            lower_quantile, upper_quantile = self._quantiles()
            decision = TrialScheduler.CONTINUE
            for other_trial in trial_runner.get_trials():
                if other_trial.status in [Trial.PENDING, Trial.PAUSED]:
                    decision = TrialScheduler.PAUSE
                    break
            self._checkpoint_or_exploit(trial, trial_runner.trial_executor,
                                        upper_quantile, lower_quantile)
            return TrialScheduler.NOOP if trial.status == Trial.PAUSED else decision
        else:
            # Synchronous mode.
            if any(self._trial_state[t].last_train_time <
                   self._next_perturbation_sync and t != trial
                   for t in trial_runner.get_live_trials()):
                logger.debug("Pausing trial {}".format(trial))
            else:
                # All trials are synced at the same timestep.
                lower_quantile, upper_quantile = self._quantiles()
                all_trials = trial_runner.get_trials()
                not_in_quantile = []
                for t in all_trials:
                    if t not in lower_quantile and t not in upper_quantile:
                        not_in_quantile.append(t)
                # Move upper quantile trials to beginning and lower quantile
                # to end. This ensures that checkpointing of strong trials
                # occurs before exploiting of weaker ones.
                all_trials = upper_quantile + not_in_quantile + lower_quantile
                for t in all_trials:
                    logger.debug("Perturbing Trial {}".format(t))
                    self._trial_state[t].last_perturbation_time = time
                    self._checkpoint_or_exploit(t, trial_runner.trial_executor,
                                                upper_quantile, lower_quantile)

                all_train_times = [
                    self._trial_state[t].last_train_time
                    for t in trial_runner.get_trials()
                ]
                max_last_train_time = max(all_train_times)
                self._next_perturbation_sync = max(
                    self._next_perturbation_sync + self._perturbation_interval,
                    max_last_train_time,
                )
            # In sync mode we should pause all trials once result comes in.
            # Once a perturbation step happens for all trials, they should
            # still all be paused.
            # choose_trial_to_run will then pick the next trial to run out of
            # the paused trials.
            return (TrialScheduler.NOOP
                    if trial.status == Trial.PAUSED else TrialScheduler.PAUSE)
    def __call__(
        self,
        trial_runner: "trial_runner.TrialRunner",
        trial: Trial,
        result: Dict[str, Any],
        scheduler: "ResourceChangingScheduler",
    ) -> Union[None, PlacementGroupFactory]:
        """Run resource allocation logic.

        Returns a new ``PlacementGroupFactory`` with updated
        resource requirements, or None. If the returned
        ``PlacementGroupFactory`` is equal by value to the one the
        trial has currently, the scheduler will skip the update process
        internally (same with None).

        Args:
            trial_runner: Trial runner for this Tune run.
                Can be used to obtain information about other trials.
            trial: The trial to allocate new resources to.
            result: The latest results of trial.
            scheduler: The scheduler calling
                the function.
        """
        # Get base trial resources as defined in
        # ``tune.run(resources_per_trial)``
        base_trial_resource = scheduler.base_trial_resources

        if not self._validate(base_trial_resource=base_trial_resource, result=result):
            return None

        # default values if resources_per_trial is unspecified
        if base_trial_resource is None:
            base_trial_resource = PlacementGroupFactory([{"CPU": 1, "GPU": 0}])

        if self.increase_by:
            increase_by = self.increase_by
            assert not self._is_bundle_empty(increase_by)
            assert increase_by.get("CPU", 0) >= 0 and increase_by.get("GPU", 0) >= 0
        elif self.add_bundles:
            increase_by = base_trial_resource.bundles[-1]
        elif base_trial_resource.bundles[0].get("GPU", 0):
            increase_by = {"GPU": 1}
        else:
            increase_by = {"CPU": 1}

        base_bundles = deepcopy(base_trial_resource.bundles)

        (
            total_available_cpus,
            total_available_gpus,
        ) = self._get_total_available_resources(trial_runner=trial_runner)

        all_trials = trial_runner.get_live_trials()

        used_cpus_and_gpus = [self._get_used_cpus_and_gpus(t) for t in all_trials]
        used_cpus, used_gpus = zip(*used_cpus_and_gpus)
        used_cpus = sum(used_cpus)
        used_gpus = sum(used_gpus)

        added_bundles = self._get_new_added_bundles(
            trial,
            all_trials,
            base_bundles,
            increase_by,
            total_available_cpus,
            total_available_gpus,
            used_cpus,
            used_gpus,
        )

        new_bundles = self._add_two_bundles(
            base_bundles, added_bundles, increase_by, False
        )

        pgf = PlacementGroupFactory(new_bundles)
        pgf._head_bundle_is_empty = base_trial_resource._head_bundle_is_empty
        return pgf