Example #1
0
 def choose_trial_to_run(
         self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]:
     for trial in trial_runner.get_trials():
         if (trial.status == Trial.PENDING
                 and trial_runner.has_resources_for_trial(trial)):
             return trial
     for trial in trial_runner.get_trials():
         if (trial.status == Trial.PAUSED
                 and trial_runner.has_resources_for_trial(trial)):
             return trial
     return None
Example #2
0
File: pbt.py Project: smorad/ray
    def choose_trial_to_run(
        self, trial_runner: "trial_runner.TrialRunner"
    ) -> Optional[Trial]:
        """Ensures all trials get fair share of time (as defined by time_attr).

        This enables the PBT scheduler to support a greater number of
        concurrent trials than can fit in the cluster at any given time.
        """
        candidates = []
        for trial in trial_runner.get_trials():
            if (
                trial.status
                in [
                    Trial.PENDING,
                    Trial.PAUSED,
                ]
                and trial_runner.trial_executor.has_resources_for_trial(trial)
            ):
                if not self._synch:
                    candidates.append(trial)
                elif (
                    self._trial_state[trial].last_train_time
                    < self._next_perturbation_sync
                ):
                    candidates.append(trial)
        candidates.sort(key=lambda trial: self._trial_state[trial].last_train_time)
        return candidates[0] if candidates else None
Example #3
0
    def on_trial_result(self, trial_runner: "trial_runner.TrialRunner",
                        trial: Trial, result: Dict) -> str:
        trials = [
            trial for trial in trial_runner.get_trials()
            if trial.status in (Trial.RUNNING, Trial.PENDING)
        ]
        if self._should_realloc and len(self._allocs) == 0:
            in_use_pgs = [
                pg.to_dict()
                for pg in trial_runner.trial_executor._pg_manager._in_use_pgs
            ]
            consumed_resources = pgs_to_resources(in_use_pgs)
            nodes = config.nodes(consumed_resources)
            self._allocs, _ = self._allocator.allocate(trials, nodes)

        alloc = self._allocs.pop(trial.trial_id, None)
        if alloc is None:
            # No change in allocation for this Trial
            return TrialScheduler.CONTINUE

        if alloc == [] and trial.status == Trial.RUNNING:
            # Pause only if the trial is running
            trial.pause(trial_runner)
            return TrialScheduler.PAUSE
        elif alloc != trial.allocation:
            trial = AdaptDLTrial.create_from(trial,
                                             trial_runner,
                                             alloc,
                                             copy_state=True)
            # Stop the old trial that's being replaced
            return TrialScheduler.STOP
        return TrialScheduler.CONTINUE
Example #4
0
 def _on_insufficient_samples(self,
                              trial_runner: "trial_runner.TrialRunner",
                              trial: Trial, time: float) -> str:
     pause = time - self._last_pause[trial] > self._min_time_slice
     pause = pause and [
         t for t in trial_runner.get_trials()
         if t.status in (Trial.PENDING, Trial.PAUSED)
     ]
     return TrialScheduler.PAUSE if pause else TrialScheduler.CONTINUE
Example #5
0
 def choose_trial_to_run(
         self, trial_runner: "trial_runner.TrialRunner") -> Optional[Trial]:
     for trial in trial_runner.get_trials():
         if (trial.status == Trial.PENDING
                 and trial_runner.trial_executor.has_resources_for_trial(
                     trial)):
             return trial
     for trial in trial_runner.get_trials():
         if (trial.status == Trial.PAUSED and
                 trial_runner.trial_executor.has_resources_for_trial(trial)
                 and len(self._allocs) == 0):
             # Note: this puts the trial back to RUNNING, we allow Trials to
             # resume when the allocation cache is empty and we reach a sync
             # point.
             return AdaptDLTrial.create_from(
                 trial,
                 trial_runner,
                 self._allocator.default_allocation(),
                 copy_state=True)
     return None
Example #6
0
    def choose_trial_to_run(
        self, trial_runner: "trial_runner.TrialRunner", allow_recurse: bool = True
    ) -> Optional[Trial]:
        """Fair scheduling within iteration by completion percentage.

        List of trials not used since all trials are tracked as state
        of scheduler. If iteration is occupied (ie, no trials to run),
        then look into next iteration.
        """

        for hyperband in self._hyperbands:
            # band will have None entries if no resources
            # are to be allocated to that bracket.
            scrubbed = [b for b in hyperband if b is not None]
            for bracket in scrubbed:
                for trial in bracket.current_trials():
                    if (
                        trial.status == Trial.PENDING
                        and trial_runner.trial_executor.has_resources_for_trial(trial)
                    ):
                        return trial
        # MAIN CHANGE HERE!
        if not any(t.status == Trial.RUNNING for t in trial_runner.get_trials()):
            for hyperband in self._hyperbands:
                for bracket in hyperband:
                    if bracket and any(
                        trial.status == Trial.PAUSED
                        for trial in bracket.current_trials()
                    ):
                        # This will change the trial state
                        self._process_bracket(trial_runner, bracket)

                        # If there are pending trials now, suggest one.
                        # This is because there might be both PENDING and
                        # PAUSED trials now, and PAUSED trials will raise
                        # an error before the trial runner tries again.
                        if allow_recurse and any(
                            trial.status == Trial.PENDING
                            for trial in bracket.current_trials()
                        ):
                            return self.choose_trial_to_run(
                                trial_runner, allow_recurse=False
                            )
        # MAIN CHANGE HERE!
        return None
Example #7
0
    def on_trial_result(self, trial_runner: "trial_runner.TrialRunner",
                        trial: Trial, result: Dict) -> str:
        if self._time_attr not in result:
            time_missing_msg = "Cannot find time_attr {} " \
                               "in trial result {}. Make sure that this " \
                               "attribute is returned in the " \
                               "results of your Trainable.".format(
                                self._time_attr, result)
            if self._require_attrs:
                raise RuntimeError(
                    time_missing_msg +
                    "If this error is expected, you can change this to "
                    "a warning message by "
                    "setting PBT(require_attrs=False)")
            else:
                if log_once("pbt-time_attr-error"):
                    logger.warning(time_missing_msg)
        if self._metric not in result:
            metric_missing_msg = "Cannot find metric {} in trial result {}. " \
                                 "Make sure that this attribute is returned " \
                                 "in the " \
                                 "results of your Trainable.".format(
                                    self._metric, result)
            if self._require_attrs:
                raise RuntimeError(
                    metric_missing_msg + "If this error is expected, "
                    "you can change this to a warning message by "
                    "setting PBT(require_attrs=False)")
            else:
                if log_once("pbt-metric-error"):
                    logger.warning(metric_missing_msg)

        if self._metric not in result or self._time_attr not in result:
            return TrialScheduler.CONTINUE

        time = result[self._time_attr]
        state = self._trial_state[trial]

        # Continue training if perturbation interval has not been reached yet.
        if time - state.last_perturbation_time < self._perturbation_interval:
            return TrialScheduler.CONTINUE  # avoid checkpoint overhead

        self._save_trial_state(state, time, result, trial)

        if not self._synch:
            state.last_perturbation_time = time
            lower_quantile, upper_quantile = self._quantiles()
            self._perturb_trial(trial, trial_runner, upper_quantile,
                                lower_quantile)
            for trial in trial_runner.get_trials():
                if trial.status in [Trial.PENDING, Trial.PAUSED]:
                    return TrialScheduler.PAUSE  # yield time to other trials

            return TrialScheduler.CONTINUE
        else:
            # Synchronous mode.
            if any(self._trial_state[t].last_train_time <
                   self._next_perturbation_sync and t != trial
                   for t in trial_runner.get_trials()):
                logger.debug("Pausing trial {}".format(trial))
            else:
                # All trials are synced at the same timestep.
                lower_quantile, upper_quantile = self._quantiles()
                all_trials = trial_runner.get_trials()
                not_in_quantile = []
                for t in all_trials:
                    if t not in lower_quantile and t not in upper_quantile:
                        not_in_quantile.append(t)
                # Move upper quantile trials to beginning and lower quantile
                # to end. This ensures that checkpointing of strong trials
                # occurs before exploiting of weaker ones.
                all_trials = upper_quantile + not_in_quantile + lower_quantile
                for t in all_trials:
                    logger.debug("Perturbing Trial {}".format(t))
                    self._trial_state[t].last_perturbation_time = time
                    self._perturb_trial(t, trial_runner, upper_quantile,
                                        lower_quantile)

                all_train_times = [
                    self._trial_state[trial].last_train_time
                    for trial in trial_runner.get_trials()
                ]
                max_last_train_time = max(all_train_times)
                self._next_perturbation_sync = max(
                    self._next_perturbation_sync + self._perturbation_interval,
                    max_last_train_time)
            # In sync mode we should pause all trials once result comes in.
            # Once a perturbation step happens for all trials, they should
            # still all be paused.
            # choose_trial_to_run will then pick the next trial to run out of
            # the paused trials.
            return TrialScheduler.PAUSE