Beispiel #1
0
    def has_resources(self, resources: Resources) -> bool:
        """Returns whether this runner has at least the specified resources.

        This refreshes the Ray cluster resources if the time since last update
        has exceeded self._refresh_period. This also assumes that the
        cluster is not resizing very frequently.
        """
        if resources.has_placement_group:
            return self._pg_manager.can_stage()

        self._update_avail_resources()
        currently_available = Resources.subtract(self._avail_resources,
                                                 self._committed_resources)
        have_space = (
            resources.cpu_total() <= currently_available.cpu
            and resources.gpu_total() <= currently_available.gpu
            and resources.memory_total() <= currently_available.memory
            and resources.object_store_memory_total() <=
            currently_available.object_store_memory and all(
                resources.get_res_total(res) <= currently_available.get(res)
                for res in resources.custom_resources))

        if have_space:
            # The assumption right now is that we block all trials if one
            # trial is queued.
            return True

        return False
Beispiel #2
0
    def has_resources(self, resources: Resources) -> bool:
        """Returns whether this runner has at least the specified resources.

        This refreshes the Ray cluster resources if the time since last update
        has exceeded self._refresh_period. This also assumes that the
        cluster is not resizing very frequently.
        """
        if resources.has_placement_group:
            return self._pg_manager.can_stage()

        self._update_avail_resources()
        currently_available = Resources.subtract(self._avail_resources,
                                                 self._committed_resources)
        have_space = (
            resources.cpu_total() <= currently_available.cpu
            and resources.gpu_total() <= currently_available.gpu
            and resources.memory_total() <= currently_available.memory
            and resources.object_store_memory_total() <=
            currently_available.object_store_memory and all(
                resources.get_res_total(res) <= currently_available.get(res)
                for res in resources.custom_resources))

        if have_space:
            # The assumption right now is that we block all trials if one
            # trial is queued.
            self._trial_queued = False
            return True

        can_overcommit = self._queue_trials and not self._trial_queued
        if can_overcommit:
            self._trial_queued = True
            logger.warning(
                "Allowing trial to start even though the "
                "cluster does not have enough free resources. Trial actors "
                "may appear to hang until enough resources are added to the "
                "cluster (e.g., via autoscaling). You can disable this "
                "behavior by specifying `queue_trials=False` in "
                "ray.tune.run().")
            return True

        return False
Beispiel #3
0
class FluidExecutor(TrialExecutor):
    def __init__(self, **kwargs):
        super().__init__(queue_trials=True)  # type: ignore

        # whether in testing environment without GPU
        self._fake_gpus = False

        # resources
        self._avail_resources = Resources(cpu=0, gpu=0)
        self._committed_resources = Resources(cpu=0, gpu=0)
        self._resources_initialized = False
        self._last_resource_refresh = float("-inf")
        # list of trials that has resources committed
        # this is usually those trials in jobs_running,
        # but a trial may be only in _trials_running but not in jobs_running,
        # because fetch_result was called on it.
        # This is maintained solely by _commit_resources/_return_resources
        self._trials_running: List[Trial] = set()

        # make sure our own GPU resources are created first in the cluster
        create_custom_gpu_res()
        self._update_avail_resources()

        logger.info(f"Init with resources: {self._avail_resources}")

        self.jobs_pending: List[PendingJob] = []

        # map from in_flight_future to the job
        self.jobs_running: Dict[ray.ObjectID, RunningJob] = {}

        # used to save the previous run fut
        self.jobs_paused: Dict[ray.ObjectID, RunningJob] = {}

        # async queue to stop runner
        self._trial_cleanup = _TrialCleanup()

        # metadata about a trial group
        self.trial_group_meta: List[TrialGroupMeta] = []
        # trialgroup assignment,
        # mapping from trial_id to group num
        self.trial_groups: Dict[str, TrialAndGroup] = {}

    @property
    def num_trial_groups(self) -> int:
        return len(self.trial_group_meta)

    def _detect_groups(self):
        """Go over pending jobs, and assign trialgroup to them if not already done.
        If new groups are discovered, otherwise run static
        """
        logger.debug(
            f"_detect_groups: self.jobs_pending={self.jobs_pending} self.trial_groups={self.trial_groups}"
        )
        # pending may already be assigned a group if it's an unpaused trial
        assigned, unassigned = partition(
            self.jobs_pending, lambda p: p.trial.trial_id in self.trial_groups)
        unassigned = list(unassigned)
        assigned = list(assigned)
        self.jobs_pending.clear()
        if unassigned:
            meta = TrialGroupMeta(
                self.num_trial_groups,
                unassigned,
            )
            self.trial_group_meta.append(meta)
            logger.debug("Assign group %d to unassigned trials: %s", meta.grp,
                         unassigned)
            for p in unassigned:
                self.trial_groups[p.trial.trial_id] = TrialAndGroup(
                    p.trial, meta.grp)
            # allocate reousrces
            self._fluid(meta)
        else:
            logger.debug("No new group")

        if assigned:
            # find each group with pending jobs and do dynamic
            groups = {self._find_group(p.trial) for p in assigned}
            for meta in groups:
                self._fluid(meta)
        else:
            logger.debug("No change in existing groups")

    def _dump_groups(self):
        """Dump group info for debugging"""
        logger.info("There are %d TrialGroup(s)", self.num_trial_groups)
        for grp in range(self.num_trial_groups):
            logger.info("TrialGroup %d", grp)
            for trial in self._trial_group(grp):
                if self._find_running(trial):
                    tag = "jobs_running"
                elif self._find_pending(trial):
                    tag = "jobs_pending"
                elif self._find_paused(trial):
                    tag = "jobs_paused"
                else:
                    tag = "none"
                logger.info("    Trial %s: [%s] queue [%s]", trial.trial_id,
                            trial.status, tag)
        logger.info("Idle Resources: %s",
                    self._resource_string(self.idle_resources))

    def _committed_resources_in_group(self, grp: int) -> Resources:
        """Compute all resources committed in this group"""
        used = Resources(cpu=0, gpu=0)
        for job in self.jobs_running.values():
            if job.trial.trial_id in self.trial_groups:
                used = resources_add(used, job.trial.resources)
        return used

    def _fluid(self, meta: TrialGroupMeta):
        """Run fluid on a specific group"""
        self._dump_groups()
        # set of trials to consider
        A = {trial.trial_id for trial in self._trial_group(meta.grp)}
        logger.debug(
            f"_fluid: meta.perf.trials_missing_info={meta.perf.trials_missing_info} meta.trials={meta.trials}, meta.grp={meta.grp}, trial_groups={self.trial_groups}, A={A}"
        )
        # assignment of resources
        W: Dict[str, Resources] = {}
        # compute new idle resources if every trials in this group were stopped
        M = resources_add(self.idle_resources,
                          self._committed_resources_in_group(meta.grp))

        if meta.perf.trials_missing_info:
            # there are still trials need perf data,
            # restrict A to only these trials
            others = A.difference(meta.perf.trials_missing_info)
            A = meta.perf.trials_missing_info
            # set others to use 0 resource
            for tid in others:
                W[tid] = Resources(cpu=0, gpu=0)
            # use 1 gpu per trial to get reference perf
            for tid in A:
                r = Resources(cpu=1, gpu=1)
                Mp = Resources.subtract(M, r)
                if not Mp.is_nonnegative():
                    break
                M = Mp
                W[tid] = r
        else:
            # convert A to array for sorting
            A = np.array(list(A))
            # reference height (1 width)
            H1 = np.array([meta.perf.get_height(tid, 1) for tid in A])
            # sort by H1 in non-increasing order
            ord = np.argsort(H1[::-1])
            A = A[ord]
            H1 = H1[ord]
            # $$w_i= \min(
            #   \max(
            #       \floor{
            #           \frac{h_{i,1}}{\sum_j h_{j,1} } n
            #       },
            #       \frac{1}{c}),
            #   d
            # )$$
            c = 1 / 2
            d = 4
            w = np.minimum(
                np.maximum(np.floor(H1 * np.size(H1) / np.sum(H1)), 1 / c), d)
            # assign resources based on w
            w = w / w.sum() * self._avail_resources.gpu_total()
            resW = [Resources(cpu=1, gpu=g) for g in w]
            # write to W
            W = dict(zip(A, resW))

        self._ensure_W(W, meta)

    def _ensure_W(self, W: Dict[str, Resources], meta: TrialGroupMeta):
        """Adjust group resources given in W"""
        logger.debug(f"ensure_W: W={W} meta.trials={meta.trials}")
        # stop any trials with 0 res
        # this has to be done first to free up resources for others to use
        for trial_id, res in W.items():
            trial = self.trial_groups[trial_id].trial
            if res.cpu_total() + res.gpu_total() == 0:
                # add to paused, then ensure_stop, we do not change trial's status which is visible outside
                running = self._find_running(trial)
                if running is not None:
                    # don't call pause_trial, which will trigger another fluid reschedule
                    self.jobs_paused[running.in_flight_future] = running
                self._ensure_stop(running.trial)
                trial.resources = res
                # add to pending
                self.start_trial(trial)
        # adjust any trials with different res, including any not already running
        for trial_id, res in W.items():
            # use trial group to map trial_id to trial
            trial = self.trial_groups[trial_id].trial

            if res.cpu_total() + res.gpu_total() == 0:
                # already handled in the loop above
                continue

            if (
                    # current_res != res
                    Resources.subtract(trial.resources, res).is_nonnegative()
                    != Resources.subtract(res,
                                          trial.resources).is_nonnegative()):
                running = self._find_running(trial)
                if running is not None:
                    # don't call pause_trial, which will trigger another fluid reschedule
                    self.jobs_paused[running.in_flight_future] = running

                self._ensure_stop(trial)

            # at this point, the job is always stopped but not in the pending queue,
            # because fluid clears the pending queue.
            trial.resources = res
            self._kickoff(PendingJob(trial, None, True), res)

    def _find_group(self, trial: Trial) -> TrialGroupMeta:
        return self.trial_group_meta[self.trial_groups[trial.trial_id].group]

    def _trial_group(self, grp: int) -> List[Trial]:
        return [v.trial for v in self.trial_groups.values() if v.group == grp]

    def _find_paused(self, trial: Trial) -> Optional[RunningJob]:
        for job in self.jobs_paused.values():
            if job.trial == trial:
                return job

    def _pop_paused(self, trial: Trial) -> Optional[RunningJob]:
        for fut, job in self.jobs_paused.items():
            if job.trial == trial:
                assert fut == job.in_flight_future
                return self.jobs_paused.pop(fut)

    def _find_running(self, trial: Trial) -> Optional[RunningJob]:
        for _, job in self.jobs_running.items():
            if job.trial == trial:
                return job
        logger.debug(
            f"Cloud not find running trial: {trial}, currently running ones are {[job for _, job in self.jobs_running.items()]}"
        )

    def _find_pending(self, trial: Trial) -> Optional[PendingJob]:
        for job in self.jobs_pending:
            if job.trial == trial:
                return job

    def _setup_remote_runner(self, trial: Trial, res: Resources,
                             reuse_allowed: bool) -> Any:
        trial.init_logger()
        # We checkpoint metadata here to try mitigating logdir duplication
        self.try_checkpoint_metadata(trial)
        remote_logdir = trial.logdir

        cls = ray.remote(
            num_cpus=res.cpu,
            num_gpus=0 if self._fake_gpus else res.gpu,
            memory=res.memory,
            object_store_memory=res.object_store_memory,
            resources=res.custom_resources,
        )(trial.get_trainable_cls())

        def logger_creator(config):
            # Set the working dir in the remote process, for user file writes
            os.makedirs(remote_logdir, exist_ok=True)
            if not ray.worker._mode() == ray.worker.LOCAL_MODE:
                os.chdir(remote_logdir)
            return NoopLogger(config, remote_logdir)

        # Clear the Trial's location (to be updated later on result)
        # since we don't know where the remote runner is placed.
        trial.set_location(Location())
        logger.debug("Trial %s: Setting up new remote runner.", trial)
        # Logging for trials is handled centrally by TrialRunner, so
        # configure the remote runner to use a noop-logger.
        trial_config = copy.deepcopy(trial.config)
        trial_config[TRIAL_INFO] = TrialInfo(trial)
        kwargs = {
            "config": trial_config,
            "logger_creator": logger_creator,
        }
        if issubclass(trial.get_trainable_cls(), DurableTrainable):
            kwargs["remote_checkpoint_dir"] = trial.remote_checkpoint_dir

        with _change_working_directory(trial):
            return cls.remote(**kwargs)

    def _kickoff(self, pending: PendingJob,
                 res: Resources) -> Optional[RunningJob]:
        """Turn a pending job into a running one
        The pending job may be previously paused, or completely new.
        If paused, there will be a running job saved in the jobs_paused queue

        May return None if failed to start
        """
        trial = pending.trial
        # this is needed for the Trainer to setup distributed training
        # TODO: figure what config key is also needed to set resource info
        trial.resources = res

        self._commit_resources(trial)
        try:
            reuse_allowed = pending.checkpoint is not None or trial.has_checkpoint(
            )
            runner = self._setup_remote_runner(trial, res, reuse_allowed)
            trial.set_runner(runner)
            restore_job = self._restore(trial, pending.checkpoint)

            # trial's status is already RUNNING, set in start_trial, to fake a running trial from the outside

            # if previously is paused
            prev_run = self._pop_paused(trial)
            if prev_run is not None:
                if restore_job is not None:
                    logger.error(
                        "A previously paused job is restoring!!!, blocking on restoring"
                    )
                    ray.get(restore_job.in_flight_future)
                # add back to running queue
                self.jobs_running[prev_run.in_flight_future] = prev_run
                return prev_run

            # if is restoring
            if trial.is_restoring:
                # assert restore_job is not None
                return restore_job

            # actually start train op
            return self._ensure_train(trial)
        except Exception as e:
            if isinstance(e, AbortTrialExecution):
                logger.exception("Trial %s: Error starting runner, aborting!",
                                 trial)
            else:
                logger.exception("Trial %s: Unexpected error starting runner.",
                                 trial)
            time.sleep(2)
            error_msg = traceback.format_exc()
            self._ensure_stop(
                trial,
                error=True,
                error_msg=error_msg,
                stop_logger=True,
                # NOTE that we don't return the resources, since they may have been lost.
                release_resources=False,
                update_status=True,
            )

    def _ensure_train(self, trial: Trial) -> RunningJob:
        """Actually invoke the train op on the runner"""
        assert trial.runner is not None
        with _change_working_directory(trial):
            fut = trial.runner.train.remote()

        if isinstance(fut, dict):
            # local mode
            fut = _LocalWrapper(fut)
        running = RunningJob(trial, fut)
        self.jobs_running[fut] = running
        logger.debug(
            f"Set trial to running: {trial}, jobs_running={self.jobs_running}")
        return running

    def _ensure_stop(
        self,
        trial,
        error=False,
        error_msg="",
        stop_logger=True,
        release_resources=True,
        update_status=False,
    ):
        """Stops the trial and its logger
        Handles any error
        """
        logger.debug(f"_ensure_stop: trial.resources={trial.resources}")
        if stop_logger:
            trial.close_logger()

        prior_status = trial.status
        trial.set_location(Location())
        if update_status:
            self.set_status(trial, Trial.ERROR if error else Trial.TERMINATED)

        # remove from running
        in_flight = [
            j for _, j in self.jobs_running.items() if j.trial == trial
        ]
        for j in in_flight:
            self.jobs_running.pop(j.in_flight_future)
        if in_flight:
            if prior_status not in [Trial.RUNNING, Trial.ERROR]:
                assert False, "trial status invalid"
        # release resources
        if release_resources:
            self._return_resources(trial)

        # remove from trial group
        # del self.trial_groups[trial.trial_id]

        try:
            trial.write_error_log(error_msg)
            if hasattr(trial, "runner") and trial.runner:
                logger.debug("Trial %s: Destroying actor.", trial)
                with _change_working_directory(trial):
                    self._trial_cleanup.add(trial, actor=trial.runner)
        except Exception:
            logger.exception("Trial %s: Error stopping runner.", trial)
            self.set_status(trial, Trial.ERROR)
        finally:
            trial.set_runner(None)

    def has_resources(self, resources):
        """Tell the schedule algorithm to always submit trials to us"""
        return True

    def start_trial(self, trial, checkpoint=None, train=True):
        """Add to pending queue and reschedule"""
        logger.debug("start_trial %s", trial)
        # the trial is considered by the outside to be running
        self.set_status(trial, Trial.RUNNING)
        self.jobs_pending.append(PendingJob(trial, checkpoint, train))
        # The actual triggering is done in on_no_available_trials()

    def stop_trial(self, trial, error=False, error_msg=None, stop_logger=True):
        """Add to to-stop queue and reschedule"""
        logger.debug("stop_trial %s", trial)
        self._ensure_stop(trial,
                          error,
                          error_msg,
                          stop_logger,
                          update_status=True)
        meta = self._find_group(trial)
        self._fluid(meta)

    def continue_training(self, trial):
        # this is called after got results from a trial,
        # and should start another train op in place of the
        # finished one.
        running_job = self._find_running(trial)
        if running_job is not None:
            # skip if the trial is running
            return

        # start new one
        self._ensure_train(trial)

    def pause_trial(self, trial):
        logger.debug("pause_trial %s", trial)
        running = self._find_running(trial)
        if running is not None:
            # add to jobs_paused
            self.jobs_paused[running.in_flight_future] = running
        # the super impl will call stop trial, which will then remove the job from running queue
        super().pause_trial(trial)

    def unpause_trial(self, trial):
        logger.debug("unpause_trial %s", trial)
        super().unpause_trial(trial)

    def resume_trial(self, trial):
        """Resumes PAUSED trials. This is a blocking call.
        This is not used by any algorithm
        """
        logger.debug("resume_trial %s", trial)
        assert trial.status == Trial.PAUSED, trial.status
        raise NotImplementedError

    def reset_trial(self, trial: Trial, new_config, new_experiment_tag):
        """Tries to invoke `Trainable.reset_config()` to reset trial.

        Args:
            trial (Trial): Trial to be reset.
            new_config (dict): New configuration for Trial trainable.
            new_experiment_tag (str): New experiment name for trial.

        Returns:
            True if `reset_config` is successful else False.
        """
        logger.debug("reset_trial %s", trial)
        trial.experiment_tag = new_experiment_tag
        trial.config = new_config
        trainable = trial.runner
        with _change_working_directory(trial):
            try:
                reset_val = ray.get(trainable.reset_config.remote(new_config),
                                    DEFAULT_GET_TIMEOUT)
            except RayTimeoutError:
                logger.exception("Trial %s: reset_config timed out.", trial)
                return False
        return reset_val

    def get_running_trials(self):
        return [job.trial for job in self.jobs_running.values()]

    def get_next_available_trial(self) -> Trial:
        """Return the next trial with ready result.
        Note that this doesn't remove the trial from running, fetch_result does that
        """
        futures = list(self.jobs_running.keys())
        # shuffle the list of futures because ray.wait
        # always return the first available future, but we want to be fair
        random.shuffle(futures)
        [ready_fut], _ = ray.wait(futures, num_returns=1)
        return self.jobs_running[ready_fut].trial

    def get_next_failed_trial(self) -> Optional[Trial]:
        if ray.worker._mode() == ray.worker.LOCAL_MODE:
            return None

        alive_node_ips = {
            node["NodeManagerAddress"]
            for node in ray.state.nodes() if node["alive"]
        }
        for trial in self.get_running_trials():
            if trial.node_ip and trial.node_ip not in alive_node_ips:
                return trial
        return None

    def fetch_result(self, trial):
        """
        Note that this will remove the trial from running queue,
        so actions must be taken later to either continue_training/stop/pause,
        to maintain consistent system state.

        This is usually called from the runner, knowning the the future for this trial is ready.
        """
        running_job = self._find_running(trial)
        assert running_job, "Trial was not running"
        self.jobs_running.pop(running_job.in_flight_future)
        result = ray.get(running_job.in_flight_future, DEFAULT_GET_TIMEOUT)
        if isinstance(result, _LocalWrapper):
            result = result.unwrap()

        if isinstance(result, dict):
            # notify trial group
            meta = self._find_group(trial)
            meta.perf.on_trial_result(trial.trial_id, result)
        return result

    def debug_string(self):
        # TODO debug_string
        pass

    def _resource_string(self, res: Resources) -> str:
        """Returns a string describing the total resources available."""
        res_str = (f"{res.cpu} CPUs, {res.gpu} GPUs, "
                   f"{_to_gb(res.memory)} GiB heap, "
                   f"{_to_gb(res.object_store_memory)} GiB objects")
        if res.custom_resources:
            custom = ", ".join(f"{res.get_res_total(name)} {name}"
                               for name in res.custom_resources)
            res_str += f" ({custom})"
        return res_str

    def save(self, trial, storage=Checkpoint.PERSISTENT, result=None):
        """Saves the trial's state to a checkpoint asynchronously.

        Args:
            trial (Trial): The trial to be saved.
            storage (str): Where to store the checkpoint. Defaults to
                PERSISTENT.
            result (dict): The state of this trial as a dictionary to be saved.
                If result is None, the trial's last result will be used.

        Returns:
             Checkpoint object, or None if an Exception occurs.
        """
        result = result or trial.last_result
        with _change_working_directory(trial):
            if storage == Checkpoint.MEMORY:
                value = trial.runner.save_to_object.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.on_checkpoint(checkpoint)
            else:
                value = trial.runner.save.remote()
                checkpoint = Checkpoint(storage, value, result)
                trial.saving_to = checkpoint
                self.jobs_running[value] = RunningJob(trial, value)
        return checkpoint

    def _restore(self,
                 trial,
                 checkpoint=None,
                 block=False) -> Optional[RunningJob]:
        """Restores training state from a given model checkpoint.

        Args:
            trial (Trial): The trial to be restored.
            checkpoint (Checkpoint): The checkpoint to restore from. If None,
                the most recent PERSISTENT checkpoint is used. Defaults to
                None.
            block (bool): Whether or not to block on restore before returning.

        Raises:
            RuntimeError: This error is raised if no runner is found.
            AbortTrialExecution: This error is raised if the trial is
                ineligible for restoration, given the Tune input arguments.
        """
        if checkpoint is None or checkpoint.value is None:
            checkpoint = trial.checkpoint
        if checkpoint.value is None:
            return
        if trial.runner is None:
            raise RuntimeError(
                "Trial {}: Unable to restore - no runner found.".format(trial))
        value = checkpoint.value
        if checkpoint.storage == Checkpoint.MEMORY:
            logger.debug("Trial %s: Attempting restore from object", trial)
            # Note that we don't store the remote since in-memory checkpoints
            # don't guarantee fault tolerance and don't need to be waited on.
            with _change_working_directory(trial):
                trial.runner.restore_from_object.remote(value)
        else:
            logger.debug("Trial %s: Attempting restore from %s", trial, value)
            if issubclass(trial.get_trainable_cls(), DurableTrainable):
                with _change_working_directory(trial):
                    remote = trial.runner.restore.remote(value)
            elif trial.sync_on_checkpoint:
                # This provides FT backwards compatibility in the
                # case where a DurableTrainable is not provided.
                logger.warning("Trial %s: Reading checkpoint into memory.",
                               trial)
                data_dict = TrainableUtil.pickle_checkpoint(value)
                with _change_working_directory(trial):
                    remote = trial.runner.restore_from_object.remote(data_dict)
            else:
                raise AbortTrialExecution(
                    "Pass in `sync_on_checkpoint=True` for driver-based trial"
                    "restoration. Pass in an `upload_dir` and a Trainable "
                    "extending `DurableTrainable` for remote storage-based "
                    "restoration")

            if block:
                ray.get(remote)
            else:
                trial.restoring_from = checkpoint
                running_job = RunningJob(trial, remote)
                self.jobs_running[remote] = running_job
                return running_job

    def restore(self, trial, checkpoint=None, block=False):
        return self._restore(trial, checkpoint, block)

    def export_trial_if_needed(self, trial: Trial):
        """Exports model of this trial based on trial.export_formats.

        Return:
            A dict that maps ExportFormats to successfully exported models.
        """
        if trial.export_formats and len(trial.export_formats) > 0:
            with _change_working_directory(trial):
                return ray.get(
                    trial.runner.export_model.remote(trial.export_formats),
                    DEFAULT_GET_TIMEOUT,
                )
        return {}

    def cleanup(self):
        self._trial_cleanup.cleanup(partial=False)

    def on_step_begin(self, trial_runner):
        """Before step() called, update the available resources."""
        self._update_avail_resources()

    def _update_avail_resources(self, num_retries=5):
        resources = None
        for i in range(num_retries):
            if i > 0:
                logger.warning(
                    "Cluster resources not detected or are 0. Attempt #"
                    "%s...", i + 1)
                time.sleep(0.5)
            try:
                resources = ray.cluster_resources()
            except Exception:
                # TODO(rliaw): Remove this when local mode is fixed.
                # https://github.com/ray-project/ray/issues/4147
                logger.debug("Using resources for local machine.")
                resources = ResourceSpec().resolve(True).to_resource_dict()
            if resources:
                break

        if not resources:
            # NOTE: This hides the possibility that Ray may be waiting for
            # clients to connect.
            resources.setdefault("CPU", 0)
            resources.setdefault("GPU", 0)
            logger.warning("Cluster resources cannot be detected or are 0. "
                           "You can resume this experiment by passing in "
                           "`resume=True` to `run`.")

        resources = resources.copy()
        num_cpus = resources.pop("CPU", 0)
        num_gpus = resources.pop("GPU", 0)
        memory = ray_constants.from_memory_units(resources.pop("memory", 0))
        object_store_memory = ray_constants.from_memory_units(
            resources.pop("object_store_memory", 0))
        custom_resources = resources

        if num_gpus == 0:
            warnings.warn(
                "No GPU resources found, assuming local test, using CPU resources instead"
            )
            # local test
            num_gpus = num_cpus
            self._fake_gpus = True
        else:
            self._fake_gpus = False

        avail_resources = Resources(
            int(num_cpus),
            int(num_gpus),
            memory=int(memory),
            object_store_memory=int(object_store_memory),
            custom_resources=custom_resources,
        )

        assert (self.idle_resources.is_nonnegative()
                ), "Cluster removed resources from running trials!"

        self._avail_resources = avail_resources
        self._last_resource_refresh = time.time()
        self._resources_initialized = True

    @property
    def idle_resources(self) -> Resources:
        return Resources.subtract(self._avail_resources,
                                  self._committed_resources)

    def _commit_resources(self, trial: Trial):
        resources = trial.resources
        self._trials_running.add(trial)

        committed = self._committed_resources
        all_keys = set(resources.custom_resources).union(
            set(committed.custom_resources))

        custom_resources = {
            k: committed.get(k) + resources.get_res_total(k)
            for k in all_keys
        }

        self._committed_resources = Resources(
            committed.cpu + resources.cpu_total(),
            committed.gpu + resources.gpu_total(),
            committed.memory + resources.memory_total(),
            committed.object_store_memory +
            resources.object_store_memory_total(),
            custom_resources=custom_resources,
        )
        logger.debug(
            f"Committed res={resources} -> {self._committed_resources}")

    def _return_resources(self, trial: Trial):
        if trial not in self._trials_running:
            return
        logger.debug("Trial %s: Returning resources.", trial)
        self._trials_running.remove(trial)
        resources = trial.resources

        committed = self._committed_resources

        all_keys = set(resources.custom_resources).union(
            set(committed.custom_resources))

        custom_resources = {
            k: committed.get(k) - resources.get_res_total(k)
            for k in all_keys
        }
        self._committed_resources = Resources(
            committed.cpu - resources.cpu_total(),
            committed.gpu - resources.gpu_total(),
            custom_resources=custom_resources,
        )

        assert (self._committed_resources.is_nonnegative()
                ), "Resource invalid: {} - {} = {}".format(
                    committed, resources, self._committed_resources)

    def on_no_available_trials(self, trial_runner):
        """This is called when we get all trial from a batch from the search algo"""
        logger.debug("on_no_available_trials")
        self._detect_groups()
        super().on_no_available_trials(trial_runner)