コード例 #1
0
 def on_no_available_trials(self, trial_runner):
     if self._queue_trials:
         return
     for trial in trial_runner.get_trials():
         if trial.uses_placement_groups:
             return
         if trial.status == Trial.PENDING:
             if not self.has_resources_for_trial(trial):
                 resource_string = trial.resources.summary_string()
                 trial_resource_help_msg = trial.get_trainable_cls(
                 ).resource_help(trial.config)
                 autoscaling_msg = ""
                 if is_ray_cluster():
                     autoscaling_msg = (
                         "Pass `queue_trials=True` in ray.tune.run() or "
                         "on the command line to queue trials until the "
                         "cluster scales up or resources become available. "
                     )
                 raise TuneError(
                     "Insufficient cluster resources to launch trial: "
                     f"trial requested {resource_string}, but the cluster "
                     f"has only {self.resource_string()}. "
                     f"{autoscaling_msg}"
                     f"{trial_resource_help_msg} ")
         elif trial.status == Trial.PAUSED:
             raise TuneError("There are paused trials, but no more pending "
                             "trials with sufficient resources.")
コード例 #2
0
 def _may_warn_insufficient_resources(self, all_trials):
     # This is approximately saying we are not making progress.
     if len(all_trials) == self._all_trials_size:
         if self._no_running_trials_since == -1:
             self._no_running_trials_since = time.monotonic()
         elif (time.monotonic() - self._no_running_trials_since >
               _get_insufficient_resources_warning_threshold()):
             if not is_ray_cluster():  # autoscaler not enabled
                 # If any of the pending trial cannot be fulfilled,
                 # that's a good enough hint of trial resources not enough.
                 for trial in all_trials:
                     if (trial.status is Trial.PENDING
                             and not _can_fulfill_no_autoscaler(trial)):
                         # TODO(xwjiang):
                         #  Raise an Error once #18608 is resolved.
                         logger.warning(
                             _get_insufficient_resources_error_msg(trial))
                         break
             else:
                 # TODO(xwjiang): #17799.
                 #  Output a more helpful msg for autoscaler.
                 logger.warning(_get_insufficient_resources_warning_msg())
             self._no_running_trials_since = time.monotonic()
     else:
         self._no_running_trials_since = -1
     self._all_trials_size = len(all_trials)
コード例 #3
0
 def on_no_available_trials(self, all_trials):
     """Tracks information across the life of Tune loop and makes guesses
     about if Tune loop is stuck due to infeasible resources.
     If so, outputs certain warning messages.
     The logic should be conservative, non-intrusive and informative.
     For example, rate limiting is applied so that the message is not
     spammy.
     """
     # This is approximately saying we are not making progress.
     if len(all_trials) == self._last_trial_num:
         if self._no_running_trials_since == -1:
             self._no_running_trials_since = time.monotonic()
         elif (time.monotonic() - self._no_running_trials_since >
               _get_insufficient_resources_warning_threshold()):
             if not is_ray_cluster():  # autoscaler not enabled
                 # If any of the pending trial cannot be fulfilled,
                 # that's a good enough hint of trial resources not enough.
                 for trial in all_trials:
                     if (trial.status is Trial.PENDING
                             and not _can_fulfill_no_autoscaler(trial)):
                         # TODO(xwjiang):
                         #  Raise an Error once #18608 is resolved.
                         logger.warning(
                             _get_insufficient_resources_error_msg(trial))
                         break
             else:
                 # TODO(xwjiang): #17799.
                 #  Output a more helpful msg for autoscaler.
                 logger.warning(_get_insufficient_resources_warning_msg())
             self._no_running_trials_since = time.monotonic()
     else:
         self._no_running_trials_since = -1
     self._last_trial_num = len(all_trials)
コード例 #4
0
def _get_warning_threshold() -> float:
    if is_ray_cluster():
        return float(
            os.environ.get(
                "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60"))
    else:
        return float(
            os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "1"))
コード例 #5
0
def _get_insufficient_resources_warning_threshold() -> float:
    if is_ray_cluster():
        return float(
            os.environ.get(
                "TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S_AUTOSCALER", "60"))
    else:
        # Set the default to 10s so that we don't prematurely determine that
        # a cluster cannot fulfill the resources requirements.
        return float(
            os.environ.get("TUNE_WARN_INSUFFICENT_RESOURCE_THRESHOLD_S", "10"))
コード例 #6
0
def _get_insufficient_resources_warning_msg() -> str:
    msg = (
        f"No trial is running and no new trial has been started within"
        f" at least the last "
        f"{_get_insufficient_resources_warning_threshold()} seconds. "
        f"This could be due to the cluster not having enough "
        f"resources available to start the next trial. "
        f"Stop the tuning job and adjust the resources requested per trial "
        f"(possibly via `resources_per_trial` or via `num_workers` for rllib) "
        f"and/or add more resources to your Ray runtime.")
    if is_ray_cluster():
        return "Ignore this message if the cluster is autoscaling. " + msg
    else:
        return msg
コード例 #7
0
ファイル: ray_trial_executor.py プロジェクト: srikalyan/ray
    def __init__(self,
                 queue_trials=None,
                 reuse_actors=False,
                 ray_auto_init=None,
                 refresh_period=RESOURCE_REFRESH_PERIOD):
        if queue_trials is None:
            if os.environ.get("TUNE_DISABLE_QUEUE_TRIALS") == "1":
                logger.info("'TUNE_DISABLE_QUEUE_TRIALS=1' detected.")
                queue_trials = False
            elif is_ray_cluster():
                queue_trials = True

        if ray_auto_init is None:
            if os.environ.get("TUNE_DISABLE_AUTO_INIT") == "1":
                logger.info("'TUNE_DISABLE_AUTO_INIT=1' detected.")
                ray_auto_init = False
            else:
                ray_auto_init = True

        super(RayTrialExecutor, self).__init__(queue_trials)
        # Check for if we are launching a trial without resources in kick off
        # autoscaler.
        self._trial_queued = False
        self._running = {}
        # Since trial resume after paused should not run
        # trial.train.remote(), thus no more new remote object ref generated.
        # We use self._paused to store paused trials here.
        self._paused = {}

        self._trial_cleanup = _TrialCleanup()
        self._reuse_actors = reuse_actors
        self._cached_actor = None

        self._avail_resources = Resources(cpu=0, gpu=0)
        self._committed_resources = Resources(cpu=0, gpu=0)
        self._resources_initialized = False
        self._refresh_period = refresh_period
        self._last_resource_refresh = float("-inf")
        self._last_nontrivial_wait = time.time()
        if not ray.is_initialized() and ray_auto_init:
            logger.info("Initializing Ray automatically."
                        "For cluster usage or custom Ray initialization, "
                        "call `ray.init(...)` before `tune.run`.")
            ray.init()

        if ray.is_initialized():
            self._update_avail_resources()
コード例 #8
0
def _get_warning_msg() -> str:
    if is_ray_cluster():
        return (
            f"If autoscaler is still scaling up, ignore this message. No "
            f"trial is running and no new trial has been started within at "
            f"least the last {_get_warning_threshold()} seconds. "
            f"This could be due to the cluster not having enough "
            f"resources available to start the next trial. Please stop the "
            f"tuning job and readjust resources_per_trial argument passed "
            f"into tune.run() as well as max_workers and worker_nodes "
            f"InstanceType specified in cluster.yaml.")
    else:
        return (f"No trial is running and no new trial has been started within"
                f" at least the last {_get_warning_threshold()} seconds. "
                f"This could be due to the cluster not having enough "
                f"resources available to start the next trial. Please stop "
                f"the tuning job and readjust resources_per_trial argument "
                f"passed into tune.run() and/or start a cluster with more "
                f"resources.")
コード例 #9
0
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 1000
    results_per_second = 0.5
    trial_length_s = 100

    max_runtime = 120

    if is_ray_cluster():
        # Add constant overhead for SSH connection
        max_runtime = 120

    timed_tune_run(name="result throughput cluster",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime,
                   sync_config=tune.SyncConfig(sync_to_driver=False))  # Tweak!
コード例 #10
0
def main():
    ray.init(address="auto")

    num_samples = 1000

    sleep_time = 0.1
    num_iters = 300

    expected_run_time = num_iters * sleep_time

    # Allow minimum of 20 % overhead (or 10 seconds for short runs)
    expected_run_time += max(expected_run_time * 0.2, 10.)

    if is_ray_cluster():
        # Add constant overhead for SSH connection
        expected_run_time += 0.3 * num_samples

    start_time = time.time()
    tune.run(
        my_naive_trainable,
        config={
            "score": tune.uniform(0., 1.),
            "num_iters": num_iters,
            "sleep_time": sleep_time
        },
        reuse_actors=True,
        verbose=2,
        num_samples=num_samples)
    time_taken = time.time() - start_time

    assert time_taken < expected_run_time, \
        f"The buffering test took {time_taken:.2f} seconds, but should not " \
        f"have exceeded {expected_run_time:.2f} seconds. Test failed."

    print(f"The buffering test took {time_taken:.2f} seconds, which "
          f"is below the budget of {expected_run_time:.2f} seconds. "
          f"Test successful.")