def testSubtraction(self): resource_1 = Resources( 1, 0, 0, 1, custom_resources={"a": 1, "b": 2}, extra_custom_resources={"a": 1, "b": 1}, ) resource_2 = Resources( 1, 0, 0, 1, custom_resources={"a": 1, "b": 2}, extra_custom_resources={"a": 1, "b": 1}, ) new_res = Resources.subtract(resource_1, resource_2) self.assertTrue(new_res.cpu == 0) self.assertTrue(new_res.gpu == 0) self.assertTrue(new_res.extra_cpu == 0) self.assertTrue(new_res.extra_gpu == 0) self.assertTrue(all(k == 0 for k in new_res.custom_resources.values())) self.assertTrue(all(k == 0 for k in new_res.extra_custom_resources.values()))
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) num_workers = cf["num_workers"] + cf["evaluation_num_workers"] # TODO(ekl): add custom resources here once tune supports them return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], memory=cf["memory"], object_store_memory=cf["object_store_memory"], extra_cpu=cf["num_cpus_per_worker"] * num_workers, extra_gpu=cf["num_gpus_per_worker"] * num_workers, extra_memory=cf["memory_per_worker"] * num_workers, extra_object_store_memory=cf["object_store_memory_per_worker"] * num_workers)
def _to_pg_factory( resources: Optional[Resources], placement_group_factory: Optional[PlacementGroupFactory], ) -> PlacementGroupFactory: """Outputs resources requirement in the form of PGF. In case that `placement_group_factory` is None, `resources` will be converted to PGF. If this is unsuccessful, an error will be raised. """ if not placement_group_factory: if not resources: resources = Resources(cpu=1, gpu=0) placement_group_factory = resource_dict_to_pg_factory(resources) return placement_group_factory
def default_resource_request( cls, config: PartialTrainerConfigDict) -> Resources: cnf = dict(cls.options.rllib_defaults, **config) cls._validate_config(cnf) num_workers = cnf["num_workers"] + cnf["evaluation_num_workers"] return Resources( cpu=cnf["num_cpus_for_driver"], gpu=cnf["num_gpus"], memory=cnf["memory"], object_store_memory=cnf["object_store_memory"], extra_cpu=cnf["num_cpus_per_worker"] * num_workers, extra_gpu=cnf["num_gpus_per_worker"] * num_workers, extra_memory=cnf["memory_per_worker"] * num_workers, extra_object_store_memory=cnf["object_store_memory_per_worker"] * num_workers, )
def testMultiStepRun(self): ray.init(num_cpus=4, num_gpus=2) kwargs = { "stopping_criterion": {"training_iteration": 5}, "resources": Resources(cpu=1, gpu=1), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] snapshot = TrialStatusSnapshot() runner = TrialRunner(callbacks=[TrialStatusSnapshotTaker(snapshot)]) for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertTrue(snapshot.all_trials_are_terminated())
def testResultDone(self): """Tests that last_result is marked `done` after trial is complete.""" ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].last_result[DONE], True)
def testMultiStepRun2(self): """Checks that runner.step throws when overstepping.""" ray.init(num_cpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertRaises(TuneError, runner.step)
def testChangeResources(self): """Checks that resource requirements can be changed on fly.""" ray.init(num_cpus=2) class ChangingScheduler(FIFOScheduler): def __init__(self): self._has_received_one_trial_result = False # For figuring out how many runner.step there are. def has_received_one_trial_result(self): return self._has_received_one_trial_result def on_trial_result(self, trial_runner, trial, result): if result["training_iteration"] == 1: self._has_received_one_trial_result = True executor = trial_runner.trial_executor executor.pause_trial(trial) trial.update_resources(dict(cpu=2, gpu=0)) return TrialScheduler.NOOP scheduler = ChangingScheduler() runner = TrialRunner(scheduler=scheduler) kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1 ) self.assertRaises( ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0)) ) while not scheduler.has_received_one_trial_result(): runner.step() self.assertEqual(trials[0].status, Trial.PAUSED) # extra step for tune loop to stage the resource requests. runner.step() self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2 )
def _update_avail_resources(self, num_retries=5): if time.time() - self._last_resource_refresh < self._refresh_period: return logger.debug("Checking Ray cluster resources.") resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1) time.sleep(0.5) try: resources = ray.cluster_resources() except Exception as exc: # TODO(rliaw): Remove this when local mode is fixed. # https://github.com/ray-project/ray/issues/4147 logger.debug(f"{exc}: Using resources for local machine.") resources = ResourceSpec().resolve(True).to_resource_dict() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning("Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`.") resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0)) custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources) self._last_resource_refresh = time.time() self._resources_initialized = True
def _commit_resources(self, resources): committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) + resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu + resources.cpu_total(), committed.gpu + resources.gpu_total(), committed.memory + resources.memory_total(), committed.object_store_memory + resources.object_store_memory_total(), custom_resources=custom_resources)
def testCheckpointingAtEnd(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_at_end": True, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].last_result[DONE], True) self.assertEqual(trials[0].has_checkpoint(), True)
def default_resource_request(cls, config): """ Configure the cluster resources used by this experiment """ num_gpus = config.get("num_gpus", 0) num_cpus = config.get("num_cpus", 1) if num_gpus > 0: # Assign extra CPUs for dataloaders workers = config.get("workers", 0) num_cpus = workers * num_gpus resource = Resources(cpu=0, gpu=0, extra_cpu=num_cpus, extra_gpu=num_gpus) return resource
def _return_resources(self, resources): committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) - resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu - resources.cpu_total(), committed.gpu - resources.gpu_total(), custom_resources=custom_resources) assert self._committed_resources.is_nonnegative(), ( "Resource invalid: {}".format(resources))
def _update_avail_resources(self, num_retries=5): if time.time() - self._last_resource_refresh < self._refresh_period: return logger.debug("Checking Ray cluster resources.") resources = None for i in range(num_retries): if i > 0: logger.warning( "Cluster resources not detected or are 0. Attempt #" "%s...", i + 1 ) time.sleep(0.5) resources = ray.cluster_resources() if resources: break if not resources: # NOTE: This hides the possibility that Ray may be waiting for # clients to connect. resources.setdefault("CPU", 0) resources.setdefault("GPU", 0) logger.warning( "Cluster resources cannot be detected or are 0. " "You can resume this experiment by passing in " "`resume=True` to `run`." ) resources = resources.copy() num_cpus = resources.pop("CPU", 0) num_gpus = resources.pop("GPU", 0) memory = ray_constants.from_memory_units(resources.pop("memory", 0)) object_store_memory = ray_constants.from_memory_units( resources.pop("object_store_memory", 0) ) custom_resources = resources self._avail_resources = Resources( int(num_cpus), int(num_gpus), memory=int(memory), object_store_memory=int(object_store_memory), custom_resources=custom_resources, ) self._last_resource_refresh = time.time() self._resources_initialized = True
def testCustomResources(self): ray.init(num_cpus=4, num_gpus=2, resources={"a": 2}) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 1 }, "resources": Resources(cpu=1, gpu=0, custom_resources={"a": 2}), } trials = [Trial("__fake", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING)
def testPauseThenResume(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 2}, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial runner.step() # Process result self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(ray.get(trials[0].runner.get_info.remote()), None) self.assertEqual(ray.get(trials[0].runner.set_info.remote(1)), 1) runner.trial_executor.pause_trial(trials[0]) self.assertEqual(trials[0].status, Trial.PAUSED)
def testErrorHandling(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": {"training_iteration": 1}, "resources": Resources(cpu=1, gpu=1), } _global_registry.register(TRAINABLE_CLASS, "asdf", None) trials = [Trial("asdf", **kwargs), Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[1].status, Trial.RUNNING)
def testStopTrial(self): ray.init(num_cpus=4, num_gpus=2) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 5 }, "resources": Resources(cpu=1, gpu=1), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs) ] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING) # Stop trial while running runner.stop_trial(trials[0]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.PENDING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.PENDING) # Stop trial while pending runner.stop_trial(trials[-1]) self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED) time.sleep(2) # Wait for stopped placement group to free resources runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(trials[-1].status, Trial.TERMINATED)
def testFailureRecoveryMaxFailures(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 2, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 3)
def testTrialErrorResumeTrue(self): ray.init(num_cpus=3, local_mode=True, include_dashboard=False) runner = TrialRunner(local_checkpoint_dir=self.tmpdir) kwargs = { "stopping_criterion": { "training_iteration": 4 }, "resources": Resources(cpu=1, gpu=0), } trials = [ Trial("__fake", config={"mock_error": True}, **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) while not runner.is_finished(): runner.step() runner.checkpoint(force=True) assert trials[0].status == Trial.ERROR del runner new_runner = TrialRunner( run_errored_only=True, resume=True, local_checkpoint_dir=self.tmpdir) assert len(new_runner.get_trials()) == 3 assert Trial.ERROR not in (t.status for t in new_runner.get_trials()) # The below is just a check for standard behavior. disable_error = False for t in new_runner.get_trials(): if t.config.get("mock_error"): t.config["mock_error"] = False disable_error = True assert disable_error while not new_runner.is_finished(): new_runner.step() assert Trial.ERROR not in (t.status for t in new_runner.get_trials())
def testCheckpointingAtEnd(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner() kwargs = { "stopping_criterion": { "training_iteration": 2 }, "checkpoint_at_end": True, "resources": Resources(cpu=1, gpu=1), } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result runner.step() # Process result, dispatch save self.assertEqual(trials[0].last_result[DONE], True) runner.step() # Process save self.assertEqual(trials[0].has_checkpoint(), True)
def update_resources(self, resources: Union[Dict, Callable, PlacementGroupFactory]): """EXPERIMENTAL: Updates the resource requirements. Should only be called when the trial is not running. Raises: ValueError if trial status is running. """ if self.status is Trial.RUNNING: raise ValueError("Cannot update resources while Trial is running.") if isinstance(resources, PlacementGroupFactory): self.placement_group_factory = resources else: self.resources = Resources(**resources) self._setup_resources() self.invalidate_json_state()
def testRestoreMetricsAfterCheckpointing(self): ray.init(num_cpus=1, num_gpus=1) observer = TrialResultObserver() runner = TrialRunner(callbacks=[observer]) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) kwargs["restore_path"] = trials[0].checkpoint.dir_or_data kwargs.pop("stopping_criterion") kwargs.pop("checkpoint_freq") # No checkpointing for next trial runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() observer.reset() while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 10) self.assertEqual(trials[1].last_result["iterations_since_restore"], 1) self.assertGreater(trials[1].last_result["time_since_restore"], 0) while not observer.just_received_a_result(): runner.step() self.assertEqual(trials[1].last_result["timesteps_since_restore"], 20) self.assertEqual(trials[1].last_result["iterations_since_restore"], 2) self.assertGreater(trials[1].last_result["time_since_restore"], 0) self.addCleanup(shutil.rmtree, trials[0].checkpoint.dir_or_data)
def testFractionalGpus(self): ray.init(num_cpus=4, num_gpus=1) runner = TrialRunner() kwargs = { "resources": Resources(cpu=1, gpu=0.5), } trials = [ Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), Trial("__fake", **kwargs), ] for t in trials: runner.add_trial(t) for _ in range(10): runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(trials[2].status, Trial.PENDING) self.assertEqual(trials[3].status, Trial.PENDING)
def testFailureRecoveryDisabled(self): ray.init(num_cpus=1, num_gpus=1) searchalg, scheduler = create_mock_components() runner = TrialRunner(searchalg, scheduler=scheduler) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) self.assertEqual(trials[0].num_failures, 1) self.assertEqual(len(searchalg.errored_trials), 1) self.assertEqual(len(scheduler.errored_trials), 1)
def _commit_resources(self, trial: Trial): resources = trial.resources self._trials_running.add(trial) committed = self._committed_resources all_keys = set(resources.custom_resources).union( set(committed.custom_resources)) custom_resources = { k: committed.get(k) + resources.get_res_total(k) for k in all_keys } self._committed_resources = Resources( committed.cpu + resources.cpu_total(), committed.gpu + resources.gpu_total(), committed.memory + resources.memory_total(), committed.object_store_memory + resources.object_store_memory_total(), custom_resources=custom_resources, ) logger.debug( f"Committed res={resources} -> {self._committed_resources}")
def testFailFast(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=True) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() while not runner.is_finished(): runner.step() self.assertEqual(trials[0].status, Trial.ERROR) # Somehow with `fail_fast=True`, if one errors out, the others are # then stopped with `TERMINATED` status. self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertRaises(TuneError, lambda: runner.step())
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() runner.step() # Start trial self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process result, dispatch save self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() # Process save with self.assertRaises(Exception): runner.step() # Error
def testChangeResources(self): """Checks that resource requirements can be changed on fly.""" ray.init(num_cpus=2) class ChangingScheduler(FIFOScheduler): def on_trial_result(self, trial_runner, trial, result): if result["training_iteration"] == 1: executor = trial_runner.trial_executor executor.stop_trial(trial) trial.update_resources(dict(cpu=2, gpu=0)) executor.start_trial(trial) return TrialScheduler.CONTINUE runner = TrialRunner(scheduler=ChangingScheduler()) kwargs = { "stopping_criterion": { "training_iteration": 2 }, "resources": Resources(cpu=1, gpu=0), } trials = [Trial("__fake", **kwargs)] for t in trials: runner.add_trial(t) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 1) self.assertRaises( ValueError, lambda: trials[0].update_resources(dict(cpu=2, gpu=0))) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual( runner.trial_executor._pg_manager.occupied_resources().get("CPU"), 2)
def testFailFastRaise(self): ray.init(num_cpus=1, num_gpus=1) runner = TrialRunner(fail_fast=TrialRunner.RAISE) kwargs = { "resources": Resources(cpu=1, gpu=1), "checkpoint_freq": 1, "max_failures": 0, "config": { "mock_error": True, "persistent_error": True, }, } runner.add_trial(Trial("__fake", **kwargs)) runner.add_trial(Trial("__fake", **kwargs)) trials = runner.get_trials() with self.assertRaises(Exception): while not runner.is_finished(): runner.step() # Not critical checks. Only to showcase the difference # with none raise type FailFast. self.assertEqual(trials[0].status, Trial.RUNNING) self.assertEqual(trials[1].status, Trial.PENDING)