def testMaxConcurrentSuggestions(self): """Checks that next_trials() supports throttling.""" experiment_spec = { "run": "PPO", "num_samples": 6, } experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(max_concurrent=4) searcher.add_configurations(experiments) trials = searcher.next_trials() self.assertEqual(len(trials), 4) self.assertEqual(searcher.next_trials(), []) finished_trial = trials.pop() searcher.on_trial_complete(finished_trial.trial_id) self.assertEqual(len(searcher.next_trials()), 1) finished_trial = trials.pop() searcher.on_trial_complete(finished_trial.trial_id) finished_trial = trials.pop() searcher.on_trial_complete(finished_trial.trial_id) finished_trial = trials.pop() searcher.on_trial_complete(finished_trial.trial_id) self.assertEqual(len(searcher.next_trials()), 1) self.assertEqual(len(searcher.next_trials()), 0)
def run_experiments(experiments, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True): """Tunes experiments. Args: experiments (Experiment | list | dict): Experiments to run. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand, or HyperOpt. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (bool): How much output should be printed for each trial. """ if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner( scheduler, launch_web_server=with_server, server_port=server_port, verbose=verbose) exp_list = experiments if isinstance(experiments, Experiment): exp_list = [experiments] elif type(experiments) is dict: exp_list = [ Experiment.from_json(name, spec) for name, spec in experiments.items() ] if (type(exp_list) is list and all(isinstance(exp, Experiment) for exp in exp_list)): for experiment in exp_list: scheduler.add_experiment(experiment, runner) else: raise TuneError("Invalid argument: {}".format(experiments)) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) for trial in runner.get_trials(): # TODO(rliaw): What about errored? if trial.status != Trial.TERMINATED: raise TuneError("Trial did not complete", trial) wait_for_log_sync() return runner.get_trials()
def create_searcher(): class TestSuggestion(Searcher): def __init__(self, index): self.index = index self.returned_result = [] super().__init__(metric="episode_reward_mean", mode="max") def suggest(self, trial_id): self.index += 1 return {"test_variable": self.index} def on_trial_complete(self, trial_id, result=None, **kwargs): self.returned_result.append(result) def save(self, checkpoint_path): with open(checkpoint_path, "wb") as f: pickle.dump(self.__dict__, f) def restore(self, checkpoint_path): with open(checkpoint_path, "rb") as f: self.__dict__.update(pickle.load(f)) searcher = TestSuggestion(0) searcher = ConcurrencyLimiter(searcher, max_concurrent=2) searcher = Repeater(searcher, repeat=3, set_index=False) search_alg = SearchGenerator(searcher) experiment_spec = { "run": "__fake", "num_samples": 20, "stop": {"training_iteration": 2}, } experiments = [Experiment.from_json("test", experiment_spec)] search_alg.add_configurations(experiments) return search_alg
def testSearchAlgFinishes(self): """Empty SearchAlg changing state in `next_trials` does not crash.""" os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" class FinishFastAlg(_MockSuggestionAlgorithm): _index = 0 def next_trial(self): spec = self._experiment.spec trial = None if self._index < spec["num_samples"]: trial = Trial( spec.get("run"), stopping_criterion=spec.get("stop")) self._index += 1 if self._index > 4: self.set_finished() return trial def suggest(self, trial_id): return {} ray.init(num_cpus=2, local_mode=True, include_dashboard=False) experiment_spec = { "run": "__fake", "num_samples": 2, "stop": { "training_iteration": 1 } } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) self.assertFalse(runner.is_finished()) runner.step() # This launches a new run runner.step() # This launches a 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the first run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # this converts self._finished to True self.assertTrue(searcher.is_finished()) self.assertRaises(TuneError, runner.step)
def testSearchAlgStalled(self): """Checks that runner and searcher state is maintained when stalled.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 } } experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm(max_concurrent=1) search_alg.add_configurations(experiments) searcher = search_alg.searcher runner = TrialRunner(search_alg=search_alg) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) trials = runner.get_trials() runner.step() self.assertEqual(trials[1].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) searcher.stall = True runner.step() self.assertEqual(trials[1].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(all(trial.is_finished() for trial in trials)) self.assertFalse(search_alg.is_finished()) self.assertFalse(runner.is_finished()) searcher.stall = False runner.step() trials = runner.get_trials() self.assertEqual(trials[2].status, Trial.RUNNING) self.assertEqual(len(searcher.live_trials), 1) runner.step() self.assertEqual(trials[2].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(search_alg.is_finished()) self.assertTrue(runner.is_finished())
def testSearchAlgFinishes(self): """Empty SearchAlg changing state in `next_trials` does not crash.""" class FinishFastAlg(SuggestionAlgorithm): _index = 0 def next_trials(self): trials = [] self._index += 1 for trial in self._trial_generator: trials += [trial] break if self._index > 4: self._finished = True return trials def _suggest(self, trial_id): return {} ray.init(num_cpus=2) experiment_spec = { "run": "__fake", "num_samples": 2, "stop": { "training_iteration": 1 } } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) self.assertFalse(runner.is_finished()) runner.step() # This launches a new run runner.step() # This launches a 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the first run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # This kills the 2nd run self.assertFalse(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() # this converts self._finished to True self.assertTrue(searcher.is_finished()) self.assertRaises(TuneError, runner.step)
def testSearchAlgNotification(self): """Checks notification of trial to the Search Algorithm.""" os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1" ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm() searcher = search_alg.searcher search_alg.add_configurations(experiments) runner = TrialRunner(search_alg=search_alg) while not runner.is_finished(): runner.step() self.assertEqual(searcher.counter["result"], 1) self.assertEqual(searcher.counter["complete"], 1)
def testSearchAlgFinished(self): """Checks that SearchAlg is Finished before all trials are done.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 1}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(experiments, max_concurrent=10) runner = TrialRunner(search_alg=searcher) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertTrue(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testSearchAlgNotification(self): """Checks notification of trial to the Search Algorithm.""" ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(experiments, max_concurrent=10) runner = TrialRunner(search_alg=searcher) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(searcher.counter["result"], 1) self.assertEqual(searcher.counter["complete"], 1)
def testSearchAlgSchedulerInteraction(self): """Checks that TrialScheduler killing trial will notify SearchAlg.""" class _MockScheduler(FIFOScheduler): def on_trial_result(self, *args, **kwargs): return TrialScheduler.STOP ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(experiments, max_concurrent=10) runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler()) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) self.assertTrue(searcher.is_finished()) self.assertFalse(runner.is_finished()) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(len(searcher.live_trials), 0) self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testSearchAlgSchedulerEarlyStop(self): """Early termination notif to Searcher can be turned off.""" class _MockScheduler(FIFOScheduler): def on_trial_result(self, *args, **kwargs): return TrialScheduler.STOP ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] searcher = _MockSuggestionAlgorithm(use_early_stopped_trials=True) searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler()) runner.step() runner.step() self.assertEqual(len(searcher.final_results), 1) searcher = _MockSuggestionAlgorithm(use_early_stopped_trials=False) searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher, scheduler=_MockScheduler()) runner.step() runner.step() self.assertEqual(len(searcher.final_results), 0)
def testSearchAlgFinishes(self): """SearchAlg changing state in `next_trials` does not crash.""" class FinishFastAlg(SuggestionAlgorithm): def next_trials(self): self._finished = True return [] ray.init(num_cpus=4, num_gpus=2) experiment_spec = { "run": "__fake", "num_samples": 3, "stop": { "training_iteration": 1 } } searcher = FinishFastAlg() experiments = [Experiment.from_json("test", experiment_spec)] searcher.add_configurations(experiments) runner = TrialRunner(search_alg=searcher) runner.step() # This should not fail self.assertTrue(searcher.is_finished()) self.assertTrue(runner.is_finished())
def testSearchAlgNotification(self): """Checks notification of trial to the Search Algorithm.""" os.environ["TUNE_RESULT_BUFFER_LENGTH"] = "1" # Don't finish early ray.init(num_cpus=4, num_gpus=2) experiment_spec = {"run": "__fake", "stop": {"training_iteration": 2}} experiments = [Experiment.from_json("test", experiment_spec)] search_alg = _MockSuggestionAlgorithm() searcher = search_alg.searcher search_alg.add_configurations(experiments) runner = TrialRunner(search_alg=search_alg) runner.step() trials = runner.get_trials() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.RUNNING) runner.step() self.assertEqual(trials[0].status, Trial.TERMINATED) self.assertEqual(searcher.counter["result"], 1) self.assertEqual(searcher.counter["complete"], 1)
def run_experiments(experiments, scheduler=None, with_server=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, queue_trials=False): """Tunes experiments. Args: experiments (Experiment | list | dict): Experiments to run. scheduler (TrialScheduler): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand, or HyperOpt. with_server (bool): Starts a background Tune server. Needed for using the Client API. server_port (int): Port number for launching TuneServer. verbose (bool): How much output should be printed for each trial. queue_trials (bool): Whether to queue trials when the cluster does not currently have enough resources to launch one. This should be set to True when running on an autoscaling cluster to enable automatic scale-up. Returns: List of Trial objects, holding data for each executed trial. """ if scheduler is None: scheduler = FIFOScheduler() runner = TrialRunner( scheduler, launch_web_server=with_server, server_port=server_port, verbose=verbose, queue_trials=queue_trials) exp_list = experiments if isinstance(experiments, Experiment): exp_list = [experiments] elif type(experiments) is dict: exp_list = [ Experiment.from_json(name, spec) for name, spec in experiments.items() ] if (type(exp_list) is list and all(isinstance(exp, Experiment) for exp in exp_list)): for experiment in exp_list: scheduler.add_experiment(experiment, runner) else: raise TuneError("Invalid argument: {}".format(experiments)) print(runner.debug_string(max_debug=99999)) last_debug = 0 while not runner.is_finished(): runner.step() if time.time() - last_debug > DEBUG_PRINT_INTERVAL: print(runner.debug_string()) last_debug = time.time() print(runner.debug_string(max_debug=99999)) errored_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: errored_trials += [trial] if errored_trials: raise TuneError("Trials did not complete", errored_trials) wait_for_log_sync() return runner.get_trials()