def __init__(self, search_alg=None, scheduler=None, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=None, fail_fast=False, checkpoint_period=None, trial_executor=None, callbacks=None, metric=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() self._pending_trial_queue_times = {} # Setting this to 0 still allows adding one new (pending) trial, # but it will prevent us from trying to fill the trial list self._max_pending_trials = 0 # Can be updated in `self.add_trial()` self._metric = metric if "TRIALRUNNER_WALLTIME_LIMIT" in os.environ: raise ValueError( "The TRIALRUNNER_WALLTIME_LIMIT environment variable is " "deprecated. " "Use `tune.run(time_budget_s=limit)` instead.") self._total_time = 0 self._iteration = 0 self._has_errored = False self._fail_fast = fail_fast if isinstance(self._fail_fast, str): self._fail_fast = self._fail_fast.upper() if self._fail_fast == TrialRunner.RAISE: warnings.warn( "fail_fast='raise' detected. Be careful when using this " "mode as resources (such as Ray processes, " "file descriptors, and temporary files) may not be " "cleaned up properly. To use " "a safer mode, use fail_fast=True.") else: raise ValueError("fail_fast must be one of {bool, RAISE}. " f"Got {self._fail_fast}.") self._server = None self._server_port = server_port if server_port is not None: self._server = TuneServer(self, self._server_port) self._trials = [] self._cached_trial_decisions = {} self._queued_trial_decisions = {} self._updated_queue = False self._stop_queue = [] self._should_stop_experiment = False # used by TuneServer self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): errored_only = False if isinstance(resume, str): errored_only = resume.upper() == "ERRORED_ONLY" try: self.resume(run_errored_only=errored_only) self._resumed = True except Exception as e: if has_verbosity(Verbosity.V3_TRIAL_DETAILS): logger.error(str(e)) logger.exception("Runner restore failed.") if self._fail_fast: raise logger.info("Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str)) self._callbacks = CallbackList(callbacks or []) self._callbacks.setup() if checkpoint_period is None: checkpoint_period = os.getenv("TUNE_GLOBAL_CHECKPOINT_S", "auto") self._checkpoint_period = checkpoint_period self._checkpoint_manager = self._create_checkpoint_manager()
def __init__(self, search_alg=None, scheduler=None, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=None, fail_fast=False, verbose=True, checkpoint_period=None, trial_executor=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) self._total_time = 0 self._iteration = 0 self._has_errored = False self._fail_fast = fail_fast if isinstance(self._fail_fast, str): self._fail_fast = self._fail_fast.upper() if self._fail_fast == TrialRunner.RAISE: logger.warning( "fail_fast='raise' detected. Be careful when using this " "mode as resources (such as Ray processes, " "file descriptors, and temporary files) may not be " "cleaned up properly. To use " "a safer mode, use fail_fast=True.") else: raise ValueError("fail_fast must be one of {bool, RAISE}. " f"Got {self._fail_fast}.") self._verbose = verbose self._server = None self._server_port = server_port if server_port is not None: self._server = TuneServer(self, self._server_port) self._trials = [] self._cached_trial_decisions = {} self._stop_queue = [] self._should_stop_experiment = False # used by TuneServer self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): errored_only = False if isinstance(resume, str): errored_only = resume.upper() == "ERRORED_ONLY" try: self.resume(run_errored_only=errored_only) self._resumed = True except Exception as e: if self._verbose: logger.error(str(e)) logger.exception("Runner restore failed.") if self._fail_fast: raise logger.info("Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") if checkpoint_period is None: checkpoint_period = env_integer("TUNE_GLOBAL_CHECKPOINT_S", 10) self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str))
def __init__(self, search_alg=None, scheduler=None, launch_web_server=False, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=TuneServer.DEFAULT_PORT, fail_fast=False, verbose=True, checkpoint_period=10, trial_executor=None): self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) self._total_time = 0 self._iteration = 0 self._has_errored = False self._fail_fast = fail_fast self._verbose = verbose self._server = None self._server_port = server_port if launch_web_server: self._server = TuneServer(self, self._server_port) self._trials = [] self._cached_trial_decisions = {} self._stop_queue = [] self._should_stop_experiment = False # used by TuneServer self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): try: self.resume() logger.info("Resuming trial.") self._resumed = True except Exception: logger.exception( "Runner restore failed. Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str))
def __init__(self, search_alg=None, scheduler=None, launch_web_server=False, local_checkpoint_dir=None, remote_checkpoint_dir=None, sync_to_cloud=None, stopper=None, resume=False, server_port=TuneServer.DEFAULT_PORT, verbose=True, checkpoint_period=10, trial_executor=None): """Initializes a new TrialRunner. Args: search_alg (SearchAlgorithm): SearchAlgorithm for generating Trial objects. scheduler (TrialScheduler): Defaults to FIFOScheduler. launch_web_server (bool): Flag for starting TuneServer local_checkpoint_dir (str): Path where global checkpoints are stored and restored from. remote_checkpoint_dir (str): Remote path where global checkpoints are stored and restored from. Used if `resume` == REMOTE. stopper: Custom class for stopping whole experiments. See ``Stopper``. resume (str|False): see `tune.py:run`. sync_to_cloud (func|str): See `tune.py:run`. server_port (int): Port number for launching TuneServer. verbose (bool): Flag for verbosity. If False, trial results will not be output. trial_executor (TrialExecutor): Defaults to RayTrialExecutor. """ self._search_alg = search_alg or BasicVariantGenerator() self._scheduler_alg = scheduler or FIFOScheduler() self.trial_executor = trial_executor or RayTrialExecutor() # For debugging, it may be useful to halt trials after some time has # elapsed. TODO(ekl) consider exposing this in the API. self._global_time_limit = float( os.environ.get("TRIALRUNNER_WALLTIME_LIMIT", float("inf"))) self._total_time = 0 self._iteration = 0 self._verbose = verbose self._server = None self._server_port = server_port if launch_web_server: self._server = TuneServer(self, self._server_port) self._trials = [] self._stop_queue = [] self._local_checkpoint_dir = local_checkpoint_dir if self._local_checkpoint_dir: os.makedirs(self._local_checkpoint_dir, exist_ok=True) self._remote_checkpoint_dir = remote_checkpoint_dir self._syncer = get_cloud_syncer(local_checkpoint_dir, remote_checkpoint_dir, sync_to_cloud) self._stopper = stopper or NoopStopper() self._resumed = False if self._validate_resume(resume_type=resume): try: self.resume() logger.info("Resuming trial.") self._resumed = True except Exception: logger.exception( "Runner restore failed. Restarting experiment.") else: logger.debug("Starting a new experiment.") self._start_time = time.time() self._last_checkpoint_time = -float("inf") self._checkpoint_period = checkpoint_period self._session_str = datetime.fromtimestamp( self._start_time).strftime("%Y-%m-%d_%H-%M-%S") self.checkpoint_file = None if self._local_checkpoint_dir: self.checkpoint_file = os.path.join( self._local_checkpoint_dir, TrialRunner.CKPT_FILE_TMPL.format(self._session_str))