def run_tune( sync_to_driver: bool, upload_dir: Optional[str] = None, durable: bool = False, experiment_name: str = "cloud_test", indicator_file: str = "/tmp/tune_cloud_indicator", ): num_cpus_per_trial = int(os.environ.get("TUNE_NUM_CPUS_PER_TRIAL", "2")) if durable: trainable = tune.durable(train) else: trainable = train tune.run(trainable, name=experiment_name, resume="AUTO", num_samples=4, config={ "max_iterations": 30, "sleep_time": 5, "checkpoint_freq": 2, "score_multiplied": tune.randint(0, 100), }, sync_config=tune.SyncConfig( sync_to_driver=sync_to_driver, upload_dir=upload_dir, sync_on_checkpoint=True, cloud_sync_period=0.5, ), keep_checkpoints_num=2, resources_per_trial={"cpu": num_cpus_per_trial}, callbacks=[IndicatorCallback(indicator_file=indicator_file)], verbose=2)
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config." ) if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict[TYPE] search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`." ) else: search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend) ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # we can't set Trial actor's CPUs to 0 so we just go very low resources_per_trial = PlacementGroupFactory( [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none) if self._gpu_resources_per_trial_non_none else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none ) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace)) run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, ) if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path(trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning("Skipping evaluation as no model checkpoints were available") else: logger.warning("Skipping evaluation as no validation set was provided") ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials] else: logger.warning("No trials reported results; check if time budget lower than epoch latency") ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
pass # delete tune.run( MyTrainableClass, sync_config=tune.SyncConfig( upload_dir="s3://my-log-dir", syncer=CustomSyncer() ), ) # __log_2_end__ if not MOCK: # __s3_start__ from ray import tune tune.run( tune.durable(train_fn), # ..., sync_config=tune.SyncConfig(upload_dir="s3://your-s3-bucket/durable-trial/"), ) # __s3_end__ # __sync_config_start__ from ray import tune tune.run( train_fn, # ..., local_dir="/path/to/shared/storage", sync_config=tune.SyncConfig( # Do not sync because we are on shared storage syncer=None
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", resume=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, hyperopt_log_verbosity=3, features_eligible_for_shared_params=None, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol( dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config.") if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" # if random seed not set, use Ludwig seed self.search_algorithm.check_for_random_seed(random_seed) if self.search_algorithm.search_alg_dict is not None: if TYPE not in self.search_algorithm.search_alg_dict: candiate_search_algs = [ search_alg for search_alg in SEARCH_ALG_IMPORT.keys() ] logger.warning( "WARNING: search_alg type parameter missing, using 'variant_generator' as default. " f"These are possible values for the type parameter: {candiate_search_algs}." ) search_alg = None else: search_alg_type = self.search_algorithm.search_alg_dict[TYPE] search_alg = tune.create_searcher( search_alg_type, metric=metric, mode=mode, **self.search_algorithm.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator( max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`.") else: search_alg = ConcurrencyLimiter( search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, features_eligible_for_shared_params, _is_ray_backend(backend), ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train) # but we do want to enable it when GPUs are specified resources_per_trial = PlacementGroupFactory( [{}] + ([{ "CPU": 0, "GPU": 1 }] * self._gpu_resources_per_trial_non_none) if self. _gpu_resources_per_trial_non_none else [{}] + [{ "CPU": self._cpu_resources_per_trial_non_none }]) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) if _ray_114: self.sync_client = get_node_to_storage_syncer( SyncConfig(upload_dir=output_directory)) else: self.sync_client = get_cloud_sync_client(output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) self.sync_client = KubernetesSyncClient(self.kubernetes_namespace) run_experiment_trial_params = tune.with_parameters( run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable( f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) # Note that resume="AUTO" will attempt to resume the experiment if possible, and # otherwise will start a new experiment: # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html should_resume = "AUTO" if resume is None else resume try: analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", name=experiment_name, config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, stop=CallbackStopper(callbacks), verbose=hyperopt_log_verbosity, resume=should_resume, log_to_file=True, ) except Exception as e: # Explicitly raise a RuntimeError if an error is encountered during a Ray trial. # NOTE: Cascading the exception with "raise _ from e" still results in hanging. raise RuntimeError(f"Encountered Ray Tune error: {e}") if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial[ "training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path( trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning( "Skipping evaluation as no model checkpoints were available" ) else: logger.warning( "Skipping evaluation as no validation set was provided" ) ordered_trials = [ TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials ] else: logger.warning( "No trials reported results; check if time budget lower than epoch latency" ) ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)