def get_abs_path(src_path, file_path): if has_remote_protocol(file_path): return file_path elif src_path is not None: return os.path.join(src_path, file_path) else: return file_path
def _save_as_numpy(predictions, output_directory, saved_keys, backend): predictions = predictions[[ c for c in predictions.columns if c not in saved_keys ]] npy_filename = os.path.join(output_directory, "{}.npy") numpy_predictions = to_numpy_dataset(predictions, backend) for k, v in numpy_predictions.items(): k = k.replace("<", "[").replace( ">", "]") # Replace <UNK> and <PAD> with [UNK], [PAD] if k not in saved_keys: if has_remote_protocol(output_directory): with open_file(npy_filename.format(make_safe_filename(k)), mode="wb") as f: np.save(f, v) else: np.save(npy_filename.format(make_safe_filename(k)), v) saved_keys.add(k)
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", # model_load_path=None, # model_resume_path=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol(dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config." ) if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, # model_load_path=model_load_path, # model_resume_path=model_resume_path, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" if self.search_alg_dict is not None: if TYPE not in self.search_alg_dict: logger.warning("WARNING: Kindly set type param for search_alg " "to utilize Tune's Search Algorithms.") search_alg = None else: search_alg_type = self.search_alg_dict[TYPE] search_alg = tune.create_searcher(search_alg_type, metric=metric, mode=mode, **self.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator(max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`." ) else: search_alg = ConcurrencyLimiter(search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, _is_ray_backend(backend) ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # we can't set Trial actor's CPUs to 0 so we just go very low resources_per_trial = PlacementGroupFactory( [{"CPU": 0.001}] + ([{"CPU": 1, "GPU": 1}] * self._gpu_resources_per_trial_non_none) if self._gpu_resources_per_trial_non_none else [{"CPU": 0.001}] + [{"CPU": 1}] * self._cpu_resources_per_trial_non_none ) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig(sync_to_driver=NamespacedKubernetesSyncer(self.kubernetes_namespace)) run_experiment_trial_params = tune.with_parameters(run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable(f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, ) if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values("metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial["training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path(trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning("Skipping evaluation as no model checkpoints were available") else: logger.warning("Skipping evaluation as no validation set was provided") ordered_trials = [TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials] else: logger.warning("No trials reported results; check if time budget lower than epoch latency") ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
def add_feature_data(feature_config, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): set_default_value(feature_config["preprocessing"], "in_memory", preprocessing_parameters["in_memory"]) name = feature_config[NAME] column = input_df[feature_config[COLUMN]] num_audio_files = len(column) if num_audio_files == 0: raise ValueError( "There are no audio files in the dataset provided.") first_audio_entry = next(iter(column)) logging.debug( f"Detected audio feature type is {type(first_audio_entry)}") if not isinstance(first_audio_entry, str) and not isinstance( first_audio_entry, torch.Tensor): raise ValueError( "Invalid audio feature data type. Detected type is {}, " "expected either string for local/remote file path or Torch Tensor." .format(type(first_audio_entry))) src_path = None if SRC in metadata: if isinstance(first_audio_entry, str) and not has_remote_protocol(first_audio_entry): src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) abs_path_column = backend.df_engine.map_objects( column, lambda row: get_abs_path(src_path, row) if isinstance(row, str) else row) num_audio_utterances = len(input_df[feature_config[COLUMN]]) padding_value = preprocessing_parameters["padding_value"] normalization_type = preprocessing_parameters["norm"] feature_dim = metadata[name]["feature_dim"] max_length = metadata[name]["max_length"] audio_feature_dict = { key: value for key, value in preprocessing_parameters.items() if key in AUDIO_FEATURE_KEYS and value is not None } audio_file_length_limit_in_s = preprocessing_parameters[ "audio_file_length_limit_in_s"] if num_audio_utterances == 0: raise ValueError( "There are no audio files in the dataset provided.") if feature_config[PREPROCESSING]["in_memory"]: audio_features = AudioFeatureMixin._process_in_memory( abs_path_column, audio_feature_dict, feature_dim, max_length, padding_value, normalization_type, audio_file_length_limit_in_s, backend, ) proc_df[feature_config[PROC_COLUMN]] = audio_features else: backend.check_lazy_load_supported(feature_config) return proc_df
def execute( self, config, dataset=None, training_set=None, validation_set=None, test_set=None, training_set_metadata=None, data_format=None, experiment_name="hyperopt", model_name="run", resume=None, skip_save_training_description=False, skip_save_training_statistics=False, skip_save_model=False, skip_save_progress=False, skip_save_log=False, skip_save_processed_input=True, skip_save_unprocessed_output=False, skip_save_predictions=False, skip_save_eval_stats=False, output_directory="results", gpus=None, gpu_memory_limit=None, allow_parallel_threads=True, callbacks=None, backend=None, random_seed=default_random_seed, debug=False, hyperopt_log_verbosity=3, features_eligible_for_shared_params=None, **kwargs, ) -> RayTuneResults: if isinstance(dataset, str) and not has_remote_protocol( dataset) and not os.path.isabs(dataset): dataset = os.path.abspath(dataset) if isinstance(backend, str): backend = initialize_backend(backend) if gpus is not None: raise ValueError( "Parameter `gpus` is not supported when using Ray Tune. " "Configure GPU resources with Ray and set `gpu_resources_per_trial` in your " "hyperopt config.") if gpu_memory_limit is None and 0 < self._gpu_resources_per_trial_non_none < 1: # Enforce fractional GPU utilization gpu_memory_limit = self.gpu_resources_per_trial hyperopt_dict = dict( config=config, dataset=dataset, training_set=training_set, validation_set=validation_set, test_set=test_set, training_set_metadata=training_set_metadata, data_format=data_format, experiment_name=experiment_name, model_name=model_name, eval_split=self.split, skip_save_training_description=skip_save_training_description, skip_save_training_statistics=skip_save_training_statistics, skip_save_model=skip_save_model, skip_save_progress=skip_save_progress, skip_save_log=skip_save_log, skip_save_processed_input=skip_save_processed_input, skip_save_unprocessed_output=skip_save_unprocessed_output, skip_save_predictions=skip_save_predictions, skip_save_eval_stats=skip_save_eval_stats, output_directory=output_directory, gpus=gpus, gpu_memory_limit=gpu_memory_limit, allow_parallel_threads=allow_parallel_threads, callbacks=callbacks, backend=backend, random_seed=random_seed, debug=debug, ) mode = "min" if self.goal != MAXIMIZE else "max" metric = "metric_score" # if random seed not set, use Ludwig seed self.search_algorithm.check_for_random_seed(random_seed) if self.search_algorithm.search_alg_dict is not None: if TYPE not in self.search_algorithm.search_alg_dict: candiate_search_algs = [ search_alg for search_alg in SEARCH_ALG_IMPORT.keys() ] logger.warning( "WARNING: search_alg type parameter missing, using 'variant_generator' as default. " f"These are possible values for the type parameter: {candiate_search_algs}." ) search_alg = None else: search_alg_type = self.search_algorithm.search_alg_dict[TYPE] search_alg = tune.create_searcher( search_alg_type, metric=metric, mode=mode, **self.search_algorithm.search_alg_dict) else: search_alg = None if self.max_concurrent_trials: assert ( self.max_concurrent_trials > 0 ), f"`max_concurrent_trials` must be greater than 0, got {self.max_concurrent_trials}" if isinstance(search_alg, BasicVariantGenerator) or search_alg is None: search_alg = BasicVariantGenerator( max_concurrent=self.max_concurrent_trials) elif isinstance(search_alg, ConcurrencyLimiter): raise ValueError( "You have specified `max_concurrent_trials`, but the search " "algorithm is already a `ConcurrencyLimiter`. FIX THIS " "by setting `max_concurrent_trials=None`.") else: search_alg = ConcurrencyLimiter( search_alg, max_concurrent=self.max_concurrent_trials) resources_per_trial = { "cpu": self._cpu_resources_per_trial_non_none, "gpu": self._gpu_resources_per_trial_non_none, } def run_experiment_trial(config, local_hyperopt_dict, checkpoint_dir=None): return self._run_experiment( config, checkpoint_dir, local_hyperopt_dict, self.decode_ctx, features_eligible_for_shared_params, _is_ray_backend(backend), ) tune_config = {} tune_callbacks = [] for callback in callbacks or []: run_experiment_trial, tune_config = callback.prepare_ray_tune( run_experiment_trial, tune_config, tune_callbacks, ) if _is_ray_backend(backend): # for now, we do not do distributed training on cpu (until spread scheduling is implemented for Ray Train) # but we do want to enable it when GPUs are specified resources_per_trial = PlacementGroupFactory( [{}] + ([{ "CPU": 0, "GPU": 1 }] * self._gpu_resources_per_trial_non_none) if self. _gpu_resources_per_trial_non_none else [{}] + [{ "CPU": self._cpu_resources_per_trial_non_none }]) if has_remote_protocol(output_directory): run_experiment_trial = tune.durable(run_experiment_trial) self.sync_config = tune.SyncConfig(sync_to_driver=False, upload_dir=output_directory) if _ray_114: self.sync_client = get_node_to_storage_syncer( SyncConfig(upload_dir=output_directory)) else: self.sync_client = get_cloud_sync_client(output_directory) output_directory = None elif self.kubernetes_namespace: from ray.tune.integration.kubernetes import KubernetesSyncClient, NamespacedKubernetesSyncer self.sync_config = tune.SyncConfig( sync_to_driver=NamespacedKubernetesSyncer( self.kubernetes_namespace)) self.sync_client = KubernetesSyncClient(self.kubernetes_namespace) run_experiment_trial_params = tune.with_parameters( run_experiment_trial, local_hyperopt_dict=hyperopt_dict) register_trainable( f"trainable_func_f{hash_dict(config).decode('ascii')}", run_experiment_trial_params) # Note that resume="AUTO" will attempt to resume the experiment if possible, and # otherwise will start a new experiment: # https://docs.ray.io/en/latest/tune/tutorials/tune-stopping.html should_resume = "AUTO" if resume is None else resume try: analysis = tune.run( f"trainable_func_f{hash_dict(config).decode('ascii')}", name=experiment_name, config={ **self.search_space, **tune_config, }, scheduler=self.scheduler, search_alg=search_alg, num_samples=self.num_samples, keep_checkpoints_num=1, max_failures=1, # retry a trial failure once resources_per_trial=resources_per_trial, time_budget_s=self.time_budget_s, sync_config=self.sync_config, local_dir=output_directory, metric=metric, mode=mode, trial_name_creator=lambda trial: f"trial_{trial.trial_id}", trial_dirname_creator=lambda trial: f"trial_{trial.trial_id}", callbacks=tune_callbacks, stop=CallbackStopper(callbacks), verbose=hyperopt_log_verbosity, resume=should_resume, log_to_file=True, ) except Exception as e: # Explicitly raise a RuntimeError if an error is encountered during a Ray trial. # NOTE: Cascading the exception with "raise _ from e" still results in hanging. raise RuntimeError(f"Encountered Ray Tune error: {e}") if "metric_score" in analysis.results_df.columns: ordered_trials = analysis.results_df.sort_values( "metric_score", ascending=self.goal != MAXIMIZE) # Catch nans in edge case where the trial doesn't complete temp_ordered_trials = [] for kwargs in ordered_trials.to_dict(orient="records"): for key in ["parameters", "training_stats", "eval_stats"]: if isinstance(kwargs[key], float): kwargs[key] = {} temp_ordered_trials.append(kwargs) # Trials w/empty eval_stats fields & non-empty training_stats fields ran intermediate # tune.report call(s) but were terminated before reporting eval_stats from post-train # evaluation (e.g., trial stopped due to time budget or relatively poor performance.) # For any such trials, run model evaluation for the best model in that trial & record # results in ordered_trials which is returned & is persisted in hyperopt_statistics.json. for trial in temp_ordered_trials: if trial["eval_stats"] == "{}" and trial[ "training_stats"] != "{}": # Evaluate the best model on the eval_split, which is validation_set if validation_set is not None and validation_set.size > 0: trial_path = trial["trial_dir"] best_model_path = self._get_best_model_path( trial_path, analysis) if best_model_path is not None: self._evaluate_best_model( trial, trial_path, best_model_path, validation_set, data_format, skip_save_unprocessed_output, skip_save_predictions, skip_save_eval_stats, gpus, gpu_memory_limit, allow_parallel_threads, backend, debug, ) else: logger.warning( "Skipping evaluation as no model checkpoints were available" ) else: logger.warning( "Skipping evaluation as no validation set was provided" ) ordered_trials = [ TrialResults.from_dict(load_json_values(kwargs)) for kwargs in temp_ordered_trials ] else: logger.warning( "No trials reported results; check if time budget lower than epoch latency" ) ordered_trials = [] return RayTuneResults(ordered_trials=ordered_trials, experiment_analysis=analysis)
def add_feature_data(feature_config, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): set_default_value(feature_config[PREPROCESSING], "in_memory", preprocessing_parameters["in_memory"]) name = feature_config[NAME] column = input_df[feature_config[COLUMN]] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) abs_path_column = backend.df_engine.map_objects( column, lambda row: get_abs_path(src_path, row) if isinstance(row, str) and not has_remote_protocol(row) else row, ) ( should_resize, width, height, num_channels, user_specified_num_channels, ) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, abs_path_column) metadata[name][PREPROCESSING]["height"] = height metadata[name][PREPROCESSING]["width"] = width metadata[name][PREPROCESSING]["num_channels"] = num_channels read_image_if_bytes_obj_and_resize = partial( ImageFeatureMixin._read_image_if_bytes_obj_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters["resize_method"], user_specified_num_channels=user_specified_num_channels, ) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(num_channels, height, width) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature_config) in_memory = feature_config[PREPROCESSING]["in_memory"] if in_memory or skip_save_processed_input: metadata[name]["reshape"] = (num_channels, height, width) proc_col = backend.read_binary_files( abs_path_column, map_fn=read_image_if_bytes_obj_and_resize) proc_col = backend.df_engine.map_objects( proc_col, lambda row: row if row is not None else default_image) proc_df[feature_config[PROC_COLUMN]] = proc_col else: num_images = len(abs_path_column) data_fp = backend.cache.get_cache_path(wrap(metadata.get(SRC)), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature_config[PROC_COLUMN] + "_data", (num_images, num_channels, height, width), dtype=np.uint8) for i, img_entry in enumerate(abs_path_column): res = read_image_if_bytes_obj_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature_config[PROC_COLUMN]] = np.arange(num_images) return proc_df