def data_imputation(self, which_sets=None): imputer = self.preprocessing_params.get('imputer', None) which_sets = which_sets if which_sets else self.fit_input_sets for data_key in which_sets: data = self.__getattribute__(data_key) if data is not None: if callable(imputer): # Apply Function to Impute Data # TODO: Send either "self" or all attributes in self as other input to "imputer" # TODO: Force callable "imputer" to have **kwargs, or check for the args it expects and send only those self.__setattr__(data_key, imputer(data)) elif any([isinstance(imputer, _) for _ in (int, float) ]): # Fill Null Data With Given Value self.__setattr__(data_key, data.fillna(imputer)) G.log('Completed data_imputation preprocessing')
def fit(self): """Train model according to :attr:`extra_params['fit']` (if appropriate) on training data""" try: self.model_history = self.model.fit(self.train_input, self.train_target) except Exception as _ex: G.warn(f"KerasModel.fit() failed with Exception: {_ex}\nAttempting standard fit method") super().fit() finally: #################### Record Epochs Elapsed if Model has 'epoch' Attribute #################### with suppress(AttributeError): # self.epochs_elapsed = len(self.model.epoch) self.epochs_elapsed = len(self.model_history.epoch) #################### Load Model Checkpoint if Possible #################### for callback in self.extra_params.get("callbacks", []): if callback.__class__.__name__ == "ModelCheckpoint": self.model.model.load_weights(callback.filepath)
def on_fold_end(self): content = format_fold_run(rep=self._rep, fold=self._fold, run="-") content += self.log_separator if not content.endswith(" ") else "" content += format_evaluation(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(" ") else "" content += self.__elapsed_helper("folds") if G.Env.verbose >= 2 and G.Env.cv_params["n_splits"] > 1: G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) else: G.debug(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_fold_end()
def experiment_workflow(self): """Define the actual experiment process, including execution, result saving, and cleanup""" if self.hyperparameter_key.exists is True: _ex = F'{self!r} has already been run' if self.do_raise_repeated is True: self._clean_up() raise RepeatedExperimentError(_ex) G.warn(_ex) self._initialize_random_seeds() self._initial_preprocessing() self.execute() recorders = RecorderList(file_blacklist=G.Env.file_blacklist) recorders.format_result() G.log(F'Saving results for Experiment: "{self.experiment_id}"') recorders.save_result() self._clean_up()
def on_run_start(self): content = format_fold_run(rep=self._rep, fold=self._fold, run=self._run) content += format( self.log_separator if content != "" and self.current_seed else "") content += "Seed: {}".format( self.current_seed) if self.current_seed else "" if G.Env.verbose >= 4 and G.Env.runs > 1: G.log(content, previous_frame=inspect.currentframe().f_back, add_time=True) else: G.debug(content, previous_frame=inspect.currentframe().f_back, add_time=True) super().on_run_start()
def _optimization_loop(self, iteration=0): """Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an Experiment will be executed, its results will be logged, and it will be compared to the current best experiment Parameters ---------- iteration: Int, default=0 The current iteration in the optimization loop""" self.logger.print_optimization_header() while iteration < self.iterations: try: self._execute_experiment() except RepeatedExperimentError: # G.debug_(F'Skipping repeated Experiment: {_ex!s}\n') if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size: G.log_(f"Hyperparameter search space has been exhausted") break self.skipped_iterations += 1 continue except StopIteration: if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size: G.log_(f"Hyperparameter search space has been exhausted") break # G.debug_(f'Re-initializing hyperparameter grid after testing {len(self.tested_keys)} keys') self._set_hyperparameter_space() continue self.logger.print_result( self.current_hyperparameters_list, self.current_score, experiment_id=self.current_experiment.experiment_id, ) if ( (self.best_experiment is None) # First evaluation or (self.do_maximize and (self.best_score < self.current_score)) # New best max or (not self.do_maximize and (self.best_score > self.current_score)) # New best min ): self.best_experiment = self.current_experiment.experiment_id self.best_score = self.current_score iteration += 1
def _update_model_params(self): """Update random state of :attr:`model_init_params` according to :attr:`current_seed`""" # TODO: Add this to some workflow in Experiment class. For now it is never used, unless the subclass decides to... # `model_init_params` initialized to all algorithm hyperparameters - Works even if 'random_state' not explicitly given try: if 'random_state' in self.model_init_params: self.model_init_params['random_state'] = self.current_seed elif 'seed' in self.model_init_params: self.model_init_params['seed'] = self.current_seed else: G.log('Model has no random_state/seed parameter to update') # FLAG: HIGH PRIORITY BELOW # TODO: BELOW IS NOT THE CASE IF MODEL IS NN - SETTING THE GLOBAL RANDOM SEED DOES SOMETHING # TODO: If this is logged, there is no reason to execute multiple-run-averaging, so don't # TODO: ... Either 1) Set `runs` = 1 (this would mess with the environment key), or... # TODO: ... 2) Set the results of all subsequent runs to the results of the first run (this could be difficult) # FLAG: HIGH PRIORITY ABOVE except Exception as _ex: G.log('Failed to update model\'s random_state {}'.format(_ex.__repr__()))
def _initial_preprocessing(self): """Perform preprocessing steps prior to executing fitting protocol (usually cross-validation), consisting of: 1) Split train/holdout data into respective train/holdout input and target data attributes, 2) Feature selection on input data sets, 3) Set target datasets to target_column contents, 4) Initialize PreprocessingPipeline to perform core preprocessing, 5) Set datasets to their (modified) counterparts in PreprocessingPipeline, 6) Log whether datasets changed""" #################### Preprocessing #################### # preprocessor = PreprocessingPipelineMixIn( # pipeline=[], preprocessing_params=dict(apply_standard_scale=True), features=self.features, # target_column=self.target_column, train_input_data=self.train_input_data, # train_target_data=self.train_target_data, holdout_input_data=self.holdout_input_data, # holdout_target_data=self.holdout_target_data, test_input_data=self.test_input_data, # fitting_guide=None, fail_gracefully=False, preprocessing_stage='infer' # ) # # # TODO: Switch from below direct calls to preprocessor.execute_pipeline() call # # TODO: After calling execute_pipeline(), set data attributes to their counterparts in preprocessor class # preprocessor.data_imputation() # preprocessor.target_data_transformation() # preprocessor.data_scaling() # # for dataset_name in preprocessor.all_input_sets + preprocessor.all_target_sets: # old_val, new_val = getattr(self, dataset_name), getattr(preprocessor, dataset_name) # G.log('Dataset: "{}" {} updated'.format(dataset_name, 'was not' if old_val.equals(new_val) else 'was')) # setattr(self, dataset_name, new_val) self.train_input_data = self.train_dataset.copy().loc[:, self. feature_selector] self.train_target_data = self.train_dataset.copy().loc[:, self. target_column] if isinstance(self.holdout_dataset, pd.DataFrame): self.holdout_input_data = self.holdout_dataset.copy( ).loc[:, self.feature_selector] self.holdout_target_data = self.holdout_dataset.copy( ).loc[:, self.target_column] if isinstance(self.test_dataset, pd.DataFrame): self.test_input_data = self.test_dataset.copy( ).loc[:, self.feature_selector] G.log("Initial preprocessing stage complete", 4)
def save_key(self): """Create an entry in the dict contained in the file at :attr:`cross_experiment_key.key`, whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False""" if not self.exists: if self.cross_experiment_key.exists is False: _err = "Cannot save hyperparameter_key: '{}', before cross_experiment_key '{}'" raise ValueError( _err.format(self.key, self.cross_experiment_key.key)) key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json" add_to_json(key_path, [], key=self.key, condition=lambda _: self.key not in _.keys()) self.exists = True G.log(f'Saved {self.key_type}_key: "{self.key}"', 4) else: G.log( f'{self.key_type}_key "{self.key}" already exists - Skipped saving', 4)
def save_result(self): """Execute :meth:`save_result` for all classes in :attr:`recorders` Notes ----- When iterating through :attr:`recorders` and calling :meth:`save_result`, a check is performed for `exit_code`. Children classes of :class:`BaseRecorder` are NOT expected to explicitly return a value in their :meth:`save_result`. However, if a value is returned and `exit_code` == 'break', the result-saving loop will be broken, and no further results will be saved. In practice, this is only performed for the sake of :meth:`DescriptionRecorder.save_result`, which has the additional quality of being able to prevent any other result files from being saved if the result of :func:`DescriptionRecorder.do_full_save` returns False when given the formatted :attr:`DescriptionRecorder.result`. This can be useful when there are storage constraints, because it ensures that essential data - including keys and the results of the experiment - are saved (to ensure the experiment is not duplicated, and to provide some utility to Hyperparameter Optimization algorithms), while extra results like Predictions are not saved.""" for recorder in self.recorders: G.log(F'Saving result file for "{type(recorder).__name__}"') exit_code = recorder.save_result() if exit_code and exit_code == 'break': break
def save_result(self): """Save the Experiment description as a .json file, named after :attr:`experiment_id`. If :attr:`do_full_save` is a callable and returns False when given the description object, the result recording loop will be broken, and the remaining result files will not be saved Returns ------- 'break' This string will be returned if :attr:`do_full_save` is a callable and returns False when given the description object. This is the signal for :class:`recorders.RecorderList` to stop recording result files""" try: write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False) except FileNotFoundError: make_dirs(self.result_path, exist_ok=False) write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False) if (self.do_full_save is not None) and (not self.do_full_save(self.result)): G.warn("Breaking result-saving loop early! Remaining result files will not be saved") return "break"
def cross_validation_workflow(self): """Execute workflow for cross-validation process, consisting of the following tasks: 1) Create train and validation split indices for all folds, 2) Iterate through folds, performing cv_fold_workflow for each, 3) Average accumulated predictions over fold splits, 4) Evaluate final predictions, 5) Format final predictions to prepare for saving""" self.on_experiment_start() cv_indices = self.folds.split(self.train_input_data, self.train_target_data.iloc[:, 0]) new_shape = (self.cross_validation_params.get('n_repeats', 1), self.cross_validation_params['n_splits'], 2) reshaped_indices = np.reshape(np.array(list(cv_indices)), new_shape) for self._rep, repetition_indices in enumerate(reshaped_indices.tolist()): self.on_repetition_start() for self._fold, (self.train_index, self.validation_index) in enumerate(repetition_indices): self.cv_fold_workflow() self.on_repetition_end() self.on_experiment_end() G.log('')
def _initial_preprocessing(self): """Perform preprocessing steps prior to executing fitting protocol (usually cross-validation), consisting of: 1) Split train/holdout data into respective train/holdout input and target data attributes, 2) Execute `feature_engineer` to perform "pre_cv"-stage preprocessing, 3) Set datasets to their (modified) counterparts in `feature_engineer`""" self.train_input_data = self.train_dataset.copy().loc[:, self. feature_selector] self.train_target_data = self.train_dataset.copy().loc[:, self. target_column] if isinstance(self.holdout_dataset, pd.DataFrame): self.holdout_input_data = self.holdout_dataset.copy( ).loc[:, self.feature_selector] self.holdout_target_data = self.holdout_dataset.copy( ).loc[:, self.target_column] if isinstance(self.test_dataset, pd.DataFrame): self.test_input_data = self.test_dataset.copy( ).loc[:, self.feature_selector] if self.feature_engineer and callable(self.feature_engineer): self.feature_engineer( "pre_cv", train_inputs=self.train_input_data, train_targets=self.train_target_data, holdout_inputs=self.holdout_input_data, holdout_targets=self.holdout_target_data, test_inputs=self.test_input_data, ) self.train_input_data = self.feature_engineer.datasets[ "train_inputs"] self.train_target_data = self.feature_engineer.datasets[ "train_targets"] self.holdout_input_data = self.feature_engineer.datasets[ "holdout_inputs"] self.holdout_target_data = self.feature_engineer.datasets[ "holdout_targets"] self.test_input_data = self.feature_engineer.datasets[ "test_inputs"] G.log("Initial preprocessing stage complete", 4)
def go(self): """Begin hyperparameter optimization process after experiment guidelines have been set and search dimensions are in place. This process includes the following: setting the hyperparameter space; locating similar experiments to be used as learning material for :class:`SKOptimizationProtocol` s; and executing :meth:`_optimization_loop`, which actually sets off the Experiment execution process""" if self.model_initializer is None: raise ValueError("Experiment guidelines must be set before starting optimization") _reporter_params = dict(dict(do_maximize=self.do_maximize), **self.reporter_parameters) self.logger = OptimizationReporter([_.name for _ in self.dimensions], **_reporter_params) self.tested_keys = [] self._set_hyperparameter_space() self._find_similar_experiments() loop_start_time = datetime.now() self._optimization_loop() loop_end_time = datetime.now() G.log_(f"Optimization loop completed in {loop_end_time - loop_start_time}") G.log_(f'Best score was {self.best_score} from Experiment "{self.best_experiment}"')
def find(self): """Execute full result-finding workflow""" self._get_ids() G.debug_( f"Experiments matching cross-experiment key/algorithm: {len(self.experiment_ids)}" ) self._get_scored_params() self._filter_by_space() G.debug_( f"Experiments fitting in the given space: {len(self.hyperparameters_and_scores)}" ) if self.module_name == "keras": multi_targets = [("model_init_params", "compile_params", "optimizer")] if multi_targets[0] in self.space.names(): self._filter_by_guidelines_multi(multi_targets[0]) else: self._filter_by_guidelines() else: self._filter_by_guidelines() #################### Post-Process Similar Experiments #################### self._reinitialize_similar_experiments() G.debug_( f"Experiments matching current guidelines: {len(self.similar_experiments)}" )
def _get_current_hyperparameters(self): """Ask :attr:`optimizer` for the upcoming set of hyperparameters that should be searched, then format them to be used in the next Experiment Returns ------- current_hyperparameters: Dict The next set of hyperparameters that will be searched""" _current_hyperparameters = self.optimizer.ask() if _current_hyperparameters == self.current_hyperparameters_list: new_parameters = self.space.rvs(random_state=None)[0] G.debug_("REPEATED asked={} new={}".format(_current_hyperparameters, new_parameters)) _current_hyperparameters = new_parameters self.current_hyperparameters_list = _current_hyperparameters current_hyperparameters = dict( zip(self.space.names(use_location=False), self.current_hyperparameters_list) ) return current_hyperparameters
def on_fold_end(self): # G.log('AggregatorEpochsElapsed.on_fold_end()') rep_key, fold_key = 'rep_{}'.format(self._rep), 'fold_{}'.format(self._fold) #################### Simple Average of Fold's Runs #################### try: self.stat_aggregates['epochs_elapsed'][fold_key]['simple_average'] = np.average( self.stat_aggregates['epochs_elapsed'][fold_key]['run_values'] ) except KeyError: # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place pass except TypeError: G.warn('\n'.join([ 'TypeError encountered when averaging stat_aggregates[{}][{}]:'.format('epochs_elapsed', fold_key), '\tValues: {}'.format(self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']), '\tTypes: {}'.format([type(_) for _ in self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']]), 'If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me' ])) super().on_fold_end()
def on_exp_start(self): """Prepare data prior to executing fitting protocol (cross-validation), by 1) Initializing formal :mod:`~hyperparameter_hunter.data.datasets` attributes, 2) Invoking `feature_engineer` to perform "pre_cv"-stage preprocessing, and 3) Updating datasets to include their (transformed) counterparts in `feature_engineer`""" #################### Build Datasets #################### data_kwargs = dict(feature_selector=self.feature_selector, target_column=self.target_column) self.data_train = TrainDataset(self.train_dataset, require_data=True, **data_kwargs) # TODO: Might be better to initialize `data_oof` with same data as `data_train` self.data_oof = OOFDataset(None, **data_kwargs) self.data_holdout = HoldoutDataset(self.holdout_dataset, **data_kwargs) self.data_test = TestDataset(self.test_dataset, feature_selector=self.feature_selector) #################### Perform Pre-CV Feature Engineering #################### self.feature_engineer( "pre_cv", train_inputs=deepcopy(self.data_train.input.d), train_targets=deepcopy(self.data_train.target.d), holdout_inputs=deepcopy(self.data_holdout.input.d), holdout_targets=deepcopy(self.data_holdout.target.d), test_inputs=deepcopy(self.data_test.input.d), ) self.data_train.input.T.d = self.feature_engineer.datasets[ "train_inputs"] self.data_train.target.T.d = self.feature_engineer.datasets[ "train_targets"] self.data_holdout.input.T.d = self.feature_engineer.datasets[ "holdout_inputs"] self.data_holdout.target.T.d = self.feature_engineer.datasets[ "holdout_targets"] self.data_test.input.T.d = self.feature_engineer.datasets[ "test_inputs"] G.log("Initial preprocessing stage complete", 4) super().on_exp_start()
def _optimization_loop(self, iteration=0): """Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an Experiment will be executed, its results will be logged, and it will be compared to the current best experiment Parameters ---------- iteration: Int, default=0 The current iteration in the optimization loop""" self.logger.print_optimization_header() while iteration < self.iterations: try: self._execute_experiment() except RepeatedExperimentError: # G.debug_(F'Skipping repeated Experiment: {_ex!s}\n') self.skipped_iterations += 1 continue except StopIteration: if len(self.tested_keys) >= self.search_space_size: G.log_( F'Hyperparameter search space has been exhausted after testing {len(self.tested_keys)} keys' ) break # G.debug_(F'Re-initializing hyperparameter grid after testing {len(self.tested_keys)} keys') self._set_hyperparameter_space() continue # TODO: :attr:`current_hyperparameters_list` only exists in Informed Protocols self.logger.print_result( self.current_hyperparameters_list, self.current_score, experiment_id=self.current_experiment.experiment_id) if (self.best_experiment is None) or (self.current_score > self.best_score): self.best_experiment = self.current_experiment.experiment_id self.best_score = self.current_score iteration += 1
def cross_validation_workflow(self): """Execute workflow for cross-validation process, consisting of the following tasks: 1) Create train and validation split indices for all folds, 2) Iterate through folds, performing `cv_fold_workflow` for each, 3) Average accumulated predictions over fold splits, 4) Evaluate final predictions, 5) Format final predictions to prepare for saving""" self.on_experiment_start() reshaped_indices = get_cv_indices(self.folds, self.cv_params, self.train_input_data, self.train_target_data.iloc[:, 0]) for self._rep, rep_indices in enumerate(reshaped_indices): self.on_repetition_start() for self._fold, (self.train_index, self.validation_index) in enumerate(rep_indices): self.cv_fold_workflow() self.on_repetition_end() self.on_experiment_end() G.log("")
def cv_fold_workflow(self): """Execute workflow for individual fold, consisting of the following tasks: Execute overridden :meth:`on_fold_start` tasks, 2) Perform cv_run_workflow for each run, 3) Execute overridden :meth:`on_fold_end` tasks""" self.on_fold_start() if self.feature_engineer and callable(self.feature_engineer): self.feature_engineer( "intra_cv", train_inputs=self.fold_train_input, train_targets=self.fold_train_target, validation_inputs=self.fold_validation_input, validation_targets=self.fold_validation_target, holdout_inputs=self.fold_holdout_input, holdout_targets=self.fold_holdout_target, test_inputs=self.fold_test_input, ) self.fold_train_input = self.feature_engineer.datasets[ "train_inputs"] self.fold_train_target = self.feature_engineer.datasets[ "train_targets"] self.fold_validation_input = self.feature_engineer.datasets[ "validation_inputs"] self.fold_validation_target = self.feature_engineer.datasets[ "validation_targets"] self.fold_holdout_input = self.feature_engineer.datasets[ "holdout_inputs"] self.fold_holdout_target = self.feature_engineer.datasets[ "holdout_targets"] self.fold_test_input = self.feature_engineer.datasets[ "test_inputs"] G.log("Intra-CV preprocessing stage complete", 4) for self._run in range(self.experiment_params.get("runs", 1)): self.cv_run_workflow() self.on_fold_end()
def update_custom_environment_params(self): """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS""" allowed_parameter_keys = [ k for k, v in signature(Environment).parameters.items() if v.kind == v.KEYWORD_ONLY ] user_defaults = {} try: user_defaults = read_json(self.environment_params_path) except (TypeError, OSError): # If `environment_params_path=None`, no error raised - `user_defaults` continues as {} if self.environment_params_path is not None: raise if not isinstance(user_defaults, dict): raise TypeError( "environment_params_path must have dict, not {}".format( user_defaults)) #################### Check user_defaults #################### for k, v in user_defaults.items(): if k not in allowed_parameter_keys: G.warn( f"Invalid key ({k}) in user Environment parameters: {self.environment_params_path}" ) elif getattr(self, k) is None: setattr(self, k, v) G.debug( f"Environment.`{k}` set to user default: '{self.environment_params_path}'" ) #################### Check Module Default Environment Arguments #################### for k in allowed_parameter_keys: if getattr(self, k) is None: setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
def _create_script_backup(self): """Create and save a copy of the script that initialized the Experiment if allowed to, and if :attr:`source_script` ends with a ".py" extension""" #################### Attempt to Copy Source Script if Allowed #################### try: if not self.source_script.endswith(".py"): G.Env.result_paths["script_backup"] = None if G.Env.result_paths["script_backup"] is not None: try: self._source_copy_helper() except FileNotFoundError: make_dirs(self.result_paths["script_backup"], exist_ok=False) self._source_copy_helper() G.log( "Created source backup: '{}'".format(self.source_script), 4) else: G.log( "Skipped source backup: '{}'".format(self.source_script), 4) #################### Exception Handling #################### except AttributeError as _ex: if G.Env is None: raise EnvironmentInactiveError(extra="\n{!s}".format(_ex)) if not hasattr(G.Env, "result_paths"): raise EnvironmentInvalidError( extra=f"G.Env lacks 'result_paths' attr\n{_ex!s}") raise except KeyError as _ex: if "script_backup" not in G.Env.result_paths: raise EnvironmentInvalidError( extra= f"G.Env.result_paths lacks 'script_backup' key\n{_ex!s}") raise
def _create_script_backup(self): """Create and save a copy of the script that initialized the Experiment""" #################### Attempt to Copy Source Script if Allowed #################### try: if G.Env.result_paths['script_backup'] is not None: try: shutil.copyfile(self.source_script, F'{self.result_paths["script_backup"]}/{self.experiment_id}.py') except FileNotFoundError: os.makedirs(self.result_paths["script_backup"], exist_ok=False) shutil.copyfile(self.source_script, F'{self.result_paths["script_backup"]}/{self.experiment_id}.py') G.log('Created backup of file: "{}"'.format(self.source_script)) else: G.log('Skipped creating backup of file: "{}"'.format(self.source_script)) #################### Exception Handling #################### except AttributeError as _ex: if G.Env is None: raise EnvironmentInactiveError(extra='\n{!s}'.format(_ex)) if not hasattr(G.Env, 'result_paths'): raise EnvironmentInvalidError(extra='G.Env lacks "result_paths" attribute\n{!s}'.format(_ex)) raise except KeyError as _ex: if 'script_backup' not in G.Env.result_paths: raise EnvironmentInvalidError(extra='G.Env.result_paths lacks "script_backup" key\n{!s}'.format(_ex)) raise
def _ask(self): """Suggest next point at which to evaluate the objective Returns ------- Some point in :attr:`space`, which is random while less than `n_initial_points` observations have been `tell`-ed. After that, `base_estimator` is used to determine the next point Notes ----- If the suggested point has already been evaluated, a random point will be returned instead, optionally accompanied by a warning message (depending on :attr:`warn_on_re_ask`)""" if self._n_initial_points > 0 or self.base_estimator is None: # Does not copy `self.rng` in order to keep advancing random state return self.space.rvs(random_state=self.rng)[0] else: if not self.models: raise RuntimeError( "Random evaluations exhausted and no model has been fit") #################### Check for Repeated Suggestion #################### next_x = self._next_x # Check distances between `next_x` and all evaluated points min_delta_x = min( [self.space.distance(next_x, xi) for xi in self.Xi]) if abs(min_delta_x) <= 1e-8: # `next_x` has already been evaluated if self.warn_on_re_ask: G.warn_("Repeated suggestion: {}".format(next_x)) # Set `_next_x` to random point, then re-invoke `_ask` to validate new point self._next_x = self.space.rvs(random_state=self.rng)[0] return self._ask() # Return point computed from last call to `tell` return next_x
def save_key(self): """Create an entry in the dict corresponding to the file at :attr:`cross_experiment_key.key`, whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False""" if not self.exists: if self.cross_experiment_key.exists is False: raise ValueError( 'Cannot save hyperparameter_key: "{}", before cross_experiment_key "{}" has been saved' .format(self.key, self.cross_experiment_key.key)) key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json" add_to_json( file_path=key_path, data_to_add=[], key=self.key, condition=lambda _: self.key not in _.keys(), ) self.exists = True G.log(f'Saved {self.key_type}_key: "{self.key}"') else: G.log( f'{self.key_type}_key "{self.key}" already exists - Skipped saving' )
def _generate_hyperparameter_key(self): """Set :attr:`hyperparameter_key` to a key to describe the experiment's hyperparameters""" parameters = dict( model_initializer=self.model_initializer, model_init_params=self.model_init_params, model_extra_params=self.model_extra_params, feature_engineer=self.feature_engineer, feature_selector=self.feature_selector, # FLAG: Should probably add :attr:`target_metric` to key - With option to ignore it? ) self.hyperparameter_key = HyperparameterKeyMaker( parameters, self.cross_experiment_key) G.log("Hyperparameter Key: '{}'".format(self.hyperparameter_key)) G.debug("Raw hyperparameters...") G.debug(self.hyperparameter_key.parameters)
def experiment_workflow(self): """Define the actual experiment process, including execution, result saving, and cleanup""" if self.hyperparameter_key.exists is True: _ex = f"{self!r} has already been run" if self.do_raise_repeated is True: self._clean_up() raise RepeatedExperimentError(_ex) G.debug(_ex) G.warn("WARNING: Duplicate experiment!") self._initialize_random_seeds() self.execute() #################### Save Experiment Results #################### recorders = RecorderList(file_blacklist=G.Env.file_blacklist, extra_recorders=G.Env.experiment_recorders) recorders.format_result() G.log(f"Saving results for Experiment: '{self.experiment_id}'") recorders.save_result() self._clean_up()
def find(self): """Execute full result-finding workflow""" self._get_ids() G.debug_( F'Experiments found with matching cross-experiment key and algorithm: {len(self.experiment_ids)}' ) self._get_scored_params() self._filter_by_space() G.debug_( F'Experiments whose hyperparameters fit in the currently defined space: {len(self.hyperparameters_and_scores)}' ) if self.module_name == 'keras': if ('model_init_params', 'compile_params', 'optimizer') in self.hyperparameter_space.get_names(): self._filter_by_guidelines_multi( ('model_init_params', 'compile_params', 'optimizer')) else: self._filter_by_guidelines() else: self._filter_by_guidelines() G.debug_( F'Experiments whose hyperparameters match the current guidelines: {len(self.similar_experiments)}' )
def set_experiment_guidelines( self, model_initializer, model_init_params, model_extra_params=None, feature_selector=None, preprocessing_pipeline=None, preprocessing_params=None, notes=None, do_raise_repeated=True, ): """Provide the arguments necessary to instantiate :class:`experiments.CrossValidationExperiment`. This method has the same signature as :meth:`experiments.BaseExperiment.__init__` except where noted Parameters ---------- model_initializer: Class, or functools.partial, or class instance The algorithm class being used to initialize a model model_init_params: Dict, or object The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are valid in `model_init_params` model_extra_params: Dict, or None, default=None A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models' non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks feature_selector: List of str, callable, list of booleans, default=None The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None, `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column` preprocessing_pipeline: ... ... Experimental... preprocessing_params: ... ... Experimental... notes: String, or None, default=None Additional information about the Experiment that will be saved with the Experiment's description result file. This serves no purpose other than to facilitate saving Experiment details in a more readable format do_raise_repeated: Boolean, default=False If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a warning will be logged Notes ----- The `auto_start` kwarg is not available here because :meth:`BaseOptimizationProtocol._execute_experiment` sets it to False in order to check for duplicated keys before running the whole Experiment. This is the most notable difference between calling :meth:`set_experiment_guidelines` and instantiating :class:`experiments.CrossValidationExperiment`""" self.model_initializer = model_initializer self.model_init_params = identify_algorithm_hyperparameters( self.model_initializer) try: self.model_init_params.update(model_init_params) except TypeError: self.model_init_params.update(dict(build_fn=model_init_params)) self.model_extra_params = model_extra_params self.feature_selector = feature_selector self.preprocessing_pipeline = preprocessing_pipeline self.preprocessing_params = preprocessing_params self.notes = notes self.do_raise_repeated = do_raise_repeated if self.do_raise_repeated is False: G.warn_( 'WARNING: Setting `do_raise_repeated`=False will allow Experiments to be unnecessarily duplicated' ) self.algorithm_name, self.module_name = identify_algorithm( self.model_initializer) self._validate_guidelines() #################### Deal with Keras #################### if self.module_name == 'keras': reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow( self.model_initializer, self.model_init_params['build_fn'], self.model_extra_params, self.source_script) self.model_init_params = dict(build_fn=reusable_build_fn) self.model_extra_params = reusable_wrapper_params self.dummy_layers = dummy_layers self.dummy_compile_params = dummy_compile_params # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam' self.set_dimensions()