def environment_workflow(self): """Execute all methods required to validate the environment and run Experiments""" self.update_custom_environment_params() self.validate_parameters() self.format_result_paths() self.generate_cross_experiment_key() G.log("Cross-Experiment Key: '{!s}'".format(self.cross_experiment_key))
def on_run_start(self): content = "" content += format_fold_run(fold=self._fold, run=self._run) content += format(self.log_separator if content != "" and self.current_seed else "") content += "Seed: {}".format(self.current_seed) if self.current_seed else "" G.log(content, previous_frame=inspect.currentframe().f_back, add_time=True) super().on_run_start()
def data_scaling(self, which_sets=None): which_sets = which_sets if which_sets else self.fit_input_sets # TODO: Expand method to include other scaling types by sending string param or callable for apply_scale arg if self.preprocessing_params.get('apply_standard_scale', False) is True: scaler = StandardScaler() # TODO: Modify fitting process to use 'which_sets' and 'self.fit_input_sets' like 'data_imputation' method scaler.fit(self.train_input_data[self.features].values) if 'train_input_data' in self.all_input_sets: self.train_input_data[self.features] = scaler.transform( self.train_input_data[self.features].values) if 'holdout_input_data' in self.all_input_sets: self.holdout_input_data[self.features] = scaler.transform( self.holdout_input_data[self.features].values) if 'test_input_data' in self.all_input_sets: self.test_input_data[self.features] = scaler.transform( self.test_input_data[self.features].values) G.log( 'Completed data_scaling preprocessing. preprocessing_params["apply_standard_scale"]={}' .format( self.preprocessing_params.get('apply_standard_scale', False)))
def _create_script_backup(self): """Create and save a copy of the script that initialized the Experiment if allowed to, and if :attr:`source_script` ends with a ".py" extension""" #################### Attempt to Copy Source Script if Allowed #################### try: if not self.source_script.endswith(".py"): G.Env.result_paths["script_backup"] = None if G.Env.result_paths["script_backup"] is not None: self._source_copy_helper() G.log( "Created source backup: '{}'".format(self.source_script), 4) else: G.log( "Skipped source backup: '{}'".format(self.source_script), 4) #################### Exception Handling #################### except AttributeError as _ex: if G.Env is None: raise EnvironmentInactiveError(extra="\n{!s}".format(_ex)) if not hasattr(G.Env, "result_paths"): raise EnvironmentInvalidError( extra=f"G.Env lacks 'result_paths' attr\n{_ex!s}") raise except KeyError as _ex: if "script_backup" not in G.Env.result_paths: raise EnvironmentInvalidError( extra= f"G.Env.result_paths lacks 'script_backup' key\n{_ex!s}") raise
def cross_validation_workflow(self): """Execute workflow for cross-validation process, consisting of the following tasks: 1) Create train and validation split indices for all folds, 2) Iterate through folds, performing `cv_fold_workflow` for each, 3) Average accumulated predictions over fold splits, 4) Evaluate final predictions, 5) Format final predictions to prepare for saving""" self.on_experiment_start() cv_indices = self.folds.split(self.train_input_data, self.train_target_data.iloc[:, 0]) new_shape = ( self.cross_validation_params.get("n_repeats", 1), self.cross_validation_params["n_splits"], 2, ) reshaped_indices = np.reshape(np.array(list(cv_indices)), new_shape) for self._rep, repetition_indices in enumerate( reshaped_indices.tolist()): self.on_repetition_start() for self._fold, ( self.train_index, self.validation_index) in enumerate(repetition_indices): self.cv_fold_workflow() self.on_repetition_end() self.on_experiment_end() G.log("")
def _create_script_backup(self): """Create and save a copy of the script that initialized the Experiment""" #################### Attempt to Copy Source Script if Allowed #################### try: if G.Env.result_paths["script_backup"] is not None: try: self._source_copy_helper() except FileNotFoundError: os.makedirs(self.result_paths["script_backup"], exist_ok=False) self._source_copy_helper() G.log("Created backup of file: '{}'".format( self.source_script)) else: G.log("Skipped creating backup of file: '{}'".format( self.source_script)) #################### Exception Handling #################### except AttributeError as _ex: if G.Env is None: raise EnvironmentInactiveError(extra="\n{!s}".format(_ex)) if not hasattr(G.Env, "result_paths"): raise EnvironmentInvalidError( extra=f"G.Env lacks 'result_paths' attr\n{_ex!s}") raise except KeyError as _ex: if "script_backup" not in G.Env.result_paths: raise EnvironmentInvalidError( extra= f"G.Env.result_paths lacks 'script_backup' key\n{_ex!s}") raise
def on_experiment_start(self): content = '' # experiment_start_time = self.stat_aggregates['times']['start'] G.log('\n', previous_frame=inspect.currentframe().f_back) # G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_experiment_start()
def save_key(self): """Create a new file for this cross_experiment_key if :attr:`exists` is False""" if not self.exists: write_json(F'{self.tested_keys_dir}/{self.key}.json', {}) self.exists = True G.log(F'Saved {self.key_type}_key: "{self.key}"') else: G.log(F'{self.key_type}_key "{self.key}" already exists - Skipped saving')
def on_fold_start(self): content = '' # fold_start_time = self.stat_aggregates['times']['folds'][-1] # G.log('\n', previous_frame=inspect.currentframe().f_back) G.log('', previous_frame=inspect.currentframe().f_back) super().on_fold_start()
def on_run_end(self): content = list() content.append(format_fold_run(fold=self._fold, run=self._run)) content.append(format_evaluation_results(self.last_evaluation_results, float_format=self.float_format)) content.append('Time Elapsed: {}'.format(sec_to_hms(self.stat_aggregates['times']['runs'][-1], as_str=True))) G.log(self.log_separator.join(content), previous_frame=inspect.currentframe().f_back) super().on_run_end()
def _validate_environment(self): """Check that there is a currently active Environment instance that is not already occupied""" if G.Env is None: raise EnvironmentInactiveError('') if G.Env.current_task is None: G.Env.current_task = self G.log(F'Validated Environment with key: "{self.cross_experiment_key}"') else: raise EnvironmentInvalidError('An experiment is in progress. It must finish before a new one can be started')
def environment_workflow(self): """Execute all methods required to validate the environment and run Experiments""" self.update_custom_environment_params() self.validate_parameters() self.train_dataset, self.holdout_dataset = define_holdout_set( self.train_dataset, self.holdout_dataset, self.target_column) self.format_result_paths() self.generate_cross_experiment_key() G.log("Cross-Experiment Key: '{!s}'".format( self.cross_experiment_key))
def cv_fold_workflow(self): """Execute workflow for individual fold, consisting of the following tasks: Execute overridden :meth:`on_fold_start` tasks, 2) Perform cv_run_workflow for each run, 3) Execute overridden :meth:`on_fold_end` tasks""" self.on_fold_start() G.log("Intra-CV preprocessing stage complete", 4) for self._run in range(self.experiment_params.get("runs", 1)): self.cv_run_workflow() self.on_fold_end()
def _validate_environment(self): """Ensure there is a currently active Environment instance that is not already occupied""" if G.Env is None: raise EnvironmentInactiveError("") if G.Env.current_task is None: G.Env.current_task = self G.log(f"Validated Environment: '{self.cross_experiment_key}'") else: raise EnvironmentInvalidError( "Current experiment must finish before starting another")
def on_repetition_end(self): content = '' content += 'Repetition {} AVG: '.format(self._rep) content += format_evaluation_results(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(' ') else '' content += 'Time Elapsed: {}'.format(sec_to_hms(self.stat_aggregates['times']['reps'][-1], as_str=True)) G.log('', previous_frame=inspect.currentframe().f_back) G.log(content, previous_frame=inspect.currentframe().f_back) super().on_repetition_end()
def on_experiment_end(self): content = 'FINAL: ' content += format_evaluation_results(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(' ') else '' content += 'Time Elapsed: {}'.format(sec_to_hms(self.stat_aggregates['times']['total_elapsed'], as_str=True)) G.log('') G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_experiment_end()
def target_data_transformation(self, which_sets=None): transformation = self.preprocessing_params.get("target_transformation", None) which_sets = which_sets if which_sets else self.fit_target_sets for data_key in which_sets: data = self.__getattribute__(data_key) if callable(transformation) and data: # TODO: Send either "self" or all attributes in self as other input to "imputer" # TODO: Force callable "imputer" to have **kwargs, or check for the args it expects and send only those self.__setattr__(data_key, transformation(data)) G.log("Completed target_data_transformation preprocessing")
def on_experiment_end(self): content = "FINAL: " content += format_evaluation(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(" ") else "" content += self.__elapsed_helper("total_elapsed") G.log("") G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_experiment_end()
def on_repetition_end(self): content = format_fold_run(rep=self._rep, fold="-", run="-") content += self.log_separator if not content.endswith(" ") else "" content += format_evaluation(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(" ") else "" content += self.__elapsed_helper("reps") if G.Env.verbose >= 2 and G.Env.cv_params.get("n_repeats", 1) > 1: G.log(content, previous_frame=inspect.currentframe().f_back) else: G.debug(content, previous_frame=inspect.currentframe().f_back) super().on_repetition_end()
def _generate_hyperparameter_key(self): """Set :attr:`hyperparameter_key` to a key to describe the experiment's hyperparameters""" parameters = dict( model_initializer=self.model_initializer, model_init_params=self.model_init_params, model_extra_params=self.model_extra_params, preprocessing_pipeline=self.preprocessing_pipeline, preprocessing_params=self.preprocessing_params, feature_selector=self.feature_selector, # FLAG: Should probably add :attr:`target_metric` to key - With option to ignore it? ) self.hyperparameter_key = HyperparameterKeyMaker(parameters, self.cross_experiment_key) G.log('Generated hyperparameter key: {}'.format(self.hyperparameter_key))
def on_repetition_end(self): content = "" content += "Repetition {} AVG: ".format(self._rep) content += format_evaluation_results( self.last_evaluation_results, float_format=self.float_format ) content += self.log_separator if not content.endswith(" ") else "" content += "Time Elapsed: {}".format( sec_to_hms(self.stat_aggregates["times"]["reps"][-1], as_str=True) ) G.log("", previous_frame=inspect.currentframe().f_back) G.log(content, previous_frame=inspect.currentframe().f_back) super().on_repetition_end()
def on_run_end(self): content = [ format_fold_run(rep=self._rep, fold=self._fold, run=self._run), format_evaluation(self.last_evaluation_results, float_format=self.float_format), self.__elapsed_helper("runs"), ] if G.Env.verbose >= 3 and G.Env.runs > 1: G.log(self.log_separator.join(content), previous_frame=inspect.currentframe().f_back) else: G.debug(self.log_separator.join(content), previous_frame=inspect.currentframe().f_back) super().on_run_end()
def on_experiment_end(self): content = "FINAL: " content += format_evaluation_results( self.last_evaluation_results, float_format=self.float_format ) content += self.log_separator if not content.endswith(" ") else "" content += "Time Elapsed: {}".format( sec_to_hms(self.stat_aggregates["times"]["total_elapsed"], as_str=True) ) G.log("") G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_experiment_end()
def save_key(self): """Create an entry in the dict contained in the file at :attr:`cross_experiment_key.key`, whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False""" if not self.exists: if self.cross_experiment_key.exists is False: _err = "Cannot save hyperparameter_key: '{}', before cross_experiment_key '{}'" raise ValueError(_err.format(self.key, self.cross_experiment_key.key)) key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json" add_to_json(key_path, [], key=self.key, condition=lambda _: self.key not in _.keys()) self.exists = True G.log(f'Saved {self.key_type}_key: "{self.key}"', 4) else: G.log(f'{self.key_type}_key "{self.key}" already exists - Skipped saving', 4)
def save_key(self): """Create an entry in the dict corresponding to the file at :attr:`cross_experiment_key.key`, whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False""" if not self.exists: if self.cross_experiment_key.exists is False: raise ValueError('Cannot save hyperparameter_key: "{}", before cross_experiment_key "{}" has been saved'.format( self.key, self.cross_experiment_key.key )) key_path = F'{self.tested_keys_dir}/{self.cross_experiment_key.key}.json' add_to_json(file_path=key_path, data_to_add=[], key=self.key, condition=lambda _: self.key not in _.keys()) self.exists = True G.log(F'Saved {self.key_type}_key: "{self.key}"') else: G.log(F'{self.key_type}_key "{self.key}" already exists - Skipped saving')
def _generate_hyperparameter_key(self): """Set :attr:`hyperparameter_key` to a key to describe the experiment's hyperparameters""" parameters = dict( model_initializer=self.model_initializer, model_init_params=self.model_init_params, model_extra_params=self.model_extra_params, feature_engineer=self.feature_engineer, feature_selector=self.feature_selector, # FLAG: Should probably add :attr:`target_metric` to key - With option to ignore it? ) self.hyperparameter_key = HyperparameterKeyMaker( parameters, self.cross_experiment_key) G.log("Hyperparameter Key: '{}'".format(self.hyperparameter_key)) G.debug("Raw hyperparameters...") G.debug(self.hyperparameter_key.parameters)
def on_fold_end(self): content = format_fold_run(rep=self._rep, fold=self._fold, run="-") content += self.log_separator if not content.endswith(" ") else "" content += format_evaluation(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(" ") else "" content += self.__elapsed_helper("folds") if G.Env.verbose >= 2 and G.Env.cv_params["n_splits"] > 1: G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) else: G.debug(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_fold_end()
def data_imputation(self, which_sets=None): imputer = self.preprocessing_params.get('imputer', None) which_sets = which_sets if which_sets else self.fit_input_sets for data_key in which_sets: data = self.__getattribute__(data_key) if data is not None: if callable(imputer): # Apply Function to Impute Data # TODO: Send either "self" or all attributes in self as other input to "imputer" # TODO: Force callable "imputer" to have **kwargs, or check for the args it expects and send only those self.__setattr__(data_key, imputer(data)) elif any([isinstance(imputer, _) for _ in (int, float) ]): # Fill Null Data With Given Value self.__setattr__(data_key, data.fillna(imputer)) G.log('Completed data_imputation preprocessing')
def on_fold_end(self): content = "F{}.{} AVG: ".format( self._rep, self._fold) # TODO: Prepend rep count content += format_evaluation(self.last_evaluation_results, float_format=self.float_format) content += self.log_separator if not content.endswith(" ") else "" content += self.__elapsed_helper("folds") if G.Env.verbose >= 2 and G.Env.cross_validation_params["n_splits"] > 1: G.log(content, previous_frame=inspect.currentframe().f_back, add_time=False) else: G.debug(content, previous_frame=inspect.currentframe().f_back, add_time=False) super().on_fold_end()
def on_run_start(self): content = format_fold_run(rep=self._rep, fold=self._fold, run=self._run) content += format( self.log_separator if content != "" and self.current_seed else "") content += "Seed: {}".format( self.current_seed) if self.current_seed else "" if G.Env.verbose >= 4 and G.Env.runs > 1: G.log(content, previous_frame=inspect.currentframe().f_back, add_time=True) else: G.debug(content, previous_frame=inspect.currentframe().f_back, add_time=True) super().on_run_start()