Exemple #1
0
    def data_imputation(self, which_sets=None):
        imputer = self.preprocessing_params.get('imputer', None)
        which_sets = which_sets if which_sets else self.fit_input_sets

        for data_key in which_sets:
            data = self.__getattribute__(data_key)

            if data is not None:
                if callable(imputer):  # Apply Function to Impute Data
                    # TODO: Send either "self" or all attributes in self as other input to "imputer"
                    # TODO: Force callable "imputer" to have **kwargs, or check for the args it expects and send only those
                    self.__setattr__(data_key, imputer(data))
                elif any([isinstance(imputer, _) for _ in (int, float)
                          ]):  # Fill Null Data With Given Value
                    self.__setattr__(data_key, data.fillna(imputer))

        G.log('Completed data_imputation preprocessing')
Exemple #2
0
    def fit(self):
        """Train model according to :attr:`extra_params['fit']` (if appropriate) on training data"""
        try:
            self.model_history = self.model.fit(self.train_input, self.train_target)
        except Exception as _ex:
            G.warn(f"KerasModel.fit() failed with Exception: {_ex}\nAttempting standard fit method")
            super().fit()
        finally:
            #################### Record Epochs Elapsed if Model has 'epoch' Attribute ####################
            with suppress(AttributeError):
                # self.epochs_elapsed = len(self.model.epoch)
                self.epochs_elapsed = len(self.model_history.epoch)

            #################### Load Model Checkpoint if Possible ####################
            for callback in self.extra_params.get("callbacks", []):
                if callback.__class__.__name__ == "ModelCheckpoint":
                    self.model.model.load_weights(callback.filepath)
Exemple #3
0
    def on_fold_end(self):
        content = format_fold_run(rep=self._rep, fold=self._fold, run="-")
        content += self.log_separator if not content.endswith(" ") else ""
        content += format_evaluation(self.last_evaluation_results,
                                     float_format=self.float_format)
        content += self.log_separator if not content.endswith(" ") else ""
        content += self.__elapsed_helper("folds")

        if G.Env.verbose >= 2 and G.Env.cv_params["n_splits"] > 1:
            G.log(content,
                  previous_frame=inspect.currentframe().f_back,
                  add_time=False)
        else:
            G.debug(content,
                    previous_frame=inspect.currentframe().f_back,
                    add_time=False)
        super().on_fold_end()
    def experiment_workflow(self):
        """Define the actual experiment process, including execution, result saving, and cleanup"""
        if self.hyperparameter_key.exists is True:
            _ex = F'{self!r} has already been run'
            if self.do_raise_repeated is True:
                self._clean_up()
                raise RepeatedExperimentError(_ex)
            G.warn(_ex)

        self._initialize_random_seeds()
        self._initial_preprocessing()
        self.execute()

        recorders = RecorderList(file_blacklist=G.Env.file_blacklist)
        recorders.format_result()
        G.log(F'Saving results for Experiment: "{self.experiment_id}"')
        recorders.save_result()
        self._clean_up()
    def on_run_start(self):
        content = format_fold_run(rep=self._rep,
                                  fold=self._fold,
                                  run=self._run)
        content += format(
            self.log_separator if content != "" and self.current_seed else "")
        content += "Seed: {}".format(
            self.current_seed) if self.current_seed else ""

        if G.Env.verbose >= 4 and G.Env.runs > 1:
            G.log(content,
                  previous_frame=inspect.currentframe().f_back,
                  add_time=True)
        else:
            G.debug(content,
                    previous_frame=inspect.currentframe().f_back,
                    add_time=True)
        super().on_run_start()
    def _optimization_loop(self, iteration=0):
        """Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an
        Experiment will be executed, its results will be logged, and it will be compared to the
        current best experiment

        Parameters
        ----------
        iteration: Int, default=0
            The current iteration in the optimization loop"""
        self.logger.print_optimization_header()

        while iteration < self.iterations:
            try:
                self._execute_experiment()
            except RepeatedExperimentError:
                # G.debug_(F'Skipping repeated Experiment: {_ex!s}\n')
                if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size:
                    G.log_(f"Hyperparameter search space has been exhausted")
                    break
                self.skipped_iterations += 1
                continue
            except StopIteration:
                if len(self.similar_experiments) + len(self.tested_keys) >= self.search_space_size:
                    G.log_(f"Hyperparameter search space has been exhausted")
                    break
                # G.debug_(f'Re-initializing hyperparameter grid after testing {len(self.tested_keys)} keys')
                self._set_hyperparameter_space()
                continue

            self.logger.print_result(
                self.current_hyperparameters_list,
                self.current_score,
                experiment_id=self.current_experiment.experiment_id,
            )

            if (
                (self.best_experiment is None)  # First evaluation
                or (self.do_maximize and (self.best_score < self.current_score))  # New best max
                or (not self.do_maximize and (self.best_score > self.current_score))  # New best min
            ):
                self.best_experiment = self.current_experiment.experiment_id
                self.best_score = self.current_score

            iteration += 1
 def _update_model_params(self):
     """Update random state of :attr:`model_init_params` according to :attr:`current_seed`"""
     # TODO: Add this to some workflow in Experiment class. For now it is never used, unless the subclass decides to...
     # `model_init_params` initialized to all algorithm hyperparameters - Works even if 'random_state' not explicitly given
     try:
         if 'random_state' in self.model_init_params:
             self.model_init_params['random_state'] = self.current_seed
         elif 'seed' in self.model_init_params:
             self.model_init_params['seed'] = self.current_seed
         else:
             G.log('Model has no random_state/seed parameter to update')
             # FLAG: HIGH PRIORITY BELOW
             # TODO: BELOW IS NOT THE CASE IF MODEL IS NN - SETTING THE GLOBAL RANDOM SEED DOES SOMETHING
             # TODO: If this is logged, there is no reason to execute multiple-run-averaging, so don't
             # TODO: ... Either 1) Set `runs` = 1 (this would mess with the environment key), or...
             # TODO: ... 2) Set the results of all subsequent runs to the results of the first run (this could be difficult)
             # FLAG: HIGH PRIORITY ABOVE
     except Exception as _ex:
         G.log('Failed to update model\'s random_state     {}'.format(_ex.__repr__()))
Exemple #8
0
    def _initial_preprocessing(self):
        """Perform preprocessing steps prior to executing fitting protocol (usually
        cross-validation), consisting of: 1) Split train/holdout data into respective train/holdout
        input and target data attributes, 2) Feature selection on input data sets, 3) Set target
        datasets to target_column contents, 4) Initialize PreprocessingPipeline to perform core
        preprocessing, 5) Set datasets to their (modified) counterparts in PreprocessingPipeline,
        6) Log whether datasets changed"""
        #################### Preprocessing ####################
        # preprocessor = PreprocessingPipelineMixIn(
        #     pipeline=[], preprocessing_params=dict(apply_standard_scale=True), features=self.features,
        #     target_column=self.target_column, train_input_data=self.train_input_data,
        #     train_target_data=self.train_target_data, holdout_input_data=self.holdout_input_data,
        #     holdout_target_data=self.holdout_target_data, test_input_data=self.test_input_data,
        #     fitting_guide=None, fail_gracefully=False, preprocessing_stage='infer'
        # )
        #
        # # TODO: Switch from below direct calls to preprocessor.execute_pipeline() call
        # # TODO: After calling execute_pipeline(), set data attributes to their counterparts in preprocessor class
        # preprocessor.data_imputation()
        # preprocessor.target_data_transformation()
        # preprocessor.data_scaling()
        #
        # for dataset_name in preprocessor.all_input_sets + preprocessor.all_target_sets:
        #     old_val, new_val = getattr(self, dataset_name), getattr(preprocessor, dataset_name)
        #     G.log('Dataset: "{}" {} updated'.format(dataset_name, 'was not' if old_val.equals(new_val) else 'was'))
        #     setattr(self, dataset_name, new_val)

        self.train_input_data = self.train_dataset.copy().loc[:, self.
                                                              feature_selector]
        self.train_target_data = self.train_dataset.copy().loc[:, self.
                                                               target_column]

        if isinstance(self.holdout_dataset, pd.DataFrame):
            self.holdout_input_data = self.holdout_dataset.copy(
            ).loc[:, self.feature_selector]
            self.holdout_target_data = self.holdout_dataset.copy(
            ).loc[:, self.target_column]

        if isinstance(self.test_dataset, pd.DataFrame):
            self.test_input_data = self.test_dataset.copy(
            ).loc[:, self.feature_selector]

        G.log("Initial preprocessing stage complete", 4)
    def save_key(self):
        """Create an entry in the dict contained in the file at :attr:`cross_experiment_key.key`,
        whose key is :attr:`key`, and whose value is an empty list if :attr:`exists` is False"""
        if not self.exists:
            if self.cross_experiment_key.exists is False:
                _err = "Cannot save hyperparameter_key: '{}', before cross_experiment_key '{}'"
                raise ValueError(
                    _err.format(self.key, self.cross_experiment_key.key))

            key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json"
            add_to_json(key_path, [],
                        key=self.key,
                        condition=lambda _: self.key not in _.keys())

            self.exists = True
            G.log(f'Saved {self.key_type}_key: "{self.key}"', 4)
        else:
            G.log(
                f'{self.key_type}_key "{self.key}" already exists - Skipped saving',
                4)
Exemple #10
0
    def save_result(self):
        """Execute :meth:`save_result` for all classes in :attr:`recorders`

        Notes
        -----
        When iterating through :attr:`recorders` and calling :meth:`save_result`, a check is performed for `exit_code`. Children
        classes of :class:`BaseRecorder` are NOT expected to explicitly return a value in their :meth:`save_result`. However, if
        a value is returned and `exit_code` == 'break', the result-saving loop will be broken, and no further results will be
        saved. In practice, this is only performed for the sake of :meth:`DescriptionRecorder.save_result`, which has the
        additional quality of being able to prevent any other result files from being saved if the result of
        :func:`DescriptionRecorder.do_full_save` returns False when given the formatted :attr:`DescriptionRecorder.result`. This
        can be useful when there are storage constraints, because it ensures that essential data - including keys and the results
        of the experiment - are saved (to ensure the experiment is not duplicated, and to provide some utility to Hyperparameter
        Optimization algorithms), while extra results like Predictions are not saved."""
        for recorder in self.recorders:
            G.log(F'Saving result file for "{type(recorder).__name__}"')
            exit_code = recorder.save_result()

            if exit_code and exit_code == 'break':
                break
Exemple #11
0
    def save_result(self):
        """Save the Experiment description as a .json file, named after :attr:`experiment_id`. If
        :attr:`do_full_save` is a callable and returns False when given the description object, the
        result recording loop will be broken, and the remaining result files will not be saved

        Returns
        -------
        'break'
            This string will be returned if :attr:`do_full_save` is a callable and returns False
            when given the description object. This is the signal for
            :class:`recorders.RecorderList` to stop recording result files"""
        try:
            write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False)
        except FileNotFoundError:
            make_dirs(self.result_path, exist_ok=False)
            write_json(f"{self.result_path}/{self.experiment_id}.json", self.result, do_clear=False)

        if (self.do_full_save is not None) and (not self.do_full_save(self.result)):
            G.warn("Breaking result-saving loop early! Remaining result files will not be saved")
            return "break"
    def cross_validation_workflow(self):
        """Execute workflow for cross-validation process, consisting of the following tasks: 1) Create train and validation split
        indices for all folds, 2) Iterate through folds, performing cv_fold_workflow for each, 3) Average accumulated predictions
        over fold splits, 4) Evaluate final predictions, 5) Format final predictions to prepare for saving"""
        self.on_experiment_start()

        cv_indices = self.folds.split(self.train_input_data, self.train_target_data.iloc[:, 0])
        new_shape = (self.cross_validation_params.get('n_repeats', 1), self.cross_validation_params['n_splits'], 2)
        reshaped_indices = np.reshape(np.array(list(cv_indices)), new_shape)

        for self._rep, repetition_indices in enumerate(reshaped_indices.tolist()):
            self.on_repetition_start()

            for self._fold, (self.train_index, self.validation_index) in enumerate(repetition_indices):
                self.cv_fold_workflow()

            self.on_repetition_end()
        self.on_experiment_end()

        G.log('')
    def _initial_preprocessing(self):
        """Perform preprocessing steps prior to executing fitting protocol (usually
        cross-validation), consisting of: 1) Split train/holdout data into respective train/holdout
        input and target data attributes, 2) Execute `feature_engineer` to perform "pre_cv"-stage
        preprocessing, 3) Set datasets to their (modified) counterparts in `feature_engineer`"""
        self.train_input_data = self.train_dataset.copy().loc[:, self.
                                                              feature_selector]
        self.train_target_data = self.train_dataset.copy().loc[:, self.
                                                               target_column]

        if isinstance(self.holdout_dataset, pd.DataFrame):
            self.holdout_input_data = self.holdout_dataset.copy(
            ).loc[:, self.feature_selector]
            self.holdout_target_data = self.holdout_dataset.copy(
            ).loc[:, self.target_column]

        if isinstance(self.test_dataset, pd.DataFrame):
            self.test_input_data = self.test_dataset.copy(
            ).loc[:, self.feature_selector]

        if self.feature_engineer and callable(self.feature_engineer):
            self.feature_engineer(
                "pre_cv",
                train_inputs=self.train_input_data,
                train_targets=self.train_target_data,
                holdout_inputs=self.holdout_input_data,
                holdout_targets=self.holdout_target_data,
                test_inputs=self.test_input_data,
            )
            self.train_input_data = self.feature_engineer.datasets[
                "train_inputs"]
            self.train_target_data = self.feature_engineer.datasets[
                "train_targets"]
            self.holdout_input_data = self.feature_engineer.datasets[
                "holdout_inputs"]
            self.holdout_target_data = self.feature_engineer.datasets[
                "holdout_targets"]
            self.test_input_data = self.feature_engineer.datasets[
                "test_inputs"]

        G.log("Initial preprocessing stage complete", 4)
Exemple #14
0
    def go(self):
        """Begin hyperparameter optimization process after experiment guidelines have been set and
        search dimensions are in place. This process includes the following: setting the
        hyperparameter space; locating similar experiments to be used as learning material for
        :class:`SKOptimizationProtocol` s; and executing :meth:`_optimization_loop`, which
        actually sets off the Experiment execution process"""
        if self.model_initializer is None:
            raise ValueError("Experiment guidelines must be set before starting optimization")

        _reporter_params = dict(dict(do_maximize=self.do_maximize), **self.reporter_parameters)
        self.logger = OptimizationReporter([_.name for _ in self.dimensions], **_reporter_params)

        self.tested_keys = []
        self._set_hyperparameter_space()
        self._find_similar_experiments()

        loop_start_time = datetime.now()
        self._optimization_loop()
        loop_end_time = datetime.now()
        G.log_(f"Optimization loop completed in {loop_end_time - loop_start_time}")
        G.log_(f'Best score was {self.best_score} from Experiment "{self.best_experiment}"')
Exemple #15
0
    def find(self):
        """Execute full result-finding workflow"""
        self._get_ids()
        G.debug_(
            f"Experiments matching cross-experiment key/algorithm: {len(self.experiment_ids)}"
        )
        self._get_scored_params()
        self._filter_by_space()
        G.debug_(
            f"Experiments fitting in the given space: {len(self.hyperparameters_and_scores)}"
        )

        if self.module_name == "keras":
            multi_targets = [("model_init_params", "compile_params",
                              "optimizer")]
            if multi_targets[0] in self.space.names():
                self._filter_by_guidelines_multi(multi_targets[0])
            else:
                self._filter_by_guidelines()
        else:
            self._filter_by_guidelines()

        #################### Post-Process Similar Experiments ####################
        self._reinitialize_similar_experiments()
        G.debug_(
            f"Experiments matching current guidelines: {len(self.similar_experiments)}"
        )
    def _get_current_hyperparameters(self):
        """Ask :attr:`optimizer` for the upcoming set of hyperparameters that should be searched,
        then format them to be used in the next Experiment

        Returns
        -------
        current_hyperparameters: Dict
            The next set of hyperparameters that will be searched"""
        _current_hyperparameters = self.optimizer.ask()

        if _current_hyperparameters == self.current_hyperparameters_list:
            new_parameters = self.space.rvs(random_state=None)[0]
            G.debug_("REPEATED  asked={}  new={}".format(_current_hyperparameters, new_parameters))
            _current_hyperparameters = new_parameters

        self.current_hyperparameters_list = _current_hyperparameters

        current_hyperparameters = dict(
            zip(self.space.names(use_location=False), self.current_hyperparameters_list)
        )

        return current_hyperparameters
Exemple #17
0
    def on_fold_end(self):
        # G.log('AggregatorEpochsElapsed.on_fold_end()')

        rep_key, fold_key = 'rep_{}'.format(self._rep), 'fold_{}'.format(self._fold)

        #################### Simple Average of Fold's Runs ####################
        try:
            self.stat_aggregates['epochs_elapsed'][fold_key]['simple_average'] = np.average(
                self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']
            )
        except KeyError:
            # self.stat_aggregates does not have 'epochs_elapsed' key - epochs never recorded in first place
            pass
        except TypeError:
            G.warn('\n'.join([
                'TypeError encountered when averaging stat_aggregates[{}][{}]:'.format('epochs_elapsed', fold_key),
                '\tValues: {}'.format(self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']),
                '\tTypes: {}'.format([type(_) for _ in self.stat_aggregates['epochs_elapsed'][fold_key]['run_values']]),
                'If the above values are numbers and you want them averaged, fix me ASAP! If not, ignore me'
            ]))

        super().on_fold_end()
    def on_exp_start(self):
        """Prepare data prior to executing fitting protocol (cross-validation), by 1) Initializing
        formal :mod:`~hyperparameter_hunter.data.datasets` attributes, 2) Invoking
        `feature_engineer` to perform "pre_cv"-stage preprocessing, and 3) Updating datasets to
        include their (transformed) counterparts in `feature_engineer`"""
        #################### Build Datasets ####################
        data_kwargs = dict(feature_selector=self.feature_selector,
                           target_column=self.target_column)
        self.data_train = TrainDataset(self.train_dataset,
                                       require_data=True,
                                       **data_kwargs)
        # TODO: Might be better to initialize `data_oof` with same data as `data_train`
        self.data_oof = OOFDataset(None, **data_kwargs)
        self.data_holdout = HoldoutDataset(self.holdout_dataset, **data_kwargs)
        self.data_test = TestDataset(self.test_dataset,
                                     feature_selector=self.feature_selector)

        #################### Perform Pre-CV Feature Engineering ####################
        self.feature_engineer(
            "pre_cv",
            train_inputs=deepcopy(self.data_train.input.d),
            train_targets=deepcopy(self.data_train.target.d),
            holdout_inputs=deepcopy(self.data_holdout.input.d),
            holdout_targets=deepcopy(self.data_holdout.target.d),
            test_inputs=deepcopy(self.data_test.input.d),
        )
        self.data_train.input.T.d = self.feature_engineer.datasets[
            "train_inputs"]
        self.data_train.target.T.d = self.feature_engineer.datasets[
            "train_targets"]
        self.data_holdout.input.T.d = self.feature_engineer.datasets[
            "holdout_inputs"]
        self.data_holdout.target.T.d = self.feature_engineer.datasets[
            "holdout_targets"]
        self.data_test.input.T.d = self.feature_engineer.datasets[
            "test_inputs"]

        G.log("Initial preprocessing stage complete", 4)
        super().on_exp_start()
    def _optimization_loop(self, iteration=0):
        """Perform Experiment execution loop while `iteration` < `iterations`. At each iteration, an Experiment will be executed,
        its results will be logged, and it will be compared to the current best experiment

        Parameters
        ----------
        iteration: Int, default=0
            The current iteration in the optimization loop"""
        self.logger.print_optimization_header()

        while iteration < self.iterations:
            try:
                self._execute_experiment()
            except RepeatedExperimentError:
                # G.debug_(F'Skipping repeated Experiment: {_ex!s}\n')
                self.skipped_iterations += 1
                continue
            except StopIteration:
                if len(self.tested_keys) >= self.search_space_size:
                    G.log_(
                        F'Hyperparameter search space has been exhausted after testing {len(self.tested_keys)} keys'
                    )
                    break
                # G.debug_(F'Re-initializing hyperparameter grid after testing {len(self.tested_keys)} keys')
                self._set_hyperparameter_space()
                continue

            # TODO: :attr:`current_hyperparameters_list` only exists in Informed Protocols
            self.logger.print_result(
                self.current_hyperparameters_list,
                self.current_score,
                experiment_id=self.current_experiment.experiment_id)

            if (self.best_experiment is None) or (self.current_score >
                                                  self.best_score):
                self.best_experiment = self.current_experiment.experiment_id
                self.best_score = self.current_score

            iteration += 1
Exemple #20
0
    def cross_validation_workflow(self):
        """Execute workflow for cross-validation process, consisting of the following tasks:
        1) Create train and validation split indices for all folds, 2) Iterate through folds,
        performing `cv_fold_workflow` for each, 3) Average accumulated predictions over fold
        splits, 4) Evaluate final predictions, 5) Format final predictions to prepare for saving"""
        self.on_experiment_start()

        reshaped_indices = get_cv_indices(self.folds, self.cv_params,
                                          self.train_input_data,
                                          self.train_target_data.iloc[:, 0])

        for self._rep, rep_indices in enumerate(reshaped_indices):
            self.on_repetition_start()

            for self._fold, (self.train_index,
                             self.validation_index) in enumerate(rep_indices):
                self.cv_fold_workflow()

            self.on_repetition_end()
        self.on_experiment_end()

        G.log("")
    def cv_fold_workflow(self):
        """Execute workflow for individual fold, consisting of the following tasks: Execute
        overridden :meth:`on_fold_start` tasks, 2) Perform cv_run_workflow for each run, 3) Execute
        overridden :meth:`on_fold_end` tasks"""
        self.on_fold_start()

        if self.feature_engineer and callable(self.feature_engineer):
            self.feature_engineer(
                "intra_cv",
                train_inputs=self.fold_train_input,
                train_targets=self.fold_train_target,
                validation_inputs=self.fold_validation_input,
                validation_targets=self.fold_validation_target,
                holdout_inputs=self.fold_holdout_input,
                holdout_targets=self.fold_holdout_target,
                test_inputs=self.fold_test_input,
            )
            self.fold_train_input = self.feature_engineer.datasets[
                "train_inputs"]
            self.fold_train_target = self.feature_engineer.datasets[
                "train_targets"]
            self.fold_validation_input = self.feature_engineer.datasets[
                "validation_inputs"]
            self.fold_validation_target = self.feature_engineer.datasets[
                "validation_targets"]
            self.fold_holdout_input = self.feature_engineer.datasets[
                "holdout_inputs"]
            self.fold_holdout_target = self.feature_engineer.datasets[
                "holdout_targets"]
            self.fold_test_input = self.feature_engineer.datasets[
                "test_inputs"]

        G.log("Intra-CV preprocessing stage complete", 4)

        for self._run in range(self.experiment_params.get("runs", 1)):
            self.cv_run_workflow()
        self.on_fold_end()
Exemple #22
0
    def update_custom_environment_params(self):
        """Try to update null parameters from environment_params_path, or DEFAULT_PARAMS"""
        allowed_parameter_keys = [
            k for k, v in signature(Environment).parameters.items()
            if v.kind == v.KEYWORD_ONLY
        ]
        user_defaults = {}

        try:
            user_defaults = read_json(self.environment_params_path)
        except (TypeError, OSError):
            # If `environment_params_path=None`, no error raised - `user_defaults` continues as {}
            if self.environment_params_path is not None:
                raise

        if not isinstance(user_defaults, dict):
            raise TypeError(
                "environment_params_path must have dict, not {}".format(
                    user_defaults))

        #################### Check user_defaults ####################
        for k, v in user_defaults.items():
            if k not in allowed_parameter_keys:
                G.warn(
                    f"Invalid key ({k}) in user Environment parameters: {self.environment_params_path}"
                )
            elif getattr(self, k) is None:
                setattr(self, k, v)
                G.debug(
                    f"Environment.`{k}` set to user default: '{self.environment_params_path}'"
                )

        #################### Check Module Default Environment Arguments ####################
        for k in allowed_parameter_keys:
            if getattr(self, k) is None:
                setattr(self, k, self.DEFAULT_PARAMS.get(k, None))
Exemple #23
0
    def _create_script_backup(self):
        """Create and save a copy of the script that initialized the Experiment if allowed to, and
        if :attr:`source_script` ends with a ".py" extension"""
        #################### Attempt to Copy Source Script if Allowed ####################
        try:
            if not self.source_script.endswith(".py"):
                G.Env.result_paths["script_backup"] = None

            if G.Env.result_paths["script_backup"] is not None:
                try:
                    self._source_copy_helper()
                except FileNotFoundError:
                    make_dirs(self.result_paths["script_backup"],
                              exist_ok=False)
                    self._source_copy_helper()
                G.log(
                    "Created source backup:  '{}'".format(self.source_script),
                    4)
            else:
                G.log(
                    "Skipped source backup:  '{}'".format(self.source_script),
                    4)
        #################### Exception Handling ####################
        except AttributeError as _ex:
            if G.Env is None:
                raise EnvironmentInactiveError(extra="\n{!s}".format(_ex))
            if not hasattr(G.Env, "result_paths"):
                raise EnvironmentInvalidError(
                    extra=f"G.Env lacks 'result_paths' attr\n{_ex!s}")
            raise
        except KeyError as _ex:
            if "script_backup" not in G.Env.result_paths:
                raise EnvironmentInvalidError(
                    extra=
                    f"G.Env.result_paths lacks 'script_backup' key\n{_ex!s}")
            raise
 def _create_script_backup(self):
     """Create and save a copy of the script that initialized the Experiment"""
     #################### Attempt to Copy Source Script if Allowed ####################
     try:
         if G.Env.result_paths['script_backup'] is not None:
             try:
                 shutil.copyfile(self.source_script, F'{self.result_paths["script_backup"]}/{self.experiment_id}.py')
             except FileNotFoundError:
                 os.makedirs(self.result_paths["script_backup"], exist_ok=False)
                 shutil.copyfile(self.source_script, F'{self.result_paths["script_backup"]}/{self.experiment_id}.py')
             G.log('Created backup of file: "{}"'.format(self.source_script))
         else:
             G.log('Skipped creating backup of file: "{}"'.format(self.source_script))
     #################### Exception Handling ####################
     except AttributeError as _ex:
         if G.Env is None:
             raise EnvironmentInactiveError(extra='\n{!s}'.format(_ex))
         if not hasattr(G.Env, 'result_paths'):
             raise EnvironmentInvalidError(extra='G.Env lacks "result_paths" attribute\n{!s}'.format(_ex))
         raise
     except KeyError as _ex:
         if 'script_backup' not in G.Env.result_paths:
             raise EnvironmentInvalidError(extra='G.Env.result_paths lacks "script_backup" key\n{!s}'.format(_ex))
         raise
    def _ask(self):
        """Suggest next point at which to evaluate the objective

        Returns
        -------
        Some point in :attr:`space`, which is random while less than `n_initial_points` observations
        have been `tell`-ed. After that, `base_estimator` is used to determine the next point

        Notes
        -----
        If the suggested point has already been evaluated, a random point will be returned instead,
        optionally accompanied by a warning message (depending on :attr:`warn_on_re_ask`)"""
        if self._n_initial_points > 0 or self.base_estimator is None:
            # Does not copy `self.rng` in order to keep advancing random state
            return self.space.rvs(random_state=self.rng)[0]
        else:
            if not self.models:
                raise RuntimeError(
                    "Random evaluations exhausted and no model has been fit")

            #################### Check for Repeated Suggestion ####################
            next_x = self._next_x
            # Check distances between `next_x` and all evaluated points
            min_delta_x = min(
                [self.space.distance(next_x, xi) for xi in self.Xi])

            if abs(min_delta_x) <= 1e-8:  # `next_x` has already been evaluated
                if self.warn_on_re_ask:
                    G.warn_("Repeated suggestion: {}".format(next_x))

                # Set `_next_x` to random point, then re-invoke `_ask` to validate new point
                self._next_x = self.space.rvs(random_state=self.rng)[0]
                return self._ask()

            # Return point computed from last call to `tell`
            return next_x
    def save_key(self):
        """Create an entry in the dict corresponding to the file at
        :attr:`cross_experiment_key.key`, whose key is :attr:`key`, and whose value is an empty
        list if :attr:`exists` is False"""
        if not self.exists:
            if self.cross_experiment_key.exists is False:
                raise ValueError(
                    'Cannot save hyperparameter_key: "{}", before cross_experiment_key "{}" has been saved'
                    .format(self.key, self.cross_experiment_key.key))

            key_path = f"{self.tested_keys_dir}/{self.cross_experiment_key.key}.json"
            add_to_json(
                file_path=key_path,
                data_to_add=[],
                key=self.key,
                condition=lambda _: self.key not in _.keys(),
            )

            self.exists = True
            G.log(f'Saved {self.key_type}_key: "{self.key}"')
        else:
            G.log(
                f'{self.key_type}_key "{self.key}" already exists - Skipped saving'
            )
    def _generate_hyperparameter_key(self):
        """Set :attr:`hyperparameter_key` to a key to describe the experiment's hyperparameters"""
        parameters = dict(
            model_initializer=self.model_initializer,
            model_init_params=self.model_init_params,
            model_extra_params=self.model_extra_params,
            feature_engineer=self.feature_engineer,
            feature_selector=self.feature_selector,
            # FLAG: Should probably add :attr:`target_metric` to key - With option to ignore it?
        )

        self.hyperparameter_key = HyperparameterKeyMaker(
            parameters, self.cross_experiment_key)
        G.log("Hyperparameter Key:     '{}'".format(self.hyperparameter_key))
        G.debug("Raw hyperparameters...")
        G.debug(self.hyperparameter_key.parameters)
    def experiment_workflow(self):
        """Define the actual experiment process, including execution, result saving, and cleanup"""
        if self.hyperparameter_key.exists is True:
            _ex = f"{self!r} has already been run"
            if self.do_raise_repeated is True:
                self._clean_up()
                raise RepeatedExperimentError(_ex)
            G.debug(_ex)
            G.warn("WARNING: Duplicate experiment!")

        self._initialize_random_seeds()
        self.execute()

        #################### Save Experiment Results ####################
        recorders = RecorderList(file_blacklist=G.Env.file_blacklist,
                                 extra_recorders=G.Env.experiment_recorders)
        recorders.format_result()
        G.log(f"Saving results for Experiment: '{self.experiment_id}'")
        recorders.save_result()
        self._clean_up()
Exemple #29
0
    def find(self):
        """Execute full result-finding workflow"""
        self._get_ids()
        G.debug_(
            F'Experiments found with matching cross-experiment key and algorithm: {len(self.experiment_ids)}'
        )
        self._get_scored_params()
        self._filter_by_space()
        G.debug_(
            F'Experiments whose hyperparameters fit in the currently defined space: {len(self.hyperparameters_and_scores)}'
        )

        if self.module_name == 'keras':
            if ('model_init_params', 'compile_params',
                    'optimizer') in self.hyperparameter_space.get_names():
                self._filter_by_guidelines_multi(
                    ('model_init_params', 'compile_params', 'optimizer'))
            else:
                self._filter_by_guidelines()
        else:
            self._filter_by_guidelines()
        G.debug_(
            F'Experiments whose hyperparameters match the current guidelines: {len(self.similar_experiments)}'
        )
    def set_experiment_guidelines(
        self,
        model_initializer,
        model_init_params,
        model_extra_params=None,
        feature_selector=None,
        preprocessing_pipeline=None,
        preprocessing_params=None,
        notes=None,
        do_raise_repeated=True,
    ):
        """Provide the arguments necessary to instantiate :class:`experiments.CrossValidationExperiment`. This method has the same
        signature as :meth:`experiments.BaseExperiment.__init__` except where noted

        Parameters
        ----------
        model_initializer: Class, or functools.partial, or class instance
            The algorithm class being used to initialize a model
        model_init_params: Dict, or object
            The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method
            of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are
            valid in `model_init_params`
        model_extra_params: Dict, or None, default=None
            A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models'
            non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks
        feature_selector: List of str, callable, list of booleans, default=None
            The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as
            the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None,
            `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column`
        preprocessing_pipeline: ...
            ... Experimental...
        preprocessing_params: ...
            ... Experimental...
        notes: String, or None, default=None
            Additional information about the Experiment that will be saved with the Experiment's description result file. This
            serves no purpose other than to facilitate saving Experiment details in a more readable format
        do_raise_repeated: Boolean, default=False
            If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys,
            a RepeatedExperimentError will be raised. Else, a warning will be logged

        Notes
        -----
        The `auto_start` kwarg is not available here because :meth:`BaseOptimizationProtocol._execute_experiment` sets it to False
        in order to check for duplicated keys before running the whole Experiment. This is the most notable difference between
        calling :meth:`set_experiment_guidelines` and instantiating :class:`experiments.CrossValidationExperiment`"""
        self.model_initializer = model_initializer

        self.model_init_params = identify_algorithm_hyperparameters(
            self.model_initializer)
        try:
            self.model_init_params.update(model_init_params)
        except TypeError:
            self.model_init_params.update(dict(build_fn=model_init_params))

        self.model_extra_params = model_extra_params
        self.feature_selector = feature_selector
        self.preprocessing_pipeline = preprocessing_pipeline
        self.preprocessing_params = preprocessing_params
        self.notes = notes
        self.do_raise_repeated = do_raise_repeated

        if self.do_raise_repeated is False:
            G.warn_(
                'WARNING: Setting `do_raise_repeated`=False will allow Experiments to be unnecessarily duplicated'
            )

        self.algorithm_name, self.module_name = identify_algorithm(
            self.model_initializer)
        self._validate_guidelines()

        #################### Deal with Keras ####################
        if self.module_name == 'keras':
            reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow(
                self.model_initializer, self.model_init_params['build_fn'],
                self.model_extra_params, self.source_script)
            self.model_init_params = dict(build_fn=reusable_build_fn)
            self.model_extra_params = reusable_wrapper_params
            self.dummy_layers = dummy_layers
            self.dummy_compile_params = dummy_compile_params
            # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam'

        self.set_dimensions()