Esempio n. 1
0
    def __init__(
        # TODO: Make `model_init_params` an optional kwarg - If not given, algorithm defaults used
        self,
        # model=None,
        # model_initializer=None,
        # model_init_params=None,
        # TODO: Convert below 2 to above 3 lines for `TranslateTrace`
        model_initializer,
        model_init_params,
        model_extra_params=None,
        feature_selector=None,
        preprocessing_pipeline=None,
        preprocessing_params=None,
        notes=None,
        do_raise_repeated=False,
        auto_start=True,
        target_metric=None,
    ):
        # TODO: When `TranslateTrace` added document `model` below with expectation that if `model`
        # TODO: ... given, (`model_initializer`, `model_init_params`) should not be, and vice versa
        # TODO: `model` (Class instance, default=None);
        # TODO: `model_initializer`/`model_init_params` docstring types += "default=None"
        """Base class for :class:`BaseCVExperiment`

        Parameters
        ----------
        model_initializer: Class, or functools.partial, or class instance
            The algorithm class being used to initialize a model
        model_init_params: Dict, or object
            The dictionary of arguments given when creating a model instance with
            `model_initializer` via the `__init__` method of :class:`models.Model`. Any kwargs that
            are considered valid by the `__init__` method of `model_initializer` are valid in
            `model_init_params`
        model_extra_params: Dict, or None, default=None
            A dictionary of extra parameters passed to :class:`models.Model`. This is used to
            provide parameters to models' non-initialization methods (like `fit`, `predict`,
            `predict_proba`, etc.), and for neural networks
        feature_selector: List of str, callable, list of booleans, default=None
            The value provided when splitting apart the input data for all provided DataFrames.
            `feature_selector` is provided as the second argument for calls to
            `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None,
            `feature_selector` is set to all columns in :attr:`train_dataset`, less
            :attr:`target_column`, and :attr:`id_column`
        preprocessing_pipeline: ...
            ... Experimental...
        preprocessing_params: ...
            ... Experimental...
        notes: String, or None, default=None
            Additional information about the Experiment that will be saved with the Experiment's
            description result file. This serves no purpose other than to facilitate saving
            Experiment details in a more readable format
        do_raise_repeated: Boolean, default=False
            If True and this Experiment locates a previous Experiment's results with matching
            Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a
            warning will be logged
        auto_start: Boolean, default=True
            If True, after the Experiment is initialized, it will automatically call
            :meth:`BaseExperiment.preparation_workflow`, followed by
            :meth:`BaseExperiment.experiment_workflow`, effectively completing all essential tasks
            without requiring additional method calls
        target_metric: Tuple, str, default=('oof', <:attr:`environment.Environment.metrics`[0]>)
            A path denoting the metric to be used to compare completed Experiments or to use for
            certain early stopping procedures in some model classes. The first value should be one
            of ['oof', 'holdout', 'in_fold']. The second value should be the name of a metric being
            recorded according to the values supplied in
            :attr:`environment.Environment.metrics_params`. See the documentation for
            :func:`metrics.get_formatted_target_metric` for more info. Any values returned by, or
            used as the `target_metric` input to this function are acceptable values for
            :attr:`BaseExperiment.target_metric`"""
        # self._model_original = model  # TODO: Add for `TranslateTrace`
        self.model_initializer = model_initializer
        self.model_init_params = identify_algorithm_hyperparameters(
            self.model_initializer)
        try:
            self.model_init_params.update(model_init_params)
        except TypeError:
            self.model_init_params.update(dict(build_fn=model_init_params))

        self.model_extra_params = model_extra_params if model_extra_params is not None else {}
        self.feature_selector = feature_selector if feature_selector is not None else []
        self.preprocessing_pipeline = preprocessing_pipeline or {}
        self.preprocessing_params = preprocessing_params if preprocessing_params is not None else {}

        self.notes = notes
        self.do_raise_repeated = do_raise_repeated
        self.auto_start = auto_start
        self.target_metric = target_metric

        #################### Attributes From Active Environment ####################
        G.Env.initialize_reporting()
        self._validate_environment()

        self.train_dataset = G.Env.train_dataset.copy()
        try:
            self.holdout_dataset = G.Env.holdout_dataset.copy()
        except AttributeError:
            self.holdout_dataset = G.Env.holdout_dataset
        try:
            self.test_dataset = G.Env.test_dataset.copy()
        except AttributeError:
            self.test_dataset = G.Env.test_dataset

        self.target_column = G.Env.target_column
        self.id_column = G.Env.id_column
        self.do_predict_proba = G.Env.do_predict_proba
        self.prediction_formatter = G.Env.prediction_formatter
        self.metrics_params = G.Env.metrics_params
        self.experiment_params = G.Env.cross_experiment_params
        self.cv_params = G.Env.cv_params
        self.result_paths = G.Env.result_paths
        self.cross_experiment_key = G.Env.cross_experiment_key

        #################### Instantiate Other Attributes ####################
        self.train_input_data = None
        self.train_target_data = None
        self.holdout_input_data = None
        self.holdout_target_data = None
        self.test_input_data = None

        self.model = None
        self.metrics = None  # Set by :class:`metrics.ScoringMixIn`
        self.stat_aggregates = dict()
        self.result_description = None

        #################### Experiment Identification Attributes ####################
        self.experiment_id = None
        self.hyperparameter_key = None
        self.algorithm_name, self.module_name = identify_algorithm(
            self.model_initializer)

        ScoringMixIn.__init__(
            self, **self.metrics_params if self.metrics_params else {})

        if self.auto_start is True:
            self.preparation_workflow()
            self.experiment_workflow()
    def set_experiment_guidelines(
        self,
        model_initializer,
        model_init_params,
        model_extra_params=None,
        feature_selector=None,
        preprocessing_pipeline=None,
        preprocessing_params=None,
        notes=None,
        do_raise_repeated=True,
    ):
        """Provide the arguments necessary to instantiate :class:`experiments.CrossValidationExperiment`. This method has the same
        signature as :meth:`experiments.BaseExperiment.__init__` except where noted

        Parameters
        ----------
        model_initializer: Class, or functools.partial, or class instance
            The algorithm class being used to initialize a model
        model_init_params: Dict, or object
            The dictionary of arguments given when creating a model instance with `model_initializer` via the `__init__` method
            of :class:`models.Model`. Any kwargs that are considered valid by the `__init__` method of `model_initializer` are
            valid in `model_init_params`
        model_extra_params: Dict, or None, default=None
            A dictionary of extra parameters passed to :class:`models.Model`. This is used to provide parameters to models'
            non-initialization methods (like `fit`, `predict`, `predict_proba`, etc.), and for neural networks
        feature_selector: List of str, callable, list of booleans, default=None
            The value provided when splitting apart the input data for all provided DataFrames. `feature_selector` is provided as
            the second argument for calls to `pandas.DataFrame.loc` in :meth:`BaseExperiment._initial_preprocessing`. If None,
            `feature_selector` is set to all columns in :attr:`train_dataset`, less :attr:`target_column`, and :attr:`id_column`
        preprocessing_pipeline: ...
            ... Experimental...
        preprocessing_params: ...
            ... Experimental...
        notes: String, or None, default=None
            Additional information about the Experiment that will be saved with the Experiment's description result file. This
            serves no purpose other than to facilitate saving Experiment details in a more readable format
        do_raise_repeated: Boolean, default=False
            If True and this Experiment locates a previous Experiment's results with matching Environment and Hyperparameter Keys,
            a RepeatedExperimentError will be raised. Else, a warning will be logged

        Notes
        -----
        The `auto_start` kwarg is not available here because :meth:`BaseOptimizationProtocol._execute_experiment` sets it to False
        in order to check for duplicated keys before running the whole Experiment. This is the most notable difference between
        calling :meth:`set_experiment_guidelines` and instantiating :class:`experiments.CrossValidationExperiment`"""
        self.model_initializer = model_initializer

        self.model_init_params = identify_algorithm_hyperparameters(
            self.model_initializer)
        try:
            self.model_init_params.update(model_init_params)
        except TypeError:
            self.model_init_params.update(dict(build_fn=model_init_params))

        self.model_extra_params = model_extra_params
        self.feature_selector = feature_selector
        self.preprocessing_pipeline = preprocessing_pipeline
        self.preprocessing_params = preprocessing_params
        self.notes = notes
        self.do_raise_repeated = do_raise_repeated

        if self.do_raise_repeated is False:
            G.warn_(
                'WARNING: Setting `do_raise_repeated`=False will allow Experiments to be unnecessarily duplicated'
            )

        self.algorithm_name, self.module_name = identify_algorithm(
            self.model_initializer)
        self._validate_guidelines()

        #################### Deal with Keras ####################
        if self.module_name == 'keras':
            reusable_build_fn, reusable_wrapper_params, dummy_layers, dummy_compile_params = keras_prep_workflow(
                self.model_initializer, self.model_init_params['build_fn'],
                self.model_extra_params, self.source_script)
            self.model_init_params = dict(build_fn=reusable_build_fn)
            self.model_extra_params = reusable_wrapper_params
            self.dummy_layers = dummy_layers
            self.dummy_compile_params = dummy_compile_params
            # FLAG: Deal with capitalization conflicts when comparing similar experiments: `optimizer`='Adam' vs 'adam'

        self.set_dimensions()
    def __init__(
        self,
        model_initializer,
        model_init_params=None,
        model_extra_params=None,
        feature_engineer=None,
        feature_selector=None,
        notes=None,
        do_raise_repeated=False,
        auto_start=True,
        target_metric=None,
    ):
        """One-off Experimentation base class

        **Bare-bones Description:** Runs the cross-validation scheme defined by `Environment`,
        during which 1) Datasets are processed according to `feature_engineer`; 2) Models are built
        by instantiating `model_initializer` with `model_init_params`; 3) Models are trained on
        processed data, optionally using parameters from `model_extra_params`; 4) Results are
        logged and recorded for each fitting period; 5) Descriptions, predictions, results (both
        averages and individual periods), etc. are saved.

        **What's the Big Deal?** The most important takeaway from the above description is that
        descriptions/results are THOROUGH and REUSABLE. By thorough, I mean that all of a model's
        hyperparameters are saved, not just the ones given in `model_init_params`. This may sound
        odd, but it's important because it makes results reusable during optimization, when you may
        be using a different set of hyperparameters. It helps with other things like preventing
        duplicate experiments and ensembling, as well. But the big part is that this transforms
        hyperparameter optimization from an isolated, throwaway process we can only afford when an
        ML project is sufficiently "mature" to a process that covers the entire lifespan of a
        project. No Experiment is forgotten or wasted. Optimization is automatically given the data
        it needs to succeed by drawing on all your past Experiments and optimization rounds.

        The Experiment has three primary missions:
        1. Act as scaffold for organizing ML Experimentation and optimization
        2. Record Experiment descriptions and results
        3. Eliminate lots of repetitive/error-prone boilerplate code

        Providing a scaffold for the entire ML process is critical because without a standardized
        format, everything we do looks different. Without a unified scaffold, development is slower,
        more confusing, and less adaptable. One of the benefits of standardizing the format of ML
        Experimentation is that it enables us to exhaustively record all the important
        characteristics of Experiment, as well as an assortment of customizable result files -- all
        in a way that allows them to be reused in the future.

        **What About Data/Metrics?** Experiments require an active
        :class:`~hyperparameter_hunter.environment.Environment` in order to function, from which
        the Experiment collects important cross-experiment parameters, such as datasets, metrics,
        cross-validation schemes, and even callbacks to inherit, among many other properties
        documented in :class:`~hyperparameter_hunter.environment.Environment`

        Parameters
        ----------
        model_initializer: Class, or functools.partial, or class instance
            Algorithm class used to initialize a model, such as XGBoost's `XGBRegressor`, or
            SKLearn's `KNeighborsClassifier`; although, there are hundreds of possibilities across
            many different ML libraries. `model_initializer` is expected to define at least `fit`
            and `predict` methods. `model_initializer` will be initialized with `model_init_params`,
            and its "extra" methods (`fit`, `predict`, etc.) will be invoked with parameters in
            `model_extra_params`
        model_init_params: Dict, or object (optional)
            Dictionary of arguments given to create an instance of `model_initializer`. Any kwargs
            that are considered valid by the `__init__` method of `model_initializer` are valid in
            `model_init_params`.

            One of the key features that makes HyperparameterHunter so magical is that **ALL**
            hyperparameters in the signature of `model_initializer` (and their default values) are
            discovered -- whether or not they are explicitly given in `model_init_params`. Not only
            does this make Experiment result descriptions incredibly thorough, it also makes
            optimization smoother, more effective, and far less work for the user. For example, take
            LightGBM's `LGBMRegressor`, with `model_init_params`=`dict(learning_rate=0.2)`.
            HyperparameterHunter recognizes that this differs from the default of 0.1. It also
            recognizes that `LGBMRegressor` is actually initialized with more than a dozen other
            hyperparameters we didn't bother mentioning, and it records their values, too. So if we
            want to optimize `num_leaves` tomorrow, the OptPro doesn't start from scratch. It knows
            that we ran an Experiment that didn't explicitly mention `num_leaves`, but its default
            value was 31, and it uses this information to fuel optimization -- all without us having
            to manually keep track of tons of janky collections of hyperparameters. In fact, we
            really don't need to go out of our way at all. HyperparameterHunter just acts as our
            faithful lab assistant, keeping track of all the stuff we'd rather not worry about
        model_extra_params: Dict (optional)
            Dictionary of extra parameters for models' non-initialization methods (like `fit`,
            `predict`, `predict_proba`, etc.), and for neural networks. To specify parameters for
            an extra method, place them in a dict named for the extra method to which the
            parameters should be given. For example, to call `fit` with `early_stopping_rounds`=5,
            use `model_extra_params`=`dict(fit=dict(early_stopping_rounds=5))`.

            For models whose `fit` methods have a kwarg like `eval_set` (such as XGBoost's), one can
            use the `DatasetSentinel` attributes of the current active
            :class:`~hyperparameter_hunter.environment.Environment`, documented under its
            "Attributes" section and under
            :attr:`~hyperparameter_hunter.environment.Environment.train_input`. An example using
            several DatasetSentinels can be found in HyperparameterHunter's
            [XGBoost Classification Example](https://github.com/HunterMcGushion/hyperparameter_hunter/blob/master/examples/xgboost_examples/classification.py)
        feature_engineer: `FeatureEngineer`, or list (optional)
            Feature engineering/transformation/pre-processing steps to apply to datasets defined in
            :class:`~hyperparameter_hunter.environment.Environment`. If list, will be used to
            initialize :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`, and can
            contain any of the following values:

                1. :class:`~hyperparameter_hunter.feature_engineering.EngineerStep` instance
                2. Function input to :class:~hyperparameter_hunter.feature_engineering.EngineerStep`

            For important information on properly formatting `EngineerStep` functions, please see
            the documentation of :class:`~hyperparameter_hunter.feature_engineering.EngineerStep`.
            OptPros can perform hyperparameter optimization of `feature_engineer` steps. This
            capability adds a third allowed value to the above list and is documented in
            :meth:`~hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment`
        feature_selector: List of str, callable, or list of booleans (optional)
            Column names to include as input data for all provided DataFrames. If None,
            `feature_selector` is set to all columns in :attr:`train_dataset`, less
            :attr:`target_column`, and :attr:`id_column`. `feature_selector` is provided as the
            second argument for calls to `pandas.DataFrame.loc` when constructing datasets
        notes: String (optional)
            Additional information about the Experiment that will be saved with the Experiment's
            description result file. This serves no purpose other than to facilitate saving
            Experiment details in a more readable format
        do_raise_repeated: Boolean, default=False
            If True and this Experiment locates a previous Experiment's results with matching
            Environment and Hyperparameter Keys, a RepeatedExperimentError will be raised. Else, a
            warning will be logged
        auto_start: Boolean, default=True
            If True, after the Experiment is initialized, it will automatically call
            :meth:`BaseExperiment.preparation_workflow`, followed by
            :meth:`BaseExperiment.experiment_workflow`, effectively completing all essential tasks
            without requiring additional method calls
        target_metric: Tuple, str, default=('oof', <:attr:`environment.Environment.metrics`[0]>)
            Path denoting the metric to be used to compare completed Experiments or to use for
            certain early stopping procedures in some model classes. The first value should be one
            of ['oof', 'holdout', 'in_fold']. The second value should be the name of a metric being
            recorded according to the values supplied in
            :attr:`hyperparameter_hunter.environment.Environment.metrics_params`. See the
            documentation for :func:`hyperparameter_hunter.metrics.get_formatted_target_metric` for
            more info. Any values returned by, or used as the `target_metric` input to this function
            are acceptable values for `target_metric`

        See Also
        --------
        :meth:`hyperparameter_hunter.optimization.protocol_core.BaseOptPro.forge_experiment`
            OptPro method to define hyperparameter search scaffold for building Experiments during
            optimization. This method follows the same format as Experiment initialization, but it
            adds the ability to provide hyperparameter values as ranges to search over, via
            subclasses of :class:`~hyperparameter_hunter.space.dimensions.Dimension`. The other
            notable difference is that `forge_experiment` removes the `auto_start` and
            `target_metric` kwargs, which is described in the `forge_experiment` docstring Notes
        :class:`~hyperparameter_hunter.environment.Environment`
            Provides critical information on how Experiments should be conducted, as well as the
            data to be used by Experiments. An `Environment` must be active before executing any
            Experiment or OptPro
        :func:`~hyperparameter_hunter.callbacks.bases.lambda_callback`
            Enables customization of the Experimentation process and access to all Experiment
            internals through a collection of methods that are invoked at all the important periods
            over an Experiment's lifespan. These can be provided via the `experiment_callbacks`
            kwarg of :class:`~hyperparameter_hunter.environment.Environment`, and the callback
            classes literally get thrown in to the parent classes of the Experiment, so they're
            kind of a big deal"""
        self.model_initializer = model_initializer
        self.model_init_params = identify_algorithm_hyperparameters(
            self.model_initializer)
        model_init_params = model_init_params if model_init_params is not None else {}
        try:
            self.model_init_params.update(model_init_params)
        except TypeError:
            self.model_init_params.update(dict(build_fn=model_init_params))

        self.model_extra_params = model_extra_params if model_extra_params is not None else {}

        self.feature_engineer = feature_engineer
        if not isinstance(self.feature_engineer, FeatureEngineer):
            self.feature_engineer = FeatureEngineer(self.feature_engineer)

        self.feature_selector = feature_selector if feature_selector is not None else []

        self.notes = notes
        self.do_raise_repeated = do_raise_repeated
        self.auto_start = auto_start
        self.target_metric = target_metric

        #################### Attributes From Active Environment ####################
        G.Env.initialize_reporting()
        self._validate_environment()

        self.train_dataset = G.Env.train_dataset.copy()
        try:
            self.holdout_dataset = G.Env.holdout_dataset.copy()
        except AttributeError:
            self.holdout_dataset = G.Env.holdout_dataset
        try:
            self.test_dataset = G.Env.test_dataset.copy()
        except AttributeError:
            self.test_dataset = G.Env.test_dataset

        self.target_column = G.Env.target_column
        self.id_column = G.Env.id_column
        self.do_predict_proba = G.Env.do_predict_proba
        self.prediction_formatter = G.Env.prediction_formatter
        self.metrics_params = G.Env.metrics_params
        self.experiment_params = G.Env.cross_experiment_params
        self.cv_params = G.Env.cv_params
        self.result_paths = G.Env.result_paths
        self.cross_experiment_key = G.Env.cross_experiment_key

        #################### Dataset Attributes ####################
        self.data_train = None
        self.data_oof = None
        self.data_holdout = None
        self.data_test = None

        #################### Other Attributes ####################
        self.model = None
        self.metrics = None  # Set by :class:`metrics.ScoringMixIn`
        self.stat_aggregates = dict()
        self.result_description = None

        #################### Experiment Identification Attributes ####################
        self.experiment_id = None
        self.hyperparameter_key = None
        self.algorithm_name, self.module_name = identify_algorithm(
            self.model_initializer)

        ScoringMixIn.__init__(
            self, **self.metrics_params if self.metrics_params else {})

        if self.auto_start is True:
            self.preparation_workflow()
            self.experiment_workflow()