def execute(): env = Environment( train_dataset=get_toy_classification_data(), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type=RepeatedStratifiedKFold, cross_validation_params=dict(n_splits=3, n_repeats=2, random_state=32), do_full_save=do_full_save, ) experiment_0 = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(subsample=0.01)) # Pro Tip: By setting XGBoost's subsample ridiculously low, we can get bad scores on purpose # Upon completion of this Experiment, we see a warning that not all result files will be saved # This is because the final score of the Experiment was below our threshold of 0.75 # Specifically, we skipped saving prediction files (OOF, holdout, test, or in-fold), and the heartbeat file # What still got saved is the Experiment's: key information, leaderboard position, and description file # These are saved to allow us to use the information for future hyperparameter optimization, and detect repeated Experiments # Additionally, the Experiment's script backup is saved, but that's because its one of the first things that happens # For even finer control over what gets saved, use `do_full_save` together with `file_blacklist` # Now, lets perform another Experiment that does a bit better than our intentionally miserable one experiment_1 = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(subsample=0.5))
def test_feature_engineer_list_experiment_equality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by :class:`~hyperparameter_hunter.experiments.CVExperiment` is the same whether it was given a list as input, or a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`""" exp_0 = CVExperiment(Ridge, feature_engineer=steps_0) exp_1 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert exp_0.feature_engineer == exp_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` exp_2 = CVExperiment(Ridge, feature_engineer=steps_1) exp_3 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert exp_2.feature_engineer == exp_3.feature_engineer
def prepped_experiment(request): """Build a partially prepared :class:`~hyperparameter_hunter.experiments.CVExperiment` instance Specifically, automatic execution is disabled via `auto_start=False`, then the following methods are called: 1. :meth:`~hyperparameter_hunter.experiments.BaseExperiment.preparation_workflow`, 2. :meth:`~hyperparameter_hunter.experiments.BaseExperiment._initialize_random_seeds`, and 3. :meth:`~hyperparameter_hunter.experiments.BaseExperiment.on_exp_start`, which initializes the four :mod:`~hyperparameter_hunter.data.datasets` classes, then performs pre-CV feature engineering Notes ----- Directly calling `on_exp_start` is ok in this test because after calling `_initialize_random_seeds`, `BaseExperiment` calls `execute`, which is implemented by `BaseCVExperiment`, and only calls `cross_validation_workflow`, whose first task is to call `on_exp_start`. So nothing gets skipped in between""" #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Partially Prepare `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, auto_start=False, ) experiment.preparation_workflow() # noinspection PyProtectedMember experiment._initialize_random_seeds() experiment.on_exp_start() return experiment
def test_feature_engineer_list_experiment_inequality(env_boston, steps_0, steps_1): """Test that the `feature_engineer` attribute constructed by :class:`~hyperparameter_hunter.experiments.CVExperiment` is NOT the same when given a list as input vs. a :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer` when the two are actually different. This is an insanity test to make sure that the related test in this module, :func:`test_feature_engineer_list_experiment_equality`, is not simply equating everything""" exp_0 = CVExperiment(Ridge, feature_engineer=steps_0) exp_1 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_1)) assert exp_0.feature_engineer != exp_1.feature_engineer # Repeat above, but switch which steps are wrapped in `FeatureEngineer` exp_2 = CVExperiment(Ridge, feature_engineer=steps_1) exp_3 = CVExperiment(Ridge, feature_engineer=FeatureEngineer(steps_0)) assert exp_2.feature_engineer != exp_3.feature_engineer
def _execute(): # To start, take a look at "examples/environment_params.json" - This is the file we're giving our Environment below # In this file, we can define a bunch of default Environment parameters that we don't want to always explicitly provide # It works really well for things that won't be changing often, like the following: # - `root_results_path`, which we probably never want to change, so all our results go to one place; # - `target_column`, which will probably be a constant for your data # - `metrics_map`, if you're not using any fancy metrics, and you already know what you want # - `file_blacklist`, if you're angry at me for adding that one result file that's always useless # Other parameters, whose default values you may want to change env = Environment( train_dataset=get_breast_cancer_data(), # If your dataset is a str path, you can even add it to environment_params environment_params_path="./environment_params.json", # Use this file for parameters not explicitly given cross_validation_params=dict( n_splits=5, shuffle=True, random_state=32 ), # Here we decide to override our default values ) print(env.root_results_path) print(env.target_column) print(env.metrics_map) print(env.cross_validation_type) print(env.runs) print(env.file_blacklist) # This includes some other values too, but you can ignore them # All of the above are from `environment_params_path` print( env.cross_validation_params ) # This is the value we provided above, rather than our `environment_params_path` default experiment = CVExperiment(model_initializer=KNeighborsClassifier, model_init_params={})
def engineer_experiment(request): """`CVExperiment` fixture that supports provision of a `feature_engineer` through `request`""" feature_engineer = FeatureEngineer(steps=request.param) experiment = CVExperiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=feature_engineer ) return experiment
def _execute(): #################### Environment #################### env = Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), ) #################### Experimentation #################### experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_experiment), model_extra_params=dict(callbacks=[ReduceLROnPlateau(patience=5)], batch_size=32, epochs=10, verbose=0), ) #################### Optimization #################### optimizer = BayesianOptimization(iterations=10) optimizer.set_experiment_guidelines( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_optimization), model_extra_params=dict( callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))], batch_size=Categorical([32, 64], transform="onehot"), epochs=10, verbose=0, ), ) optimizer.go()
def execute(): #################### Environment #################### env = Environment( train_dataset=get_boston_data(), results_path="HyperparameterHunterAssets", holdout_dataset=get_holdout_data, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=10, random_state=1), ) #################### CVExperiment #################### exp_0 = CVExperiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([quantile_transform]), ) #################### Optimization #################### # `opt_0` recognizes `exp_0`'s `feature_engineer` and its results as valid learning material # This is because `opt_0` marks the engineer step functions omitted by `exp_0` as `optional=True` opt_0 = DummyOptPro(iterations=10) opt_0.forge_experiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([ Categorical([quantile_transform, log_transform], optional=True), Categorical([standard_scale, standard_scale_BAD], optional=True), Categorical([square_sum_feature], optional=True), ]), ) opt_0.go()
def execute(): env = Environment( train_dataset=get_breast_cancer_data(), results_path="HyperparameterHunterAssets", target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=build_fn, model_extra_params=dict( callbacks=[ ModelCheckpoint( filepath=os.path.abspath("foo_checkpoint"), save_best_only=True, verbose=1 ), ReduceLROnPlateau(patience=5), ], batch_size=32, epochs=10, verbose=0, shuffle=True, ), )
def execute(): """This is going to be a very simple example to illustrate what exactly HyperparameterHunter does, and how it revolutionizes hyperparameter optimization.""" # Start by creating an `Environment` - This is where you define how Experiments (and optimization) will be conducted env = Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), ) # Now, conduct an `Experiment` # This tells HyperparameterHunter to use the settings in the active `Environment` to train a model with these hyperparameters experiment = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", max_depth=3)) # That's it. No annoying boilerplate code to fit models and record results # Now, the `Environment`'s `root_results_path` directory will contain new files describing the Experiment just conducted # Time for the fun part. We'll set up some hyperparameter optimization by first defining the `OptimizationProtocol` we want optimizer = BayesianOptimization(verbose=1) # Now we're going to say which hyperparameters we want to optimize. # Notice how this looks just like our `experiment` above optimizer.set_experiment_guidelines( model_initializer=XGBClassifier, model_init_params=dict( objective= "reg:linear", # We're setting this as a constant guideline - Not one to optimize max_depth=Integer( 2, 10 ), # Instead of using an int like the `experiment` above, we provide a space to search ), ) # Notice that our range for `max_depth` includes the `max_depth=3` value we used in our `experiment` earlier optimizer.go() # Now, we go assert experiment.experiment_id in [ _[2] for _ in optimizer.similar_experiments ] # Here we're verifying that the `experiment` we conducted first was found by `optimizer` and used as learning material # You can also see via the console that we found `experiment`'s saved files, and used it to start optimization last_experiment_id = optimizer.current_experiment.experiment_id # Let's save the id of the experiment that was just conducted by `optimizer` optimizer.go() # Now, we'll start up `optimizer` again... # And we can see that this second optimization round learned from both our first `experiment` and our first optimization round assert experiment.experiment_id in [ _[2] for _ in optimizer.similar_experiments ] assert last_experiment_id in [_[2] for _ in optimizer.similar_experiments]
def exp_lambda_cb(lambda_cbs): """Return a `CVExperiment` with `lambda_cbs` as `callbacks` Parameters ---------- lambda_cbs: `LambdaCallback`, list of `LambdaCallback`, or None LambdaCallback values passed to the `CVExperiment`'s `callbacks` kwarg""" return CVExperiment(AdaBoostRegressor, callbacks=lambda_cbs)
def test_sentinels_experiment(env_0): # noinspection PyUnusedLocal experiment = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", max_depth=3, subsample=0.5), model_extra_params=dict( fit=dict(eval_set=get_all_sentinels(env_0), early_stopping_rounds=5, eval_metric="mae") ), )
def test_do_not_validate(env_boston): exp = CVExperiment( model_initializer=Ridge, model_init_params={}, feature_engineer=FeatureEngineer([standard_scale], do_validate=False), ) for step in exp.feature_engineer.steps: assert step.original_hashes == {} assert step.updated_hashes == {}
def test_inverse_type_error(env_boston): """Test that an error is raised if an `EngineerStep` function returns an extra value that is not a function or class instance. Extra return values are used for inverse transformations""" with pytest.raises(TypeError, match="`inversion` must be callable, or class with .*"): exp = CVExperiment( model_initializer=Ridge, model_init_params={}, feature_engineer=FeatureEngineer([bad_quantile_transform]), )
def experiment_fixture(request): #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Execute `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, ) return experiment
def exp_lgb_0(): return CVExperiment( model_initializer=LGBMClassifier, model_init_params=dict( boosting_type="gbdt", num_leaves=5, max_depth=5, min_child_samples=1, subsample=0.5, verbose=-1, ), )
def engineer_experiment(request): """`CVExperiment` fixture that supports provision of a `feature_engineer` through `request` Parameters ---------- request: Object If `request` has a "param" attribute, it must be a list of feature engineering steps to provide to :class:`~hyperparameter_hunter.feature_engineering.FeatureEngineer`""" feature_engineer = FeatureEngineer(steps=getattr(request, "param", None)) experiment = CVExperiment(model_initializer=SVC, model_init_params=dict(), feature_engineer=feature_engineer) return experiment
def execute(): env = Environment( train_dataset=get_toy_classification_data(), results_path="HyperparameterHunterAssets", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) experiment = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict( objective="reg:linear", max_depth=3, n_estimators=100, subsample=0.5 ), )
def _execute(): env = Environment( train_dataset=prep_data(), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], target_column=[f"target_{_}" for _ in range(10)], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=10, shuffle=True, random_state=True), ) experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=build_fn, model_extra_params=dict(batch_size=32, epochs=10, verbose=0, shuffle=True), )
def experiment_prep_fixture(request): #################### Build `feature_engineer` #################### feature_engineer = FeatureEngineer(steps=request.param) #################### Partially Prepare `CVExperiment` #################### experiment = CVExperiment( model_initializer=AdaBoostClassifier, model_init_params=dict(), feature_engineer=feature_engineer, auto_start=False, ) experiment.preparation_workflow() # noinspection PyProtectedMember experiment._initialize_random_seeds() # noinspection PyProtectedMember experiment._initial_preprocessing() return experiment
def _execute(): env = Environment( train_dataset=prep_data(), results_path="HyperparameterHunterAssets", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=3, shuffle=True, random_state=True), ) experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=build_fn, model_extra_params=dict(batch_size=32, epochs=3, verbose=0, shuffle=True), )
def test_categorical_tuple_match(env_digits): """Test that optimization of a `Categorical` space, whose values are tuples can be performed and that saved results from such a space are correctly identified as similar Experiments""" model_extra_params = dict(batch_size=32, epochs=3, verbose=0, shuffle=True) exp_0 = CVExperiment(KerasClassifier, build_fn_digits_exp, model_extra_params) #################### First OptPro #################### opt_0 = BayesianOptPro(iterations=1, random_state=32, n_initial_points=1) opt_0.forge_experiment(KerasClassifier, build_fn_digits_opt, model_extra_params) opt_0.go() assert len(opt_0.similar_experiments) == 1 # Should match `exp_0` #################### Second OptPro #################### opt_1 = BayesianOptPro(iterations=1, random_state=32, n_initial_points=1) opt_1.forge_experiment(KerasClassifier, build_fn_digits_opt, model_extra_params) opt_1.go() assert len(opt_1.similar_experiments) == 2 # Should match `exp_0` and `opt_0`
def test_optional_step_matching_by_exp(env_boston, es_0, es_1, es_2): """Test that the result of an Experiment is correctly matched by an OptPro with all-`optional` `EngineerStep` dimensions""" feature_engineer = [_ for _ in [es_0, es_1, es_2] if _ is not None] exp_0 = CVExperiment(XGBRegressor, feature_engineer=feature_engineer) opt_0 = ExtraTreesOptPro(iterations=1, random_state=32) opt_0.forge_experiment( XGBRegressor, feature_engineer=[ Categorical([es_a], optional=True), Categorical([es_b, es_c], optional=True), Categorical([es_d, es_e], optional=True), ], ) opt_0.get_ready() # Assert `opt_0` matched with `exp_0` assert len(opt_0.similar_experiments) == 1
def execute(): env = Environment( train_dataset=get_toy_classification_data(), results_path="HyperparameterHunterAssets", # Both `holdout_dataset`, and `train_dataset` can be any of the following: pandas.DataFrame, filepath, or None # If a filepath is provided, it will be passed to :meth:`pandas.read_csv`. # In addition to the above types, `holdout_dataset` can also be provided as a callable (see above :func:`get_holdout_set`) holdout_dataset=get_holdout_set, test_dataset=get_toy_classification_data(), # By default, `holdout_dataset` will be scored with the provided metrics, just like OOF predictions # However, you can provide the additional `metrics_params` kwarg to specify which metrics are calculated for each dataset # See the documentation in :class:`environment.Environment` and :class:`metrics.ScoringMixIn` for more information metrics=["roc_auc_score"], cv_type=StratifiedKFold, cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) experiment = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(subsample=0.5))
def execute(): env = Environment( train_dataset=get_toy_classification_data(), results_path="HyperparameterHunterAssets", metrics=["roc_auc_score"], cv_type=RepeatedStratifiedKFold, cv_params=dict(n_splits=5, n_repeats=2, random_state=32), runs=2, # Just instantiate `Environment` with your list of callbacks, and go about business as usual experiment_callbacks=[printer_callback(), confusion_matrix_oof()], # In addition to `printer_callback` made above, we're also adding the `confusion_matrix_oof` callback # This, and other callbacks, can be found in `hyperparameter_hunter.callbacks.recipes` ) experiment = CVExperiment( model_initializer=XGBClassifier, model_init_params={}, model_extra_params=dict(fit=dict(verbose=False)), )
def execute(): train_df, holdout_df = prep_data() env = Environment( train_dataset=train_df, results_path="HyperparameterHunterAssets", metrics=["roc_auc_score"], target_column=[f"target_{_}" for _ in range(10) ], # 10 classes (one-hot-encoded output) holdout_dataset=holdout_df, cv_type="StratifiedKFold", cv_params=dict(n_splits=3, shuffle=True, random_state=True), ) exp = CVExperiment(KerasClassifier, build_fn_exp, dict(batch_size=64, epochs=10, verbose=1)) opt = BayesianOptPro(iterations=10, random_state=32) opt.forge_experiment(KerasClassifier, build_fn_opt, dict(batch_size=64, epochs=10, verbose=0)) opt.go()
def test_similar_experiments_unordered(): """Check that an experiment with a single `EngineerStep` is considered "similar" by an Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to fail because the otherwise identical engineer steps occur at different indexes in `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature` at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro is still 1 despite the fact that the other step immediately preceding it is `optional`""" env = Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]), ) opt = BayesianOptPro(iterations=1) opt.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([ Categorical([standard_scale, normalize, min_max_scale], optional=True), Categorical([sqr_sum_feature], optional=True), ]), ) opt.go() assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
def test_predictor_holdout_iris(): G.priority_callbacks = (DummyExperimentPredictorHoldout, ) #################### Set Up Environment #################### env = Environment( train_dataset=get_iris_data(), results_path=assets_dir, holdout_dataset=get_holdout, target_column="species", metrics=dict(f1=lambda t, p: f1_score(t, p, average="micro"), hamming_loss="hamming_loss"), cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) #################### Perform Experiment #################### experiment = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict( objective="multi:softprob", max_depth=1, n_estimators=300, learning_rate=0.02, min_child_weight=6, gamma=0.07, colsample_bytree=0.31, ), model_extra_params=dict(fit=dict( eval_set=[ (env.train_input, env.train_target), (env.validation_input, env.validation_target), ], early_stopping_rounds=20, eval_metric="merror", )), ) G.priority_callbacks = tuple()
def test_predictor_holdout_breast_cancer(): G.priority_callbacks = (DummyExperimentPredictorHoldout, ) #################### Set Up Environment #################### env = Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, holdout_dataset=get_holdout, target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) #################### Perform Experiment #################### experiment = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict( objective="reg:linear", max_depth=3, n_estimators=100, learning_rate=0.02, min_child_weight=6, gamma=0.07, colsample_bytree=0.31, ), model_extra_params=dict(fit=dict( eval_set=[ (env.train_input, env.train_target), (env.validation_input, env.validation_target), ], early_stopping_rounds=5, eval_metric="mae", )), ) G.priority_callbacks = tuple()
def execute(): #################### Environment #################### env = Environment( train_dataset=get_iris_data(), results_path="HyperparameterHunterAssets", target_column="species", metrics=["hamming_loss"], cv_params=dict(n_splits=5, random_state=32), ) #################### Experiment #################### # Just a reference for normal `class_weight` usage outside of optimization CVExperiment(RandomForestClassifier, dict(n_estimators=10, class_weight={ 0: 1, 1: 1, 2: 1 })) #################### Optimization #################### opt = BayesianOptPro(iterations=10, random_state=32) opt.forge_experiment( model_initializer=RandomForestClassifier, model_init_params=dict( # Weight values for each class can be optimized with `Categorical`/`Integer` class_weight={ 0: Categorical([1, 3]), 1: Categorical([1, 4]), 2: Integer(1, 9), # You can also use `Integer` for low/high ranges }, criterion=Categorical(["gini", "entropy"]), n_estimators=Integer(5, 100), ), ) opt.go()