def _execute(): # To start, take a look at "examples/environment_params.json" - This is the file we're giving our Environment below # In this file, we can define a bunch of default Environment parameters that we don't want to always explicitly provide # It works really well for things that won't be changing often, like the following: # - `root_results_path`, which we probably never want to change, so all our results go to one place; # - `target_column`, which will probably be a constant for your data # - `metrics_map`, if you're not using any fancy metrics, and you already know what you want # - `file_blacklist`, if you're angry at me for adding that one result file that's always useless # Other parameters, whose default values you may want to change env = Environment( train_dataset=get_breast_cancer_data(), # If your dataset is a str path, you can even add it to environment_params environment_params_path="./environment_params.json", # Use this file for parameters not explicitly given cross_validation_params=dict( n_splits=5, shuffle=True, random_state=32 ), # Here we decide to override our default values ) print(env.root_results_path) print(env.target_column) print(env.metrics_map) print(env.cross_validation_type) print(env.runs) print(env.file_blacklist) # This includes some other values too, but you can ignore them # All of the above are from `environment_params_path` print( env.cross_validation_params ) # This is the value we provided above, rather than our `environment_params_path` default experiment = CVExperiment(model_initializer=KNeighborsClassifier, model_init_params={})
def execute(): env = Environment( train_dataset=get_breast_cancer_data(), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type='StratifiedKFold', cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), ) experiment = CrossValidationExperiment( model_initializer=KerasClassifier, model_init_params=build_fn, model_extra_params=dict( callbacks=[ ModelCheckpoint(filepath=os.path.abspath('foo_checkpoint'), save_best_only=True, verbose=1), ReduceLROnPlateau(patience=5), ], batch_size=32, epochs=10, verbose=0, shuffle=True, ), )
def _execute(): #################### Environment #################### env = Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), ) #################### Experimentation #################### experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_experiment), model_extra_params=dict(callbacks=[ReduceLROnPlateau(patience=5)], batch_size=32, epochs=10, verbose=0), ) #################### Optimization #################### optimizer = BayesianOptimization(iterations=10) optimizer.set_experiment_guidelines( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_optimization), model_extra_params=dict( callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))], batch_size=Categorical([32, 64], transform="onehot"), epochs=10, verbose=0, ), ) optimizer.go()
def _execute(): env = Environment( train_dataset=get_toy_classification_data(), root_results_path='HyperparameterHunterAssets', metrics_map=['roc_auc_score'], cross_validation_type='StratifiedKFold', cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), runs=1, ) optimizer = GradientBoostedRegressionTreeOptimization( iterations=10, read_experiments=True, random_state=None, ) optimizer.set_experiment_guidelines( model_initializer=CatBoostClassifier, model_init_params=dict(iterations=100, eval_metric=Categorical( ['Logloss', 'Accuracy', 'AUC'], transform='onehot'), learning_rate=Real(low=0.0001, high=0.5), depth=Integer(4, 7), save_snapshot=False), ) optimizer.go() print('')
def execute(): #################### Environment #################### env = Environment( train_dataset=get_boston_data(), results_path="HyperparameterHunterAssets", holdout_dataset=get_holdout_data, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=10, random_state=1), ) #################### CVExperiment #################### exp_0 = CVExperiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([quantile_transform]), ) #################### Optimization #################### # `opt_0` recognizes `exp_0`'s `feature_engineer` and its results as valid learning material # This is because `opt_0` marks the engineer step functions omitted by `exp_0` as `optional=True` opt_0 = DummyOptPro(iterations=10) opt_0.forge_experiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([ Categorical([quantile_transform, log_transform], optional=True), Categorical([standard_scale, standard_scale_BAD], optional=True), Categorical([square_sum_feature], optional=True), ]), ) opt_0.go()
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=StratifiedKFold, cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), runs=2, ) optimizer = BayesianOptimization(iterations=100, read_experiments=True, random_state=None) optimizer.set_experiment_guidelines( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), learning_rate=Real(0.0001, 0.5), n_estimators=200, subsample=0.5, booster=Categorical(['gbtree', 'gblinear', 'dart']), ), model_extra_params=dict( fit=dict( eval_metric=Categorical(['auc', 'rmse', 'mae']) ) ), ) optimizer.go()
def execute(): env = Environment( train_dataset=get_breast_cancer_data(), results_path="HyperparameterHunterAssets", target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=build_fn, model_extra_params=dict( callbacks=[ ModelCheckpoint( filepath=os.path.abspath("foo_checkpoint"), save_best_only=True, verbose=1 ), ReduceLROnPlateau(patience=5), ], batch_size=32, epochs=10, verbose=0, shuffle=True, ), )
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), results_path="HyperparameterHunterAssets", target_column="diagnosis", metrics=["roc_auc_score"], cv_type=StratifiedKFold, cv_params=dict(n_splits=10, shuffle=True, random_state=32), runs=2, ) optimizer = BayesianOptPro(iterations=10, read_experiments=True, random_state=None) optimizer.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), learning_rate=Real(0.0001, 0.5), n_estimators=200, subsample=0.5, booster=Categorical(["gbtree", "gblinear", "dart"]), ), model_extra_params=dict(fit=dict(eval_metric=Categorical(["auc", "rmse", "mae"]))), ) optimizer.go()
def execute(): env = Environment( train_dataset=get_imbalanced_dataset(), results_path="HyperparameterHunterAssets", target_column="target", metrics=["roc_auc_score", "accuracy_score"], cv_type="KFold", cv_params=dict(n_splits=5, random_state=7), ) # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter # optimization just for fun. If you're like most people and you think it's absurd to test # 18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below opt_0 = ET(iterations=20, random_state=32) opt_0.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), n_estimators=Integer(50, 900), learning_rate=Real(0.0001, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), feature_engineer=FeatureEngineer([ Categorical( [ EngineerStep(resample_smote_tomek, stage="intra_cv"), EngineerStep(over_sample_random, stage="intra_cv"), EngineerStep(over_sample_smote, stage="intra_cv"), EngineerStep(under_sample_random, stage="intra_cv"), EngineerStep(under_sample_cluster_centroids, stage="intra_cv"), EngineerStep(under_sample_tomek_links, stage="intra_cv"), #################### GROUP 2 (EXTENDED) #################### EngineerStep(resample_smote_enn, stage="intra_cv"), EngineerStep(over_sample_ADASYN, stage="intra_cv"), EngineerStep(over_sample_BorderlineSMOTE, stage="intra_cv"), EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"), EngineerStep(under_sample_NearMiss, stage="intra_cv"), EngineerStep(under_sample_CondensedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_OneSidedSelection, stage="intra_cv"), EngineerStep(under_sample_NeighbourhoodCleaningRule, stage="intra_cv"), EngineerStep(under_sample_EditedNearestNeighbours, stage="intra_cv"), EngineerStep(under_sample_RepeatedEditedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_AllKNN, stage="intra_cv"), EngineerStep(under_sample_InstanceHardnessThreshold, stage="intra_cv"), ], optional=True, ) ]), ) opt_0.go()
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=StratifiedKFold, cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), runs=1, ) optimizer = RandomForestOptimization( iterations=100, read_experiments=True, ) optimizer.set_experiment_guidelines( model_initializer=LGBMClassifier, model_init_params=dict(boosting_type=Categorical(['gbdt', 'dart']), num_leaves=Integer(5, 20), max_depth=-1, min_child_samples=5, subsample=0.5), ) optimizer.go()
def execute(): env = Environment( train_dataset=get_toy_classification_data(), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type=RepeatedStratifiedKFold, cross_validation_params=dict(n_splits=3, n_repeats=2, random_state=32), do_full_save=do_full_save, ) experiment_0 = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(subsample=0.01)) # Pro Tip: By setting XGBoost's subsample ridiculously low, we can get bad scores on purpose # Upon completion of this Experiment, we see a warning that not all result files will be saved # This is because the final score of the Experiment was below our threshold of 0.75 # Specifically, we skipped saving prediction files (OOF, holdout, test, or in-fold), and the heartbeat file # What still got saved is the Experiment's: key information, leaderboard position, and description file # These are saved to allow us to use the information for future hyperparameter optimization, and detect repeated Experiments # Additionally, the Experiment's script backup is saved, but that's because its one of the first things that happens # For even finer control over what gets saved, use `do_full_save` together with `file_blacklist` # Now, lets perform another Experiment that does a bit better than our intentionally miserable one experiment_1 = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(subsample=0.5))
def env_3(): def printer_callback(): def printer_helper(_rep, _fold, _run, last_evaluation_results): print(f"{_rep}.{_fold}.{_run} {last_evaluation_results}") return lambda_callback( on_experiment_start=printer_helper, on_experiment_end=printer_helper, on_repetition_start=printer_helper, on_repetition_end=printer_helper, on_fold_start=printer_helper, on_fold_end=printer_helper, on_run_start=printer_helper, on_run_end=printer_helper, ) return Environment( train_dataset=get_toy_classification_data(), results_path=assets_dir, metrics=["roc_auc_score"], holdout_dataset=get_toy_classification_data(), cv_type=RepeatedStratifiedKFold, cv_params=dict(n_splits=3, n_repeats=2, random_state=32), runs=2, experiment_callbacks=[ printer_callback(), confusion_matrix_oof(), confusion_matrix_holdout(), ], )
def _execute(): env = Environment( train_dataset=get_toy_classification_data(target='diagnosis'), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=RepeatedStratifiedKFold, cross_validation_params=dict(n_splits=5, n_repeats=2, random_state=32), ) optimizer = ExtraTreesOptimization( iterations=10, read_experiments=True, random_state=None, ) optimizer.set_experiment_guidelines( model_initializer=RGFClassifier, model_init_params=dict(max_leaf=1000, algorithm=Categorical( ['RGF', 'RGF_Opt', 'RGF_Sib']), l2=Real(0.01, 0.3), normalize=Categorical([True, False]), learning_rate=Real(0.3, 0.7), loss=Categorical(['LS', 'Expo', 'Log', 'Abs'])), ) optimizer.go()
def env_1(): return Environment( train_dataset=get_breast_cancer_data(), environment_params_path="examples/advanced_examples/environment_params.json", results_path=assets_dir, cv_params=dict(n_splits=3, shuffle=True, random_state=32), )
def execute(): """This is going to be a very simple example to illustrate what exactly HyperparameterHunter does, and how it revolutionizes hyperparameter optimization.""" # Start by creating an `Environment` - This is where you define how Experiments (and optimization) will be conducted env = Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), ) # Now, conduct an `Experiment` # This tells HyperparameterHunter to use the settings in the active `Environment` to train a model with these hyperparameters experiment = CVExperiment(model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", max_depth=3)) # That's it. No annoying boilerplate code to fit models and record results # Now, the `Environment`'s `root_results_path` directory will contain new files describing the Experiment just conducted # Time for the fun part. We'll set up some hyperparameter optimization by first defining the `OptimizationProtocol` we want optimizer = BayesianOptimization(verbose=1) # Now we're going to say which hyperparameters we want to optimize. # Notice how this looks just like our `experiment` above optimizer.set_experiment_guidelines( model_initializer=XGBClassifier, model_init_params=dict( objective= "reg:linear", # We're setting this as a constant guideline - Not one to optimize max_depth=Integer( 2, 10 ), # Instead of using an int like the `experiment` above, we provide a space to search ), ) # Notice that our range for `max_depth` includes the `max_depth=3` value we used in our `experiment` earlier optimizer.go() # Now, we go assert experiment.experiment_id in [ _[2] for _ in optimizer.similar_experiments ] # Here we're verifying that the `experiment` we conducted first was found by `optimizer` and used as learning material # You can also see via the console that we found `experiment`'s saved files, and used it to start optimization last_experiment_id = optimizer.current_experiment.experiment_id # Let's save the id of the experiment that was just conducted by `optimizer` optimizer.go() # Now, we'll start up `optimizer` again... # And we can see that this second optimization round learned from both our first `experiment` and our first optimization round assert experiment.experiment_id in [ _[2] for _ in optimizer.similar_experiments ] assert last_experiment_id in [_[2] for _ in optimizer.similar_experiments]
def initialization_matching_env(): return Environment( train_dataset=get_breast_cancer_data(target="target"), results_path=assets_dir, metrics=["roc_auc_score"], cv_type="KFold", cv_params=dict(n_splits=2, shuffle=True, random_state=32), )
def toy_environment_fixture(): return Environment( train_dataset=pima_indians_head, holdout_dataset=holdout_first_row, metrics=["roc_auc_score"], target_column="class", cv_params=dict(n_splits=3, shuffle=True, random_state=32), )
def env_0(): return Environment( train_dataset=get_toy_classification_data(), results_path=assets_dir, metrics=["roc_auc_score"], cv_type="RepeatedStratifiedKFold", cv_params=dict(n_splits=3, n_repeats=2, random_state=32), )
def env_0(): return Environment( train_dataset=get_diabetes_data(target="target"), results_path=assets_dir, metrics=["mean_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=2, shuffle=True, random_state=32), )
def env_digits(): return Environment( train_dataset=get_digits_data(), results_path=assets_dir, metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=3, shuffle=True, random_state=32), )
def env_breast_cancer(): env = Environment( train_dataset=get_breast_cancer_data(target="target"), results_path=assets_dir, metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) return env
def env_0(): return Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path=assets_dir, metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=3, shuffle=True, random_state=32), )
def env_boston(): return Environment( train_dataset=get_boston_data(), results_path=assets_dir, target_column="DIS", metrics=["r2_score"], cv_type="KFold", cv_params=dict(n_splits=3, random_state=1), )
def dataset_recorder_env(request): return Environment( train_dataset=small_toy_dataset, holdout_dataset=getattr(request, "param", None), metrics=["accuracy_score"], target_column="t", cv_params=dict(n_splits=4, shuffle=True, random_state=32), experiment_callbacks=[dataset_recorder()], )
def boston_env(): return Environment( train_dataset=boston_head, holdout_dataset=holdout_last_row, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="RepeatedKFold", cv_params=dict(n_repeats=2, n_splits=3, random_state=1), experiment_callbacks=[dataset_recorder()], )
def env_5(request): return Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, target_column="diagnosis", metrics=["roc_auc_score"], cv_type=StratifiedKFold, cv_params=dict(n_splits=3, shuffle=True, random_state=32), experiment_recorders=request.param, )
def env_boston_regression(): env = Environment( train_dataset=get_boston_data(), results_path=assets_dir, target_column="DIS", metrics=["median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=3, random_state=1), ) return env
def env_0(): def do_full_save(experiment_result): return experiment_result["final_evaluations"]["oof"]["roc_auc_score"] > 0.75 return Environment( train_dataset=get_toy_classification_data(), results_path=assets_dir, metrics=["roc_auc_score"], cv_type=RepeatedStratifiedKFold, cv_params=dict(n_splits=3, n_repeats=2, random_state=32), do_full_save=do_full_save, )
def env_boston(): return Environment( train_dataset=get_boston_data(), results_path=assets_dir, holdout_dataset=get_holdout_data, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=3, random_state=1), runs=1, verbose=1, )
def env_0(): """`Environment` fixture that has `holdout_dataset` identical to `train_dataset` and is given `experiment_callbacks` consisting of the `lambda_callback` result of :func:`sentinel_checker`""" return Environment( train_dataset=get_breast_cancer_data(target="target"), results_path=assets_dir, holdout_dataset=get_breast_cancer_data(target="target"), metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=2, shuffle=True, random_state=32), experiment_callbacks=[sentinel_checker()], )