def execute(): env = Environment( train_dataset=get_imbalanced_dataset(), results_path="HyperparameterHunterAssets", target_column="target", metrics=["roc_auc_score", "accuracy_score"], cv_type="KFold", cv_params=dict(n_splits=5, random_state=7), ) # Since this is HyperparameterHunter, after all, we'll throw in some classic hyperparameter # optimization just for fun. If you're like most people and you think it's absurd to test # 18 different `imblearn` techniques, feel free to comment out some `EngineerStep`s below opt_0 = ET(iterations=20, random_state=32) opt_0.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), n_estimators=Integer(50, 900), learning_rate=Real(0.0001, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), feature_engineer=FeatureEngineer([ Categorical( [ EngineerStep(resample_smote_tomek, stage="intra_cv"), EngineerStep(over_sample_random, stage="intra_cv"), EngineerStep(over_sample_smote, stage="intra_cv"), EngineerStep(under_sample_random, stage="intra_cv"), EngineerStep(under_sample_cluster_centroids, stage="intra_cv"), EngineerStep(under_sample_tomek_links, stage="intra_cv"), #################### GROUP 2 (EXTENDED) #################### EngineerStep(resample_smote_enn, stage="intra_cv"), EngineerStep(over_sample_ADASYN, stage="intra_cv"), EngineerStep(over_sample_BorderlineSMOTE, stage="intra_cv"), EngineerStep(over_sample_SVMSMOTE, stage="intra_cv"), EngineerStep(under_sample_NearMiss, stage="intra_cv"), EngineerStep(under_sample_CondensedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_OneSidedSelection, stage="intra_cv"), EngineerStep(under_sample_NeighbourhoodCleaningRule, stage="intra_cv"), EngineerStep(under_sample_EditedNearestNeighbours, stage="intra_cv"), EngineerStep(under_sample_RepeatedEditedNearestNeighbour, stage="intra_cv"), EngineerStep(under_sample_AllKNN, stage="intra_cv"), EngineerStep(under_sample_InstanceHardnessThreshold, stage="intra_cv"), ], optional=True, ) ]), ) opt_0.go()
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=StratifiedKFold, cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), runs=2, ) optimizer = BayesianOptimization(iterations=100, read_experiments=True, random_state=None) optimizer.set_experiment_guidelines( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), learning_rate=Real(0.0001, 0.5), n_estimators=200, subsample=0.5, booster=Categorical(['gbtree', 'gblinear', 'dart']), ), model_extra_params=dict( fit=dict( eval_metric=Categorical(['auc', 'rmse', 'mae']) ) ), ) optimizer.go()
def _execute(): env = Environment( train_dataset=get_toy_classification_data(target='diagnosis'), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=RepeatedStratifiedKFold, cross_validation_params=dict(n_splits=5, n_repeats=2, random_state=32), ) optimizer = ExtraTreesOptimization( iterations=10, read_experiments=True, random_state=None, ) optimizer.set_experiment_guidelines( model_initializer=RGFClassifier, model_init_params=dict(max_leaf=1000, algorithm=Categorical( ['RGF', 'RGF_Opt', 'RGF_Sib']), l2=Real(0.01, 0.3), normalize=Categorical([True, False]), learning_rate=Real(0.3, 0.7), loss=Categorical(['LS', 'Expo', 'Log', 'Abs'])), ) optimizer.go()
def execute(): #################### Environment #################### env = Environment( train_dataset=get_boston_data(), results_path="HyperparameterHunterAssets", holdout_dataset=get_holdout_data, target_column="DIS", metrics=["r2_score", "median_absolute_error"], cv_type="KFold", cv_params=dict(n_splits=10, random_state=1), ) #################### CVExperiment #################### exp_0 = CVExperiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([quantile_transform]), ) #################### Optimization #################### # `opt_0` recognizes `exp_0`'s `feature_engineer` and its results as valid learning material # This is because `opt_0` marks the engineer step functions omitted by `exp_0` as `optional=True` opt_0 = DummyOptPro(iterations=10) opt_0.forge_experiment( model_initializer=Ridge, model_init_params=dict(), feature_engineer=FeatureEngineer([ Categorical([quantile_transform, log_transform], optional=True), Categorical([standard_scale, standard_scale_BAD], optional=True), Categorical([square_sum_feature], optional=True), ]), ) opt_0.go()
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), results_path="HyperparameterHunterAssets", target_column="diagnosis", metrics=["roc_auc_score"], cv_type=StratifiedKFold, cv_params=dict(n_splits=10, shuffle=True, random_state=32), runs=2, ) optimizer = BayesianOptPro(iterations=10, read_experiments=True, random_state=None) optimizer.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( max_depth=Integer(2, 20), learning_rate=Real(0.0001, 0.5), n_estimators=200, subsample=0.5, booster=Categorical(["gbtree", "gblinear", "dart"]), ), model_extra_params=dict(fit=dict(eval_metric=Categorical(["auc", "rmse", "mae"]))), ) optimizer.go()
def _build_penta_cat_int(input_shape): model = Sequential([ Dense( Integer(50, 100), kernel_initializer=Categorical( ["lecun_uniform", "lecun_normal", "glorot_normal"]), input_shape=input_shape, activation=Categorical( ["elu", "selu", "softsign", "relu", "tanh", "sigmoid"]), ), Dropout(0.5), Dense( 1, kernel_initializer=Categorical( ["lecun_uniform", "lecun_normal", "glorot_normal"]), activation=Categorical( ["elu", "selu", "softsign", "relu", "tanh", "sigmoid"]), ), ]) model.compile( optimizer=Categorical([ "sgd", "rmsprop", "adagrad", "adadelta", "adam", "adamax", "nadam" ]), loss="binary_crossentropy", metrics=["accuracy"], ) return model
def test_reg_engineer_categorical(env_boston_regression, hh_assets, opt_pro): """Demonstrate that `BayesianOptPro` breaks with multiple `Categorical`s when `FeatureEngineer` is included in the dimensions""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict( loss=Categorical(["linear", "square", "exponential"])), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def build_fn_digits_opt(input_shape=-1): model = Sequential( [ Reshape((8, 8, -1), input_shape=(64,)), Conv2D(32, kernel_size=Categorical([(3, 3), (5, 5)]), activation="relu"), MaxPooling2D(pool_size=Categorical([(2, 2), (3, 3)])), Dropout(0.5), Flatten(), Dense(1, activation="sigmoid"), ] ) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) return model
def test_reg_engineer_categorical_integer_ok(env_boston_regression, hh_assets, opt_pro): """Identical to `test_reg_engineer_categorical`, except `Integer` added to demonstrate that all `OptPro`s can optimize with `FeatureEngineer` if space is not exclusively `Categorical`""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(loss=Categorical( ["linear", "square", "exponential"]), n_estimators=Integer(10, 40)), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def _build_tri_cat_real(input_shape): model = Sequential([ Dense(90, input_shape=input_shape, activation=Categorical(["elu", "selu", "relu"])), Dropout(Real(0.2, 0.7)), Dense(1, activation=Categorical( ["selu", "softsign", "relu", "tanh", "sigmoid"])), ]) model.compile(optimizer=Categorical(["adam", "rmsprop"]), loss="binary_crossentropy", metrics=["accuracy"]) return model
def _build_fn_regressor(input_shape): model = Sequential( [ Dense(100, activation="relu", input_shape=input_shape), Dense(Integer(40, 60), activation="relu", kernel_initializer="glorot_normal"), Dropout(Real(0.2, 0.7)), Dense(1, activation=Categorical(["relu", "sigmoid"]), kernel_initializer="orthogonal"), ] ) model.compile( optimizer=Categorical(["adam", "rmsprop"]), loss="mean_absolute_error", metrics=["mean_absolute_error"], ) return model
def _execute(): env = Environment( train_dataset=get_toy_classification_data(), root_results_path='HyperparameterHunterAssets', metrics_map=['roc_auc_score'], cross_validation_type='StratifiedKFold', cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), runs=1, ) optimizer = GradientBoostedRegressionTreeOptimization( iterations=10, read_experiments=True, random_state=None, ) optimizer.set_experiment_guidelines( model_initializer=CatBoostClassifier, model_init_params=dict(iterations=100, eval_metric=Categorical( ['Logloss', 'Accuracy', 'AUC'], transform='onehot'), learning_rate=Real(low=0.0001, high=0.5), depth=Integer(4, 7), save_snapshot=False), ) optimizer.go() print('')
def _execute(): #################### Environment #################### env = Environment( train_dataset=get_breast_cancer_data(target="target"), root_results_path="HyperparameterHunterAssets", metrics_map=["roc_auc_score"], cross_validation_type="StratifiedKFold", cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32), ) #################### Experimentation #################### experiment = CVExperiment( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_experiment), model_extra_params=dict(callbacks=[ReduceLROnPlateau(patience=5)], batch_size=32, epochs=10, verbose=0), ) #################### Optimization #################### optimizer = BayesianOptimization(iterations=10) optimizer.set_experiment_guidelines( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_optimization), model_extra_params=dict( callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))], batch_size=Categorical([32, 64], transform="onehot"), epochs=10, verbose=0, ), ) optimizer.go()
def _execute(): env = Environment( train_dataset=get_breast_cancer_data(), root_results_path='HyperparameterHunterAssets', target_column='diagnosis', metrics_map=['roc_auc_score'], cross_validation_type=StratifiedKFold, cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32), runs=1, ) optimizer = RandomForestOptimization( iterations=100, read_experiments=True, ) optimizer.set_experiment_guidelines( model_initializer=LGBMClassifier, model_init_params=dict(boosting_type=Categorical(['gbdt', 'dart']), num_leaves=Integer(5, 20), max_depth=-1, min_child_samples=5, subsample=0.5), ) optimizer.go()
def hyperparameter_space(self, param_space=None): space = dict(is_unbalance=True, learning_rate=Real(0.01, 0.3), num_boost_round=Categorical(np.arange(50, 500, 20)), num_leaves=Categorical(np.arange(31, 256, 4)), min_child_weight=Real(0.1, 10), colsample_bytree=Real(0.5, 1.), subsample=Real(0.5, 1.), reg_alpha=Real(0.01, 0.1), reg_lambda=Real(0.01, 0.1)) if param_space: return param_space else: return space
class ChoiceMMNormalizeSS: functions = Categorical([min_max_scale, normalize, standard_scale]) engineers = Categorical([ EngineerStep(min_max_scale), EngineerStep(normalize), EngineerStep(standard_scale) ]) o_functions = Categorical([min_max_scale, normalize, standard_scale], optional=True) o_engineers = Categorical( [ EngineerStep(min_max_scale), EngineerStep(normalize), EngineerStep(standard_scale) ], optional=True, )
def _build_fn_optimization(input_shape): model = Sequential([ Dense(Integer(50, 150), kernel_initializer='uniform', input_shape=input_shape, activation='relu'), Dropout(Real(0.2, 0.7)), Dense(1, kernel_initializer='uniform', activation=Categorical(['sigmoid', 'relu'])), ]) model.compile( optimizer=Categorical(['adam', 'rmsprop']), loss='binary_crossentropy', metrics=['accuracy'], ) return model
def _build_fn_optimization(input_shape): model = Sequential([ Dense( Integer(50, 150), kernel_initializer="uniform", input_shape=input_shape, activation="relu", ), Dropout(Real(0.2, 0.7)), Dense(1, kernel_initializer="uniform", activation=Categorical(["sigmoid", "relu"])), ]) model.compile(optimizer=Categorical(["adam", "rmsprop"]), loss="binary_crossentropy", metrics=["accuracy"]) return model
def _build_fn_categorical_4(input_shape): # `Categorical(["glorot_normal", Orthogonal(gain=1)])` model = Sequential( [ Dense(Integer(50, 100), input_shape=input_shape), Dense(1, kernel_initializer=Categorical(["glorot_normal", Orthogonal(gain=1)])), ] ) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) return model
def test_optional_step_matching_by_exp(env_boston, es_0, es_1, es_2): """Test that the result of an Experiment is correctly matched by an OptPro with all-`optional` `EngineerStep` dimensions""" feature_engineer = [_ for _ in [es_0, es_1, es_2] if _ is not None] exp_0 = CVExperiment(XGBRegressor, feature_engineer=feature_engineer) opt_0 = ExtraTreesOptPro(iterations=1, random_state=32) opt_0.forge_experiment( XGBRegressor, feature_engineer=[ Categorical([es_a], optional=True), Categorical([es_b, es_c], optional=True), Categorical([es_d, es_e], optional=True), ], ) opt_0.get_ready() # Assert `opt_0` matched with `exp_0` assert len(opt_0.similar_experiments) == 1
def test_reg_categorical_ok(env_boston_regression, hh_assets, opt_pro): """Demonstrate that all `OptPro`s are fine with exclusively-`Categorical` space that doesn't include `FeatureEngineer`""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict( loss=Categorical(["linear", "square", "exponential"])), ) opt.go()
def test_reg_categorical_integer_ok(env_boston_regression, hh_assets, opt_pro): """Identical to `test_reg_categorical_ok`, except `Integer` added to show cooperation""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(loss=Categorical( ["linear", "square", "exponential"]), n_estimators=Integer(10, 40)), ) opt.go()
def opt_pro(optimization_protocol): opt = optimization_protocol(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=XGBRegressor, model_init_params=dict( max_depth=Integer(2, 10), n_estimators=Integer(50, 300), learning_rate=Real(0.1, 0.9), subsample=0.5, booster=Categorical(["gbtree", "gblinear"]), ), model_extra_params=dict(fit=dict( eval_metric=Categorical(["rmse", "mae"]))), feature_engineer=FeatureEngineer( [Categorical([nothing_transform], optional=True)]), ) opt.go() return opt
def test_reg_engineer(env_boston_regression, hh_assets, opt_pro): """Demonstrate problem with `BayesianOptPro` specifically - same configuration is fine with all other `OptPro`s""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def test_reg_engineer_integer_ok(env_boston_regression, hh_assets, opt_pro): """Identical to `test_reg_engineer`, except `Integer` dimension added to show that everything is fine now. Problem limited to not only `BayesianOptPro`, but also exclusively `Categorical` search spaces""" opt = opt_pro(iterations=3, random_state=32, n_initial_points=1) opt.forge_experiment( model_initializer=AdaBoostRegressor, model_init_params=dict(n_estimators=Integer(10, 40)), feature_engineer=FeatureEngineer( [Categorical([standard_scale, min_max_scale, normalize])]), ) opt.go()
def test_similar_experiments_unordered(): """Check that an experiment with a single `EngineerStep` is considered "similar" by an Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to fail because the otherwise identical engineer steps occur at different indexes in `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature` at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro is still 1 despite the fact that the other step immediately preceding it is `optional`""" env = Environment( train_dataset=get_breast_cancer_data(), results_path=assets_dir, target_column="diagnosis", metrics=["roc_auc_score"], cv_type="StratifiedKFold", cv_params=dict(n_splits=5, shuffle=True, random_state=32), ) exp = CVExperiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]), ) opt = BayesianOptPro(iterations=1) opt.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict(objective="reg:linear", subsample=0.5, max_depth=3), feature_engineer=FeatureEngineer([ Categorical([standard_scale, normalize, min_max_scale], optional=True), Categorical([sqr_sum_feature], optional=True), ]), ) opt.go() assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
def opt_dtc_0(): optimizer = ExtraTreesOptimization(iterations=2, random_state=1337) optimizer.set_experiment_guidelines( model_initializer=DecisionTreeClassifier, model_init_params=dict( criterion="gini", min_samples_split=Integer(2, 5), splitter=Categorical(["best", "random"]), min_weight_fraction_leaf=Real(0.0, 0.1), ), ) optimizer.go() yield optimizer
def opt_keras_0(): optimizer = DummySearch(iterations=2) optimizer.set_experiment_guidelines( model_initializer=KerasClassifier, model_init_params=dict(build_fn=_build_fn_optimization), model_extra_params=dict( callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))], batch_size=Categorical([32, 64], transform="onehot"), epochs=10, verbose=0, ), ) optimizer.go()
def do_optimization(): optimizer = BayesianOptPro(iterations=5, random_state=1337) optimizer.forge_experiment( model_initializer=XGBClassifier, model_init_params=dict( objective="reg:linear", max_depth=Integer(2, 20), learning_rate=Real(0.0001, 0.5), subsample=0.5, booster=Categorical(["gbtree", "dart"]), ), ) optimizer.go()
def opt_regressor(): optimizer = DummyOptPro(iterations=1) optimizer.forge_experiment( model_initializer=KerasRegressor, model_init_params=_build_fn_regressor, model_extra_params=dict( callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))], batch_size=Categorical([32, 64], transform="onehot"), epochs=10, verbose=0, ), ) optimizer.go()