Ejemplo n.º 1
0
def env_0():
    """`Environment` fixture that has `holdout_dataset` identical to `train_dataset` and is given
    `experiment_callbacks` consisting of the `lambda_callback` result of :func:`sentinel_checker`"""
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        holdout_dataset=get_breast_cancer_data(target="target"),
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=32),
        experiment_callbacks=[sentinel_checker()],
    )
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=StratifiedKFold,
        cross_validation_params=dict(n_splits=10, shuffle=True, random_state=32),
        runs=2,
    )

    optimizer = BayesianOptimization(iterations=100, read_experiments=True, random_state=None)

    optimizer.set_experiment_guidelines(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            learning_rate=Real(0.0001, 0.5),
            n_estimators=200,
            subsample=0.5,
            booster=Categorical(['gbtree', 'gblinear', 'dart']),
        ),
        model_extra_params=dict(
            fit=dict(
                eval_metric=Categorical(['auc', 'rmse', 'mae'])
            )
        ),
    )

    optimizer.go()
def _execute():
    # To start, take a look at "examples/environment_params.json" - This is the file we're giving our Environment below
    # In this file, we can define a bunch of default Environment parameters that we don't want to always explicitly provide

    # It works really well for things that won't be changing often, like the following:
    # - `root_results_path`, which we probably never want to change, so all our results go to one place;
    # - `target_column`, which will probably be a constant for your data
    # - `metrics_map`, if you're not using any fancy metrics, and you already know what you want
    # - `file_blacklist`, if you're angry at me for adding that one result file that's always useless
    # Other parameters, whose default values you may want to change

    env = Environment(
        train_dataset=get_breast_cancer_data(),  # If your dataset is a str path, you can even add it to environment_params
        environment_params_path="./environment_params.json",  # Use this file for parameters not explicitly given
        cross_validation_params=dict(
            n_splits=5, shuffle=True, random_state=32
        ),  # Here we decide to override our default values
    )

    print(env.root_results_path)
    print(env.target_column)
    print(env.metrics_map)
    print(env.cross_validation_type)
    print(env.runs)
    print(env.file_blacklist)  # This includes some other values too, but you can ignore them
    # All of the above are from `environment_params_path`
    print(
        env.cross_validation_params
    )  # This is the value we provided above, rather than our `environment_params_path` default

    experiment = CVExperiment(model_initializer=KNeighborsClassifier, model_init_params={})
def _execute():
    #################### Environment ####################
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path="HyperparameterHunterAssets",
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
    )

    #################### Experimentation ####################
    experiment = CVExperiment(
        model_initializer=KerasClassifier,
        model_init_params=dict(build_fn=_build_fn_experiment),
        model_extra_params=dict(callbacks=[ReduceLROnPlateau(patience=5)],
                                batch_size=32,
                                epochs=10,
                                verbose=0),
    )

    #################### Optimization ####################
    optimizer = BayesianOptimization(iterations=10)
    optimizer.set_experiment_guidelines(
        model_initializer=KerasClassifier,
        model_init_params=dict(build_fn=_build_fn_optimization),
        model_extra_params=dict(
            callbacks=[ReduceLROnPlateau(patience=Integer(5, 10))],
            batch_size=Categorical([32, 64], transform="onehot"),
            epochs=10,
            verbose=0,
        ),
    )
    optimizer.go()
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path="HyperparameterHunterAssets",
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    experiment = CVExperiment(
        model_initializer=KerasClassifier,
        model_init_params=build_fn,
        model_extra_params=dict(
            callbacks=[
                ModelCheckpoint(
                    filepath=os.path.abspath("foo_checkpoint"), save_best_only=True, verbose=1
                ),
                ReduceLROnPlateau(patience=5),
            ],
            batch_size=32,
            epochs=10,
            verbose=0,
            shuffle=True,
        ),
    )
Ejemplo n.º 6
0
def env_1():
    return Environment(
        train_dataset=get_breast_cancer_data(),
        environment_params_path="examples/advanced_examples/environment_params.json",
        results_path=assets_dir,
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
    )
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path="HyperparameterHunterAssets",
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=10, shuffle=True, random_state=32),
        runs=2,
    )

    optimizer = BayesianOptPro(iterations=10, read_experiments=True, random_state=None)

    optimizer.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            max_depth=Integer(2, 20),
            learning_rate=Real(0.0001, 0.5),
            n_estimators=200,
            subsample=0.5,
            booster=Categorical(["gbtree", "gblinear", "dart"]),
        ),
        model_extra_params=dict(fit=dict(eval_metric=Categorical(["auc", "rmse", "mae"]))),
    )

    optimizer.go()
Ejemplo n.º 8
0
def expected_sentinels(cv_scheme):
    """Determine expected values of :class`environment.Environment`'s `DatasetSentinel`s given
    train/validation splits created by `cv_scheme`

    Parameters
    ----------
    cv_scheme: Descendant instance of `sklearn.model_selection._split._BaseKFold`
        Cross-validation class instance to produce train/validation data splits via :meth:`split`

    Returns
    -------
    train_sentinels: List
        Tuples of (train_input, train_target) produced by `cv_scheme.split`
    validation_sentinels: List
        Tuples of (validation_input, validation_target) produced by `cv_scheme.split`
    holdout_sentinels: List
        Tuples of (holdout_input, holdout_target) repeated for each period created by `cv_scheme`"""
    train_sentinels, validation_sentinels, holdout_sentinels = [], [], []

    data = get_breast_cancer_data(target="target")
    target_df = data[["target"]]
    input_df = data.drop(["target"], axis=1)

    # TODO: Need to account for feature engineering here - Probably need to have hardcoded test data rather than calculating splits here
    for train_i, validation_i in cv_scheme.split(input_df, target_df):
        train_sentinels.append(
            (input_df.iloc[train_i, :], target_df.iloc[train_i, :]))
        validation_sentinels.append(
            (input_df.iloc[validation_i, :], target_df.iloc[validation_i, :]))
        holdout_sentinels.append((input_df, target_df))
        # TODO: Need to account for feature engineering here - Probably need to have hardcoded test data rather than calculating splits here

    return train_sentinels, validation_sentinels, holdout_sentinels
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=StratifiedKFold,
        cross_validation_params=dict(n_splits=10,
                                     shuffle=True,
                                     random_state=32),
        runs=1,
    )

    optimizer = RandomForestOptimization(
        iterations=100,
        read_experiments=True,
    )
    optimizer.set_experiment_guidelines(
        model_initializer=LGBMClassifier,
        model_init_params=dict(boosting_type=Categorical(['gbdt', 'dart']),
                               num_leaves=Integer(5, 20),
                               max_depth=-1,
                               min_child_samples=5,
                               subsample=0.5),
    )
    optimizer.go()
Ejemplo n.º 10
0
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type='StratifiedKFold',
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
    )

    experiment = CrossValidationExperiment(
        model_initializer=KerasClassifier,
        model_init_params=build_fn,
        model_extra_params=dict(
            callbacks=[
                ModelCheckpoint(filepath=os.path.abspath('foo_checkpoint'),
                                save_best_only=True,
                                verbose=1),
                ReduceLROnPlateau(patience=5),
            ],
            batch_size=32,
            epochs=10,
            verbose=0,
            shuffle=True,
        ),
    )
def execute():
    """This is going to be a very simple example to illustrate what exactly HyperparameterHunter does, and how it revolutionizes
    hyperparameter optimization."""

    # Start by creating an `Environment` - This is where you define how Experiments (and optimization) will be conducted
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path="HyperparameterHunterAssets",
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=10,
                                     shuffle=True,
                                     random_state=32),
    )

    # Now, conduct an `Experiment`
    # This tells HyperparameterHunter to use the settings in the active `Environment` to train a model with these hyperparameters
    experiment = CVExperiment(model_initializer=XGBClassifier,
                              model_init_params=dict(objective="reg:linear",
                                                     max_depth=3))

    # That's it. No annoying boilerplate code to fit models and record results
    # Now, the `Environment`'s `root_results_path` directory will contain new files describing the Experiment just conducted

    # Time for the fun part. We'll set up some hyperparameter optimization by first defining the `OptimizationProtocol` we want
    optimizer = BayesianOptimization(verbose=1)

    # Now we're going to say which hyperparameters we want to optimize.
    # Notice how this looks just like our `experiment` above
    optimizer.set_experiment_guidelines(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            objective=
            "reg:linear",  # We're setting this as a constant guideline - Not one to optimize
            max_depth=Integer(
                2, 10
            ),  # Instead of using an int like the `experiment` above, we provide a space to search
        ),
    )
    # Notice that our range for `max_depth` includes the `max_depth=3` value we used in our `experiment` earlier

    optimizer.go()  # Now, we go

    assert experiment.experiment_id in [
        _[2] for _ in optimizer.similar_experiments
    ]
    # Here we're verifying that the `experiment` we conducted first was found by `optimizer` and used as learning material
    # You can also see via the console that we found `experiment`'s saved files, and used it to start optimization

    last_experiment_id = optimizer.current_experiment.experiment_id
    # Let's save the id of the experiment that was just conducted by `optimizer`

    optimizer.go()  # Now, we'll start up `optimizer` again...

    # And we can see that this second optimization round learned from both our first `experiment` and our first optimization round
    assert experiment.experiment_id in [
        _[2] for _ in optimizer.similar_experiments
    ]
    assert last_experiment_id in [_[2] for _ in optimizer.similar_experiments]
Ejemplo n.º 12
0
def initialization_matching_env():
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="KFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=32),
    )
Ejemplo n.º 13
0
def env_0():
    return Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        root_results_path=assets_dir,
        metrics_map=["roc_auc_score"],
        cross_validation_type="StratifiedKFold",
        cross_validation_params=dict(n_splits=3, shuffle=True,
                                     random_state=32),
    )
Ejemplo n.º 14
0
def env_breast_cancer():
    env = Environment(
        train_dataset=get_breast_cancer_data(target="target"),
        results_path=assets_dir,
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )
    return env
def env_5(request):
    return Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type=StratifiedKFold,
        cv_params=dict(n_splits=3, shuffle=True, random_state=32),
        experiment_recorders=request.param,
    )
Ejemplo n.º 16
0
def env_4():
    return Environment(
        train_dataset=get_breast_cancer_data(target="diagnosis"),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=dict(
            roc_auc="roc_auc_score",
            f1=f1_score,
            f1_micro=lambda y_true, y_pred: f1_score(y_true, y_pred, average="micro"),
            f1_macro=lambda y_true, y_pred: f1_score(y_true, y_pred, average="macro"),
        ),
        cv_type="KFold",
        cv_params=dict(n_splits=2, shuffle=True, random_state=42),
        verbose=1,
    )
Ejemplo n.º 17
0
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score', 'f1_score'],
        cross_validation_type=KFold,
        cross_validation_params=dict(n_splits=5, shuffle=True,
                                     random_state=32),
        runs=3,
    )

    experiment = CrossValidationExperiment(model_initializer=LGBMClassifier,
                                           model_init_params=dict(
                                               boosting_type='gbdt',
                                               num_leaves=31,
                                               max_depth=-1,
                                               min_child_samples=5,
                                               subsample=0.5))
Ejemplo n.º 18
0
def execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(target='target'),
        root_results_path='HyperparameterHunterAssets',
        metrics_map=['roc_auc_score'],
        cross_validation_type='StratifiedKFold',
        cross_validation_params=dict(n_splits=10,
                                     shuffle=True,
                                     random_state=32),
    )

    experiment = CrossValidationExperiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(objective='reg:linear',
                               max_depth=3,
                               n_estimators=100,
                               subsample=0.5),
        model_extra_params=dict(fit=dict(eval_set=[(
            env.train_input,
            env.train_target), (env.validation_input, env.validation_target)],
                                         early_stopping_rounds=5)))
def test_similar_experiments_unordered():
    """Check that an experiment with a single `EngineerStep` is considered "similar" by an
    Optimization Protocol, with two `optional` `EngineerStep`s, where the second step is identical
    to the single step used by the standalone experiment. As of v3.0.0alpha2, this is expected to
    fail because the otherwise identical engineer steps occur at different indexes in
    `FeatureEngineer.steps` for the experiment and the OptPro. The experiment has `sqr_sum_feature`
    at index=0, while the same step in the OptPro is at index=1. Note that the step index in OptPro
    is still 1 despite the fact that the other step immediately preceding it is `optional`"""
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    exp = CVExperiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(objective="reg:linear",
                               subsample=0.5,
                               max_depth=3),
        feature_engineer=FeatureEngineer([EngineerStep(sqr_sum_feature)]),
    )

    opt = BayesianOptPro(iterations=1)
    opt.forge_experiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(objective="reg:linear",
                               subsample=0.5,
                               max_depth=3),
        feature_engineer=FeatureEngineer([
            Categorical([standard_scale, normalize, min_max_scale],
                        optional=True),
            Categorical([sqr_sum_feature], optional=True),
        ]),
    )
    opt.go()

    assert exp.experiment_id in [_[2] for _ in opt.similar_experiments]
Ejemplo n.º 20
0
def _execute():
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        root_results_path='HyperparameterHunterAssets',
        target_column='diagnosis',
        metrics_map=['roc_auc_score'],
        cross_validation_type=StratifiedKFold,
        cross_validation_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    experiment_0 = CrossValidationExperiment(model_initializer=KNeighborsClassifier, model_init_params={})
    experiment_1 = CrossValidationExperiment(model_initializer=SVC, model_init_params={})
    experiment_2 = CrossValidationExperiment(model_initializer=LinearSVC, model_init_params={})
    experiment_3 = CrossValidationExperiment(model_initializer=NuSVC, model_init_params={})
    experiment_4 = CrossValidationExperiment(model_initializer=DecisionTreeClassifier, model_init_params={})
    experiment_5 = CrossValidationExperiment(model_initializer=RandomForestClassifier, model_init_params={})
    experiment_6 = CrossValidationExperiment(model_initializer=AdaBoostClassifier, model_init_params={})
    experiment_7 = CrossValidationExperiment(model_initializer=GradientBoostingClassifier, model_init_params={})
    experiment_8 = CrossValidationExperiment(model_initializer=GaussianNB, model_init_params={})
    experiment_9 = CrossValidationExperiment(model_initializer=LinearDiscriminantAnalysis, model_init_params={})
    experiment_10 = CrossValidationExperiment(model_initializer=QuadraticDiscriminantAnalysis, model_init_params={})
    experiment_11 = CrossValidationExperiment(model_initializer=MLPClassifier, model_init_params={})
Ejemplo n.º 21
0
def test_predictor_holdout_breast_cancer():
    G.priority_callbacks = (DummyExperimentPredictorHoldout, )

    #################### Set Up Environment ####################
    env = Environment(
        train_dataset=get_breast_cancer_data(),
        results_path=assets_dir,
        holdout_dataset=get_holdout,
        target_column="diagnosis",
        metrics=["roc_auc_score"],
        cv_type="StratifiedKFold",
        cv_params=dict(n_splits=5, shuffle=True, random_state=32),
    )

    #################### Perform Experiment ####################
    experiment = CVExperiment(
        model_initializer=XGBClassifier,
        model_init_params=dict(
            objective="reg:linear",
            max_depth=3,
            n_estimators=100,
            learning_rate=0.02,
            min_child_weight=6,
            gamma=0.07,
            colsample_bytree=0.31,
        ),
        model_extra_params=dict(fit=dict(
            eval_set=[
                (env.train_input, env.train_target),
                (env.validation_input, env.validation_target),
            ],
            early_stopping_rounds=5,
            eval_metric="mae",
        )),
    )

    G.priority_callbacks = tuple()
Ejemplo n.º 22
0
cb_save_all = lambda_callback(on_run_end=save_model)
# Now we have our very own callback ready to go!

# There are two places we can tell HyperparameterHunter about our callback:
#   1. `Environment`, with the `experiment_callbacks` kwarg, or
#   2. `CVExperiment`, with the `callbacks` kwarg
# Using `Environment` will automatically tell all our Experiments about the callbacks, whereas
#   providing `callbacks` directly to `CVExperiment` restricts callbacks to just that Experiment

# We'll give our `callbacks` to `CVExperiment` because we want to try different callbacks later

##################################################
# Set Up Environment
##################################################
env = Environment(
    train_dataset=get_breast_cancer_data("target"),
    results_path="HyperparameterHunterAssets",
    metrics=["roc_auc_score"],
    cv_type="StratifiedKFold",
    cv_params=dict(n_splits=10, shuffle=True, random_state=32),
    runs=1,
    # experiment_callbacks=[cb_save_all],  # Do this to give callbacks to all Experiments
)

##################################################
# Experiment
##################################################
exp_0 = CVExperiment(
    model_initializer=XGBClassifier,
    model_init_params=dict(objective="reg:linear",
                           max_depth=3,
Ejemplo n.º 23
0
#         max_depth=Integer(low=2, high=20),
#         learning_rate=Real(0.0001, 0.5),
#         n_estimators=200,
#         subsample=0.5,
#         booster=Categorical(['gbtree', 'gblinear', 'dart']),
#     )
# )
# optimizer.go()

from hyperparameter_hunter import Environment, CVExperiment, BayesianOptPro, Integer
from hyperparameter_hunter.utils.learning_utils import get_breast_cancer_data
from xgboost import XGBClassifier

# Start by creating an `Environment` - This is where you define how Experiments (and optimization) will be conducted
env = Environment(
    train_dataset=get_breast_cancer_data(target='target'),
    results_path='HyperparameterHunterAssets',
    metrics=['roc_auc_score'],
    cv_type='StratifiedKFold',
    cv_params=dict(n_splits=10, shuffle=True, random_state=32),
)

# Now, conduct an `Experiment`
# This tells HyperparameterHunter to use the settings in the active `Environment` to train a model with these hyperparameters
experiment = CVExperiment(model_initializer=XGBClassifier,
                          model_init_params=dict(objective='reg:linear',
                                                 max_depth=3))

# That's it. No annoying boilerplate code to fit models and record results
# Now, the `Environment`'s `results_path` directory will contain new files describing the Experiment just conducted