def test_SE_retraining_fails_when_param_disabled():
        print("\n=== disabling "+kcvp+" and retraining ===")
        total_runs = 4
        aml = setup_and_train(False)  # first run
        first_models = get_partitioned_model_names(aml.leaderboard)
        first_bof = next(m for m in first_models.se if re.search(r'_BestOfFamily_', m))
        ds = import_dataset()
        for i in range(total_runs - 1):
            aml.train(y=ds.target, training_frame=ds.train)
        models = get_partitioned_model_names(aml.leaderboard)
        first_se_all_models = [m for m in first_models.se if re.search(r'_AllModels_', m)]
        se_all_models = [m for m in models.se if re.search(r'_AllModels_', m)]
        se_best_of_family = [m for m in models.se if re.search(r'_BestOfFamily_', m)]
        lb = aml.leaderboard
        print(lb.head(lb.nrows))

        assert len(models.se) == len(se_all_models) + len(se_best_of_family)
        assert len(se_all_models) == len(first_se_all_models), \
            "expecting only the {} first StackedEnsemble_AllModels, but got {}".format(len(first_se_all_models), len(se_all_models))
        assert se_all_models[0] in first_models.se, "first StackedEnsemble_AllModels got replaced by new one"
        if len(se_best_of_family) > 1:
            assert first_bof in se_best_of_family, "first StackedEnsemble_BestOfFamily disappeared after multiple runs"
            row_of = lambda id: lb[lb['model_id'] == id]
            first_bof_row = row_of(first_bof)
            assert all(all(row[i] == first_bof_row[i] for i in range(1, lb.ncols)) for row in [row_of(se) for se in se_best_of_family]), \
                "expecting possibly 2+ similar StackedEnsemble_BestOfFamily (corner case), but managed to obtain 2 different ones!"
        else:
            assert len(se_best_of_family) == 1, "expecting only the first StackedEnsemble_BestOfFamily, but got {}".format(len(se_best_of_family))
            assert se_best_of_family[0] == first_bof, "first StackedEnsemble_Best_of_Family got replaced by new one"
def test_AUTO_stopping_metric_with_no_sorting_metric_regression():
    print(
        "Check leaderboard with AUTO stopping metric and no sorting metric for regression"
    )
    ds = import_dataset('regression', split=False)
    exclude_algos = ["DeepLearning", "GLM"]
    aml = H2OAutoML(
        project_name=
        "py_aml_lb_test_auto_stopping_metric_no_sorting_regression",
        exclude_algos=exclude_algos,
        max_models=10,
        nfolds=2,
        stopping_rounds=1,
        stopping_tolerance=0.5,
        seed=automl_seed)
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["rmse", "mse", "mae", "rmsle", "mean_residual_deviance"], "rmse")
    base = get_partitioned_model_names(aml.leaderboard).base
    first = [m for m in base if 'XGBoost_1' in m]
    others = [m for m in base if m not in first]
    check_model_property(
        first, 'stopping_metric', True, None
    )  #if stopping_rounds == 0, actual value of stopping_metric is set to None
    check_model_property(others, 'stopping_metric', True, "deviance")
def test_algo_parameter_can_be_applied_only_to_a_specific_algo():
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_specific_algo_param",
        algo_parameters=dict(GBM__monotone_constraints=dict(AGE=1)),
        max_models=6,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    model_names = get_partitioned_model_names(aml.leaderboard).all
    models_supporting_monotone_constraints = [
        n for n in model_names if re.match(r"GBM|XGBoost", n)
    ]
    assert next((m for m in models_supporting_monotone_constraints
                 if m.startswith('GBM')),
                None), "There should be at least one GBM model"
    for m in models_supporting_monotone_constraints:
        model = h2o.get_model(m)
        mc_value = next(v['actual'] for n, v in model.params.items()
                        if n == 'monotone_constraints')
        if m.startswith('GBM'):
            assert isinstance(mc_value, list)
            age = next((v for v in mc_value if v['key'] == 'AGE'), None)
            assert age is not None
            assert age['value'] == 1.0
        else:
            assert mc_value is None
Esempio n. 4
0
def test_stacked_ensembles_are_trained_with_blending_frame_even_if_nfolds_eq_0(
):
    print(
        "Check that we can disable cross-validation when passing a blending frame and that Stacked Ensembles are trained using this frame."
    )
    max_models = 5
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_blending_frame",
                    seed=1,
                    max_models=max_models,
                    nfolds=0)
    aml.train(y=ds.target,
              training_frame=ds.train,
              blending_frame=ds.valid,
              leaderboard_frame=ds.test)

    se = get_partitioned_model_names(aml.leaderboard).se
    assert len(
        se
    ) > 3, "In blending mode, StackedEnsemble should still be trained in spite of nfolds=0."
    for m in se:
        model = h2o.get_model(m)
        assert model.params['blending_frame']['actual'][
            'name'] == ds.valid.frame_id
        assert model._model_json['output']['stacking_strategy'] == 'blending'
def test_AUTO_stopping_metric_with_no_sorting_metric_binary():
    print(
        "Check leaderboard with AUTO stopping metric and no sorting metric for binary"
    )
    ds = import_dataset('binary', split=False)
    exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"]
    aml = H2OAutoML(
        project_name="py_aml_lb_test_auto_stopping_metric_no_sorting_binary",
        seed=automl_seed,
        max_models=10,
        nfolds=2,
        stopping_rounds=1,
        stopping_tolerance=0.5,
        exclude_algos=exclude_algos)
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"],
        "auc", True)
    base = get_partitioned_model_names(aml.leaderboard).base
    first = [m for m in base if 'XGBoost_1' in m]
    others = [m for m in base if m not in first]
    check_model_property(
        first, 'stopping_metric', True, None
    )  #if stopping_rounds == 0, actual value of stopping_metric is set to None
    check_model_property(others, 'stopping_metric', True, "logloss")
def test_stacked_ensembles_are_trained_after_max_models():
    print("Check that Stacked Ensembles are still trained after max models have been trained")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_SE_after_max_models", seed=1, max_models=5)
    aml.train(y=ds.target, training_frame=ds.train)

    se = get_partitioned_model_names(aml.leaderboard).se
    assert len(se) == 2, "StackedEnsemble should still be trained after max models have been reached"
Esempio n. 7
0
def test_automl_stops_after_max_models():
    print("Check that automl gets interrupted after `max_models`")
    ds = import_dataset()
    max_models = 5
    aml = H2OAutoML(project_name="py_aml_max_models", seed=1, max_models=max_models)
    aml.train(y=ds.target, training_frame=ds.train)

    base_models = get_partitioned_model_names(aml.leaderboard).base
    assert len(base_models) == max_models, "obtained {} base models when {} are expected".format(len(base_models), max_models)
 def test_param_disabled():
     print("\n=== disabling "+kcvp+" ===")
     aml = setup_and_train(False)
     models = get_partitioned_model_names(aml.leaderboard)
     keys = list_keys_in_memory()
     preds = len(keys['cv_predictions'])
     assert preds == 0, "{preds} CV predictions were not cleaned from memory".format(preds=preds)
     for m in models.base:
         assert_cv_predictions_on_model(m, False)
     for m in models.se:
         assert not h2o.get_model(h2o.get_model(m).metalearner().model_id).cross_validation_predictions()
Esempio n. 9
0
def test_nfolds_eq_0():
    print("Check nfolds = 0 works properly")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_nfolds0",
                    nfolds=0,
                    max_models=3,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['nfolds']['actual'] == 0
def test_exclude_algos_is_applied_on_top_of_modeling_plan():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_plan_minimal_syntax",
                    max_models=5,
                    modeling_plan=['DRF', 'GLM', ('GBM', 'grids'), 'StackedEnsemble'],
                    exclude_algos=['GBM', 'StackedEnsemble'],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    models = get_partitioned_model_names(aml.leaderboard)
    assert len(models.base) == 3
    assert len(models.se) == 0
def test_include_algos():
    print("AutoML trains only models for algos listed in include_algos")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_include_algos",
                    include_algos=['GBM'],
                    max_models=max_models,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    models = get_partitioned_model_names(aml.leaderboard)
    assert all(['GBM' in name for name in models.base])
    assert len(models.se) == 0, "No StackedEnsemble should have been trained if not explicitly included to the existing include_algos"
def test_exclude_algos():
    print("AutoML doesn't train models for algos listed in exclude_algos")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_exclude_algos",
                    exclude_algos=['DRF', 'GLM'],
                    max_models=max_models,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    models = get_partitioned_model_names(aml.leaderboard)
    assert not any(['DRF' in name or 'GLM' in name for name in models.base])
    assert len(models.se) >= 1
 def test_param_enabled():
     print("\n=== enabling "+kcvp+" ===")
     aml = setup_and_train(True)
     models = get_partitioned_model_names(aml.leaderboard)
     keys = list_keys_in_memory()
     preds = len(keys['cv_predictions'])
     expected = len(models.all) * (nfolds + 1)  # +1 for holdout prediction
     assert preds == expected, "missing CV predictions in memory, got {actual}, expected {expected}".format(actual=preds, expected=expected)
     for m in models.base:
         assert_cv_predictions_on_model(m)
     for m in models.se:
         assert_cv_predictions_on_model(h2o.get_model(m).metalearner().model_id)
 def test_SE_retraining_works_when_param_enabled():
     print("\n=== enabling "+kcvp+" and retraining ===")
     total_runs = 4
     aml = setup_and_train(True)  # first run
     ds = import_dataset()
     for i in range(total_runs - 1):
         aml.train(y=ds.target, training_frame=ds.train)
     models = get_partitioned_model_names(aml.leaderboard)
     se_all_models = [m for m in models.se if re.search(r'_AllModels_', m)]
     se_best_of_family = [m for m in models.se if re.search(r'_BestOfFamily_', m)]
     assert len(models.se) == len(se_all_models) + len(se_best_of_family)
     assert len(se_best_of_family) + len(se_all_models) >= total_runs, "some StackedEnsembles are missing"
Esempio n. 15
0
def test_actual_default_input_stopping_rounds():
    train = h2o.import_file(path=pu.locate("smalldata/extdata/australia.csv"))
    target = 'runoffnew'
    exclude_algos = ["DeepLearning", "GLM"]
    aml = H2OAutoML(project_name="actual_default_input_stopping_rounds",
                    exclude_algos=exclude_algos,
                    max_models=10,
                    seed=automl_seed)
    aml.train(y=target, training_frame=train)

    base_models = get_partitioned_model_names(aml.leaderboard).base
    # when using cv, all cv models are trained with the stopping_rounds = 3 (default), but the final model resets stopping_rounds to 0 and use e. g. average ntrees, iterations...
    check_model_property(base_models, 'stopping_rounds', True, 0, 0, 3)
Esempio n. 16
0
def test_weights_column():
    print("Check weights_column")
    ds = import_dataset()
    nrows = ds.train.nrows
    weights_column = "weight"
    train = ds.train.concat(
        h2o.H2OFrame(list(map(lambda _: uniform(0, 5), range(nrows))),
                     column_names=[weights_column]))
    aml = H2OAutoML(project_name="py_aml_weights_column", max_models=3, seed=1)
    aml.train(y=ds.target, training_frame=train, weights_column=weights_column)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['weights_column']['actual'][
        'column_name'] == weights_column
def test_exploitation_doesnt_impact_max_models():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_exploitation_ratio_max_models",
                    exploitation_ratio=.1,
                    max_models=6,
                    seed=1,
                    verbosity='debug')
    aml.train(y=ds.target, training_frame=ds.train)
    print(aml.leaderboard)
    models = get_partitioned_model_names(aml.leaderboard)
    assert len(models.base) == 6
    assert len(models.se) > 3
    print(aml.training_info)
    assert 'start_GBM_lr_annealing' in aml.training_info
    assert 'start_XGBoost_lr_search' in aml.training_info
Esempio n. 18
0
def test_nfolds_default_and_fold_assignements_skipped_by_default():
    print("Check that fold assignments were skipped by default and nfolds > 1")
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_keep_cross_validation_fold_assignment_0",
        nfolds=3,
        max_models=3,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['keep_cross_validation_fold_assignment'][
        'actual'] == False
    assert amodel._model_json["output"][
        "cross_validation_fold_assignment_frame_id"] == None
def test_target_encoding_regression():
    ds = import_dataset(mode='regression')
    aml = H2OAutoML(project_name="automl_with_te_regression",
                    max_models=5,
                    preprocessing=['target_encoding'],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test)
    lb = aml.leaderboard
    print(lb)
    # we can't really verify from client if TE was correctly applied... so just using a poor man's check:
    mem_keys = h2o.ls().key
    # print(mem_keys)
    assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys)
    for mid in get_partitioned_model_names(lb).all:
        check_mojo_pojo_availability(mid)
 def test_param_disabled():
     print("\n=== disabling "+kcvm+" ===")
     aml = setup_and_train(False)
     models = get_partitioned_model_names(aml.leaderboard)
     check_model_property(models.se, kcvm, False)
     check_model_property(models.base, kcvm, True, False, True)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models_all']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv)
     for m in models.base:
         assert not h2o.get_model(m).cross_validation_models(), "unexpected cv models for model "+m
     for m in models.se:
         metal = h2o.get_model(h2o.get_model(m).metalearner().model_id)
         assert not metal.cross_validation_models(), "unexpected cv models for metalearner of model "+m
def test_modeling_plan_using_minimal_syntax():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_plan_minimal_syntax",
                    max_models=5,
                    modeling_plan=['DRF', 'GLM', ('GBM', 'grids'), 'StackedEnsemble'],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    models = get_partitioned_model_names(aml.leaderboard)
    assert len(models.base) == 5
    assert len(models.se) > 2
    assert any('DRF' in name for name in models.base)
    assert any('XRT' in name for name in models.base)
    assert any('GLM' in name for name in models.base)
    assert any('GBM_grid' in name for name in models.base)
    assert any('BestOfFamily' in name for name in models.se)
    assert any('AllModels' in name for name in models.se)
 def test_param_enabled():
     print("\n=== enabling "+kcvm+" ===")
     aml = setup_and_train(True)
     models = get_partitioned_model_names(aml.leaderboard)
     check_model_property(models.se, kcvm, False)
     check_model_property(models.base, kcvm, True, True, True)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models_all']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     expected = len(models.all) * nfolds
     assert cv == expected, "missing CV models in memory, got {actual}, expected {expected}".format(actual=cv, expected=expected)
     for m in models.base:
         assert h2o.get_model(m).cross_validation_models(), "missing cv models for model "+m
     for m in models.se:
         metal = h2o.get_model(h2o.get_model(m).metalearner().model_id)
         assert metal.cross_validation_models(), "missing cv models for metalearner of model "+m
Esempio n. 23
0
def test_AUTO_stopping_metric_with_custom_sorting_metric_regression():
    print("Check leaderboard with AUTO stopping metric and rmse sorting metric")
    ds = import_dataset('regression', split=False)
    exclude_algos = ["DeepLearning", "GLM"]
    aml = H2OAutoML(project_name="py_aml_lb_test_auto_stopping_metric_custom_sorting",
                    exclude_algos=exclude_algos,
                    max_models=10,
                    nfolds=2,
                    stopping_rounds=1,
                    stopping_tolerance=0.5,
                    seed=automl_seed,
                    sort_metric="rmse")
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(aml, exclude_algos, ["rmse", "mean_residual_deviance", "mse", "mae", "rmsle"], "rmse")
    base = get_partitioned_model_names(aml.leaderboard).base
    check_model_property(base, 'stopping_metric', True, "RMSE")
def test_balance_classes():
    print("Check balance_classes & related args work properly")
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_balance_classes_etc",
        exclude_algos=['XGBoost'],  # XGB doesn't support balance_classes
        max_models=3,
        balance_classes=True,
        class_sampling_factors=[0.2, 1.4],
        max_after_balance_size=3.0,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['balance_classes']['actual'] is True
    assert amodel.params['max_after_balance_size']['actual'] == 3.0
    assert amodel.params['class_sampling_factors']['actual'] == [0.2, 1.4]
Esempio n. 25
0
def test_keep_cross_validation_fold_assignment_enabled_with_nfolds_eq_0():
    print(
        "Check that fold assignments were skipped when `keep_cross_validation_fold_assignment` = True and nfolds = 0"
    )
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_aml_keep_cross_validation_fold_assignment_2",
        nfolds=0,
        max_models=3,
        seed=1,
        keep_cross_validation_fold_assignment=True)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['keep_cross_validation_fold_assignment'][
        'actual'] == False
    assert amodel._model_json["output"][
        "cross_validation_fold_assignment_frame_id"] == None
def test_modeling_plan_using_simplified_syntax():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_plan_simple_syntax",
                    max_models=3,
                    modeling_plan=[
                        ('DRF', ['XRT', 'def_1']),
                        ('GBM', 'grids'),
                        ('StackedEnsemble',)
                    ],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    print(aml.leaderboard)
    models = get_partitioned_model_names(aml.leaderboard)
    assert len(models.base) == 3
    assert len(models.se) > 2
    assert any('DRF' in name for name in models.base)
    assert any('XRT' in name for name in models.base)
    assert any('GBM_grid' in name for name in models.base)
    assert len([name for name in models.se if 'BestOfFamily' in name]) > 2  # we should get a BoF for group1 + one after GBM grid group.
def test_modeling_plan_using_full_syntax():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_plan_full_syntax",
                    max_models=3,
                    modeling_plan=[
                        dict(name='GLM', steps=[dict(id='def_1')]),
                        dict(name='GBM', alias='grids'),
                        dict(name='DRF', steps=[dict(id='def_1', group=5, weight=333)]),  # just testing that it is parsed correctly on backend (no model will be built due to the priority group + max_models)
                        dict(name='GBM', steps=[dict(id="def_1")]),
                    ],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    print(aml.leaderboard)
    models = get_partitioned_model_names(aml.leaderboard)
    assert len(models.base) == 3
    assert len(models.se) == 0
    assert any('GLM' in name for name in models.base)
    assert any('GBM' in name for name in models.base)
    assert any('GBM_grid' in name for name in models.base)
Esempio n. 28
0
def test_fold_column():
    print("Check fold_column param")
    ds = import_dataset()
    fold_column = "fold_id"
    nrows = ds.train.nrows
    train = ds.train.concat(
        h2o.H2OFrame(list(islice(cycle(range(3)), 0, nrows)),
                     column_names=[fold_column]))
    aml = H2OAutoML(project_name="py_aml_fold_column",
                    max_models=3,
                    seed=1,
                    keep_cross_validation_models=True)
    aml.train(y=ds.target, training_frame=train, fold_column=fold_column)
    models = get_partitioned_model_names(aml.leaderboard)
    amodel = h2o.get_model(models.base[0])
    assert amodel.params['fold_column']['actual']['column_name'] == fold_column
    ensemble = h2o.get_model(models.se[0])
    metalearner = h2o.get_model(ensemble.metalearner()['name'])
    assert metalearner.params['fold_column']['actual'][
        'column_name'] == fold_column
    assert len(metalearner.cross_validation_models()) == 3
def test_AUTO_stopping_metric_with_auc_sorting_metric():
    print("Check leaderboard with AUTO stopping metric and auc sorting metric")
    ds = import_dataset('binary', split=False)
    exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"]
    aml = H2OAutoML(
        project_name="py_aml_lb_test_auto_stopping_metric_auc_sorting",
        seed=automl_seed,
        max_models=10,
        nfolds=2,
        stopping_rounds=1,
        stopping_tolerance=0.5,
        exclude_algos=exclude_algos,
        sort_metric='auc')
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"],
        "auc", True)
    base = get_partitioned_model_names(aml.leaderboard).base
    check_model_property(base, 'stopping_metric', True, "logloss")