def test_automl_creates_interpretable_SE_with_only_monotonic_models():
    ds = import_dataset()
    aml_mono = H2OAutoML(
        project_name="test_automl_creates_interpretable_se",
        max_models=5,
        include_algos=["GBM", "GLM", "XGBoost", "StackedEnsemble"],
        monotone_constraints=dict(AGE=1,
                                  DPROS=1,
                                  DCAPS=1,
                                  PSA=1,
                                  VOL=1,
                                  GLEASON=1),
        seed=1234)
    aml_mono.train(y=ds.target, training_frame=ds.train)

    leaderboard = (aml_mono.leaderboard.as_data_frame()["model_id"])

    assert leaderboard.apply(
        lambda model_name: "Monotonic" in model_name).any()

    se_name = leaderboard[leaderboard.apply(
        lambda model_name: "Monotonic" in model_name)]
    se_mono = h2o.get_model(se_name.iloc[0])

    assert leaderboard.apply(lambda model_name: 'GLM' in model_name).any()
    assert all(['GBM' in bm or 'XGBoost' in bm for bm in se_mono.base_models])
def test_automl_creates_interpretable_SE_iff_monotonic_models_exist():
    ds = import_dataset()
    aml_mono = H2OAutoML(
        project_name="test_automl_creates_interpretable_se",
        max_models=5,
        include_algos=["GBM", "XGBoost", "DRF", "StackedEnsemble"],
        monotone_constraints=dict(AGE=1,
                                  DPROS=1,
                                  DCAPS=1,
                                  PSA=1,
                                  VOL=1,
                                  GLEASON=1),
        seed=1234)
    aml_mono.train(y=ds.target, training_frame=ds.train)

    assert (aml_mono.leaderboard.as_data_frame()["model_id"].apply(
        lambda model_name: "Monotonic" in model_name).any())

    # If we don't have monotonic constraints we shouldn't have monotonically constrained SE
    aml = H2OAutoML(project_name="test_automl_doesnt_create_interpretable_se",
                    max_models=2,
                    include_algos=["GBM", "XGBoost", "StackedEnsemble"],
                    seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)

    assert not (aml.leaderboard.as_data_frame()["model_id"].apply(
        lambda model_name: "Monotonic" in model_name).any())
def test_non_train_params_are_frozen_after_first_train():
    aml = H2OAutoML(max_models=2, nfolds=3, seed=42, keep_cross_validation_predictions=True)
    ds = import_dataset()
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    assert aml.leaderboard.nrows == aml.max_models + aml.leaderboard["model_id"].grep("StackedEnsemble").sum()
    assert aml.leaderboard.columns[1] == 'auc'
    
    try:
        aml.nfolds = 0
        assert False, "should have raised"
    except H2OValueError as e:
        assert "Param ``nfolds`` can not be modified after the first call to ``train``." == str(e)
        assert aml.nfolds == 3
        
    try:
        aml.seed = 24
        assert False, "should have raised"
    except H2OValueError as e:
        assert "Param ``seed`` can not be modified after the first call to ``train``." == str(e)
        assert aml.seed == 42

    assert aml.sort_metric == 'AUTO'
    aml.sort_metric = 'logloss'
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    print(aml.leaderboard)
    assert aml.leaderboard.nrows == aml.max_models*2 + aml.leaderboard["model_id"].grep("StackedEnsemble").sum()
    assert aml.leaderboard.columns[1] == 'logloss'
def test_modeling_steps():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_modeling_steps",
                    max_models=5,
                    modeling_plan=['DRF',
                                   dict(name='GBM', steps=[
                                       dict(id='def_3', group=2),
                                       dict(id='grid_1', weight=77)
                                   ]),
                                   ('GLM', 'defaults'),
                                   ('StackedEnsemble', 'defaults')],
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    print(aml.leaderboard)
    # we should now see the detailed steps sorted in their execution order.
    print(aml.modeling_steps)
    assert aml.modeling_steps == [
        {'name': 'DRF', 'steps': [{'id': 'def_1', 'group': 1, 'weight': 10},
                                  {'id': 'XRT', 'group': 1, 'weight': 10}]},
        {'name': 'GLM', 'steps': [{'id': 'def_1', 'group': 1, 'weight': 10}]},
        {'name': 'StackedEnsemble', 'steps': [{'id': 'best_of_family_1', 'group': 1, 'weight': 10}]},  # no all_1 as XRT is interpreted as not being of the same family as DRF (legacy decision). 
        {'name': 'GBM', 'steps': [{'id': 'def_3', 'group': 2, 'weight': 10},
                                  {'id': 'grid_1', 'group': 2, 'weight': 77}]},  # grids are 2nd group by default
        {'name': 'StackedEnsemble', 'steps': [{'id': 'best_of_family_2', 'group': 2, 'weight': 10}, 
                                              {'id': 'all_2', 'group': 2, 'weight': 10}]}
    ]

    new_aml = H2OAutoML(project_name="py_reinject_modeling_steps",
                        max_models=5,
                        modeling_plan=aml.modeling_steps,
                        seed=1)
    new_aml.train(y=ds.target, training_frame=ds.train)
    print(new_aml.leaderboard)
    assert aml.modeling_steps == new_aml.modeling_steps
Example #5
0
def test_columns_not_in_x_and_y_are_ignored():
    ds = import_dataset()
    #Use same project_name so we add to leaderboard for each run
    aml = H2OAutoML(max_models=2, stopping_rounds=3, stopping_tolerance=0.001, project_name="aml1")

    print("AutoML with x as a str list, train, valid, and test")
    x = ["AGE", "RACE", "DPROS"]
    y = ds.target
    names = ds.train.names
    aml.train(x=x, y=y, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)
    print("AutoML leaderboard")
    print(aml.leaderboard)
    models = aml.leaderboard["model_id"]
    check_ignore_cols_automl(models, names, x, y)

    print("AutoML with x and y as col indexes, train, valid, and test")
    aml.train(x=[2, 3, 4], y=1, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)
    print("AutoML leaderboard")
    print(aml.leaderboard)
    models = aml.leaderboard["model_id"]
    check_ignore_cols_automl(models, names, x, y)

    print("AutoML with x as a str list, y as a col index, train, valid, and test")
    aml.train(x=x, y=1, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)
    print("AutoML leaderboard")
    print(aml.leaderboard)
    models = aml.leaderboard["model_id"]
    check_ignore_cols_automl(models, names, x, y)

    print("AutoML with x as col indexes, y as a str, train, valid, and test")
    aml.train(x=[2,3,4], y=y, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)
    print("AutoML leaderboard")
    print(aml.leaderboard)
    models = aml.leaderboard["model_id"]
    check_ignore_cols_automl(models, names, x, y)
def test_get_automl():
    ds = import_dataset()
    aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)

    get_aml = get_automl(aml.project_name)

    assert aml.project_name == get_aml["project_name"]
    assert aml.leader.model_id == get_aml["leader"].model_id
    assert aml.leaderboard.get_frame_data(
    ) == get_aml["leaderboard"].get_frame_data()
    assert aml.event_log.get_frame_data(
    ) == get_aml["event_log"].get_frame_data()
    assert aml.training_info == get_aml['training_info']

    # PUBDEV-6599
    assert aml.project_name == get_aml.project_name
    assert aml.leader.model_id == get_aml.leader.model_id
    assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id
    assert aml.event_log.frame_id == get_aml.event_log.frame_id
    assert aml.training_info == get_aml.training_info

    # Test predictions
    predictions = aml.predict(ds.test)
    predictions_from_output = get_aml.predict(ds.test)
    assert (predictions == predictions_from_output).all()

    # Test get_leaderboard PUBDEV-7454
    assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all()
    assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml,
                                                           'ALL')).all()
def test_AUTO_stopping_metric_with_no_sorting_metric_regression():
    print(
        "Check leaderboard with AUTO stopping metric and no sorting metric for regression"
    )
    ds = import_dataset('regression', split=False)
    exclude_algos = ["DeepLearning", "GLM"]
    aml = H2OAutoML(
        project_name=
        "py_aml_lb_test_auto_stopping_metric_no_sorting_regression",
        exclude_algos=exclude_algos,
        max_models=10,
        nfolds=2,
        stopping_rounds=1,
        stopping_tolerance=0.5,
        seed=automl_seed)
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["rmse", "mse", "mae", "rmsle", "mean_residual_deviance"], "rmse")
    base = get_partitioned_model_names(aml.leaderboard).base
    first = [m for m in base if 'XGBoost_1' in m]
    others = [m for m in base if m not in first]
    check_model_property(
        first, 'stopping_metric', True, None
    )  #if stopping_rounds == 0, actual value of stopping_metric is set to None
    check_model_property(others, 'stopping_metric', True, "deviance")
    def test_remove_automl_after_individual_manual_deletions():
        ds = import_dataset()
        project_name='aml_no_xval_remove_test'
        max_models = 3
        aml = H2OAutoML(project_name=project_name,
                        nfolds=0,
                        max_models=max_models,
                        seed=1)
        aml.train(y=ds.target, training_frame=ds.train, blending_frame=ds.valid)

        keys = list_keys_in_memory()
        # manually remove the first item for each category to verify robustness of global automl deletion
        # for example, to verify that exceptions (if any) are handled correctly when automl is trying to remove a base model that was already removed
        for k, v in keys.items():
            if k == 'all': continue
            if len(v) > 0:
                h2o.remove(v[0])

        h2o.remove(aml)
        clean = list_keys_in_memory()
        print(clean['all'].values)
        assert aml.key.startswith(project_name)
        assert not contains_leaderboard(aml.key, clean)
        assert not contains_event_log(aml.key, clean)
        assert len(clean['models_base']) == 0
        assert len(clean['cv_models']) == 0
        assert len(clean['models_all']) == 0
        assert len(clean['metrics']) == 0
        assert len(clean['predictions']) == 0
        assert len(clean['automl']) == 0
        for frame in [ds.train, ds.valid, ds.test]:
            assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
def test_AUTO_stopping_metric_with_no_sorting_metric_binary():
    print(
        "Check leaderboard with AUTO stopping metric and no sorting metric for binary"
    )
    ds = import_dataset('binary', split=False)
    exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"]
    aml = H2OAutoML(
        project_name="py_aml_lb_test_auto_stopping_metric_no_sorting_binary",
        seed=automl_seed,
        max_models=10,
        nfolds=2,
        stopping_rounds=1,
        stopping_tolerance=0.5,
        exclude_algos=exclude_algos)
    aml.train(y=ds.target, training_frame=ds.train)

    check_leaderboard(
        aml, exclude_algos,
        ["auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"],
        "auc", True)
    base = get_partitioned_model_names(aml.leaderboard).base
    first = [m for m in base if 'XGBoost_1' in m]
    others = [m for m in base if m not in first]
    check_model_property(
        first, 'stopping_metric', True, None
    )  #if stopping_rounds == 0, actual value of stopping_metric is set to None
    check_model_property(others, 'stopping_metric', True, "logloss")
    def test_SE_retraining_fails_when_param_disabled():
        print("\n=== disabling "+kcvp+" and retraining ===")
        total_runs = 4
        aml = setup_and_train(False)  # first run
        first_models = get_partitioned_model_names(aml.leaderboard)
        first_bof = next(m for m in first_models.se if re.search(r'_BestOfFamily_', m))
        ds = import_dataset()
        for i in range(total_runs - 1):
            aml.train(y=ds.target, training_frame=ds.train)
        models = get_partitioned_model_names(aml.leaderboard)
        first_se_all_models = [m for m in first_models.se if re.search(r'_AllModels_', m)]
        se_all_models = [m for m in models.se if re.search(r'_AllModels_', m)]
        se_best_of_family = [m for m in models.se if re.search(r'_BestOfFamily_', m)]
        lb = aml.leaderboard
        print(lb.head(lb.nrows))

        assert len(models.se) == len(se_all_models) + len(se_best_of_family)
        assert len(se_all_models) == len(first_se_all_models), \
            "expecting only the {} first StackedEnsemble_AllModels, but got {}".format(len(first_se_all_models), len(se_all_models))
        assert se_all_models[0] in first_models.se, "first StackedEnsemble_AllModels got replaced by new one"
        if len(se_best_of_family) > 1:
            assert first_bof in se_best_of_family, "first StackedEnsemble_BestOfFamily disappeared after multiple runs"
            row_of = lambda id: lb[lb['model_id'] == id]
            first_bof_row = row_of(first_bof)
            assert all(all(row[i] == first_bof_row[i] for i in range(1, lb.ncols)) for row in [row_of(se) for se in se_best_of_family]), \
                "expecting possibly 2+ similar StackedEnsemble_BestOfFamily (corner case), but managed to obtain 2 different ones!"
        else:
            assert len(se_best_of_family) == 1, "expecting only the first StackedEnsemble_BestOfFamily, but got {}".format(len(se_best_of_family))
            assert se_best_of_family[0] == first_bof, "first StackedEnsemble_Best_of_Family got replaced by new one"
Example #11
0
def test_workaround_for_distribution():
    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "true"))
        ds = import_dataset('regression')
        aml = H2OAutoML(project_name="py_test",
                        algo_parameters=dict(
                            distribution='poisson',
                            family='poisson',
                        ),
                        exclude_algos=['StackedEnsemble'],
                        max_runtime_secs=60,
                        seed=1)
        aml.train(y=ds.target, training_frame=ds.train)
        model_names = [
            aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))
        ]
        for mn in model_names:
            m = h2o.get_model(mn)
            dist = m.params[
                'distribution'] if 'distribution' in m.params else m.params[
                    'family'] if 'family' in m.params else None
            print("{}: distribution = {}".format(mn, dist))
    except:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))
def test_algo_parameter_can_be_applied_only_to_a_specific_algo():
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_specific_algo_param",
        algo_parameters=dict(GBM__monotone_constraints=dict(AGE=1)),
        max_models=6,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    model_names = get_partitioned_model_names(aml.leaderboard).all
    models_supporting_monotone_constraints = [
        n for n in model_names if re.match(r"GBM|XGBoost", n)
    ]
    assert next((m for m in models_supporting_monotone_constraints
                 if m.startswith('GBM')),
                None), "There should be at least one GBM model"
    for m in models_supporting_monotone_constraints:
        model = h2o.get_model(m)
        mc_value = next(v['actual'] for n, v in model.params.items()
                        if n == 'monotone_constraints')
        if m.startswith('GBM'):
            assert isinstance(mc_value, list)
            age = next((v for v in mc_value if v['key'] == 'AGE'), None)
            assert age is not None
            assert age['value'] == 1.0
        else:
            assert mc_value is None
Example #13
0
def test_stacked_ensembles_are_trained_with_blending_frame_even_if_nfolds_eq_0(
):
    print(
        "Check that we can disable cross-validation when passing a blending frame and that Stacked Ensembles are trained using this frame."
    )
    max_models = 5
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_blending_frame",
                    seed=1,
                    max_models=max_models,
                    nfolds=0)
    aml.train(y=ds.target,
              training_frame=ds.train,
              blending_frame=ds.valid,
              leaderboard_frame=ds.test)

    se = get_partitioned_model_names(aml.leaderboard).se
    assert len(
        se
    ) > 3, "In blending mode, StackedEnsemble should still be trained in spite of nfolds=0."
    for m in se:
        model = h2o.get_model(m)
        assert model.params['blending_frame']['actual'][
            'name'] == ds.valid.frame_id
        assert model._model_json['output']['stacking_strategy'] == 'blending'
def test_exploitation_impacts_exploration_duration():
    ds = import_dataset()
    planned_duration = 60
    aml = H2OAutoML(
        project_name="py_exploitation_ratio_max_runtime",
        exploitation_ratio=
        .5,  # excessive ratio on purpose, due to training overheads in multinode
        exclude_algos=['DeepLearning',
                       'XGBoost'],  # removing some algos for the same reason
        max_runtime_secs=planned_duration,
        seed=1,
        verbosity='info')
    aml.train(y=ds.target, training_frame=ds.train)
    automl_start = int(aml.training_info['start_epoch'])
    assert 'start_GBM_lr_annealing' in aml.training_info
    # assert 'start_XGBoost_lr_search' in aml.training_info
    first_exploitation_step = 'start_GBM_lr_annealing'
    after_exploitation_step = 'start_completion_GBM_grid_1'
    if first_exploitation_step in aml.training_info and after_exploitation_step in aml.training_info:
        exploitation_start = int(aml.training_info[first_exploitation_step])
        exploration_duration = exploitation_start - automl_start
        after_start = int(aml.training_info[after_exploitation_step])
        exploitation_duration = after_start - exploitation_start
        # can't reliably check duration ratio
        assert 0 < exploration_duration < planned_duration
        print(aml.leaderboard)
        print(exploitation_duration)
        print(exploration_duration)
        assert 0 < exploitation_duration < exploration_duration
    else:
        print(aml.leaderboard)
        print("budget time was too small to start and complete exploitation")
Example #15
0
def test_no_x_train_and_validation_and_test_sets():
    print("AutoML run with x not provided with train, valid, and test")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml4",
                    stopping_rounds=3,
                    stopping_tolerance=0.001,
                    stopping_metric="AUC",
                    max_models=max_models,
                    seed=1234,
                    nfolds=0)
    aml.train(y=ds.target,
              training_frame=ds.train,
              validation_frame=ds.valid,
              leaderboard_frame=ds.test)
    assert aml.project_name == "py_aml4", "Project name is not set"
    assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3"
    assert aml.stopping_tolerance == 0.001, "stopping_tolerance is not set to 0.001"
    assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`"
    assert aml.max_models == 2, "max_models is not set to 2"
    assert aml.seed == 1234, "seed is not set to `1234`"
    log_df = aml.event_log.as_data_frame()
    warn_messages = log_df[log_df['level'] == 'WARN']['message']
    assert not warn_messages.str.startswith("User specified a validation frame with cross-validation still enabled").any(), \
        "no warning should have been raised as CV was disabled"
    print("Check leaderboard")
    print(aml.leaderboard)
    def test_remove_automl_with_xval_when_keeping_all_cv_details():
        ds = import_dataset()
        project_name = 'aml_with_xval_remove_test'
        max_models = 5
        nfolds = 5
        aml = H2OAutoML(project_name=project_name,
                        nfolds=nfolds,
                        max_models=max_models,
                        seed=1,
                        keep_cross_validation_predictions=True,
                        keep_cross_validation_fold_assignment=True,
                        keep_cross_validation_models=True)
        aml.train(y=ds.target, training_frame=ds.train)

        keys = list_keys_in_memory()
        # print(keys['all'].values)
        assert aml.key.startswith(project_name)
        assert contains_leaderboard(aml.key, keys)
        assert contains_event_log(aml.key, keys)
        num_SEs = len(keys['metalearners']) / (
            nfolds + 1)  # keeping cv models, so metalearners include cv models
        print({k: len(v) for k, v in keys.items()})
        expectations = dict(
            models_base=max_models + num_SEs,
            cv_models=(max_models + num_SEs) *
            nfolds,  # 1 cv model per fold for all models, incl. SEs
            predictions=(
                len(keys['cv_models'])  # cv predictions
                + len(keys['models_base'])  # cv holdout predictions
            ),
            metrics=(
                len(keys['cv_models']) *
                3  # for each cv model, 1 on training frame, 1 on validation frame (=training for cv), one on adapted frame (to be removed with PUBDEV-6638)
                +
                len(keys['models_base'])  # for each model, 1 on training_frame
                + (num_SEs * 1)  # for each SE, 1 on levelone training
                + (1 if any(
                    ("DeepLearning" in x for x in keys["metrics"])) else 0
                   )  # DeepLearning has 2 training metrics (IDK why)
            ))
        for k, v in expectations.items():
            assert len(keys[k]) == v, "expected {} {}, but got {}".format(
                v, k, len(keys[k]))

        h2o.remove(aml)
        clean = list_keys_in_memory()
        print(clean['all'].values)
        assert not contains_leaderboard(aml.key, clean)
        assert not contains_event_log(aml.key, clean)
        assert len(clean['models_base']) == 0
        assert len(clean['cv_models']) == 0
        assert len(clean['models_all']) == 0
        assert len(clean['predictions']) == 0
        assert len(clean['metrics']) == 0
        assert len(clean['automl']) == 0
        for frame in [ds.train, ds.valid, ds.test]:
            assert frame_in_cluster(
                frame), "frame {} has been removed from cluster".format(
                    frame.frame_id)
Example #17
0
def test_max_runtime_secs_alone():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_max_runtime_secs", seed=1, max_runtime_secs=7)
    aml.train(y=ds.target, training_frame=ds.train)
    max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs']
    max_models = aml._build_resp['build_control']['stopping_criteria']['max_models']
    assert max_runtime == 7
    assert max_models == 0
Example #18
0
def test_train_returns_leader_model():
    ds = import_dataset()
    aml = H2OAutoML(max_models=3, project_name="py_aml_rain_result", seed=42)
    model = aml.train(y=ds.target, training_frame=ds.train)

    assert isinstance(model, ModelBase)
    assert model.key == aml.leader.key
    model.predict(ds.test)
def test_stacked_ensembles_are_trained_after_max_models():
    print("Check that Stacked Ensembles are still trained after max models have been trained")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_SE_after_max_models", seed=1, max_models=5)
    aml.train(y=ds.target, training_frame=ds.train)

    se = get_partitioned_model_names(aml.leaderboard).se
    assert len(se) == 2, "StackedEnsemble should still be trained after max models have been reached"
Example #20
0
def test_no_time_limit_if_max_models_is_provided():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_no_time_limit", seed=1, max_models=1)
    aml.train(y=ds.target, training_frame=ds.train)
    max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs']
    max_models = aml._build_resp['build_control']['stopping_criteria']['max_models']
    assert max_models == 1, max_models
    assert max_runtime == 0, max_runtime
Example #21
0
def test_automl_stops_after_max_models():
    print("Check that automl gets interrupted after `max_models`")
    ds = import_dataset()
    max_models = 5
    aml = H2OAutoML(project_name="py_aml_max_models", seed=1, max_models=max_models)
    aml.train(y=ds.target, training_frame=ds.train)

    base_models = get_partitioned_model_names(aml.leaderboard).base
    assert len(base_models) == max_models, "obtained {} base models when {} are expected".format(len(base_models), max_models)
def test_exploitation_disabled():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_exploitation_ratio_disabled",
                    exploitation_ratio=.0,
                    max_models=6,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    assert 'start_GBM_lr_annealing' not in aml.training_info
    assert 'start_XGBoost_lr_search' not in aml.training_info
Example #23
0
def test_default_automl_with_binary_task():
    ds = import_dataset('binary')
    aml = H2OAutoML(max_models=2, 
                    project_name='aml_binary')

    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test)
    print(aml.leader)
    print(aml.leaderboard)
    assert aml.leaderboard.columns == ["model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"]
    def test_remove_automl_with_xval():
        ds = import_dataset()
        project_name = 'aml_with_xval_remove_test'
        max_models = 5
        nfolds = 5
        aml = H2OAutoML(project_name=project_name,
                        nfolds=nfolds,
                        max_models=max_models,
                        seed=1)
        aml.train(y=ds.target,
                  training_frame=ds.train,
                  validation_frame=ds.valid,
                  leaderboard_frame=ds.test)

        keys = list_keys_in_memory()
        assert aml.key.startswith(project_name)
        assert contains_leaderboard(aml.key, keys)
        assert contains_event_log(aml.key, keys)
        num_SEs = len(keys['metalearners'])
        print({k: len(v) for k, v in keys.items()})
        expectations = dict(
            models_base=max_models + num_SEs,
            cv_models=0,
            predictions=0,
            metrics=(
                max_models *
                3  # for each non-SE model, 1 on training_frame, 1 on validation_frame, 1 on leaderboard_frame
                + (
                    num_SEs * 2
                )  # for each SE model, 1 on training frame, 1 on leaderboard frame
                + (
                    num_SEs * 2
                )  # for each SE metalearner, 1+1 on levelone training+validation
                + (1 if any(
                    ("DeepLearning" in x for x in keys["metrics"])) else 0
                   )  # DeepLearning has 2 training metrics (IDK why)
            ))
        for k, v in expectations.items():
            assert len(keys[k]) == v, "expected {} {}, but got {}".format(
                v, k, len(keys[k]))

        h2o.remove(aml)
        clean = list_keys_in_memory()
        print(clean['all'].values)
        assert not contains_leaderboard(aml.key, clean)
        assert not contains_event_log(aml.key, clean)
        assert len(clean['models_base']) == 0
        assert len(clean['cv_models']) == 0
        assert len(clean['models_all']) == 0
        assert len(clean['predictions']) == 0
        assert len(clean['metrics']) == 0
        assert len(clean['automl']) == 0
        for frame in [ds.train, ds.valid, ds.test]:
            assert frame_in_cluster(
                frame), "frame {} has been removed from cluster".format(
                    frame.frame_id)
Example #25
0
def test_max_runtime_secs_can_be_set_in_combination_with_max_models_and_max_runtime_wins():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_all_stopping_constraints", seed=1, max_models=20, max_runtime_secs=12)
    aml.train(y=ds.target, training_frame=ds.train)
    max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs']
    max_models = aml._build_resp['build_control']['stopping_criteria']['max_models']
    assert max_runtime == 12
    assert max_models == 20
    assert aml.leaderboard.nrows < 20
    assert int(aml.training_info['duration_secs']) < 2*max_runtime  # being generous to avoid errors on slow Jenkins
def test_cannot_set_unauthorized_algo_parameter():
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_unauthorized_algo_param",
                    algo_parameters=dict(score_tree_interval=7),
                    max_models=6,
                    seed=1)
    try:
        aml.train(y=ds.target, training_frame=ds.train)
    except h2o.exceptions.H2OResponseError as e:
        assert "algo_parameters: score_tree_interval" in str(e)
def test_params_can_be_set_as_attributes():
    aml = H2OAutoML()
    aml.max_models = 4
    aml.seed = 42
    aml.exclude_algos = ['StackedEnsemble']

    ds = import_dataset()
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    assert aml.leaderboard.nrows == aml.max_models == 4
    assert aml.project_name is not None
def test_warn_on_empty_leaderboard():
    ds = import_dataset()
    aml = H2OAutoML(project_name="test_empty_leaderboard",
                    include_algos=[],
                    seed=1234)
    aml.train(y=ds.target, training_frame=ds.train)
    assert aml.leaderboard.nrow == 0
    warnings = aml.event_log[aml.event_log['level'] == 'WARN','message']
    last_warning = warnings[warnings.nrow-1,:].flatten()
    assert "Empty leaderboard" in last_warning
Example #29
0
def test_nfolds_eq_0():
    print("Check nfolds = 0 works properly")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_nfolds0",
                    nfolds=0,
                    max_models=3,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    base_models = get_partitioned_model_names(aml.leaderboard).base
    amodel = h2o.get_model(base_models[0])
    assert amodel.params['nfolds']['actual'] == 0
def test_include_algos():
    print("AutoML trains only models for algos listed in include_algos")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_include_algos",
                    include_algos=['GBM'],
                    max_models=max_models,
                    seed=1)
    aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid)
    models = get_partitioned_model_names(aml.leaderboard)
    assert all(['GBM' in name for name in models.base])
    assert len(models.se) == 0, "No StackedEnsemble should have been trained if not explicitly included to the existing include_algos"