def test_automl_creates_interpretable_SE_with_only_monotonic_models(): ds = import_dataset() aml_mono = H2OAutoML( project_name="test_automl_creates_interpretable_se", max_models=5, include_algos=["GBM", "GLM", "XGBoost", "StackedEnsemble"], monotone_constraints=dict(AGE=1, DPROS=1, DCAPS=1, PSA=1, VOL=1, GLEASON=1), seed=1234) aml_mono.train(y=ds.target, training_frame=ds.train) leaderboard = (aml_mono.leaderboard.as_data_frame()["model_id"]) assert leaderboard.apply( lambda model_name: "Monotonic" in model_name).any() se_name = leaderboard[leaderboard.apply( lambda model_name: "Monotonic" in model_name)] se_mono = h2o.get_model(se_name.iloc[0]) assert leaderboard.apply(lambda model_name: 'GLM' in model_name).any() assert all(['GBM' in bm or 'XGBoost' in bm for bm in se_mono.base_models])
def test_automl_creates_interpretable_SE_iff_monotonic_models_exist(): ds = import_dataset() aml_mono = H2OAutoML( project_name="test_automl_creates_interpretable_se", max_models=5, include_algos=["GBM", "XGBoost", "DRF", "StackedEnsemble"], monotone_constraints=dict(AGE=1, DPROS=1, DCAPS=1, PSA=1, VOL=1, GLEASON=1), seed=1234) aml_mono.train(y=ds.target, training_frame=ds.train) assert (aml_mono.leaderboard.as_data_frame()["model_id"].apply( lambda model_name: "Monotonic" in model_name).any()) # If we don't have monotonic constraints we shouldn't have monotonically constrained SE aml = H2OAutoML(project_name="test_automl_doesnt_create_interpretable_se", max_models=2, include_algos=["GBM", "XGBoost", "StackedEnsemble"], seed=1234) aml.train(y=ds.target, training_frame=ds.train) assert not (aml.leaderboard.as_data_frame()["model_id"].apply( lambda model_name: "Monotonic" in model_name).any())
def test_non_train_params_are_frozen_after_first_train(): aml = H2OAutoML(max_models=2, nfolds=3, seed=42, keep_cross_validation_predictions=True) ds = import_dataset() aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid) assert aml.leaderboard.nrows == aml.max_models + aml.leaderboard["model_id"].grep("StackedEnsemble").sum() assert aml.leaderboard.columns[1] == 'auc' try: aml.nfolds = 0 assert False, "should have raised" except H2OValueError as e: assert "Param ``nfolds`` can not be modified after the first call to ``train``." == str(e) assert aml.nfolds == 3 try: aml.seed = 24 assert False, "should have raised" except H2OValueError as e: assert "Param ``seed`` can not be modified after the first call to ``train``." == str(e) assert aml.seed == 42 assert aml.sort_metric == 'AUTO' aml.sort_metric = 'logloss' aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid) print(aml.leaderboard) assert aml.leaderboard.nrows == aml.max_models*2 + aml.leaderboard["model_id"].grep("StackedEnsemble").sum() assert aml.leaderboard.columns[1] == 'logloss'
def test_modeling_steps(): ds = import_dataset() aml = H2OAutoML(project_name="py_modeling_steps", max_models=5, modeling_plan=['DRF', dict(name='GBM', steps=[ dict(id='def_3', group=2), dict(id='grid_1', weight=77) ]), ('GLM', 'defaults'), ('StackedEnsemble', 'defaults')], seed=1) aml.train(y=ds.target, training_frame=ds.train) print(aml.leaderboard) # we should now see the detailed steps sorted in their execution order. print(aml.modeling_steps) assert aml.modeling_steps == [ {'name': 'DRF', 'steps': [{'id': 'def_1', 'group': 1, 'weight': 10}, {'id': 'XRT', 'group': 1, 'weight': 10}]}, {'name': 'GLM', 'steps': [{'id': 'def_1', 'group': 1, 'weight': 10}]}, {'name': 'StackedEnsemble', 'steps': [{'id': 'best_of_family_1', 'group': 1, 'weight': 10}]}, # no all_1 as XRT is interpreted as not being of the same family as DRF (legacy decision). {'name': 'GBM', 'steps': [{'id': 'def_3', 'group': 2, 'weight': 10}, {'id': 'grid_1', 'group': 2, 'weight': 77}]}, # grids are 2nd group by default {'name': 'StackedEnsemble', 'steps': [{'id': 'best_of_family_2', 'group': 2, 'weight': 10}, {'id': 'all_2', 'group': 2, 'weight': 10}]} ] new_aml = H2OAutoML(project_name="py_reinject_modeling_steps", max_models=5, modeling_plan=aml.modeling_steps, seed=1) new_aml.train(y=ds.target, training_frame=ds.train) print(new_aml.leaderboard) assert aml.modeling_steps == new_aml.modeling_steps
def test_columns_not_in_x_and_y_are_ignored(): ds = import_dataset() #Use same project_name so we add to leaderboard for each run aml = H2OAutoML(max_models=2, stopping_rounds=3, stopping_tolerance=0.001, project_name="aml1") print("AutoML with x as a str list, train, valid, and test") x = ["AGE", "RACE", "DPROS"] y = ds.target names = ds.train.names aml.train(x=x, y=y, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) print("AutoML leaderboard") print(aml.leaderboard) models = aml.leaderboard["model_id"] check_ignore_cols_automl(models, names, x, y) print("AutoML with x and y as col indexes, train, valid, and test") aml.train(x=[2, 3, 4], y=1, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) print("AutoML leaderboard") print(aml.leaderboard) models = aml.leaderboard["model_id"] check_ignore_cols_automl(models, names, x, y) print("AutoML with x as a str list, y as a col index, train, valid, and test") aml.train(x=x, y=1, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) print("AutoML leaderboard") print(aml.leaderboard) models = aml.leaderboard["model_id"] check_ignore_cols_automl(models, names, x, y) print("AutoML with x as col indexes, y as a str, train, valid, and test") aml.train(x=[2,3,4], y=y, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) print("AutoML leaderboard") print(aml.leaderboard) models = aml.leaderboard["model_id"] check_ignore_cols_automl(models, names, x, y)
def test_get_automl(): ds = import_dataset() aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234) aml.train(y=ds.target, training_frame=ds.train) get_aml = get_automl(aml.project_name) assert aml.project_name == get_aml["project_name"] assert aml.leader.model_id == get_aml["leader"].model_id assert aml.leaderboard.get_frame_data( ) == get_aml["leaderboard"].get_frame_data() assert aml.event_log.get_frame_data( ) == get_aml["event_log"].get_frame_data() assert aml.training_info == get_aml['training_info'] # PUBDEV-6599 assert aml.project_name == get_aml.project_name assert aml.leader.model_id == get_aml.leader.model_id assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id assert aml.event_log.frame_id == get_aml.event_log.frame_id assert aml.training_info == get_aml.training_info # Test predictions predictions = aml.predict(ds.test) predictions_from_output = get_aml.predict(ds.test) assert (predictions == predictions_from_output).all() # Test get_leaderboard PUBDEV-7454 assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all() assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml, 'ALL')).all()
def test_AUTO_stopping_metric_with_no_sorting_metric_regression(): print( "Check leaderboard with AUTO stopping metric and no sorting metric for regression" ) ds = import_dataset('regression', split=False) exclude_algos = ["DeepLearning", "GLM"] aml = H2OAutoML( project_name= "py_aml_lb_test_auto_stopping_metric_no_sorting_regression", exclude_algos=exclude_algos, max_models=10, nfolds=2, stopping_rounds=1, stopping_tolerance=0.5, seed=automl_seed) aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["rmse", "mse", "mae", "rmsle", "mean_residual_deviance"], "rmse") base = get_partitioned_model_names(aml.leaderboard).base first = [m for m in base if 'XGBoost_1' in m] others = [m for m in base if m not in first] check_model_property( first, 'stopping_metric', True, None ) #if stopping_rounds == 0, actual value of stopping_metric is set to None check_model_property(others, 'stopping_metric', True, "deviance")
def test_remove_automl_after_individual_manual_deletions(): ds = import_dataset() project_name='aml_no_xval_remove_test' max_models = 3 aml = H2OAutoML(project_name=project_name, nfolds=0, max_models=max_models, seed=1) aml.train(y=ds.target, training_frame=ds.train, blending_frame=ds.valid) keys = list_keys_in_memory() # manually remove the first item for each category to verify robustness of global automl deletion # for example, to verify that exceptions (if any) are handled correctly when automl is trying to remove a base model that was already removed for k, v in keys.items(): if k == 'all': continue if len(v) > 0: h2o.remove(v[0]) h2o.remove(aml) clean = list_keys_in_memory() print(clean['all'].values) assert aml.key.startswith(project_name) assert not contains_leaderboard(aml.key, clean) assert not contains_event_log(aml.key, clean) assert len(clean['models_base']) == 0 assert len(clean['cv_models']) == 0 assert len(clean['models_all']) == 0 assert len(clean['metrics']) == 0 assert len(clean['predictions']) == 0 assert len(clean['automl']) == 0 for frame in [ds.train, ds.valid, ds.test]: assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
def test_AUTO_stopping_metric_with_no_sorting_metric_binary(): print( "Check leaderboard with AUTO stopping metric and no sorting metric for binary" ) ds = import_dataset('binary', split=False) exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"] aml = H2OAutoML( project_name="py_aml_lb_test_auto_stopping_metric_no_sorting_binary", seed=automl_seed, max_models=10, nfolds=2, stopping_rounds=1, stopping_tolerance=0.5, exclude_algos=exclude_algos) aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"], "auc", True) base = get_partitioned_model_names(aml.leaderboard).base first = [m for m in base if 'XGBoost_1' in m] others = [m for m in base if m not in first] check_model_property( first, 'stopping_metric', True, None ) #if stopping_rounds == 0, actual value of stopping_metric is set to None check_model_property(others, 'stopping_metric', True, "logloss")
def test_SE_retraining_fails_when_param_disabled(): print("\n=== disabling "+kcvp+" and retraining ===") total_runs = 4 aml = setup_and_train(False) # first run first_models = get_partitioned_model_names(aml.leaderboard) first_bof = next(m for m in first_models.se if re.search(r'_BestOfFamily_', m)) ds = import_dataset() for i in range(total_runs - 1): aml.train(y=ds.target, training_frame=ds.train) models = get_partitioned_model_names(aml.leaderboard) first_se_all_models = [m for m in first_models.se if re.search(r'_AllModels_', m)] se_all_models = [m for m in models.se if re.search(r'_AllModels_', m)] se_best_of_family = [m for m in models.se if re.search(r'_BestOfFamily_', m)] lb = aml.leaderboard print(lb.head(lb.nrows)) assert len(models.se) == len(se_all_models) + len(se_best_of_family) assert len(se_all_models) == len(first_se_all_models), \ "expecting only the {} first StackedEnsemble_AllModels, but got {}".format(len(first_se_all_models), len(se_all_models)) assert se_all_models[0] in first_models.se, "first StackedEnsemble_AllModels got replaced by new one" if len(se_best_of_family) > 1: assert first_bof in se_best_of_family, "first StackedEnsemble_BestOfFamily disappeared after multiple runs" row_of = lambda id: lb[lb['model_id'] == id] first_bof_row = row_of(first_bof) assert all(all(row[i] == first_bof_row[i] for i in range(1, lb.ncols)) for row in [row_of(se) for se in se_best_of_family]), \ "expecting possibly 2+ similar StackedEnsemble_BestOfFamily (corner case), but managed to obtain 2 different ones!" else: assert len(se_best_of_family) == 1, "expecting only the first StackedEnsemble_BestOfFamily, but got {}".format(len(se_best_of_family)) assert se_best_of_family[0] == first_bof, "first StackedEnsemble_Best_of_Family got replaced by new one"
def test_workaround_for_distribution(): try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "true")) ds = import_dataset('regression') aml = H2OAutoML(project_name="py_test", algo_parameters=dict( distribution='poisson', family='poisson', ), exclude_algos=['StackedEnsemble'], max_runtime_secs=60, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names = [ aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows)) ] for mn in model_names: m = h2o.get_model(mn) dist = m.params[ 'distribution'] if 'distribution' in m.params else m.params[ 'family'] if 'family' in m.params else None print("{}: distribution = {}".format(mn, dist)) except: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))
def test_algo_parameter_can_be_applied_only_to_a_specific_algo(): ds = import_dataset() aml = H2OAutoML( project_name="py_specific_algo_param", algo_parameters=dict(GBM__monotone_constraints=dict(AGE=1)), max_models=6, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names = get_partitioned_model_names(aml.leaderboard).all models_supporting_monotone_constraints = [ n for n in model_names if re.match(r"GBM|XGBoost", n) ] assert next((m for m in models_supporting_monotone_constraints if m.startswith('GBM')), None), "There should be at least one GBM model" for m in models_supporting_monotone_constraints: model = h2o.get_model(m) mc_value = next(v['actual'] for n, v in model.params.items() if n == 'monotone_constraints') if m.startswith('GBM'): assert isinstance(mc_value, list) age = next((v for v in mc_value if v['key'] == 'AGE'), None) assert age is not None assert age['value'] == 1.0 else: assert mc_value is None
def test_stacked_ensembles_are_trained_with_blending_frame_even_if_nfolds_eq_0( ): print( "Check that we can disable cross-validation when passing a blending frame and that Stacked Ensembles are trained using this frame." ) max_models = 5 ds = import_dataset() aml = H2OAutoML(project_name="py_aml_blending_frame", seed=1, max_models=max_models, nfolds=0) aml.train(y=ds.target, training_frame=ds.train, blending_frame=ds.valid, leaderboard_frame=ds.test) se = get_partitioned_model_names(aml.leaderboard).se assert len( se ) > 3, "In blending mode, StackedEnsemble should still be trained in spite of nfolds=0." for m in se: model = h2o.get_model(m) assert model.params['blending_frame']['actual'][ 'name'] == ds.valid.frame_id assert model._model_json['output']['stacking_strategy'] == 'blending'
def test_exploitation_impacts_exploration_duration(): ds = import_dataset() planned_duration = 60 aml = H2OAutoML( project_name="py_exploitation_ratio_max_runtime", exploitation_ratio= .5, # excessive ratio on purpose, due to training overheads in multinode exclude_algos=['DeepLearning', 'XGBoost'], # removing some algos for the same reason max_runtime_secs=planned_duration, seed=1, verbosity='info') aml.train(y=ds.target, training_frame=ds.train) automl_start = int(aml.training_info['start_epoch']) assert 'start_GBM_lr_annealing' in aml.training_info # assert 'start_XGBoost_lr_search' in aml.training_info first_exploitation_step = 'start_GBM_lr_annealing' after_exploitation_step = 'start_completion_GBM_grid_1' if first_exploitation_step in aml.training_info and after_exploitation_step in aml.training_info: exploitation_start = int(aml.training_info[first_exploitation_step]) exploration_duration = exploitation_start - automl_start after_start = int(aml.training_info[after_exploitation_step]) exploitation_duration = after_start - exploitation_start # can't reliably check duration ratio assert 0 < exploration_duration < planned_duration print(aml.leaderboard) print(exploitation_duration) print(exploration_duration) assert 0 < exploitation_duration < exploration_duration else: print(aml.leaderboard) print("budget time was too small to start and complete exploitation")
def test_no_x_train_and_validation_and_test_sets(): print("AutoML run with x not provided with train, valid, and test") ds = import_dataset() aml = H2OAutoML(project_name="py_aml4", stopping_rounds=3, stopping_tolerance=0.001, stopping_metric="AUC", max_models=max_models, seed=1234, nfolds=0) aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) assert aml.project_name == "py_aml4", "Project name is not set" assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3" assert aml.stopping_tolerance == 0.001, "stopping_tolerance is not set to 0.001" assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`" assert aml.max_models == 2, "max_models is not set to 2" assert aml.seed == 1234, "seed is not set to `1234`" log_df = aml.event_log.as_data_frame() warn_messages = log_df[log_df['level'] == 'WARN']['message'] assert not warn_messages.str.startswith("User specified a validation frame with cross-validation still enabled").any(), \ "no warning should have been raised as CV was disabled" print("Check leaderboard") print(aml.leaderboard)
def test_remove_automl_with_xval_when_keeping_all_cv_details(): ds = import_dataset() project_name = 'aml_with_xval_remove_test' max_models = 5 nfolds = 5 aml = H2OAutoML(project_name=project_name, nfolds=nfolds, max_models=max_models, seed=1, keep_cross_validation_predictions=True, keep_cross_validation_fold_assignment=True, keep_cross_validation_models=True) aml.train(y=ds.target, training_frame=ds.train) keys = list_keys_in_memory() # print(keys['all'].values) assert aml.key.startswith(project_name) assert contains_leaderboard(aml.key, keys) assert contains_event_log(aml.key, keys) num_SEs = len(keys['metalearners']) / ( nfolds + 1) # keeping cv models, so metalearners include cv models print({k: len(v) for k, v in keys.items()}) expectations = dict( models_base=max_models + num_SEs, cv_models=(max_models + num_SEs) * nfolds, # 1 cv model per fold for all models, incl. SEs predictions=( len(keys['cv_models']) # cv predictions + len(keys['models_base']) # cv holdout predictions ), metrics=( len(keys['cv_models']) * 3 # for each cv model, 1 on training frame, 1 on validation frame (=training for cv), one on adapted frame (to be removed with PUBDEV-6638) + len(keys['models_base']) # for each model, 1 on training_frame + (num_SEs * 1) # for each SE, 1 on levelone training + (1 if any( ("DeepLearning" in x for x in keys["metrics"])) else 0 ) # DeepLearning has 2 training metrics (IDK why) )) for k, v in expectations.items(): assert len(keys[k]) == v, "expected {} {}, but got {}".format( v, k, len(keys[k])) h2o.remove(aml) clean = list_keys_in_memory() print(clean['all'].values) assert not contains_leaderboard(aml.key, clean) assert not contains_event_log(aml.key, clean) assert len(clean['models_base']) == 0 assert len(clean['cv_models']) == 0 assert len(clean['models_all']) == 0 assert len(clean['predictions']) == 0 assert len(clean['metrics']) == 0 assert len(clean['automl']) == 0 for frame in [ds.train, ds.valid, ds.test]: assert frame_in_cluster( frame), "frame {} has been removed from cluster".format( frame.frame_id)
def test_max_runtime_secs_alone(): ds = import_dataset() aml = H2OAutoML(project_name="py_max_runtime_secs", seed=1, max_runtime_secs=7) aml.train(y=ds.target, training_frame=ds.train) max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs'] max_models = aml._build_resp['build_control']['stopping_criteria']['max_models'] assert max_runtime == 7 assert max_models == 0
def test_train_returns_leader_model(): ds = import_dataset() aml = H2OAutoML(max_models=3, project_name="py_aml_rain_result", seed=42) model = aml.train(y=ds.target, training_frame=ds.train) assert isinstance(model, ModelBase) assert model.key == aml.leader.key model.predict(ds.test)
def test_stacked_ensembles_are_trained_after_max_models(): print("Check that Stacked Ensembles are still trained after max models have been trained") ds = import_dataset() aml = H2OAutoML(project_name="py_aml_SE_after_max_models", seed=1, max_models=5) aml.train(y=ds.target, training_frame=ds.train) se = get_partitioned_model_names(aml.leaderboard).se assert len(se) == 2, "StackedEnsemble should still be trained after max models have been reached"
def test_no_time_limit_if_max_models_is_provided(): ds = import_dataset() aml = H2OAutoML(project_name="py_no_time_limit", seed=1, max_models=1) aml.train(y=ds.target, training_frame=ds.train) max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs'] max_models = aml._build_resp['build_control']['stopping_criteria']['max_models'] assert max_models == 1, max_models assert max_runtime == 0, max_runtime
def test_automl_stops_after_max_models(): print("Check that automl gets interrupted after `max_models`") ds = import_dataset() max_models = 5 aml = H2OAutoML(project_name="py_aml_max_models", seed=1, max_models=max_models) aml.train(y=ds.target, training_frame=ds.train) base_models = get_partitioned_model_names(aml.leaderboard).base assert len(base_models) == max_models, "obtained {} base models when {} are expected".format(len(base_models), max_models)
def test_exploitation_disabled(): ds = import_dataset() aml = H2OAutoML(project_name="py_exploitation_ratio_disabled", exploitation_ratio=.0, max_models=6, seed=1) aml.train(y=ds.target, training_frame=ds.train) assert 'start_GBM_lr_annealing' not in aml.training_info assert 'start_XGBoost_lr_search' not in aml.training_info
def test_default_automl_with_binary_task(): ds = import_dataset('binary') aml = H2OAutoML(max_models=2, project_name='aml_binary') aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) print(aml.leader) print(aml.leaderboard) assert aml.leaderboard.columns == ["model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse"]
def test_remove_automl_with_xval(): ds = import_dataset() project_name = 'aml_with_xval_remove_test' max_models = 5 nfolds = 5 aml = H2OAutoML(project_name=project_name, nfolds=nfolds, max_models=max_models, seed=1) aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) keys = list_keys_in_memory() assert aml.key.startswith(project_name) assert contains_leaderboard(aml.key, keys) assert contains_event_log(aml.key, keys) num_SEs = len(keys['metalearners']) print({k: len(v) for k, v in keys.items()}) expectations = dict( models_base=max_models + num_SEs, cv_models=0, predictions=0, metrics=( max_models * 3 # for each non-SE model, 1 on training_frame, 1 on validation_frame, 1 on leaderboard_frame + ( num_SEs * 2 ) # for each SE model, 1 on training frame, 1 on leaderboard frame + ( num_SEs * 2 ) # for each SE metalearner, 1+1 on levelone training+validation + (1 if any( ("DeepLearning" in x for x in keys["metrics"])) else 0 ) # DeepLearning has 2 training metrics (IDK why) )) for k, v in expectations.items(): assert len(keys[k]) == v, "expected {} {}, but got {}".format( v, k, len(keys[k])) h2o.remove(aml) clean = list_keys_in_memory() print(clean['all'].values) assert not contains_leaderboard(aml.key, clean) assert not contains_event_log(aml.key, clean) assert len(clean['models_base']) == 0 assert len(clean['cv_models']) == 0 assert len(clean['models_all']) == 0 assert len(clean['predictions']) == 0 assert len(clean['metrics']) == 0 assert len(clean['automl']) == 0 for frame in [ds.train, ds.valid, ds.test]: assert frame_in_cluster( frame), "frame {} has been removed from cluster".format( frame.frame_id)
def test_max_runtime_secs_can_be_set_in_combination_with_max_models_and_max_runtime_wins(): ds = import_dataset() aml = H2OAutoML(project_name="py_all_stopping_constraints", seed=1, max_models=20, max_runtime_secs=12) aml.train(y=ds.target, training_frame=ds.train) max_runtime = aml._build_resp['build_control']['stopping_criteria']['max_runtime_secs'] max_models = aml._build_resp['build_control']['stopping_criteria']['max_models'] assert max_runtime == 12 assert max_models == 20 assert aml.leaderboard.nrows < 20 assert int(aml.training_info['duration_secs']) < 2*max_runtime # being generous to avoid errors on slow Jenkins
def test_cannot_set_unauthorized_algo_parameter(): ds = import_dataset() aml = H2OAutoML(project_name="py_unauthorized_algo_param", algo_parameters=dict(score_tree_interval=7), max_models=6, seed=1) try: aml.train(y=ds.target, training_frame=ds.train) except h2o.exceptions.H2OResponseError as e: assert "algo_parameters: score_tree_interval" in str(e)
def test_params_can_be_set_as_attributes(): aml = H2OAutoML() aml.max_models = 4 aml.seed = 42 aml.exclude_algos = ['StackedEnsemble'] ds = import_dataset() aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid) assert aml.leaderboard.nrows == aml.max_models == 4 assert aml.project_name is not None
def test_warn_on_empty_leaderboard(): ds = import_dataset() aml = H2OAutoML(project_name="test_empty_leaderboard", include_algos=[], seed=1234) aml.train(y=ds.target, training_frame=ds.train) assert aml.leaderboard.nrow == 0 warnings = aml.event_log[aml.event_log['level'] == 'WARN','message'] last_warning = warnings[warnings.nrow-1,:].flatten() assert "Empty leaderboard" in last_warning
def test_nfolds_eq_0(): print("Check nfolds = 0 works properly") ds = import_dataset() aml = H2OAutoML(project_name="py_aml_nfolds0", nfolds=0, max_models=3, seed=1) aml.train(y=ds.target, training_frame=ds.train) base_models = get_partitioned_model_names(aml.leaderboard).base amodel = h2o.get_model(base_models[0]) assert amodel.params['nfolds']['actual'] == 0
def test_include_algos(): print("AutoML trains only models for algos listed in include_algos") ds = import_dataset() aml = H2OAutoML(project_name="py_include_algos", include_algos=['GBM'], max_models=max_models, seed=1) aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid) models = get_partitioned_model_names(aml.leaderboard) assert all(['GBM' in name for name in models.base]) assert len(models.se) == 0, "No StackedEnsemble should have been trained if not explicitly included to the existing include_algos"