def test_get_automl(): train = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = 'CAPSULE' train[y] = train[y].asfactor() aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234) aml.train(y=y, training_frame=train) get_aml = get_automl(aml.project_name) assert aml.project_name == get_aml["project_name"] assert aml.leader.model_id == get_aml["leader"].model_id assert aml.leaderboard.get_frame_data( ) == get_aml["leaderboard"].get_frame_data() assert aml.event_log.get_frame_data( ) == get_aml["event_log"].get_frame_data() assert aml.training_info == get_aml['training_info'] # PUBDEV-6599 assert aml.project_name == get_aml.project_name assert aml.leader.model_id == get_aml.leader.model_id assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id assert aml.event_log.frame_id == get_aml.event_log.frame_id assert aml.training_info == get_aml.training_info # Test predictions predictions = aml.predict(train) predictions_from_output = get_aml.predict(train) assert (predictions == predictions_from_output).all() # Test get_leaderboard PUBDEV-7454 assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all() assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml, 'ALL')).all()
def test_get_automl(): ds = import_dataset() aml = H2OAutoML(project_name="test_get_automl", max_models=2, seed=1234) aml.train(y=ds.target, training_frame=ds.train) get_aml = get_automl(aml.project_name) assert aml.project_name == get_aml["project_name"] assert aml.leader.model_id == get_aml["leader"].model_id assert aml.leaderboard.get_frame_data( ) == get_aml["leaderboard"].get_frame_data() assert aml.event_log.get_frame_data( ) == get_aml["event_log"].get_frame_data() assert aml.training_info == get_aml['training_info'] # PUBDEV-6599 assert aml.project_name == get_aml.project_name assert aml.leader.model_id == get_aml.leader.model_id assert aml.leaderboard.frame_id == get_aml.leaderboard.frame_id assert aml.event_log.frame_id == get_aml.event_log.frame_id assert aml.training_info == get_aml.training_info # Test predictions predictions = aml.predict(ds.test) predictions_from_output = get_aml.predict(ds.test) assert (predictions == predictions_from_output).all() # Test get_leaderboard PUBDEV-7454 assert (get_leaderboard(aml) == get_leaderboard(get_aml)).all() assert (get_leaderboard(aml, 'ALL') == get_leaderboard(get_aml, 'ALL')).all()
def test_custom_leaderboard(): print("Check custom leaderboard") ds = prepare_data('binomial') aml = H2OAutoML(project_name="py_aml_custom_lb_test", max_models=5, seed=automl_seed) aml.train(y=ds.target, training_frame=ds.train) std_columns = [ "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse" ] assert aml.leaderboard.names == std_columns assert get_leaderboard(aml).names == std_columns assert get_leaderboard(aml, extra_columns=[]).names == std_columns assert get_leaderboard(aml, extra_columns='ALL').names == std_columns + [ "training_time_ms", "predict_time_per_row_ms" ] assert get_leaderboard(aml, extra_columns="unknown").names == std_columns assert get_leaderboard(aml, extra_columns=[ "training_time_ms" ]).names == std_columns + ["training_time_ms"] assert get_leaderboard( aml, extra_columns=["predict_time_per_row_ms", "training_time_ms"] ).names == std_columns + ["predict_time_per_row_ms", "training_time_ms"] assert get_leaderboard(aml, extra_columns=[ "unknown", "training_time_ms" ]).names == std_columns + ["training_time_ms"] lb_ext = get_leaderboard(aml, extra_columns='ALL') print(lb_ext) assert all(lb_ext[:, 1:].isnumeric() ), "metrics and extension columns should all be numeric" assert (lb_ext["training_time_ms"].as_data_frame().values >= 0).all() assert (lb_ext["predict_time_per_row_ms"].as_data_frame().values > 0).all()
def test_optional_SEs_trained_by_default_when_no_time_limit(): ds = import_dataset() aml = H2OAutoML(project_name="py_SEs_with_no_time_limit", seed=1, max_models=3) aml.train(y=ds.target, training_frame=ds.train) lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame() steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list() assert len(steps_SE) > 1 assert 'best_of_family_1' in steps_SE, "default SE for first group is missing" assert 'best_of_family_2' not in steps_SE, 'all other SEs should be optional ones' assert 'all_1' not in steps_SE, 'all other SEs should be optional ones' assert 'all_2' not in steps_SE, 'all other SEs should be optional ones'
def train(data_path, max_models, model_name): train_data, test_data, train_cols = prepare_data(args.data_path) test_cols = train_cols[:-1] test_cols = "quality" with mlflow.start_run() as run: print("run_id:", run.info.run_id) model = H2OAutoML(max_models=max_models, max_runtime_secs=300, seed=24, nfolds=6) model.train(x=train_cols, y=test_cols, training_frame=train_data, validation_frame=test_data) mlflow.log_param("max_models", max_models) mlflow.log_metric("rmse", model.leader.rmse()) mlflow.set_tag("mlflow_version", mlflow.__version__) mlflow.set_tag("h2o_version", h2o.__version__) mlflow.set_tag("model.leader.class", qname(model.leader.__class__)) mlflow.set_tag("model.leader.estimator_type", model.leader._estimator_type) mlflow.set_tag("num_leaderboard_models", model.leaderboard.nrows) lb = get_leaderboard(model, extra_columns='ALL') print(lb) path = "leaderboard.csv" h2o.export_file(lb, path=path, force=True) mlflow.log_artifact(path) from tabulate import tabulate df = lb.as_data_frame() table = tabulate(df, headers="keys", tablefmt="psql", showindex=False) path = "leaderboard.txt" with open(path, "w") as f: f.write(table) mlflow.log_artifact(path) df = df[["model_id"]] with open("models.csv", "w") as f: df.to_csv(f, index=False, header=False) mlflow.log_artifact("models.csv") mlflow.h2o.log_model(model.leader, "h2o-model", registered_model_name=args.model_name)
def test_smoke_automl(): nmodels = 20 # enough models to run every step (all base models, all grids, all SEs...) ds = import_dataset() aml = H2OAutoML(project_name="py_aml_smoke", max_models=nmodels, nfolds=3, stopping_tolerance=0.5, stopping_rounds=2, seed=42, verbosity='debug') model = aml.train(y=ds.target, training_frame=ds.train) assert isinstance(model, ModelBase) lb = get_leaderboard(aml, ['algos', 'provider', 'step', 'group']) print(lb) assert lb.nrows > nmodels
def test_optional_SEs_not_trained_in_reproducible_mode(): ds = import_dataset() aml = H2OAutoML(project_name="py_SEs_reproducible_mode", seed=1, max_runtime_secs=30, max_models=3, include_algos=['StackedEnsemble', 'GLM', 'GBM']) # 2 base model in group 1, 1 in group 2 aml.train(y=ds.target, training_frame=ds.train) lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame() print(lb) steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list() assert len(steps_SE) == 2 assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (sequential reproducible mode)" assert 'best_of_family_2' not in steps_SE, "no SE should be built for second group (sequential reproducible mode)" assert 'best_of_family_3' not in steps_SE, "no SE should be built for third group (sequential reproducible mode)" assert 'best_of_family_xglm' in steps_SE, "final SE is missing" assert 'all_xglm' in steps_SE, "final SE is missing" assert 'best_of_family_gbm' not in steps_SE, 'no optional SE should be trained (sequential reproducible mode)'
def test_custom_leaderboard_as_method(): ds = import_dataset('binary') aml = H2OAutoML(project_name="py_aml_custom_lb_method_test", max_models=5, seed=42) aml.train(y=ds.target, training_frame=ds.train) assert_frame_equal(aml.get_leaderboard().as_data_frame(), aml.leaderboard.as_data_frame()) lb_ext = get_leaderboard(aml, extra_columns='ALL') assert_frame_equal( aml.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame()) aml2 = get_automl(aml.project_name) assert_frame_equal(aml2.get_leaderboard().as_data_frame(), aml.leaderboard.as_data_frame()) assert_frame_equal( aml2.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())
def test_optional_SEs_trained_in_non_reproducible_mode(): ds = import_dataset() aml = H2OAutoML(project_name="py_SEs_non_reproducible_mode", seed=1, max_runtime_secs=30, include_algos=['StackedEnsemble', 'GLM', 'DRF']) # 1 base model in each group: 1, 2, 3 aml.train(y=ds.target, training_frame=ds.train) lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame() print(lb) steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list() assert len(steps_SE) > 2 assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (1 base nodel only)" assert 'best_of_family_2' in steps_SE, 'SE best_of_family from group 2 is missing' assert 'best_of_family_3' in steps_SE, 'SE best_of_family from group 3 is missing' assert 'best_of_family_4' not in steps_SE, 'all other SEs should be optional ones' assert 'all_1' not in steps_SE, 'all other SEs should be optional ones' assert 'all_2' not in steps_SE, 'all other SEs should be optional ones' assert 'all_3' not in steps_SE, 'all other SEs should be optional ones' assert 'best_of_family_gbm' in steps_SE, 'optional SE best_of_family should have been trained'
def test_custom_leaderboard(): print("Check custom leaderboard") ds = import_dataset('binary') aml = H2OAutoML(project_name="py_aml_custom_lb_test", max_models=5, seed=42) aml.train(y=ds.target, training_frame=ds.train) std_columns = [ "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse" ] assert aml.leaderboard.names == std_columns assert get_leaderboard(aml).names == std_columns assert get_leaderboard(aml, extra_columns=[]).names == std_columns assert get_leaderboard(aml, extra_columns='ALL').names == std_columns + [ "training_time_ms", "predict_time_per_row_ms", "algo" ] assert get_leaderboard(aml, extra_columns="unknown").names == std_columns assert get_leaderboard(aml, extra_columns=[ "training_time_ms" ]).names == std_columns + ["training_time_ms"] assert get_leaderboard( aml, extra_columns=["predict_time_per_row_ms", "training_time_ms"] ).names == std_columns + ["predict_time_per_row_ms", "training_time_ms"] assert get_leaderboard(aml, extra_columns=[ "unknown", "training_time_ms" ]).names == std_columns + ["training_time_ms"] lb_ext = get_leaderboard(aml, extra_columns='ALL') print(lb_ext) assert all( lb_ext[:, [c for c in lb_ext.columns if c not in ("model_id", "algo")]]. isnumeric()), "metrics and extension columns should all be numeric" assert (lb_ext["training_time_ms"].as_data_frame().values >= 0).all() assert (lb_ext["predict_time_per_row_ms"].as_data_frame().values > 0).all() assert (lb_ext["algo"].as_data_frame().isin( ["DRF", "DeepLearning", "GBM", "GLM", "StackedEnsemble", "XGBoost"]).all().all())
from h2o.automl import H2OAutoML, get_leaderboard h2o.init() x = train.columns y = "Label" x.remove(y) aml = H2OAutoML(max_models=20, seed=1) aml.train(x=x, y=y, training_frame=train) # AutoML Leaderboard lb = aml.leaderboard # Optionally edd extra model information to the leaderboard lb = get_leaderboard(aml, extra_columns='ALL') # Print all rows (instead of default 10 rows) lb.head(rows=lb.nrows) # The leader model is stored here aml.leader # If you need to generate predictions on a test set, you can make # predictions directly on the `"H2OAutoML"` object, or on the leader # model object directly preds = aml.predict(test) # or: preds = aml.leader.predict(test)
'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday', 'Dept', 'A', 'B', 'C', 'Day', 'Month', 'Year' ] Y = 'Weekly_Sales' algos = ['DRF', 'XGBoost', 'GBM', 'DeepLearning', 'StackedEnsemble'] #aml = H2OAutoML(max_models=30, max_runtime_secs=300, seed=1) aml = H2OAutoML(max_runtime_secs=300, seed=1, include_algos=algos) h2o_frame = h2o.H2OFrame(train) aml.train(x=X, y=Y, training_frame=h2o_frame) # AutoML Leaderboard lb1 = aml.leaderboard.as_data_frame() # Optionally edd extra model information to the leaderboard lb = get_leaderboard(aml, extra_columns='ALL').as_data_frame() # Print all rows (instead of default 10 rows) lb.as_data_frame().head() # The leader model is stored here aml.leader h2o_frame_test = h2o.H2OFrame(test) preds = aml.predict(h2o_frame_test) perf = aml.leader.model_performance(h2o_frame_test) ################################################# def plot_corr_vars(df):
def view_leaderboard(self, auto_ml): leader_board = get_leaderboard(auto_ml, extra_columns='ALL') logger.info('Leaderboard: \n{}'.format(leader_board.head(rows=leader_board.nrows))) h2o_util.show_model_performance(auto_ml.leader)