def test_AUTO_stopping_metric_with_auc_sorting_metric(): print("Check leaderboard with AUTO stopping metric and auc sorting metric") ds = prepare_data('binomial') exclude_algos = ["DeepLearning", "GLM", "StackedEnsemble"] aml = H2OAutoML( project_name="py_aml_lb_test_auto_stopping_metric_auc_sorting", seed=automl_seed, max_models=10, exclude_algos=exclude_algos, sort_metric='auc') aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["auc", "logloss", "mean_per_class_error", "rmse", "mse"], "auc", True) non_se = get_partitioned_model_names(aml.leaderboard).non_se check_model_property(non_se, 'stopping_metric', True, "logloss")
def test_balance_classes(): print("Check balance_classes & related args work properly") ds = import_dataset() aml = H2OAutoML( project_name="py_aml_balance_classes_etc", exclude_algos=['XGBoost'], # XGB doesn't support balance_classes max_models=3, balance_classes=True, class_sampling_factors=[0.2, 1.4], max_after_balance_size=3.0, seed=1) aml.train(y=ds['target'], training_frame=ds['train']) _, non_se, _ = get_partitioned_model_names(aml.leaderboard) amodel = h2o.get_model(non_se[0]) assert amodel.params['balance_classes']['actual'] == True assert amodel.params['max_after_balance_size']['actual'] == 3.0 assert amodel.params['class_sampling_factors']['actual'] == [0.2, 1.4]
def test_stacked_ensembles_are_trained_after_timeout(): print("Check that Stacked Ensembles are still trained after timeout") max_runtime_secs = 20 ds = import_dataset() aml = H2OAutoML(project_name="py_aml_SE_after_timeout", seed=1, max_runtime_secs=max_runtime_secs, exclude_algos=['XGBoost', 'DeepLearning']) start = time.time() aml.train(y=ds['target'], training_frame=ds['train']) end = time.time() assert end - start - max_runtime_secs > 0 _, _, se = get_partitioned_model_names(aml.leaderboard) assert len( se ) > 0, "StackedEnsemble should still be trained after timeout" # we don't need to test if all SEs are built, there may be only one if just one model type was built.
def fit_model_into_data_by(model_type, features, model_params, X_train, X_test, X_train_selected, X_test_selected, y_train, y_test): if features == 'all': feature_cols = [col for col in list(X_train.columns) if col not in model_params['TRAIN_TEST_SPLIT']['EXCLUDE_COL']] train = X_train test = X_test elif features == 'selected': feature_cols = [col for col in list(X_train_selected.columns) if col not in model_params['TRAIN_TEST_SPLIT']['EXCLUDE_COL']] train = X_train_selected test = X_test_selected target_col = model_params['TRAIN_TEST_SPLIT']['TARGET_COL'] if model_type == 'h2o': h2o.init(ip="127.0.0.1", max_mem_size_GB=2) hdf = h2o.H2OFrame(pd.concat([train, y_train], axis=1)) aml = H2OAutoML(max_models=5, seed=1, max_runtime_secs=432000) aml.train( x=feature_cols, y=target_col, training_frame=hdf ) h2o.save_model(model=aml, path=model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model', force=True) # joblib.dump(aml, # model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl') y_test_pred = aml.predict(h2o.H2OFrame(test)).as_data_frame()['predict'] elif model_type == 'rf': rf_reg = RandomForestRegressor( n_estimators=1000, max_depth=10, max_features='sqrt', n_jobs=-1, verbose=1, random_state=1 ) rf_reg.fit(train, y_train) joblib.dump(rf_reg, model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl') y_test_pred = rf_reg.predict(test) print('RMSE: ', np.sqrt(mean_squared_error(y_test_pred, y_test))) print('R2: ', r2_score(y_test_pred, y_test)) return y_test_pred
def run_example(self): h2o.init() # Import a sample binary outcome train/test set into H2O train = h2o.import_file("./data/churn-train.csv") test = h2o.import_file("./data/churn-test.csv") #df = h2o.import_file("./data/churn.csv") #train, test = df.split_frame(ratios=[.75]) # Identify predictors and response x = train.columns y = "churn_probability" x.remove(y) # For binary classification, response should be a factor #train[y] = train[y].asfactor() #test[y] = test[y].asfactor() # Run AutoML for 20 base models (limited to 1 hour max runtime by default) aml = H2OAutoML(max_runtime_secs=20, seed=1, sort_metric="mae") aml.train(x=x, y=y, training_frame=train) # View the AutoML Leaderboard lb = aml.leaderboard lb.head(rows=lb.nrows) # Print all rows instead of default (10 rows) # The leader model is stored here print(aml.leader.model_performance(test)) # If you need to generate predictions on a test set, you can make # predictions directly on the `"H2OAutoML"` object, or on the leader # model object directly preds = aml.predict(test) # or: preds = aml.leader.predict(test) print(preds) resp = [aml, aml.leader, preds.as_data_frame()] h2o.shutdown() return resp
def test_remove_automl_with_xval_when_keeping_all_cv_details(): target, train, valid, test = prepare_data() project_name = 'aml_with_xval_remove_test' max_models = 3 nfolds = 5 aml = H2OAutoML(project_name=project_name, nfolds=nfolds, max_models=max_models, seed=1, keep_cross_validation_predictions=True, keep_cross_validation_fold_assignment=True, keep_cross_validation_models=True) aml.train(y=target, training_frame=train) keys = list_keys_in_memory() # print(keys['all'].values) assert contains_leaderboard(project_name, keys) assert contains_event_log(project_name, keys) expectations = dict( models_base=max_models + 2, # 2 SEs cv_models=(max_models+2) * nfolds, # 1 cv model per fold for all models, incl. SEs predictions=(len(keys['cv_models']) # cv predictions + len(keys['models_base']) # cv holdout predictions ), metrics=(len(keys['cv_models']) * 3 # for each cv model, 1 on training frame, 1 on validation frame (=training for cv), one on adapted frame (to be removed with PUBDEV-6638) + len(keys['models_base']) # for each model, 1 on training_frame + (2 * 1) # for each SE, 1 on levelone training ) ) for k, v in expectations.items(): assert len(keys[k]) == v, "expected {} {}, but got {}".format(v, k, len(keys[k])) h2o.remove(aml) clean = list_keys_in_memory() print(clean['all'].values) assert not contains_leaderboard(project_name, clean) assert not contains_event_log(project_name, clean) assert len(clean['models_base']) == 0 assert len(clean['cv_models']) == 0 assert len(clean['models_all']) == 0 assert len(clean['predictions']) == 0 assert len(clean['metrics']) == 0 assert len(clean['automl']) == 0 for frame in [train, valid, test]: assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
def test_frames_can_be_passed_as_key(): print("Check that all AutoML frames can be passed as keys.") ds = import_dataset() kw_args = [ dict(training_frame=ds.train.frame_id), dict(training_frame=ds.train, validation_frame=ds.valid.frame_id), dict(training_frame=ds.train, blending_frame=ds.valid.frame_id), dict(training_frame=ds.train, leaderboard_frame=ds.test.frame_id), ] for kwargs in kw_args: aml = H2OAutoML(project_name="py_aml_frames_as_keys", seed=1, max_models=1, nfolds=0) aml.train(y=ds.target, **kwargs) h2o.remove(aml)
def train_automl(self, train: h2o.H2OFrame, x: List[str], y: str, weight: str) -> H2OGenericEstimator: """ Use AutoML to build model Args: train (h2o dataframe): training data containing columns x, y, and weight x (list of str): column names of model features y (list of str): column name of ground truth weight (str): column name of row weights Return H2OGenericEstimator: best model out of the training grid """ aml = H2OAutoML(max_runtime_secs=self.search_time, seed=1) aml.train(x=x, y=y, training_frame=train, weights_column=weight) best_model = aml.leader return best_model
def test_optional_SEs_not_trained_in_reproducible_mode(): ds = import_dataset() aml = H2OAutoML(project_name="py_SEs_reproducible_mode", seed=1, max_runtime_secs=30, max_models=3, include_algos=['StackedEnsemble', 'GLM', 'GBM']) # 2 base model in group 1, 1 in group 2 aml.train(y=ds.target, training_frame=ds.train) lb = get_leaderboard(aml, ['provider', 'step']).as_data_frame() print(lb) steps_SE = lb.query("provider == 'StackedEnsemble'").step.to_list() assert len(steps_SE) == 2 assert 'best_of_family_1' not in steps_SE, "no SE should be built for first group (sequential reproducible mode)" assert 'best_of_family_2' not in steps_SE, "no SE should be built for second group (sequential reproducible mode)" assert 'best_of_family_3' not in steps_SE, "no SE should be built for third group (sequential reproducible mode)" assert 'best_of_family_xglm' in steps_SE, "final SE is missing" assert 'all_xglm' in steps_SE, "final SE is missing" assert 'best_of_family_gbm' not in steps_SE, 'no optional SE should be trained (sequential reproducible mode)'
def test_leaderboard_for_binary_with_custom_sorting(): print("Check leaderboard for Binomial sort by logloss") ds = import_dataset('binary', split=False) exclude_algos = ["GLM", "DeepLearning", "DRF"] aml = H2OAutoML(project_name="py_aml_lb_test_custom_binom_sort", seed=automl_seed, max_models=8, nfolds=2, stopping_rounds=1, stopping_tolerance=0.5, exclude_algos=exclude_algos, sort_metric="logloss") aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["logloss", "auc", "aucpr", "mean_per_class_error", "rmse", "mse"], "logloss")
def test_custom_leaderboard_as_method(): ds = import_dataset('binary') aml = H2OAutoML(project_name="py_aml_custom_lb_method_test", max_models=5, seed=42) aml.train(y=ds.target, training_frame=ds.train) assert_frame_equal(aml.get_leaderboard().as_data_frame(), aml.leaderboard.as_data_frame()) lb_ext = get_leaderboard(aml, extra_columns='ALL') assert_frame_equal( aml.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame()) aml2 = get_automl(aml.project_name) assert_frame_equal(aml2.get_leaderboard().as_data_frame(), aml.leaderboard.as_data_frame()) assert_frame_equal( aml2.get_leaderboard('ALL').as_data_frame(), lb_ext.as_data_frame())
def test_leaderboard_for_regression_with_custom_sorting_deviance(): print("Check leaderboard for Regression sort by deviance") ds = import_dataset('regression', split=False) exclude_algos = ["GBM", "DeepLearning"] aml = H2OAutoML(project_name="py_aml_lb_test_custom_regr_deviance", exclude_algos=exclude_algos, max_models=10, nfolds=2, stopping_rounds=1, stopping_tolerance=0.5, seed=automl_seed, sort_metric="deviance") aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["mean_residual_deviance", "rmse", "mse", "mae", "rmsle"], "mean_residual_deviance")
def test_no_x_train_set_only(): print("AutoML run with x not provided and train set only") ds = import_dataset() aml = H2OAutoML(project_name="py_aml1", stopping_rounds=3, stopping_tolerance=0.001, stopping_metric="AUC", max_models=max_models, seed=1234) aml.train(y=ds['target'], training_frame=ds['train']) assert aml.project_name == "py_aml1", "Project name is not set" assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3" assert aml.stopping_tolerence == 0.001, "stopping_tolerance is not set to 0.001" assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`" assert aml.max_models == 2, "max_models is not set to 2" assert aml.seed == 1234, "seed is not set to `1234`" print("Check leaderboard") print(aml.leaderboard)
def test_keep_cross_validation_fold_assignment_enabled_with_nfolds_eq_0(): print( "Check that fold assignments were skipped when `keep_cross_validation_fold_assignment` = True and nfolds = 0" ) ds = import_dataset() aml = H2OAutoML( project_name="py_aml_keep_cross_validation_fold_assignment_2", nfolds=0, max_models=3, seed=1, keep_cross_validation_fold_assignment=True) aml.train(y=ds['target'], training_frame=ds['train']) _, non_se, _ = get_partitioned_model_names(aml.leaderboard) amodel = h2o.get_model(non_se[0]) assert amodel.params['keep_cross_validation_fold_assignment'][ 'actual'] == False assert amodel._model_json["output"][ "cross_validation_fold_assignment_frame_id"] == None
def test_frames_cannot_be_passed_as_key(): print("Check that all AutoML frames can be passed as keys.") ds = import_dataset() aml = H2OAutoML(project_name="py_aml_frames_as_keys", seed=1, max_models=3, nfolds=0) kw_args = [ dict(training_frame=ds['train'].frame_id), dict(training_frame=ds['train'], validation_frame=ds['valid'].frame_id), dict(training_frame=ds['train'], blending_frame=ds['valid'].frame_id), dict(training_frame=ds['train'], leaderboard_frame=ds['test'].frame_id), ] for kwargs in kw_args: try: aml.train(y=ds['target'], **kwargs) assert False, "should have thrown due to wrong frame key" except H2OTypeError as e: attr = next(k for k, v in kwargs.items() if v is not ds['train']) assert "'{}' must be a valid H2OFrame".format(attr) in str(e)
def test_keep_cross_validation_fold_assignment_enabled_with_nfolds_neq_0(): print( "Check that fold assignments were kept when `keep_cross_validation_fold_assignment` = True and nfolds > 1" ) ds = import_dataset() aml = H2OAutoML( project_name="py_aml_keep_cross_validation_fold_assignment_1", nfolds=3, max_models=3, seed=1, keep_cross_validation_fold_assignment=True) aml.train(y=ds.target, training_frame=ds.train) base_models = get_partitioned_model_names(aml.leaderboard).base amodel = h2o.get_model(base_models[0]) assert amodel.params['keep_cross_validation_fold_assignment'][ 'actual'] == True assert amodel._model_json["output"][ "cross_validation_fold_assignment_frame_id"] != None
def execute(self, params, **kwargs): import h2o from h2o.automl import H2OAutoML h2o.init() train_X_frame = h2o.H2OFrame.from_python( self.marvin_dataset['train_X']) test_X_frame = h2o.H2OFrame.from_python(self.marvin_dataset['test_X']) x = train_X_frame.columns y = 'Species' x.remove(y) automl = H2OAutoML(max_models=20, seed=1) automl.train(x=x, y=y, training_frame=train_X_frame) self.marvin_model = automl
def auto_ML(df, n_models, validation_ratio=.5): """ Initialize h2o, a new Auto ML object and already train it applying verification following the validation ratio. """ h2o.init(ip="localhost", port=54323) aml = H2OAutoML(max_models=n_models, seed=1) X, y, train, test = split(df, validation_ratio) aml.train(x=list(df.loc[:, 'f1':'f20'].columns), y='REDSHIFT_SPEC', training_frame=train, leaderboard_frame=test) lb = aml.leaderboard print("Leaderboard: ", lb.head(rows=lb.nrows), '\n') print("Leader: ", aml.leader, "\n") return aml
def test_event_log(): ds = import_dataset() aml = H2OAutoML(project_name="test_event_log", max_models=2, seed=1234) aml.train(y=ds.target, training_frame=ds.train) print(aml.event_log) assert aml.event_log.columns == ['timestamp', 'level', 'stage', 'message', 'name', 'value'] assert aml.event_log.nrows > 10 print(aml.training_info) assert int(aml.training_info['stop_epoch']) > int(aml.training_info['start_epoch']) stop_dt = dt.datetime.fromtimestamp(int(aml.training_info['stop_epoch'])) now = dt.datetime.now() # test that stop_epoch is time encoded as unix epoch assert abs(stop_dt - now) < dt.timedelta(minutes=1) assert abs(int(aml.training_info['duration_secs']) - (int(aml.training_info['stop_epoch']) - int(aml.training_info['start_epoch']))) <= 1
def test_automl_stops_after_max_models(): print("Check that automl gets interrupted after `max_models`") ds = import_dataset() max_models = 5 aml = H2OAutoML(project_name="py_aml_max_models", seed=1, max_models=max_models) aml.train(y=ds['target'], training_frame=ds['train']) base_models = [ m for m in [aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))] if not m.startswith('StackedEnsemble') ] assert len( base_models ) == max_models, "obtained {} base models when {} are expected".format( len(base_models), max_models)
def test_modeling_plan_using_simplified_syntax(): ds = import_dataset() aml = H2OAutoML(project_name="py_modeling_plan_simple_syntax", max_models=3, modeling_plan=[ ('DRF', ['XRT', 'def_1']), ('GBM', 'grids'), ('StackedEnsemble', ['best']) ], seed=1) aml.train(y=ds.target, training_frame=ds.train) _, non_se, se = get_partitioned_model_names(aml.leaderboard) assert len(non_se) == 3 assert len(se) == 1 assert any('DRF' in name for name in non_se) assert any('XRT' in name for name in non_se) assert any('GBM_grid' in name for name in non_se) assert any('BestOfFamily' in name for name in se)
def test_AUTO_stopping_metric_with_custom_sorting_metric(): print( "Check leaderboard with AUTO stopping metric and rmse sorting metric") ds = prepare_data('regression') exclude_algos = ["DeepLearning", "GLM"] aml = H2OAutoML( project_name="py_aml_lb_test_auto_stopping_metric_custom_sorting", exclude_algos=exclude_algos, max_models=10, seed=automl_seed, sort_metric="rmse") aml.train(y=ds.target, training_frame=ds.train) check_leaderboard( aml, exclude_algos, ["rmse", "mean_residual_deviance", "mse", "mae", "rmsle"], "rmse") non_se = get_partitioned_model_names(aml.leaderboard).non_se check_model_property(non_se, 'stopping_metric', True, "RMSE")
def test_max_runtime_secs_can_be_set_in_combination_with_max_models_and_max_runtime_wins( ): ds = import_dataset() aml = H2OAutoML(project_name="py_all_stopping_constraints", seed=1, max_models=20, max_runtime_secs=12) aml.train(y=ds['target'], training_frame=ds['train']) max_runtime = aml._build_resp['build_control']['stopping_criteria'][ 'max_runtime_secs'] max_models = aml._build_resp['build_control']['stopping_criteria'][ 'max_models'] assert max_runtime == 12 assert max_models == 20 assert aml.leaderboard.nrows < 20 assert int( aml.training_info['duration_secs'] ) < 2 * max_runtime # being generous to avoid errors on slow Jenkins
def prostate_automl(): df = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) # Split frames; make the splits repeatable to test multiple runs # TODO: note that frames with the following names get created, but some Python binding temp # magic gives random names to the frames that are given to AutoML. See PUBDEV-4634. fr = df.split_frame(ratios=[.8, .1], destination_frames=[ "prostate_train", "prostate_valid", "prostate_test" ], seed=42) #Set up train, validation, and test sets train = fr[0] valid = fr[1] test = fr[2] # aml = H2OAutoML(max_models = 2, stopping_rounds=3, stopping_tolerance=0.001, project_name='prostate') aml = H2OAutoML(max_models=2, stopping_rounds=2, stopping_tolerance=0.05, project_name='prostate', exclude_algos=["GLM", "DeepLearning"]) # aml = H2OAutoML(max_models=8, stopping_rounds=2, seed=42, project_name='prostate') train["CAPSULE"] = train["CAPSULE"].asfactor() valid["CAPSULE"] = valid["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() print( "AutoML (Binomial) run with x not provided with train, valid, and test" ) aml.train(y="CAPSULE", training_frame=train, validation_frame=valid, leaderboard_frame=test) print(aml.leader) print(aml.leaderboard) assert set(aml.leaderboard.columns) == set([ "model_id", "auc", "logloss", "aucpr", "mean_per_class_error", "rmse", "mse" ])
def test_early_stopping_args(): print("Check arguments to H2OAutoML class") ds = import_dataset() aml = H2OAutoML(project_name="py_aml0", stopping_rounds=3, stopping_tolerance=0.001, stopping_metric="AUC", max_models=max_models, seed=1234, exclude_algos=["DeepLearning"]) aml.train(y=ds['target'], training_frame=ds['train']) assert aml.project_name == "py_aml0", "Project name is not set" assert aml.stopping_rounds == 3, "stopping_rounds is not set to 3" assert aml.stopping_tolerence == 0.001, "stopping_tolerance is not set to 0.001" assert aml.stopping_metric == "AUC", "stopping_metrics is not set to `AUC`" assert aml.max_models == 2, "max_models is not set to 2" assert aml.seed == 1234, "seed is not set to `1234`" print("Check leaderboard") print(aml.leaderboard)
def test_remove_automl_with_xval(): ds = import_dataset() project_name = 'aml_with_xval_remove_test' max_models = 5 nfolds = 5 aml = H2OAutoML(project_name=project_name, nfolds=nfolds, max_models=max_models, seed=1) aml.train(y=ds.target, training_frame=ds.train, validation_frame=ds.valid, leaderboard_frame=ds.test) keys = list_keys_in_memory() assert aml.key.startswith(project_name) assert contains_leaderboard(aml.key, keys) assert contains_event_log(aml.key, keys) num_SEs = len(keys['metalearners']) print({k: len(v) for k, v in keys.items()}) expectations = dict( models_base=max_models + num_SEs, cv_models=0, predictions=0, metrics=(max_models * 3 # for each non-SE model, 1 on training_frame, 1 on validation_frame, 1 on leaderboard_frame + (num_SEs * 2) # for each SE model, 1 on training frame, 1 on leaderboard frame + (num_SEs * 2) # for each SE metalearner, 1+1 on levelone training+validation + (1 if any(("DeepLearning" in x for x in keys["metrics"])) else 0) # DeepLearning has 2 training metrics (IDK why) ) ) for k, v in expectations.items(): assert len(keys[k]) == v, "expected {} {}, but got {}".format(v, k, len(keys[k])) h2o.remove(aml) clean = list_keys_in_memory() print(clean['all'].values) assert not contains_leaderboard(aml.key, clean) assert not contains_event_log(aml.key, clean) assert len(clean['models_base']) == 0 assert len(clean['cv_models']) == 0 assert len(clean['models_all']) == 0 assert len(clean['predictions']) == 0 assert len(clean['metrics']) == 0 assert len(clean['automl']) == 0 for frame in [ds.train, ds.valid, ds.test]: assert frame_in_cluster(frame), "frame {} has been removed from cluster".format(frame.frame_id)
def test_stacked_ensembles_are_trained_after_max_models(): print( "Check that Stacked Ensembles are still trained after max models have been trained" ) max_models = 5 ds = import_dataset() aml = H2OAutoML(project_name="py_aml_SE_after_max_models", seed=1, max_models=max_models) aml.train(y=ds['target'], training_frame=ds['train']) stacked_ensembles = [ m for m in [aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))] if m.startswith('StackedEnsemble') ] assert len( stacked_ensembles ) == 2, "StackedEnsemble should still be trained after max models have been reached"
def automl_pojo(): fr1 = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) fr1["CAPSULE"] = fr1["CAPSULE"].asfactor() aml = H2OAutoML(max_models=2, project_name="py_lb_test_aml1", seed=1234) aml.train(y="CAPSULE", training_frame=fr1) # download pojo if aml.leader.algo != "stackedensemble": model_zip_path = os.path.join(tempfile.mkdtemp(), 'model.zip') time0 = time.time() print("\nDownloading POJO @... " + model_zip_path) pojo_file = aml.download_pojo(model_zip_path) print(" => %s (%d bytes)" % (pojo_file, os.stat(pojo_file).st_size)) assert os.path.exists(pojo_file) print(" Time taken = %.3fs" % (time.time() - time0)) assert os.path.isfile(model_zip_path) os.remove(model_zip_path)
def iris_automl(): df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Split frames fr = df.split_frame(ratios=[.8,.1]) # Set up train, validation, and test sets train = fr[0] valid = fr[1] test = fr[2] aml = H2OAutoML(max_runtime_secs = 3,stopping_rounds=3,stopping_tolerance=0.001) print("AutoML (Multinomial) run with x not provided; uses train, valid, and leaderboard (test) frame") aml.train(y="class", training_frame=train,validation_frame=valid, leaderboard_frame=test) print(aml.leader) print(aml.leaderboard) assert set(aml.leaderboard.columns) == set(["model_id","mean_per_class_error", "logloss", "rmse", "mse"])
def automl_pojo(): ds = import_dataset() aml = H2OAutoML(max_models=2, project_name="py_lb_test_aml1", exclude_algos=['XGBoost', 'StackedEnsemble'], # no POJO export for XGB or SE seed=1234) aml.train(y=ds.target, training_frame=ds.train) # download pojo model_zip_path = tempfile.mkdtemp() model_zip_file_path = os.path.join(model_zip_path, aml._leader_id + ".java") time0 = time.time() print("\nDownloading POJO @... " + model_zip_file_path) pojo_file = aml.download_pojo(model_zip_path) print(" => %s (%d bytes)" % (pojo_file, os.stat(pojo_file).st_size)) assert os.path.exists(pojo_file) print(" Time taken = %.3fs" % (time.time() - time0)) assert os.path.isfile(model_zip_file_path) shutil.rmtree(model_zip_path)