def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2OSVD pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], "standardize__scale": [True, False], "svd__nv": [2, 3], "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), "svd__transform": ["none", "standardize"], } custom_cv = H2OKFold(arrests, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(arrests[1:], arrests[0]) print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import(): from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()), ("rf", H2ORandomForestEstimator()) ]) params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50,60), "rf__max_depth": randint(4,8), "rf__min_rows": randint(5,10), "pca__transform": ["none", "standardize"], } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:],iris[0]) print(random_search.best_estimator_)
def test_h2o_only_pipeline_with_h2o_frames(): pipeline = Pipeline([('svd', H2OSVD(seed=seed)), ('estimator', H2OGradientBoostingClassifier(seed=seed))]) params = dict( svd__nv=[2, 3], svd__transform=['DESCALE', 'DEMEAN', 'NONE'], estimator__ntrees=[5, 10], estimator__max_depth=[1, 2, 3], estimator__learn_rate=[0.1, 0.2], ) search = RandomizedSearchCV( pipeline, params, n_iter=5, random_state=seed, n_jobs=1, # fails with parallel jobs ) data = _get_data(format='h2o', n_classes=3) assert isinstance(data.X_train, h2o.H2OFrame) search.set_params( scoring=make_scorer(_h2o_accuracy), cv=H2OKFold(data.X_train, n_folds=3, seed=seed), ) search.fit(data.X_train, data.y_train) preds = search.predict(data.X_test) assert isinstance(preds, h2o.H2OFrame) assert preds.dim == [len(data.X_test), 1] probs = search.predict_proba(data.X_test) assert probs.dim == [len(data.X_test), 3] assert np.allclose(np.sum(probs.as_data_frame().values, axis=1), 1.), "`predict_proba` didn't return probabilities" score = search.score(data.X_test, data.y_test) assert isinstance(score, float) skl_score = accuracy_score(data.y_test.as_data_frame().values, preds.as_data_frame().values) assert abs(score - skl_score) < 1e-6, "score={}, skl_score={}".format( score, skl_score) scores['h2o_only_pipeline_with_h2o_frame'] = score
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer from scipy.stats import randint iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:], iris[0]) print(random_search.best_estimator_)
("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))]) pipeline.fit(iris_df[:4],iris_df[4]) # Random CV using H2O and Scikit-learn from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": [2,3], "gbm__ntrees": [10,20], "gbm__max_depth": [1,2,3], "gbm__learn_rate": [0.1,0.2]} custom_cv = H2OKFold(iris_df, n_folds=5, seed=42) pipeline = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="gaussian"))]) random_search = RandomizedSearchCV(pipeline, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris_df[1:], iris_df[0]) print random_search.best_estimator_
('groupby_count', AddGroupByCount())])), ('Add_ip_address_num', Pipeline([('extract', ColumnExtractor(['user_id', 'ip_address'])), ('groupby_count', AddGroupByCount())])), ('numerics', Pipeline([('extract', ColumnExtractor(NUM_FEATS)), ('zero_fill', ZeroFillTransformer()), ('log', Log1pTransformer())])) ])) ]) ############################## # Modeling + Tuning ############################## from h2o.cross_validation import H2OKFold dataset = pd.concat([X_train, y_train], axis=1) cv = H2OKFold(dataset, n_folds=5, seed=42) # H2O approach ("H2OCreator", H2OFrameCreator()), # ('standardize', H2OScaler()), # ('pca', H2OPCA()), ('rf', H2ORandomForestEstimator(ntrees=20)) # something new to try # from scipy.stats import randint # params = { # # "standardize__center": [True, False], # # "standardize__scale": [True, False], # "pca__k": 2, # # randint(2, X_train[1:].shape[1]), # "rf__ntrees": 20 # # randint(50,60),