def scale_svd_rf_pipe_new_import(): from h2o.estimators.svd import H2OSingularValueDecompositionEstimator print("Importing USArrests.csv data...") arrests = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Compare with SVD") # build transformation pipeline using sklearn's Pipeline and H2O estimators without H2OSingularValueDecompositionEstimator.init_for_pipeline() # it should fail # Note: if you use SVD algo in a different combination of pipeline tasks, it could not fail, for example # if you comment line with H2ORandomForestEstimator task, the fit method doesn't fail because the pipeline doesn't # use _fit_transform_one method thus does not use HH2OSingularValueDecompositionEstimator.transform method try: pipe = Pipeline([ ("standardize", H2OScaler()), ("svd", H2OSingularValueDecompositionEstimator(nv=3)), ("rf", H2ORandomForestEstimator(seed=42,ntrees=50)) ]) pipe.fit(arrests[1:], arrests[0]) assert False, "Pipeline should fail without using H2OSingularValueDecompositionEstimator.init_for_pipeline()" except TypeError: pass # build transformation pipeline using sklearn's Pipeline and H2O estimators with H2OSingularValueDecompositionEstimator.init_for_pipeline() pipe = Pipeline([ ("standardize", H2OScaler()), ("svd", H2OSingularValueDecompositionEstimator(nv=3).init_for_pipeline()), ("rf", H2ORandomForestEstimator(seed=42,ntrees=50)) ]) pipe.fit(arrests[1:], arrests[0]) print(pipe)
def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2OSVD pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], "standardize__scale": [True, False], "svd__nv": [2, 3], "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), "svd__transform": ["none", "standardize"], } custom_cv = H2OKFold(arrests, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(arrests[1:], arrests[0]) print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import(): from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()), ("rf", H2ORandomForestEstimator()) ]) params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50,60), "rf__max_depth": randint(4,8), "rf__min_rows": randint(5,10), "pca__transform": ["none", "standardize"], } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:],iris[0]) print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import(): from h2o.transforms.preprocessing import H2OScaler from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O estimators without H2OPrincipalComponentAnalysisEstimator.init_for_pipeline() # it should fail # Note: if you use PCA algo in a different combination of pipeline tasks, it could not fail, for example # if you comment line with H2ORandomForestEstimator task, the fit method doesn't fail because the pipeline doesn't # use _fit_transform_one method thus does not use H2OPrincipalComponentAnalysisEstimator.transform method try: pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator(k=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))]) pipe.fit(iris[:4], iris[4]) assert False, "Pipeline should fail without using H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()" except TypeError: pass # build transformation pipeline using sklearn's Pipeline and H2O estimators with H2OPrincipalComponentAnalysisEstimator.init_for_pipeline() pipe = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()), ("rf", H2ORandomForestEstimator(seed=42, ntrees=5)) ]) pipe.fit(iris[:4], iris[4]) print(pipe) # set H2OPCA transform property pca = H2OPrincipalComponentAnalysisEstimator(k=2) pca.transform = "standardize" pipe = Pipeline([("standardize", H2OScaler()), ("pca", pca.init_for_pipeline()), ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))]) pipe.fit(iris[:4], iris[4]) print(pipe)
def test_scaler(): iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) scaler = H2OScaler() scaler.fit(iris) iris_transformed = scaler.transform(iris) assert [[u'Iris-setosa', u'Iris-versicolor', u'Iris-virginica']] == iris_transformed["C5"].levels() assert max(iris_transformed[[ "C1", "C2", "C3", "C4" ]].mean().as_data_frame().transpose()[0].tolist()) < 1e-10
def scale_svd_rf_pipe(): from h2o.transforms.decomposition import H2OSVD print("Importing USArrests.csv data...") arrests = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([ ("standardize", H2OScaler()), ("svd", H2OSVD(nv=3)), ("rf", H2ORandomForestEstimator(seed=42,ntrees=50)) ]) pipe.fit(arrests[1:], arrests[0]) print(pipe)
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) pipe.fit(iris[:4], iris[4])
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA # this should work below, but it's not yet: https://0xdata.atlassian.net/browse/PUBDEV-5236 #from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))]) pipe.fit(iris[:4], iris[4])
def scale_pca_rf_pipe(): from h2o.transforms.preprocessing import H2OScaler from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator from sklearn.pipeline import Pipeline from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer from scipy.stats import randint iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # build transformation pipeline using sklearn's Pipeline and H2O transforms pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPrincipalComponentAnalysisEstimator()), ("rf", H2ORandomForestEstimator())]) params = { "standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": randint(2, iris[1:].shape[1]), "rf__ntrees": randint(50, 60), "rf__max_depth": randint(4, 8), "rf__min_rows": randint(5, 10), } custom_cv = H2OKFold(iris, n_folds=5, seed=42) random_search = RandomizedSearchCV(pipe, params, n_iter=5, scoring=make_scorer(h2o_r2_score), cv=custom_cv, random_state=42, n_jobs=1) random_search.fit(iris[1:], iris[0]) print(random_search.best_estimator_)
hyper_parameters = {"ntrees": ntrees_opt, "max_depth":max_depth_opt, "learn_rate":learn_rate_opt} from h2o.grid.grid_search import H2OGridSearch gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) gs.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10) print gs.sort_by('logloss', increasing=True) # Pipeline from h2o.transforms.preprocessing import H2OScaler from sklearn.pipeline import Pipeline # Turn off h2o progress bars h2o.__PROGRESS_BAR__=False h2o.no_progress() # build transformation pipeline using sklearn's Pipeline and H2O transforms pipeline = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))]) pipeline.fit(iris_df[:4],iris_df[4]) # Random CV using H2O and Scikit-learn from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = {"standardize__center": [True, False], # Parameters to test "standardize__scale": [True, False], "pca__k": [2,3], "gbm__ntrees": [10,20], "gbm__max_depth": [1,2,3], "gbm__learn_rate": [0.1,0.2]}
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters) #gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10) ## ## Pipeline ## from h2o.transforms.preprocessing import H2OScaler from h2o.transforms.decomposition import H2OPCA from sklearn.pipeline import Pipeline h2o.no_progress() pipeline = Pipeline([ ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)), ("gbm", H2OGradientBoostingEstimator(distribution="multinomial")) ]) print pipeline.fit(iris_df[:4], iris_df[4]) ## ## Randomized Gird Search ## from sklearn.grid_search import RandomizedSearchCV from h2o.cross_validation import H2OKFold from h2o.model.regression import h2o_r2_score from sklearn.metrics.scorer import make_scorer params = { "standardize__center": [True, False],