Esempio n. 1
0
def scale_svd_rf_pipe_new_import():
    from h2o.estimators.svd import H2OSingularValueDecompositionEstimator
    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    print("Compare with SVD")

    # build transformation pipeline using sklearn's Pipeline and H2O estimators without H2OSingularValueDecompositionEstimator.init_for_pipeline()
    # it should fail
    # Note: if you use SVD algo in a different combination of pipeline tasks, it could not fail, for example
    #     if you comment line with H2ORandomForestEstimator task, the fit method doesn't fail because the pipeline doesn't
    #     use _fit_transform_one method thus does not use HH2OSingularValueDecompositionEstimator.transform method
    try:
        pipe = Pipeline([
                        ("standardize", H2OScaler()),
                        ("svd", H2OSingularValueDecompositionEstimator(nv=3)),
                        ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))
                        ])

        pipe.fit(arrests[1:], arrests[0])
        assert False, "Pipeline should fail without using H2OSingularValueDecompositionEstimator.init_for_pipeline()"
    except TypeError:
        pass

    # build transformation pipeline using sklearn's Pipeline and H2O estimators with H2OSingularValueDecompositionEstimator.init_for_pipeline()
    pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("svd", H2OSingularValueDecompositionEstimator(nv=3).init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))
                    ])

    pipe.fit(arrests[1:], arrests[0])
    print(pipe)
Esempio n. 2
0
def scale_svd_rf_pipe():
    from h2o.transforms.decomposition import H2OSVD

    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2OSVD
    pipe = Pipeline([("standardize", H2OScaler()), ("svd", H2OSVD()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],
        "standardize__scale": [True, False],
        "svd__nv": [2, 3],
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
        "svd__transform": ["none", "standardize"],
    }

    custom_cv = H2OKFold(arrests, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(arrests[1:], arrests[0])
    print(random_search.best_estimator_)
def scale_pca_rf_pipe_new_import():
  from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator())
                  ])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__k":                 randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),
            "pca__transform":         ["none", "standardize"],
            }

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, 
                                     params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)

  random_search.fit(iris[1:],iris[0])

  print(random_search.best_estimator_)
Esempio n. 4
0
def scale_pca_rf_pipe_new_import():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O estimators without H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()
    # it should fail
    # Note: if you use PCA algo in a different combination of pipeline tasks, it could not fail, for example
    #     if you comment line with H2ORandomForestEstimator task, the fit method doesn't fail because the pipeline doesn't
    #     use _fit_transform_one method thus does not use H2OPrincipalComponentAnalysisEstimator.transform method
    try:
        pipe = Pipeline([("standardize", H2OScaler()),
                         ("pca", H2OPrincipalComponentAnalysisEstimator(k=2)),
                         ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))])
        pipe.fit(iris[:4], iris[4])
        assert False, "Pipeline should fail without using H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()"
    except TypeError:
        pass

    # build transformation pipeline using sklearn's Pipeline and H2O estimators with H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()
    pipe = Pipeline([
        ("standardize", H2OScaler()),
        ("pca",
         H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()),
        ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))
    ])
    pipe.fit(iris[:4], iris[4])
    print(pipe)

    # set H2OPCA transform property
    pca = H2OPrincipalComponentAnalysisEstimator(k=2)
    pca.transform = "standardize"
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", pca.init_for_pipeline()),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))])
    pipe.fit(iris[:4], iris[4])
    print(pipe)
Esempio n. 5
0
def test_scaler():
    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    scaler = H2OScaler()
    scaler.fit(iris)

    iris_transformed = scaler.transform(iris)

    assert [[u'Iris-setosa', u'Iris-versicolor',
             u'Iris-virginica']] == iris_transformed["C5"].levels()
    assert max(iris_transformed[[
        "C1", "C2", "C3", "C4"
    ]].mean().as_data_frame().transpose()[0].tolist()) < 1e-10
Esempio n. 6
0
def scale_svd_rf_pipe():
    from h2o.transforms.decomposition import H2OSVD
    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([
        ("standardize", H2OScaler()),
        ("svd", H2OSVD(nv=3)),
        ("rf", H2ORandomForestEstimator(seed=42,ntrees=50))
    ])

    pipe.fit(arrests[1:], arrests[0])
    print(pipe)
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.transforms.decomposition import H2OPCA
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])
    pipe.fit(iris[:4], iris[4])
Esempio n. 8
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.transforms.decomposition import H2OPCA
    # this should work below, but it's not yet: https://0xdata.atlassian.net/browse/PUBDEV-5236
    #from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as H2OPCA
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])
    pipe.fit(iris[:4], iris[4])
Esempio n. 9
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import RandomizedSearchCV
    from h2o.cross_validation import H2OKFold
    from h2o.model.regression import h2o_r2_score
    from sklearn.metrics.scorer import make_scorer
    from scipy.stats import randint

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPrincipalComponentAnalysisEstimator()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],  # Parameters to test
        "standardize__scale": [True, False],
        "pca__k": randint(2, iris[1:].shape[1]),
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
    }

    custom_cv = H2OKFold(iris, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(iris[1:], iris[0])

    print(random_search.best_estimator_)
Esempio n. 10
0
hyper_parameters = {"ntrees": ntrees_opt, "max_depth":max_depth_opt, "learn_rate":learn_rate_opt}
from h2o.grid.grid_search import H2OGridSearch
gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters)
gs.train(x=range(0,iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)
print gs.sort_by('logloss', increasing=True)

# Pipeline
from h2o.transforms.preprocessing import H2OScaler
from sklearn.pipeline import Pipeline

# Turn off h2o progress bars
h2o.__PROGRESS_BAR__=False
h2o.no_progress()

# build transformation pipeline using sklearn's Pipeline and H2O transforms
pipeline = Pipeline([("standardize", H2OScaler()),
                 ("pca", H2OPCA(k=2)),
                 ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))])
pipeline.fit(iris_df[:4],iris_df[4])

# Random CV using H2O and Scikit-learn
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer
params = {"standardize__center":    [True, False],             # Parameters to test
          "standardize__scale":     [True, False],
          "pca__k":                 [2,3],
          "gbm__ntrees":            [10,20],
          "gbm__max_depth":         [1,2,3],
          "gbm__learn_rate":        [0.1,0.2]}
Esempio n. 11
0
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters)
#gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)

##
## Pipeline
##

from h2o.transforms.preprocessing import H2OScaler
from h2o.transforms.decomposition import H2OPCA
from sklearn.pipeline import Pipeline

h2o.no_progress()

pipeline = Pipeline([
    ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
    ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))
])

print pipeline.fit(iris_df[:4], iris_df[4])

##
## Randomized Gird Search
##
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer

params = {
    "standardize__center": [True, False],