コード例 #1
0
def pca_export():
    print("###### PCA ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    model = H2OPrincipalComponentAnalysisEstimator(k=3, impute_missing=True)
    model.train(x=list(range(4)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="PCA", format='MOJO')
コード例 #2
0
def scale_pca_rf_pipe_new_import():
  from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator())
                  ])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__k":                 randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),
            "pca__transform":         ["none", "standardize"],
            }

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, 
                                     params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)

  random_search.fit(iris[1:],iris[0])

  print(random_search.best_estimator_)
コード例 #3
0
def kmeans_model(df, xValues):
    
    hf = h2o.H2OFrame(df)
    train, valid, test = hf.split_frame(ratios=[.8, .1])    
    # kmeans model
    kmeans = H2OKMeansEstimator(k=3,max_iterations=5,seed = 10,categorical_encoding = "AUTO",max_runtime_secs=10)
    kmeans.train(xValues, training_frame= hf)
    
    # pca model, generate Principal Components for further modelling or plotting
    
    pca = H2OPrincipalComponentAnalysisEstimator(k=4)
#    pca.train(xValues, training_frame= hf)
    pca.train(list(df.columns), training_frame= hf)
    pca_features = pca.predict(hf).as_data_frame()
    pca_metric = pca.summary().as_data_frame()
    
    # model metrics
    cluster_column = kmeans.predict(hf).as_data_frame()
    # The Between Cluster Sum-of-Square Error
    inter_cluster_error = kmeans.betweenss()
    # Within Cluster Sum-of-Square Error
    intra_cluster_error = kmeans.withinss()
    # Centroids
    centroids = kmeans.centers()
    # Size of clusters
    cluster_size = kmeans.size()
# 
    cluster_column.columns = ['cluster']
    frames = [df,cluster_column]
    transformed_data = pd.concat(frames, axis=1)
    
    output = [transformed_data, pca_features, pca_metric, centroids, inter_cluster_error, intra_cluster_error, cluster_size]
    return output
コード例 #4
0
def scale_pca_rf_pipe_new_import():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O estimators without H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()
    # it should fail
    # Note: if you use PCA algo in a different combination of pipeline tasks, it could not fail, for example
    #     if you comment line with H2ORandomForestEstimator task, the fit method doesn't fail because the pipeline doesn't
    #     use _fit_transform_one method thus does not use H2OPrincipalComponentAnalysisEstimator.transform method
    try:
        pipe = Pipeline([("standardize", H2OScaler()),
                         ("pca", H2OPrincipalComponentAnalysisEstimator(k=2)),
                         ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))])
        pipe.fit(iris[:4], iris[4])
        assert False, "Pipeline should fail without using H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()"
    except TypeError:
        pass

    # build transformation pipeline using sklearn's Pipeline and H2O estimators with H2OPrincipalComponentAnalysisEstimator.init_for_pipeline()
    pipe = Pipeline([
        ("standardize", H2OScaler()),
        ("pca",
         H2OPrincipalComponentAnalysisEstimator(k=2).init_for_pipeline()),
        ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))
    ])
    pipe.fit(iris[:4], iris[4])
    print(pipe)

    # set H2OPCA transform property
    pca = H2OPrincipalComponentAnalysisEstimator(k=2)
    pca.transform = "standardize"
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", pca.init_for_pipeline()),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=5))])
    pipe.fit(iris[:4], iris[4])
    print(pipe)
コード例 #5
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPrincipalComponentAnalysisEstimator(k=2)),
                     ("rf", H2ORandomForestEstimator(seed=42, ntrees=50))])
    pipe.fit(iris[:4], iris[4])
コード例 #6
0
def test_pca_screeplot():
    import matplotlib
    import matplotlib.pyplot as plt
    matplotlib.use("agg")
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    fitH2O = H2OPrincipalComponentAnalysisEstimator(k=4, transform="DEMEAN")
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)

    # The following should not fail
    fitH2O.screeplot()
    fitH2O.screeplot(server=True)
    fitH2O.screeplot(type="lines", server=True)

    # Free the memory
    plt.close("all")
コード例 #7
0
def scale_pca_rf_pipe():

    from h2o.transforms.preprocessing import H2OScaler
    from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
    from h2o.estimators.random_forest import H2ORandomForestEstimator
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import RandomizedSearchCV
    from h2o.cross_validation import H2OKFold
    from h2o.model.regression import h2o_r2_score
    from sklearn.metrics.scorer import make_scorer
    from scipy.stats import randint

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # build  transformation pipeline using sklearn's Pipeline and H2O transforms
    pipe = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPrincipalComponentAnalysisEstimator()),
                     ("rf", H2ORandomForestEstimator())])

    params = {
        "standardize__center": [True, False],  # Parameters to test
        "standardize__scale": [True, False],
        "pca__k": randint(2, iris[1:].shape[1]),
        "rf__ntrees": randint(50, 60),
        "rf__max_depth": randint(4, 8),
        "rf__min_rows": randint(5, 10),
    }

    custom_cv = H2OKFold(iris, n_folds=5, seed=42)
    random_search = RandomizedSearchCV(pipe,
                                       params,
                                       n_iter=5,
                                       scoring=make_scorer(h2o_r2_score),
                                       cv=custom_cv,
                                       random_state=42,
                                       n_jobs=1)

    random_search.fit(iris[1:], iris[0])

    print(random_search.best_estimator_)
コード例 #8
0
def build_mojo_pipeline():
    results_dir = pyunit_utils.locate("results")
    iris_csv = pyunit_utils.locate('smalldata/iris/iris_train.csv')
    iris = h2o.import_file(iris_csv)

    pca = H2OPrincipalComponentAnalysisEstimator(k=2)
    pca.train(training_frame=iris)

    principal_components = pca.predict(iris)

    km = H2OKMeansEstimator(k=3)
    km.train(training_frame=principal_components)

    pca_mojo_path = pca.download_mojo(path=results_dir)
    km_mojo_path = km.download_mojo(get_genmodel_jar=True, path=results_dir)

    java_cmd = [
        "java", "-cp",
        os.path.join(results_dir, "h2o-genmodel.jar"),
        "hex.genmodel.tools.BuildPipeline", "--mapping"
    ]
    pca_mojo_name = os.path.basename(pca_mojo_path).split('.')[0]
    for i, pc in enumerate(principal_components.columns):
        mapping = pc + '=' + pca_mojo_name + ':' + str(i)
        java_cmd += [mapping]
    java_cmd += [
        "--output",
        os.path.join(results_dir, "pipe.zip"), "--input", km_mojo_path,
        pca_mojo_path
    ]

    subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT).communicate()

    h2o_preds = km.predict(principal_components)
    mojo_preds_raw = h2o.mojo_predict_csv(input_csv_path=iris_csv,
                                          mojo_zip_path=os.path.join(
                                              results_dir, "pipe.zip"))
    mojo_preds = h2o.H2OFrame([c['cluster'] for c in mojo_preds_raw],
                              column_names=['predict'])

    assert (mojo_preds == h2o_preds).mean()[0, "predict"] == 1
コード例 #9
0
def pca_arrests():

    print("Importing USArrests.csv data...")
    arrests = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    arrests.describe()

    # import from h2o.transforms.decomposition
    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrests.names))
        pca_h2o = H2OPCA(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrests)
        # TODO: pca_h2o.show()

    # import from h2o.estimators.pca
    for i in range(4):
        print("H2O PCA with " + str(i) + " dimensions:\n")
        print("Using these columns: {0}".format(arrests.names))
        pca_h2o = H2OPrincipalComponentAnalysisEstimator(k=i + 1)
        pca_h2o.train(x=list(range(4)), training_frame=arrests)
コード例 #10
0
def pca_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     ncol_upper=8000,
                                     ncol_lower=5000,
                                     missing_fraction=0.001,
                                     seed=1234)
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names
    transform_types = [
        "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"
    ]  # pyunit test loop through transform
    for transformN in transform_types:  # compare H2O predict and mojo predict for all dataset transform types
        pcaModel = H2OPrincipalComponentAnalysisEstimator(
            k=3,
            transform=transformN,
            seed=1234,
            impute_missing=True,
            use_all_factor_levels=False)
        pcaModel.train(x=x, training_frame=train)
        pyunit_utils.saveModelMojo(pcaModel)  # save mojo model
        MOJONAME = pyunit_utils.getMojoName(pcaModel._id)
        TMPDIR = os.path.normpath(
            os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                         "results", MOJONAME))
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            pcaModel, TMPDIR, MOJONAME)  # save mojo predict

        for col in range(pred_h2o.ncols):
            if pred_h2o[col].isfactor():
                pred_h2o[col] = pred_h2o[col].asnumeric()

        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
コード例 #11
0
ファイル: R2P_cluster.py プロジェクト: bindu29/example
def kMeans_model(data_temp, Vals, parametersObj, obj_t):

    print('Performing KMeans modelling')
    hf = h2o.H2OFrame(data_temp)

    predictors = list(data_temp.columns)

    # split into train and validation sets
    train, valid = hf.split_frame(ratios=[.8], seed=1234)
    num_data = data_temp.select_dtypes(
        include=['number', 'int', 'float']).copy()
    # try using the `k` parameter:
    # build the model with three clusters
    # initialize the estimator then train the model
    try:
        hf_kmeans = H2OKMeansEstimator(
            max_iterations=int(parametersObj['max_iterations']),
            score_each_iteration=bool(parametersObj['score_each_iteration']),
            ignore_const_cols=bool(parametersObj['ignore_const_cols']),
            k=int(parametersObj['kvalue']),
            max_runtime_secs=int(parametersObj['max_runtime_secs']),
            categorical_encoding=str(parametersObj['categoricalencoding']),
            standardize=bool(parametersObj['standardize']),
            estimate_k=bool(parametersObj['estimate_k']))
        hf_kmeans.train(x=predictors,
                        training_frame=train,
                        validation_frame=valid)

    except Exception:
        hf_kmeans = H2OKMeansEstimator(
            max_iterations=int(parametersObj['max_iterations']),
            score_each_iteration=bool(parametersObj['score_each_iteration']),
            ignore_const_cols=bool(parametersObj['ignore_const_cols']),
            k=int(parametersObj['kvalue']),
            max_runtime_secs=int(parametersObj['max_runtime_secs']),
            categorical_encoding=str(parametersObj['categoricalencoding']),
            standardize=bool(parametersObj['standardize']))
        hf_kmeans.train(x=predictors,
                        training_frame=train,
                        validation_frame=valid)

    clusters = hf_kmeans.predict(hf)
    data_temp['cluster'] = clusters.as_data_frame()
    data_temp['cluster'] = data_temp['cluster'] + 1

    #k = int(parametersObj['kvalue'])
    ab_t = hf_kmeans.centroid_stats()['within_cluster_sum_of_squares']
    within = pd.DataFrame(columns=['withiness'], index=range(len(ab_t)))
    #print(len(within))
    within['withiness'] = hf_kmeans.centroid_stats(
    )['within_cluster_sum_of_squares']
    #print(len(hf_kmeans.centroid_stats()['within_cluster_sum_of_squares']))

    tss = hf_kmeans.totss()
    betweenss = hf_kmeans.betweenss()
    #center = hf_kmeans.centers(train = True)
    size = pd.DataFrame(columns=['cluster_size'], index=range(len(ab_t)))
    size['cluster_size'] = hf_kmeans.centroid_stats()['size']
    ## PCA

    pca = H2OPrincipalComponentAnalysisEstimator(k=2,
                                                 transform="STANDARDIZE",
                                                 pca_method="GLRM",
                                                 use_all_factor_levels=True,
                                                 impute_missing=False,
                                                 max_iterations=300)
    pca.train(training_frame=hf)
    cords = pca.rotation().as_data_frame()
    cords = cords[['pc1', 'pc2']]
    cords.columns = ['PC1', 'PC2']
    print('here')
    num_data = data_temp.select_dtypes(
        include=['int', 'float', 'number']).copy()
    num_data_cols = list(num_data.columns)
    print('here1')

    cat_cols = list(filter(lambda x: x not in num_data_cols,
                           data_temp.columns))
    data_temp1 = data_temp[cat_cols]
    try:
        num_data_cols.remove('cluster')
    except Exception:
        num_data_cols = num_data_cols
    if num_data.shape[1] > 1:
        num_data = num_data[num_data_cols]
        num_data_temp = transformation_inv(num_data, obj_t)
        frames = [num_data_temp, data_temp1, data_temp['cluster']]
        data_temp = pd.concat(frames, axis=1)
        for i in num_data.columns:
            if data_temp[i].dtypes == 'float' or data_temp[i].dtypes == 'int':
                for j in range(len(data_temp[i])):
                    data_temp[i].iloc[j] = round(data_temp[i].iloc[j], 2)
    else:
        frames = [data_temp1, num_data['cluster']]
        data_temp = pd.concat(frames, axis=1)
    print(data_temp.columns)
    print('KMeans done')
    listed = [data_temp, cords, [], within, tss, betweenss, size]
    return listed
コード例 #12
0
    # region train validate split + select variables
    train_with_target = train_df.cbind(target)
    print("train_with_target: ", train_with_target.shape)

    if para.TRAIN_LABEL == "":
        train_label = train_df.col_names
    else:
        train_label = para.TRAIN_LABEL

    train, test = train_with_target.split_frame(ratios=[1-para.TEST_SIZE])
    # endregion

    # region PCA
    if para.RUN_PCA == 1:
        pca_decomp = H2OPrincipalComponentAnalysisEstimator(k=124,
                                                            transform="Standardize",
                                                            pca_method="Power",
                                                            impute_missing=True)

        pca_decomp.train(training_frame=train_df)
        pred = pca_decomp.predict(train_df)
    # endregion

    # region Model
    print('[{}] Model training starts'.format(util.time_now()))

    # Define model
    model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)

    # Train model
    model.train(x=train_label, y='TARGET', training_frame=train_with_target)
    # endregion
コード例 #13
0
gs.get_grid("logloss", decreasing=False)


#==============================================================================
# integrate with scikit-learn http://scikit-learn.org/stable/modules/pipeline.html
#==============================================================================

from sklearn.pipeline import Pipeline, make_union
from h2o.transforms.preprocessing import H2OScaler
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator


h2o.__PROGRESS_BAR__=True

pipeline = Pipeline([("standardize", H2OScaler()),
                     ("pca", H2OPrincipalComponentAnalysisEstimator(k=2,impute_missing=True)),
                     ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))])

pipeline.fit?
H2OPrincipalComponentAnalysisEstimator?

pipeline.fit(X=iris_df[:4], y=iris_df[4])


import sklearn.pipeline
dir(sklearn.pipeline)

sklearn.pipeline.Parallel?
sklearn.pipeline.make_union?

from sklearn.decomposition import PCA, TruncatedSVD
コード例 #14
0
def test_pca_importance():
    arrestsH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    fitH2O = H2OPrincipalComponentAnalysisEstimator(k=4, transform="DEMEAN")
    fitH2O.train(x=list(range(4)), training_frame=arrestsH2O)
    assert fitH2O.varimp()
コード例 #15
0
print('XGB ranking')
grid_res = grid.get_grid(sort_by='rmsle', decreasing=True)
print(grid_res)
# Get best XGB model # best so far learn_rate:0.01, ntrees:5000, colsample_bytree:0.81, subsample:0.88999,
# gamma:0.4, max_depth:4, min_child_weight:4
# with RMSLE: 0.12430670916111265
best_mod = grid_res.models[-1]
print('Best XGB:')
print(best_mod.params)


# Tune PCA model
pca_params = {'transform': ['None', 'Standardize', 'Normalize', 'Demean', 'Descale'],
              'k': [i for i in range(1, 10)],
              'max_iterations': [pow(10, i) for i in range(1, 6)]}
pca = H2OPrincipalComponentAnalysisEstimator(seed=seed)
grid = H2OGridSearch(model=pca, hyper_params=pca_params, search_criteria={'strategy': 'Cartesian'})
grid.train(list(set(housing.columns) - {'C1', 'Ids'}), y='SalePrice', training_frame=housing, nfolds=10)
# Get the grid results, sorted
print('PCA ranking')
grid_res = grid.get_grid(sort_by='rmsle', decreasing=True)
print(grid_res)
# Get best PCA model # best so far learn_rate:0.01, ntrees:5000, colsample_bytree:0.81, subsample:0.88999,
# gamma:0.4, max_depth:4, min_child_weight:4
# with RMSLE: 0.12430670916111265
best_mod = grid_res.models[-1]
print('Best PCA:')
print(best_mod.params)