Beispiel #1
0
DLperf = DeepLearn.model_performance()

RandomForest = H2ORandomForestEstimator(model_id='RandomForest',
                                        ntrees=10,
                                        max_depth=5,
                                        min_rows=10,
                                        seed=1111,
                                        nfolds=5,
                                        binomial_double_trees=True,
                                        keep_cross_validation_predictions=True)
RandomForest.train(x=x, y=y, training_frame=train)
# Eval performance:
RFperf = RandomForest.model_performance()

GradientBoost = H2OGradientBoostingEstimator(model_id = 'GradientBoost',
                                             nfolds=5,
                                             seed=1111,
                                             keep_cross_validation_predictions=True)
GradientBoost.train(x=x, y=y, training_frame=train)
GBperf = GradientBoost.model_performance()


Ensemble = H2OStackedEnsembleEstimator(model_id="Ensemble",
                                       base_models=['DeepLearn', 'RandomForest',
                                                    'GradientBoost'])
Ensemble.train(x=x, y=y, training_frame=train)

Performance = Ensemble.model_performance()


predic = Ensemble.predict(valid).as_data_frame()
yhat = np.array(predic).reshape(-1,1)
data_2007['Month'] = data_2007['Month'].asfactor()
data_2007['DayofMonth'] = data_2007['DayofMonth'].asfactor()
data_2007['DayOfWeek'] = data_2007['DayOfWeek'].asfactor()
data_2007['DepDelayed'] = data_2007['DepDelayed'].asfactor()
data_2008['Month'] = data_2008['Month'].asfactor()
data_2008['DayofMonth'] = data_2008['DayofMonth'].asfactor()
data_2008['DayOfWeek'] = data_2008['DayOfWeek'].asfactor()
data_2008['DepDelayed'] = data_2008['DepDelayed'].asfactor()

# This gives the number of missing values
data_2007.describe()

# Create training set and test set with the labels
x_cols = [
    'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'Distance', 'UniqueCarrier',
    'Origin', 'Dest'
]
y_cols = ['DepDelayed']

# ### Gradient Boosting

gb_model = H2OGradientBoostingEstimator()
gb_model.train(x=x_cols,
               y=y_cols,
               training_frame=data_2007,
               validation_frame=data_2008)

# /this gives the score of model
gb_model.varimp
gb_model.download_pojo('/mapr/my.cluster.com/user/*****/airlines')
def mojo_predict_csv_test(target_dir):
    mojo_file_name = "prostate_gbm_model.zip"
    mojo_zip_path = os.path.join(target_dir, mojo_file_name)

    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Getting first row from test data frame
    pdf = test[1, 2:]
    input_csv = "%s/in.csv" % target_dir
    output_csv = "%s/output.csv" % target_dir
    h2o.export_file(pdf, input_csv)

    # =================================================================
    # Regression
    # =================================================================
    regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian")
    regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train)
    pred_reg = regression_gbm1.predict(pdf)
    contribs_reg = regression_gbm1.predict_contributions(pdf)
    p1 = pred_reg[0, 0]
    print("Regression prediction: " + str(p1))

    download_mojo(regression_gbm1, mojo_zip_path)

    print("\nPerforming Regression Prediction using MOJO @... " + target_dir)
    prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv,
                                             mojo_zip_path=mojo_zip_path,
                                             output_csv_path=output_csv)
    print("Prediction result: " + str(prediction_result))
    assert p1 == float(
        prediction_result[0]['predict']
    ), "expected predictions to be the same for binary and MOJO model for regression"

    print("\nComparing Regression Contributions using MOJO @... " + target_dir)
    contributions_result = h2o.mojo_predict_csv(input_csv_path=input_csv,
                                                mojo_zip_path=mojo_zip_path,
                                                output_csv_path=output_csv,
                                                predict_contributions=True)
    assert contributions_result is not None
    contributions_pandas = pandas.read_csv(output_csv)
    assert_frame_equal(contribs_reg.as_data_frame(use_pandas=True),
                       contributions_pandas,
                       check_dtype=False)

    # =================================================================
    # Binomial
    # =================================================================
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli")

    bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train)
    pred_bin = bernoulli_gbm1.predict(pdf)
    contribs_bin = bernoulli_gbm1.predict_contributions(pdf)

    binary_prediction_0 = pred_bin[0, 1]
    binary_prediction_1 = pred_bin[0, 2]
    print("Binomial prediction: p0: " + str(binary_prediction_0))
    print("Binomial prediction: p1: " + str(binary_prediction_1))

    download_mojo(bernoulli_gbm1, mojo_zip_path)

    print("\nPerforming Binomial Prediction using MOJO @... " + target_dir)
    prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv,
                                             mojo_zip_path=mojo_zip_path,
                                             output_csv_path=output_csv)

    mojo_prediction_0 = float(prediction_result[0]['p0'])
    mojo_prediction_1 = float(prediction_result[0]['p1'])
    print("Binomial prediction: p0: " + str(mojo_prediction_0))
    print("Binomial prediction: p1: " + str(mojo_prediction_1))

    assert abs(
        binary_prediction_0 - mojo_prediction_0
    ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Binomial - p0"
    assert abs(
        binary_prediction_1 - mojo_prediction_1
    ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Binomial - p1"

    print("\nComparing Binary Classification Contributions using MOJO @... " +
          target_dir)
    contributions_bin_result = h2o.mojo_predict_csv(
        input_csv_path=input_csv,
        mojo_zip_path=mojo_zip_path,
        output_csv_path=output_csv,
        predict_contributions=True)
    assert contributions_bin_result is not None
    contributions_bin_pandas = pandas.read_csv(output_csv)
    print(contributions_bin_pandas)
    print(contribs_bin.as_data_frame(use_pandas=True))
    assert_frame_equal(contribs_bin.as_data_frame(use_pandas=True),
                       contributions_bin_pandas,
                       check_dtype=False)

    # =================================================================
    # Multinomial
    # =================================================================
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    r = iris[0].runif()
    train = iris[r < 0.90]
    test = iris[r >= 0.10]

    # Getting first row from test data frame
    pdf = test[1, 0:4]
    input_csv = "%s/in-multi.csv" % target_dir
    output_csv = "%s/output.csv" % target_dir
    h2o.export_file(pdf, input_csv)

    multi_gbm = H2OGradientBoostingEstimator()
    multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train)

    pred_multi = multi_gbm.predict(pdf)
    multinomial_prediction_1 = pred_multi[0, 1]
    multinomial_prediction_2 = pred_multi[0, 2]
    multinomial_prediction_3 = pred_multi[0, 3]
    print("Multinomial prediction (Binary): p0: " +
          str(multinomial_prediction_1))
    print("Multinomial prediction (Binary): p1: " +
          str(multinomial_prediction_2))
    print("Multinomial prediction (Binary): p2: " +
          str(multinomial_prediction_3))

    download_mojo(multi_gbm, mojo_zip_path)

    print("\nPerforming Multinomial Prediction using MOJO @... " + target_dir)
    prediction_result = h2o.mojo_predict_csv(input_csv_path=input_csv,
                                             mojo_zip_path=mojo_zip_path,
                                             output_csv_path=output_csv)

    mojo_prediction_1 = float(prediction_result[0]['Iris-setosa'])
    mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor'])
    mojo_prediction_3 = float(prediction_result[0]['Iris-virginica'])
    print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1))
    print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2))
    print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3))

    assert abs(
        multinomial_prediction_1 - mojo_prediction_1
    ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p0"
    assert abs(
        multinomial_prediction_2 - mojo_prediction_2
    ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p1"
    assert abs(
        multinomial_prediction_3 - mojo_prediction_3
    ) < 1e-15, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
Beispiel #4
0
# train, test split
train_audio, valid_audio = train_audio.split_frame(ratios=[0.75], seed=1)
#train_audio.shape
no_x_audio = len(train_audio.columns)
x_audio = train_audio.columns[:no_x_audio - 1]
y_audio = train_audio.columns[no_x_audio - 1]
'''
modelling
'''
# model initialization
rf_audio = H2ORandomForestEstimator(seed=12, ntrees=50, max_depth= 20, \
                                       balance_classes=False, nfolds = 5, \
                                       stopping_metric = 'MSE')

gbm_audio = H2OGradientBoostingEstimator(ntrees = 50, max_depth = 20, \
                                         distribution = 'AUTO', nfolds = 5, \
                                         stopping_metric = 'MSE')

# model training
model_audio = rf_audio
model_audio.train(x=x_audio,
                  y=y_audio,
                  training_frame=train_audio,
                  validation_frame=valid_audio)

#model_audio.show()
'''
performance checking
'''
dev_pred = model_audio.predict(dev_audio)
#dev_pred2 = model_audio.predict_leaf_node_assignment(dev_audio)
Beispiel #5
0
def multinomial_auc_prostate_gbm():
    data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    response_col = "GLEASON"
    data[response_col] = data[response_col].asfactor()
    
    predictors = ["RACE", "AGE", "PSA", "DPROS", "CAPSULE", "VOL", "DCAPS"]
    distribution = "multinomial"

    # train model
    gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="WEIGHTED_OVR")
    gbm.train(x=predictors, y=response_col, training_frame=data)

    gbm.show()

    # get result on training data from h2o
    cm = gbm.confusion_matrix(data)
    h2o_auc_table = gbm.multinomial_auc_table(train=True)
    h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True)

    print(cm)
    print(h2o_auc_table.as_data_frame())
    print(h2o_aucpr_table.as_data_frame())

    h2o_ovr_macro_auc = h2o_auc_table[3][7]
    h2o_ovr_weighted_auc = h2o_auc_table[3][8]
    h2o_ovo_macro_auc = h2o_auc_table[3][30]
    h2o_ovo_weighted_auc = h2o_auc_table[3][31]

    h2o_ovr_weighted_aucpr = h2o_aucpr_table[3][8]

    h2o_default_auc = gbm.auc()
    h2o_default_aucpr = gbm.aucpr()

    print("default vs. table AUC "+str(h2o_ovr_weighted_auc)+" "+str(h2o_default_auc))
    print("default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" "+str(h2o_default_aucpr))

    # default should be ovr weighted 
    assert h2o_ovr_weighted_auc == h2o_default_auc, "default vs. table AUC "+str(h2o_ovr_weighted_auc)+" != "+str(h2o_default_auc)
    assert h2o_ovr_weighted_aucpr == h2o_default_aucpr, "default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" != "+str(h2o_default_aucpr)

    # transform data for sklearn
    prediction = gbm.predict(data).as_data_frame().iloc[:,1:]
    actual = data[response_col].as_data_frame().iloc[:, 0].tolist()

    # get result on training data from sklearn
    sklearn_ovr_macro_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='macro')
    sklearn_ovr_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='weighted')
    sklearn_ovo_macro_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='macro')
    sklearn_ovo_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='weighted')

    print("sklearn vs. h2o ovr macro:    "+str(sklearn_ovr_macro_auc)+" "+str(h2o_ovr_macro_auc))
    print("sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" "+str(h2o_ovr_weighted_auc))
    print("sklearn vs. h2o ovo macro:    "+str(sklearn_ovo_macro_auc)+" "+str(h2o_ovo_macro_auc))
    print("sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" "+str(h2o_ovo_weighted_auc))

    # compare results h2o vs sklearn
    precision = 1e-7
    assert abs(h2o_ovr_macro_auc - sklearn_ovr_macro_auc) < precision, "sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_ovr_macro_auc)
    assert abs(h2o_ovr_weighted_auc - sklearn_ovr_weighted_auc) < precision, "sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" != "+str(h2o_ovr_weighted_auc)
    assert abs(h2o_ovo_macro_auc - sklearn_ovo_macro_auc) < precision, "sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" != "+str(h2o_ovo_macro_auc)
    assert abs(h2o_ovo_weighted_auc - sklearn_ovo_weighted_auc) < precision, "sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" != "+str(h2o_ovo_weighted_auc)

    # set auc_type 
    gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="MACRO_OVR")
    gbm.train(x=predictors, y=response_col, training_frame=data, validation_frame=data)

    h2o_auc_table = gbm.multinomial_auc_table(train=True)
    h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True)

    h2o_ovr_macro_auc = h2o_auc_table[3][7]
    h2o_ovr_macro_aucpr = h2o_aucpr_table[3][7]

    h2o_default_auc = gbm.auc()
    h2o_default_aucpr = gbm.aucpr()

    assert abs(h2o_ovr_macro_auc - h2o_default_auc) < precision, "default auc vs. h2o ovr macro auc: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_default_auc)
    assert abs(h2o_ovr_macro_aucpr - h2o_default_aucpr) < precision, "default aucpr vs. h2o ovr macro aucpr: "+str(h2o_ovr_macro_aucpr)+" != "+str(h2o_default_aucpr)

    # test early stopping
    ntrees = 100
    gbm2 = H2OGradientBoostingEstimator(ntrees=ntrees, max_depth=2, nfolds=3, distribution=distribution, score_each_iteration=True, auc_type="MACRO_OVR", stopping_metric="AUC", stopping_rounds=3)
    gbm2.train(x=predictors, y=response_col, training_frame=data, validation_frame=data)
    assert ntrees > gbm2.score_history().shape[0], "Test early stopping: Training should start early."

    # test performance with different auc type
    perf2 = gbm.model_performance(data, auc_type="WEIGHTED_OVO")
    perf2_auc = perf2.auc()
    assert abs(h2o_ovo_weighted_auc - perf2_auc) < precision, "h2o ovo weighted vs. h2o performance ovo weighted: "+str(h2o_ovo_weighted_auc)+" != "+str(perf2_auc)
    
    # test peformance with no data and auc_type is set
    ntrees = 2
    gbm3 = H2OGradientBoostingEstimator(ntrees=ntrees, max_depth=2, nfolds=3, distribution=distribution)
    gbm3.train(x=predictors, y=response_col, training_frame=data, validation_frame=data)
    perf3 = gbm3.model_performance(train=True, auc_type="WEIGHTED_OVO")
    perf3_auc = perf3.auc()
    assert perf3_auc == "NaN", "AUC should be \"NaN\" because it is not set in model parameters and test_data is None"
    
    # test aucpr is not in cv summary
    print(gbm._model_json["output"]["cv_scoring_history"][0]._col_header)
    assert not "aucpr" in gbm.cross_validation_metrics_summary()[0], "The aucpr should not be in cross-validation metrics summary."
    assert "pr_auc" in gbm.cross_validation_metrics_summary()[0], "The pr_auc should be in cross-validation metrics summary."
# Split frame into two - we use one as the training frame and the second one as the validation frame
splits = crimeWithWeatherHF.split_frame(ratios=[0.8])
train = splits[0]
test = splits[1]

# Prepare column names
predictor_columns = train.drop("Arrest").col_names
response_column = "Arrest"

# Create and train GBM model
from h2o.estimators.gbm import H2OGradientBoostingEstimator

# Prepare model based on the given set of parameters
gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                         max_depth=3,
                                         learn_rate=0.1,
                                         distribution="bernoulli")

# Train the model
gbm_model.train(x=predictor_columns,
                y=response_column,
                training_frame=train,
                validation_frame=test)

# Create and train deeplearning model
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

# Prepare model based on the given set of parameters
dl_model = H2ODeepLearningEstimator()

# Train the model
Beispiel #7
0
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("uploader", H2OFrameCreator()),
		("classifier", classifier)
	])
	pipeline.fit(audit_X, H2OFrame(audit_y.to_frame(), column_types = ["categorical"]))
	pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
	classifier = pipeline._final_estimator
	store_mojo(classifier, name)
	store_pkl(pipeline, name)
	adjusted = pipeline.predict(audit_X)
	adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
	store_csv(adjusted.as_data_frame(), name)

if "Audit" in datasets and with_h2o:
	build_audit_h2o(H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees = 17), "H2OGradientBoostingAudit")
	build_audit_h2o(H2OGeneralizedLinearEstimator(family = "binomial"), "H2OLogisticRegressionAudit")
	build_audit_h2o(H2ORandomForestEstimator(distribution = "bernoulli", seed = 13), "H2ORandomForestAudit")

audit_dict_X = audit_X.to_dict("records")

def build_audit_dict(classifier, name, with_proba = True):
	pipeline = PMMLPipeline([
		("dict-transformer", DictVectorizer()),
		("classifier", classifier)
	])
	pipeline.fit(audit_dict_X, audit_y)
	store_pkl(pipeline, name)
	adjusted = DataFrame(pipeline.predict(audit_dict_X), columns = ["Adjusted"])
	if with_proba == True:
		adjusted_proba = DataFrame(pipeline.predict_proba(audit_dict_X), columns = ["probability(0)", "probability(1)"])
Beispiel #8
0
import pandas as pd
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

con = h2o.connect(url='http://192.168.5.208:54321/')

csv_data = pd.read_csv('股票数据/处理后数据/processed_601857.csv', encoding='utf8')
csv_data['earn'] = csv_data['20_closing_price'] > csv_data['closing_price']*1.2
csv_data_ = h2o.H2OFrame(csv_data)
model = H2OGradientBoostingEstimator(model_id='stock_601857', nfolds=10,
distribution = "bernoulli", ntrees = 2000, max_depth = 10,
learn_rate = 0.4, histogram_type = "UniformAdaptive",
min_split_improvement = 0.000001,
balance_classes = False, seed = 52345,
stopping_rounds = 5, stopping_metric = 'AUC', stopping_tolerance = 0.001,
col_sample_rate = 0.6, col_sample_rate_per_tree = 0.6,
col_sample_rate_change_per_level = 0.6, sample_rate = 0.85, min_rows = 100,
)

traning_data, test_data = csv_data_.split_frame(ratios=[0.8], destination_frames=["train_frame", "test_data"])
csv_data.keys()
model.train(x=['closing_price', 'upping_ratio',
       'changing_ratio', 'volume', 'upping_ratio1',
       'upping_ratio2', 'upping_ratio3', 'upping_ratio4', 'upping_ratio5',
       'A_index_closing_price', 'A_index_upping_money', 'A_index_upping_ratio',
       'A_index_volume', 'A_index_volume_money', 'B_index_closing_price',
       'B_index_upping_money', 'B_index_upping_ratio', 'B_index_volume',
       'B_index_volume_money', 'top50_index_closing_price',
       'top50_index_upping_money', 'top50_index_upping_ratio',
       'top50_index_volume', 'top50_index_volume_money',
       'sh_index_closing_price', 'sh_index_upping_money',
Beispiel #9
0
    cum_list.append(B[k])

train = pd.concat(cum_list)
spm = sp.csr_matrix(train.values)
d = h2o.H2OFrame(spm)
#Turn into categorical
for col in cat_vars_index:
    d[col] = d[col].asfactor()

start = time.time()

#Train base models for stacked ensemble

my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      nfolds=10,
                                      ntrees=5,
                                      keep_cross_validation_predictions=True,
                                      seed=1)

my_gbm.train(y=-1, training_frame=d)

my_rf = H2ORandomForestEstimator(nfolds=10,
                                 ntrees=5,
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(y=-1, training_frame=d)

# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf])

ensemble.train(y=-1, training_frame=d)
Beispiel #10
0
    def test_gbm_grid_search_over_params(self):
        """
        test_gbm_grid_search_over_params: test for condition 1 and performs the following:
        a. grab all truely griddable parameters and randomly or manually set the parameter values.
        b. Next, build H2O GBM models using grid search.  Count and make sure models
           are only built for hyper-parameters set to legal values.  No model is built for bad hyper-parameters
           values.  We should instead get a warning/error message printed out.
        c. For each model built using grid search, we will extract the parameters used in building
           that model and manually build a H2O GBM model.  MSEs are calculated from a test set
           to compare the performance of grid search model and our manually built model.  If their MSEs
           are close, declare test success.  Otherwise, declare test failure.
        d. we will check and make sure the models are built within the max_runtime_secs time limit that was set
           for it as well.  If max_runtime_secs was exceeded, declare test failure as well.
        """

        print(
            "*******************************************************************************************"
        )
        print("test_gbm_grid_search_over_params for GBM " + self.family)
        h2o.cluster_info()

        try:
            print("Hyper-parameters used here is {0}".format(
                self.final_hyper_params))

            # start grid search
            grid_model = H2OGridSearch(H2OGradientBoostingEstimator(
                distribution=self.family, nfolds=self.nfolds, seed=self.seed),
                                       hyper_params=self.final_hyper_params)
            grid_model.train(x=self.x_indices,
                             y=self.y_index,
                             training_frame=self.training1_data)

            self.correct_model_number = len(
                grid_model)  # store number of models built

            # make sure the correct number of models are built by gridsearch
            if not (self.correct_model_number
                    == self.possible_number_models):  # wrong grid model number
                self.test_failed += 1
                print(
                    "test_gbm_grid_search_over_params for GBM failed: number of models built by gridsearch "
                    "does not equal to all possible combinations of hyper-parameters"
                )
            else:
                # add parameters into params_dict.  Use this to manually build model
                params_dict = dict()
                params_dict["distribution"] = self.family
                params_dict["nfolds"] = self.nfolds
                params_dict["seed"] = self.seed
                total_run_time_limits = 0.0  # calculate upper bound of max_runtime_secs
                true_run_time_limits = 0.0
                manual_run_runtime = 0.0

                # compare MSE performance of model built by gridsearch with manually built model
                for each_model in grid_model:

                    params_list = grid_model.get_hyperparams_dict(
                        each_model._id)
                    params_list.update(params_dict)

                    model_params = dict()

                    # need to taken out max_runtime_secs from model parameters, it is now set in .train()
                    if "max_runtime_secs" in params_list:
                        model_params["max_runtime_secs"] = params_list[
                            "max_runtime_secs"]
                        max_runtime = params_list["max_runtime_secs"]
                        del params_list["max_runtime_secs"]
                    else:
                        max_runtime = 0

                    if "r2_stopping" in params_list:
                        model_params["r2_stopping"] = params_list[
                            "r2_stopping"]
                        del params_list["r2_stopping"]

                    if "validation_frame" in params_list:
                        model_params["validation_frame"] = params_list[
                            "validation_frame"]
                        del params_list["validation_frame"]

                    if "learn_rate_annealing" in params_list:
                        model_params["learn_rate_annealing"] = params_list[
                            "learn_rate_annealing"]
                        del params_list["learn_rate_annealing"]

                    # make sure manual model was provided the same max_runtime_secs as the grid model
                    each_model_runtime = pyunit_utils.find_grid_runtime(
                        [each_model])

                    manual_model = H2OGradientBoostingEstimator(**params_list)
                    manual_model.train(x=self.x_indices,
                                       y=self.y_index,
                                       training_frame=self.training1_data,
                                       **model_params)

                    # collect the time taken to manually built all models
                    model_runtime = pyunit_utils.find_grid_runtime(
                        [manual_model])  # time taken to build this model
                    manual_run_runtime += model_runtime

                    summary_list = manual_model._model_json['output'][
                        'model_summary']
                    tree_num = summary_list.cell_values[0][
                        summary_list.col_header.index('number_of_trees')]

                    if max_runtime > 0:
                        # shortest possible time it takes to build this model
                        if (max_runtime <
                                self.min_runtime_per_tree) or (tree_num <= 1):
                            total_run_time_limits += model_runtime
                        else:
                            total_run_time_limits += max_runtime

                    true_run_time_limits += max_runtime

                    # compute and compare test metrics between the two models
                    test_grid_model_metrics = each_model.model_performance(
                    )._metric_json[self.training_metric]
                    test_manual_model_metrics = manual_model.model_performance(
                    )._metric_json[self.training_metric]

                    # just compare the mse in this case within tolerance:
                    if (each_model_runtime > 0) and \
                            (abs(model_runtime - each_model_runtime)/each_model_runtime < self.allowed_runtime_diff) \
                            and (abs(test_grid_model_metrics - test_manual_model_metrics) > self.allowed_diff):
                        #                        self.test_failed += 1             # count total number of tests that have failed
                        print(
                            "test_gbm_grid_search_over_params for GBM warning: grid search model mdetric ({0}) and "
                            "manually built H2O model metric ({1}) differ too much"
                            "!".format(test_grid_model_metrics,
                                       test_manual_model_metrics))

                total_run_time_limits = max(
                    total_run_time_limits,
                    true_run_time_limits) * (1 + self.extra_time_fraction)

                # make sure the max_runtime_secs is working to restrict model built time
                if not (manual_run_runtime <= total_run_time_limits):
                    self.test_failed += 1
                    print(
                        "test_gbm_grid_search_over_params for GBM failed: time taken to manually build models is {0}."
                        "  Maximum allowed time is {1}".format(
                            manual_run_runtime, total_run_time_limits))
                else:
                    print(
                        "time taken to manually build all models is {0}. Maximum allowed time is "
                        "{1}".format(manual_run_runtime,
                                     total_run_time_limits))

                if self.test_failed == 0:
                    print(
                        "test_gbm_grid_search_over_params for GBM has passed!")
        except:
            if self.possible_number_models > 0:
                print(
                    "test_gbm_grid_search_over_params for GBM failed: exception was thrown for no reason."
                )
                self.test_failed += 1
def stackedensemble_metalearner_test():
    """This test checks the following:
    1) That H2OStackedEnsembleEstimator `metalearner_nfolds` works correctly
    2) That H2OStackedEnsembleEstimator `metalearner_nfolds` works in concert with `metalearner_nfolds`
    """

    # Import training set
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
                            destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
                            destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          fold_assignment="Modulo",
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)


    def train_ensemble_using_metalearner(algo, expected_algo):
        print("Training ensemble using {} metalearner.".format(algo))

        meta_params = dict(metalearner_nfolds=3)

        se = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm=algo, **meta_params)
        se.train(x=x, y=y, training_frame=train)
        assert(se.params['metalearner_algorithm']['actual'] == expected_algo)
        if meta_params:
            assert(se.params['metalearner_nfolds']['actual'] == 3)

        meta = h2o.get_model(se.metalearner()['name'])
        assert(meta.algo == expected_algo), "Expected that the metalearner would use {}, but actually used {}.".format(expected_algo, meta.algo)
        if meta_params:
            assert(meta.params['nfolds']['actual'] == 3)

    metalearner_algos = ['AUTO', 'deeplearning', 'drf', 'gbm', 'glm', 'naivebayes', 'xgboost']
    for algo in metalearner_algos:
        expected_algo = 'glm' if algo == 'AUTO' else algo
        train_ensemble_using_metalearner(algo, expected_algo)
Beispiel #12
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GBM.
        2. It will find the intersection of parameters that are both griddable and used by GBM.
        3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGradientBoostingEstimator(distribution=self.family,
                                             seed=self.seed,
                                             nfolds=self.nfolds)
        model.train(x=self.x_indices,
                    y=self.y_index,
                    training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime(
            [model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(
            self.model_run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_trees = summary_list.cell_values[0][summary_list.col_header.index(
            'number_of_trees')]

        if num_trees == 0:
            self.min_runtime_per_tree = self.model_run_time
        else:
            self.min_runtime_per_tree = self.model_run_time / num_trees

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [
                time_scale * x for x in self.hyper_params["max_runtime_secs"]
            ]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params[
                "max_runtime_secs"]
            len_good_time = len(
                [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models * len_good_time

        if "fold_assignment" in list(self.final_hyper_params):
            self.possible_number_models = self.possible_number_models * self.scale_model

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename,
                                                 self.final_hyper_params)
Beispiel #13
0
data = recode_cc_data(data)
  
data[y] = data[y].asfactor()
data.describe()


train, test = data.split_frame([0.7], seed=12345)

# summarize split
print('Train data rows = %d, columns = %d' % (train.shape[0], train.shape[1]))
print('Test data rows = %d, columns = %d' % (test.shape[0], test.shape[1]))  

model = H2OGradientBoostingEstimator(ntrees=150,            # maximum 150 trees in GBM
                                     max_depth=4,           # trees can have maximum depth of 4
                                     sample_rate=0.9,       # use 90% of rows in each iteration (tree)
                                     col_sample_rate=0.9,   # use 90% of variables in each iteration (tree)
                                     stopping_rounds=5,     # stop if validation error does not decrease for 5 iterations (trees)
                                     score_tree_interval=1, # for reproducibility, set higher for bigger data
                                     seed=12345)            # random seed for reproducibility

# train a GBM model
model.train(y=y, x=X, training_frame=train, validation_frame=test)

# print AUC
print('GBM Test AUC = %.2f' % model.auc(valid=True))

row = test[test['ID'] == 29116]
row

def generate_local_sample(row, frame, X, N=1000):
        epochs=100)

model_dl.train(
        x= features, 
        y="loan_status", 
        training_frame=train_split, 
        validation_frame=valid_split)
model_dl.params
print(model_dl)



#GBM
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model_gbm = H2OGradientBoostingEstimator(distribution='bernoulli',
                                    ntrees=100,
                                    max_depth=4,
                                    learn_rate=0.1)
model_gbm.train(x=features, y="loan_status", training_frame=train_split, validation_frame=valid_split)
print(model_gbm)



#GBM with cross validation
cvmodel = H2OGradientBoostingEstimator(distribution='bernoulli',
                                       ntrees=100,
                                       max_depth=4,
                                       learn_rate=0.1,
                                       nfolds=5)
cvmodel.train(x=features, y="loan_status", training_frame=train)
print(cvmodel)
Beispiel #15
0
def metric_json_check():
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    # Regression metric json
    reg_mod = H2OGradientBoostingEstimator(distribution="gaussian")
    reg_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df)
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = list(reg_met._metric_json.keys())
    reg_metric_json_keys_desired = [u'model_category',
                                    u'description',
                                    u'r2',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'RMSE',
                                    u'mae',
                                    u'rmsle',
                                    u'__meta',
                                    u'_exclude_fields',
                                    u'scoring_time',
                                    u'predictions',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'nobs',
                                    u'mean_residual_deviance',
                                    u'custom_metric_name',
                                    u'custom_metric_value']
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)
    # Regression metric json (GLM)
    reg_mod = H2OGeneralizedLinearEstimator(family="gaussian")
    reg_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df)
    reg_met = reg_mod.model_performance()
    reg_metric_json_keys_have = list(reg_met._metric_json.keys())
    reg_metric_json_keys_desired = [u'model_category',
                                    u'description',
                                    u'r2',
                                    u'residual_degrees_of_freedom',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'RMSE',
                                    u'mae',
                                    u'rmsle',
                                    u'__meta',
                                    u'_exclude_fields',
                                    u'null_deviance',
                                    u'scoring_time',
                                    u'null_degrees_of_freedom',
                                    u'predictions',
                                    u'AIC',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'nobs',
                                    u'residual_deviance',
                                    u'mean_residual_deviance',
                                    u'custom_metric_name',
                                    u'custom_metric_value']
    reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired))
    assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \
                                "metric json. The difference is {2}".format(reg_metric_json_keys_have,
                                                                            reg_metric_json_keys_desired,
                                                                            reg_metric_diff)

    # Binomial metric json
    bin_mod = H2OGradientBoostingEstimator(distribution="bernoulli")
    df["CAPSULE"] = df["CAPSULE"].asfactor()
    bin_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df)
    bin_met = bin_mod.model_performance()
    bin_metric_json_keys_have = list(bin_met._metric_json.keys())
    bin_metric_json_keys_desired = [u'AUC',
                                    u'Gini',
                                    u'model_category',
                                    u'description',
                                    u'mean_per_class_error',
                                    u'r2',
                                    u'frame',
                                    u'model_checksum',
                                    u'MSE',
                                    u'RMSE',
                                    u'__meta',
                                    u'_exclude_fields',
                                    u'gains_lift_table',
                                    u'logloss',
                                    u'scoring_time',
                                    u'thresholds_and_metric_scores',
                                    u'predictions',
                                    u'max_criteria_and_metric_scores',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'nobs',
                                    u'domain',
                                    u'custom_metric_name',
                                    u'custom_metric_value',
                                    u'pr_auc']
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Binomial metric json (GLM)
    bin_mod = H2OGeneralizedLinearEstimator(family="binomial")
    bin_mod.train(x=list(range(3,df.ncol)), y="CAPSULE", training_frame=df)
    bin_metric_json_keys_have = list(bin_met._metric_json.keys())
    bin_metric_json_keys_desired = [u'frame',
                                    u'residual_deviance',
                                    u'max_criteria_and_metric_scores',
                                    u'MSE',
                                    u'RMSE',
                                    u'frame_checksum',
                                    u'nobs',
                                    u'AIC',
                                    u'logloss',
                                    u'Gini',
                                    u'predictions',
                                    u'AUC',
                                    u'description',
                                    u'mean_per_class_error',
                                    u'model_checksum',
                                    u'duration_in_ms',
                                    u'model_category',
                                    u'gains_lift_table',
                                    u'r2',
                                    u'residual_degrees_of_freedom',
                                    u'__meta',
                                    u'_exclude_fields',
                                    u'null_deviance',
                                    u'scoring_time',
                                    u'null_degrees_of_freedom',
                                    u'model',
                                    u'thresholds_and_metric_scores',
                                    u'domain',
                                    u'custom_metric_name',
                                    u'custom_metric_value',
                                    u'pr_auc']
    bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired))
    assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \
                                "metric json. The difference is {2}".format(bin_metric_json_keys_have,
                                                                            bin_metric_json_keys_desired,
                                                                            bin_metric_diff)

    # Multinomial metric json
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    myX = ["Origin", "Dest", "IsDepDelayed", "UniqueCarrier", "Distance", "fDayofMonth", "fDayOfWeek"]
    myY = "fYear"

    mul_mod = H2OGradientBoostingEstimator(distribution="multinomial")
    mul_mod.train(x=myX, y=myY, training_frame=df)
    mul_met = mul_mod.model_performance()
    mul_metric_json_keys_have = list(mul_met._metric_json.keys())
    mul_metric_json_keys_desired = [u'cm',
                                    u'model_category',
                                    u'description',
                                    u'mean_per_class_error',
                                    u'r2',
                                    u'frame',
                                    u'nobs',
                                    u'model_checksum',
                                    u'MSE',
                                    u'RMSE',
                                    u'__meta',
                                    u'_exclude_fields',
                                    u'logloss',
                                    u'scoring_time',
                                    u'predictions',
                                    u'hit_ratio_table',
                                    u'model',
                                    u'duration_in_ms',
                                    u'frame_checksum',
                                    u'custom_metric_name',
                                    u'custom_metric_value']
    mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired))
    assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \
                                "metric json. The difference is {2}".format(mul_metric_json_keys_have,
                                                                            mul_metric_json_keys_desired,
                                                                            mul_metric_diff)

    # Clustering metric json
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))
    from h2o.estimators.kmeans import H2OKMeansEstimator
    clus_mod = H2OKMeansEstimator(k=3, standardize=False)
    clus_mod.train(x=list(range(4)), training_frame=df)
    clus_met = clus_mod.model_performance()
    clus_metric_json_keys_have = list(clus_met._metric_json.keys())
    clus_metric_json_keys_desired = [u'tot_withinss',
                                     u'model_category',
                                     u'description',
                                     u'frame',
                                     u'model_checksum',
                                     u'MSE',
                                     u'RMSE',
                                     u'__meta',
                                     u'_exclude_fields',
                                     u'scoring_time',
                                     u'betweenss',
                                     u'predictions',
                                     u'totss',
                                     u'model',
                                     u'duration_in_ms',
                                     u'frame_checksum',
                                     u'nobs',
                                     u'centroid_stats',
                                     u'custom_metric_name',
                                     u'custom_metric_value']
    clus_metric_diff = list(set(clus_metric_json_keys_have) - set(clus_metric_json_keys_desired))
    assert not clus_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) clustering " \
                                "metric json. The difference is {2}".format(clus_metric_json_keys_have,
                                                                            clus_metric_json_keys_desired,
                                                                            clus_metric_diff)
def algo_max_runtime_secs():
    '''
    This pyunit test is written to ensure that the max_runtime_secs can restrict the model training time for all
    h2o algos.  See PUBDEV-4702.
    '''
    global model_within_max_runtime
    global err_bound
    seed = 12345

    # GLRM, do not make sense to stop in the middle of an iteration
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/glrmdata1000x25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OGeneralizedLowRankEstimator(k=10,
                                           loss="Quadratic",
                                           gamma_x=0.3,
                                           gamma_y=0.3,
                                           transform="STANDARDIZE",
                                           seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # deeplearning
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/gaussian_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    model = H2ODeepLearningEstimator(distribution='gaussian',
                                     seed=seed,
                                     hidden=[10, 10, 10])
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([training1_data, model])

    # stack ensemble, stacking part is not iterative
    print(
        "******************** Skip testing stack ensemble.  Not an iterative algo."
    )

    # GBM run
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/multinomial_training1_set.csv"))
    y_index = training1_data.ncol - 1
    x_indices = list(range(y_index))
    training1_data[y_index] = training1_data[y_index].round().asfactor()
    model = H2OGradientBoostingEstimator(distribution="multinomial", seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # GLM run
    model = H2OGeneralizedLinearEstimator(family='multinomial', seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices, y_index)
    cleanUp([model])

    # naivebayes, not iterative
    print(
        "******************** Skip testing Naives Bayes.  Not an iterative algo."
    )

    # random foreset
    model = H2ORandomForestEstimator(ntrees=100,
                                     score_tree_interval=0,
                                     seed=seed)
    grabRuntimeInfo(err_bound, 2.0, model, training1_data, x_indices)
    cleanUp([model, training1_data])

    # PCA
    training1_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/gridsearch/pca1000by25.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OPCA(k=10,
                   transform="STANDARDIZE",
                   pca_method="Power",
                   compute_metrics=True,
                   seed=seed)
    grabRuntimeInfo(err_bound * 5, 2, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # kmeans
    training1_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gridsearch/kmeans_8_centers_3_coords.csv"))
    x_indices = list(range(training1_data.ncol))
    model = H2OKMeansEstimator(k=10, seed=seed)
    grabRuntimeInfo(err_bound * 2, 2.5, model, training1_data, x_indices)
    cleanUp([training1_data, model])

    # word2vec
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/text8.gz"),
                            header=1,
                            col_types=["string"])
    used = train[0:170000, 0]
    w2v_model = H2OWord2vecEstimator()
    grabRuntimeInfo(err_bound, 2.0, w2v_model, used, [], 0)
    cleanUp([train, used, w2v_model])

    if sum(model_within_max_runtime) > 0:
        sys.exit(1)
# Now, train the GBM model:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

# Load the data and prepare for modeling
airlines_hex = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip")

# Generate random numbers and create training, validation, testing splits
r = airlines_hex.runif()   # Random UNIForm numbers, one per row
air_train_hex = airlines_hex[r  < 0.6]
air_valid_hex = airlines_hex[(r >= 0.6) & (r < 0.9)]
air_test_hex  = airlines_hex[r  >= 0.9]

myX = ["DayofMonth", "DayOfWeek"]

air_model = H2OGradientBoostingEstimator(
               distribution='bernoulli', ntrees=100,
               max_depth=4, learn_rate=0.1)
air_model.train(x=myX, y="IsDepDelayed",
                training_frame=air_train_hex)
def cars_checkpoint():

    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print("\n*** Description (chunk distribution, etc) of training frame:")
    train.describe()
    print("\n*** Description (chunk distribution, etc) of validation frame:")
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)), 1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        distribution = "multinomial"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else:
        response_col = "economy"
        distribution = "gaussian"

    print("\n*** Distribution: {0}".format(distribution))
    print("\n*** Response column: {0}".format(response_col))

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(list(range(2, 6)), 1)[0]
    min_rows1 = random.sample(list(range(10, 16)), 1)[0]
    print("\n*** Building model 1 with the following parameters:")
    print("*** ntrees model 1: {0}".format(ntrees1))
    print("*** max_depth model 1: {0}".format(max_depth1))
    print("*** min_rows model 1: {0}".format(min_rows1))

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                          max_depth=max_depth1,
                                          min_rows=min_rows1,
                                          score_each_iteration=True,
                                          distribution=distribution)
    model1.train(x=predictors,
                 y=response_col,
                 training_frame=train,
                 validation_frame=valid)

    # model1 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees1,
    #                  max_depth=max_depth1,
    #                  min_rows=min_rows1,
    #                  score_each_iteration=True,
    #                  distribution=distribution,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print(
        "\n*** Continuing to build model 1 (now called model 2) with the following parameters:"
    )
    print("*** ntrees model 2: {0}".format(ntrees2))
    print("*** max_depth model 2: {0}".format(max_depth2))
    print("*** min_rows model 2: {0}".format(min_rows2))

    model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model2.train(x=predictors,
                 y=response_col,
                 training_frame=train,
                 validation_frame=valid)

    # model2 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print(
        "\n*** Continuing to build model 1 (now called model 3) with the following parameters:"
    )
    print("*** ntrees model 3: {0}".format(ntrees3))
    print("*** max_depth model 3: {0}".format(max_depth3))
    print("*** min_rows model 3: {0}".format(min_rows3))

    model3 = H2OGradientBoostingEstimator(ntrees=ntrees3,
                                          max_depth=max_depth3,
                                          min_rows=min_rows3,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model3.train(x=predictors,
                 y=response_col,
                 training_frame=train,
                 validation_frame=valid)

    # model3 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees3,
    #                  max_depth=max_depth3,
    #                  min_rows=min_rows3,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    print(
        "\n*** Building the equivalent of model 2 (called model 4) in one shot:"
    )

    model4 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True)
    model4.train(x=predictors,
                 y=response_col,
                 training_frame=train,
                 validation_frame=valid)

    # model4 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    print("\n*** Model Summary for model 2:")
    print(model2.summary())
    print("\n*** Model Summary for model 3:")
    print(model3.summary())
    print("\n*** Model Summary for model 4:")
    print(model4.summary())

    print("\n*** Score History for model 2:")
    print(model2.scoring_history())
    print("\n*** Score History for model 3:")
    print(model3.scoring_history())
    print("\n*** Score History for model 4:")
    print(model4.scoring_history())

    # checks
    if problem == 0:
        assert isinstance(model2, type(model4))
        assert model2.mse(valid=True) == model4.mse(
            valid=True
        ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
            model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2, type(model4))
        assert model2.auc(valid=True) == model4.auc(
            valid=True
        ), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(
            model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True) == model4.logloss(
            valid=True
        ), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(
            model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True) == model4.giniCoef(
            valid=True
        ), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(
            model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2, type(model4))
        assert model2.mse(valid=True) == model4.mse(
            valid=True
        ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
            model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True) == model4.r2(
            valid=True
        ), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(
            model2.r2(valid=True), model4.r2(valid=True))
Beispiel #19
0
def main_Calib(filename, output, mode, alg, basis, order, figure, verbose, offset, qt, pre, split):
    '''
    # main program
    # input: radius: %+.3f, 'str' (in makefile, str is default)
    #        path: file storage path, 'str'
    #        fout: file output name as .h5, 'str' (.h5 not included')
    #        cut_max: cut off of Legendre
    # output: the gathered result EventID, ChannelID, x, y, z
    '''
    if pre != 'r':
        print('begin reading file', flush=True)
        EventID, ChannelID, Q, PETime, photonTime, PulseTime, dETime, x, y, z = pub.ReadFile(filename)
        VertexTruth = (np.vstack((x, y, z))/1e3).T
        if(offset):
            off = pub.LoadBase(offset)
        else:
            off = np.zeros_like(PMTPos[:,0])
        print('total event: %d' % np.size(np.unique(EventID)), flush=True)
        print('begin processing legendre coeff', flush=True)
        # this part for the same vertex

        tmp = time.time()
        EventNo = np.size(np.unique(EventID))
        PMTNo = np.size(PMTPos[:,0])
        if mode == 'PE':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)
        elif mode == 'time':
            counts = np.bincount(EventID)
            counts = counts[counts!=0]
            PMTPosRep = PMTPos[ChannelID]
            vertex = np.repeat(VertexTruth, counts, axis=0)
        elif mode == 'combined':
            PMTPosRep = np.tile(PMTPos, (EventNo,1))
            vertex = np.repeat(VertexTruth, PMTNo, axis=0)

        if basis == 'Legendre':
            X, cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=True)
        elif basis == 'Zernike':
            from zernike import RZern
            cos_theta = pub.LegendreCoeff(PMTPosRep, vertex, order, Legendre=False)
            cart = RZern(order)
            nk = cart.nk
            m = cart.mtab
            n = cart.ntab
            rho = np.linalg.norm(vertex, axis=1)/0.65
            theta = np.arccos(cos_theta)
            X = np.zeros((rho.shape[0], nk))

            for i in np.arange(nk):
                if not i % 5:
                    print(f'process {i}-th event')
                X[:,i] = cart.Zk(i, rho, theta)
            X = X[:,m>=0]
            print(f'rank: {np.linalg.matrix_rank(X)}')    
        print(f'use {time.time() - tmp} s')

        # which info should be used
        if mode == 'PE':
            y = Q
        elif mode == 'time':
            y = PulseTime 
        elif mode == 'combined':
            # PulseTime = PulseTime - np.min(PulseTime)
            # PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            # print(np.min(PulseTime), np.max(PulseTime))
            PulseTime = (PulseTime - np.max(PulseTime)/2)/np.max(PulseTime)*2
            bins = np.arange(-1, 0.05, 0.1)
            N = 10
            # Legendre coeff
            x = pub.legval(bins, np.eye(N).reshape(N, N, 1))
            # 1st basis
            Y = np.tile(x, len(np.unique(EventID))*len(np.unique(ChannelID))).T
            # 2nd basis
            X = np.repeat(X, bins.shape[0], axis=0)
            # output
            y = np.zeros((len(np.unique(EventID)), len(np.unique(ChannelID)), len(bins)))
            '''
            basis = np.zeros((X.shape[0], X.shape[1]*Y.shape[1]))
            for i_index, i in enumerate(np.arange(X.shape[1])):
                for j_index, j in enumerate(np.arange(Y.shape[1])):
                    total_index = i_index*Y.shape[1] + j_index
                    if not total_index % 10:
                        print(total_index)
                    basis[:, total_index] = X[:,i_index]*Y[:,j_index]
            X = basis
            '''
            split_index = np.unique(EventID).shape[0]
            for k_index, k in enumerate(np.unique(EventID)): # event begin with 1
                if k_index > split_index * split:
                    break
                if not k % 100:
                    print(k)
                index = EventID == k
                CID = ChannelID[index]
                Pulse_t = PulseTime[index]
                for i in np.unique(CID): # PMT begin with 0
                    y[k_index, i, 1:], _ = np.histogram(Pulse_t[CID==i], bins=bins)
            y = np.reshape(y,(-1))
        if verbose:
            print(f'the basis shape is {X.shape}, and the dependent variable shape is {y.shape}')
    if pre =='w':
        if split != 1:
            split_index = np.int(split*y.shape[0])
            X = X[:split_index]
            Y = Y[:split_index]
            y = y[:split_index]
        import pandas as pd
        import pyarrow as pa
        import pyarrow.parquet as pq
        y = np.atleast_2d(y).T
        #data = np.hstack((X, y, np.ones_like(y)))
        df_X = pd.DataFrame(X)
        X_names = []
        for i in df_X.columns:
            X_names.append('X' + str(i))
        df_X.columns = X_names    
        
        df_Y = pd.DataFrame(Y)
        Y_names = []
        for i in df_Y.columns:
            Y_names.append('Y' + str(i))
        df_Y.columns = Y_names
        
        df_y = pd.DataFrame(y)
        df_y.columns = ['output']
        df = pd.concat([df_X, df_Y, df_y], axis=1)
        table = pa.Table.from_pandas(df)
        
        pq.write_table(table, 'test1.parquet')
        return

    if not pre:
        # Regression methods:
        if alg == 'sms':
            import statsmodels.api as sm
            if mode == 'PE':
                model = sm.GLM(y, X, family=sm.families.Poisson(), fit_intercept=False)
                result = model.fit()
                if verbose:
                    print(result.summary())
                AIC = result.aic
                coef_ = result.params
                std = result.bse
                
            elif mode == 'time':
                import pandas as pd
                data = pd.DataFrame(data = np.hstack((X, np.atleast_2d(y).T)))                
                strs = 'y ~ '
                start = data.keys().start
                stop = data.keys().stop
                step = data.keys().step

                cname = []
                cname.append('X0')
                for i in np.arange(start+1, stop, step):
                    if i == start + 1:
                        strs += 'X%d ' % i
                    elif i == stop - step:
                        pass
                    else:
                        strs += ' + X%d ' % i                      

                    if i == stop - step:
                        cname.append('y')
                    else:
                        cname.append('X%d' % i)
                data.columns = cname

                mod = sm.formula.quantreg(strs, data[cname])

                result = mod.fit(q=qt,)
                coef_ = result.params
                AIC = np.zeros_like(coef_)
                std = np.zeros_like(coef_)           
                print('Waring! No AIC and std value')
            elif mode == 'combined':
                # data = pd.DataFrame(data = np.hstack((basis, np.atleast_2d(y).T)))  
                with h5py.File(output,'w') as out:        
                    out.create_dataset('X', data = X)
                    out.create_dataset('Y', data = y)
                print('begin...')
                model = sm.GLM(y, X, family=sm.families.Poisson())
                result = model.fit()
                if verbose:
                    print(result.summary())
                coef_ = result.params
                std = result.bse
                AIC = result.aic
            if verbose:
                print(result.summary())

        elif (alg == 'custom'):
            from scipy.optimize import minimize
            x0 = np.zeros_like(X[0]) # initial value (be careful of Zernike order)
            
            if mode == 'PE':
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))
            elif mode == 'time':
                x0[0] = np.mean(y)
                qt = 0.1
                ts = 2.6
                result = minimize(pub.CalibTime, x0=x0, method='SLSQP', args = (np.hstack((EventID, EventID)), y, X, qt, ts))
            elif mode == 'combined':
                x0 = np.zeros_like(X[0])
                x0[0] = 0.8 + np.log(2) # intercept is much more important
                result = minimize(pub.CalibPE, x0=x0, method='SLSQP', args = (y, PMTPos, X))

            coef_ = np.array(result.x, dtype=float)
            if verbose:
                print(result.message)
            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)

            H = pub.MyHessian(result.x, pub.CalibPE, *(y, PMTPos, X))
            # H = pub.MyHessian(result.x, *(Q, PMTPos, X, pub.CalibTime))
            # std = 1/np.sqrt(-np.diag(np.linalg.pinv(H1)))
            print(coef_)
            # print(std)
            print('Waring! No AIC and std value, std is testing')

        elif alg == 'sk':
            from sklearn.linear_model import TweedieRegressor
            alpha = 0.001
            reg = TweedieRegressor(power=1, alpha=alpha, link='log', max_iter=1000, tol=1e-6, fit_intercept=False)
            reg.fit(X, y)

            # just for point data
            # pred = reg.predict(X[0:30,0:cut+1])

            print('coeff:\n', reg.coef_,'\n')

            coef_ = reg.coef_ 

            AIC = np.zeros_like(coef_)
            std = np.zeros_like(coef_)
            print('Waring! No AIC and std value')

        elif alg == 'h2o':
            import h2o
            from h2o.estimators.gbm import H2OGradientBoostingEstimator
            from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
            if mode != 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, y, np.ones_like(y)))

                h2o.init()
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]

                if mode == 'PE':
                    #offset_col = hf.columns[-1]
                    glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                        #offset_column = offset_col, 
                        lambda_ = 0,
                        compute_p_values = True)

                    glm_model.train(predictors, response_col, training_frame=hf)

                    coef_table = glm_model._model_json['output']['coefficients_table']
                    coef_ = glm_model.coef()

                elif mode == 'time':
                    gbm = H2OGradientBoostingEstimator(distribution="quantile", seed = 1234,
                                                      stopping_metric = "mse", stopping_tolerance = 1e-4)
                    gbm.train(x = predictors, y = response_col, training_frame = hf)
                    breakpoint()
                    print(gbm)
                    exit()
            elif mode == 'combined':
                y = np.atleast_2d(y).T
                data = np.hstack((X, Y, y, np.ones_like(y)))

                h2o.init() 
                hf = h2o.H2OFrame(data)
                predictors = hf.columns[0:-2]
                response_col = hf.columns[-2]           

            if verbose:
                print(coef_)
                if basis == 'Zernike':
                    print(f'Regession coef shape is f{np.array(coef_).shape}, Zernike shape is {nk}')
            coef_ = coef_table['coefficients']
            std = coef_table['std_error']
            AIC = glm_model.aic()

            h2o.cluster().shutdown()

    elif pre == 'r':
        import h2o
        from h2o.estimators.gbm import H2OGradientBoostingEstimator
        from h2o.estimators.glm import H2OGeneralizedLinearEstimator           
        h2o.init()
        hf = h2o.import_file("electron-1.parquet")
        pairs = []
        for i in hf.columns:
            for j in hf.columns:
                if (i.startswith('Z') and j.startswith('L')):
                    if ((i!='X0') and (j != 'Y0')):
                        pairs.append((i,j))
        predictors = hf.columns[2:]
        response_col = hf.columns[0]
        
        print(predictors)
        print(response_col)
        print(pairs)
        if mode == 'PE':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                lambda_ = 0,
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        
        elif mode == 'combined':
            #offset_col = hf.columns[-1]
            glm_model = H2OGeneralizedLinearEstimator(family= "poisson",
                #offset_column = offset_col, 
                interaction_pairs=pairs,
                lambda_ = 0,
                #remove_collinear_columns = True, 
                compute_p_values = True)

            glm_model.train(predictors, response_col, training_frame=hf)
        breakpoint()
        coef_table = glm_model._model_json['output']['coefficients_table']
        coef_ = coef_table['coefficients']
        std = coef_table['std_error']
        AIC = glm_model.aic()
        print(f'Regession coef is f{np.array(coef_)}')             
        if (figure=='ON'):
            import matplotlib.pyplot as plt
            L, K = 500, 500
            ddx = np.linspace(-1.0, 1.0, K)
            ddy = np.linspace(-1.0, 1.0, L)
            xv, yv = np.meshgrid(ddx, ddy)
            cart.make_cart_grid(xv, yv)
            # normal scale
            # im = plt.imshow(np.exp(cart.eval_grid(np.array(coef_), matrix=True)), origin='lower', extent=(-1, 1, -1, 1))
            # log scale
            im = plt.imshow(cart.eval_grid(np.array(coef_), matrix=True), origin='lower', extent=(-1, 1, -1, 1))
            plt.colorbar()
            plt.savefig('test.png')
    else:
        print('error regression algorithm')
            
    with h5py.File(output,'w') as out:        
        out.create_dataset('coeff' + str(order), data = coef_)
        out.create_dataset('std' + str(order), data = std)
        out.create_dataset('AIC' + str(order), data = AIC)
def multinomial_auc_prostate_gbm():
    data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    response_col = "GLEASON"
    data[response_col] = data[response_col].asfactor()
    
    predictors = ["RACE", "AGE", "PSA", "DPROS", "CAPSULE", "VOL", "DCAPS"]
    distribution = "multinomial"

    # train model
    gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="WEIGHTED_OVR")
    gbm.train(x=predictors, y=response_col, training_frame=data)

    gbm.show()

    # get result on training data from h2o
    cm = gbm.confusion_matrix(data)
    h2o_auc_table = gbm.multinomial_auc_table(train=True)
    h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True)

    print(cm)
    print(h2o_auc_table.as_data_frame())
    print(h2o_aucpr_table.as_data_frame())

    h2o_ovr_macro_auc = h2o_auc_table[3][7]
    h2o_ovr_weighted_auc = h2o_auc_table[3][8]
    h2o_ovo_macro_auc = h2o_auc_table[3][30]
    h2o_ovo_weighted_auc = h2o_auc_table[3][31]

    h2o_ovr_weighted_aucpr = h2o_aucpr_table[3][8]

    h2o_default_auc = gbm.auc()
    h2o_default_aucpr = gbm.aucpr()

    print("default vs. table AUC "+str(h2o_ovr_weighted_auc)+" "+str(h2o_default_auc))
    print("default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" "+str(h2o_default_aucpr))

    # default should be ovr weighted 
    assert h2o_ovr_weighted_auc == h2o_default_auc, "default vs. table AUC "+str(h2o_ovr_weighted_auc)+" != "+str(h2o_default_auc)
    assert h2o_ovr_weighted_aucpr == h2o_default_aucpr, "default vs. table PR AUC "+str(h2o_ovr_weighted_aucpr)+" != "+str(h2o_default_aucpr)

    # transform data for sklearn
    prediction = gbm.predict(data).as_data_frame().iloc[:,1:]
    actual = data[response_col].as_data_frame().iloc[:, 0].tolist()

    # get result on training data from sklearn
    sklearn_ovr_macro_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='macro')
    sklearn_ovr_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovr", average='weighted')
    sklearn_ovo_macro_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='macro')
    sklearn_ovo_weighted_auc = roc_auc_score(actual, prediction, multi_class="ovo", average='weighted')

    print("sklearn vs. h2o ovr macro:    "+str(sklearn_ovr_macro_auc)+" "+str(h2o_ovr_macro_auc))
    print("sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" "+str(h2o_ovr_weighted_auc))
    print("sklearn vs. h2o ovo macro:    "+str(sklearn_ovo_macro_auc)+" "+str(h2o_ovo_macro_auc))
    print("sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" "+str(h2o_ovo_weighted_auc))

    # compare results h2o vs sklearn
    precision = 1e-7
    assert abs(h2o_ovr_macro_auc - sklearn_ovr_macro_auc) < precision, "sklearn vs. h2o ovr macro: "+str(sklearn_ovr_macro_auc)+" != "+str(h2o_ovr_macro_auc)
    assert abs(h2o_ovr_weighted_auc - sklearn_ovr_weighted_auc) < precision, "sklearn vs. h2o ovr weighted: "+str(sklearn_ovr_weighted_auc)+" != "+str(h2o_ovr_weighted_auc)
    assert abs(h2o_ovo_macro_auc - sklearn_ovo_macro_auc) < precision, "sklearn vs. h2o ovo macro: "+str(sklearn_ovo_macro_auc)+" != "+str(h2o_ovo_macro_auc)
    assert abs(h2o_ovo_weighted_auc - sklearn_ovo_weighted_auc) < precision, "sklearn vs. h2o ovo weighted: "+str(sklearn_ovo_weighted_auc)+" != "+str(h2o_ovo_weighted_auc)

    # set auc_type 
    gbm = H2OGradientBoostingEstimator(ntrees=1, max_depth=2, nfolds=3, distribution=distribution, auc_type="MACRO_OVR")
    gbm.train(x=predictors, y=response_col, training_frame=data, validation_frame=data)

    h2o_auc_table = gbm.multinomial_auc_table(train=True)
    h2o_aucpr_table = gbm.multinomial_aucpr_table(train=True)

    h2o_ovr_macro_auc = h2o_auc_table[3][7]
    h2o_ovr_macro_aucpr = h2o_aucpr_table[3][7]

    h2o_default_auc = gbm.auc()
    h2o_default_aucpr = gbm.aucpr()

    print("default vs. table AUC "+str(h2o_ovr_macro_auc)+" "+str(h2o_default_auc))
    print("default vs. table PR AUC "+str(h2o_ovr_macro_aucpr)+" "+str(h2o_default_aucpr))
Beispiel #21
0
  def check_same(data1, data2, min_rows_scale):
    gbm1_regression = H2OGradientBoostingEstimator(min_rows=5,
                                                   ntrees=5,
                                                   max_depth=5)
    gbm1_regression.train(x=["displacement", "power", "weight", "acceleration", "year"],
                          y="economy",
                          training_frame=data1)

    gbm2_regression = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale,
                                                   ntrees=5,
                                                   max_depth=5)
    gbm2_regression.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"],
                          y="economy",
                          training_frame=data2,
                          weights_column="weights")

    gbm1_binomial = H2OGradientBoostingEstimator(min_rows=5,
                                                 distribution="bernoulli",
                                                 ntrees=5,
                                                 max_depth=5)
    gbm1_binomial.train(x=["displacement", "power", "weight", "acceleration", "year"],
                        y="economy_20mpg",
                        training_frame=data1)
    gbm2_binomial = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale,
                                                 distribution="bernoulli",
                                                 ntrees=5,
                                                 max_depth=5)
    gbm2_binomial.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"],
                        y="economy_20mpg",
                        training_frame=data2,
                        weights_column="weights")

    gbm1_multinomial = H2OGradientBoostingEstimator(min_rows=5,
                                                    distribution="multinomial",
                                                    ntrees=5,
                                                    max_depth=5)
    gbm1_multinomial.train(x=["displacement", "power", "weight", "acceleration", "year"],
                           y="cylinders",
                           training_frame=data1)

    gbm2_multinomial = H2OGradientBoostingEstimator(min_rows=5*min_rows_scale,
                                                    distribution="multinomial",
                                                    ntrees=5,
                                                    max_depth=5)
    gbm2_multinomial.train(x=["displacement", "power", "weight", "acceleration", "year", "weights"],
                           y="cylinders",
                           weights_column="weights", training_frame=data2)
    reg1_mse = gbm1_regression.mse()
    reg2_mse = gbm2_regression.mse()
    bin1_auc = gbm1_binomial.auc()
    bin2_auc = gbm2_binomial.auc()
    mul1_mse = gbm1_multinomial.mse()
    mul2_mse = gbm2_multinomial.mse()

    print("MSE (regresson)   no weights vs. weights: {0}, {1}".format(reg1_mse, reg2_mse))
    print("AUC (binomial)    no weights vs. weights: {0}, {1}".format(bin1_auc, bin2_auc))
    print("MSE (multinomial) no weights vs. weights: {0}, {1}".format(mul1_mse, mul2_mse))

    assert abs(reg1_mse - reg2_mse) < 1e-5 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format(reg1_mse, reg2_mse)
    assert abs(bin1_auc - bin2_auc) < 3e-4 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format(bin1_auc, bin2_auc)
    assert abs(mul1_mse - mul1_mse) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format(mul1_mse, mul2_mse)
Beispiel #22
0
rf_v1._model_json['output']['variable_importances'].as_data_frame()

# In[27]:

perf = rf_v1.model_performance(valid=True)
perf.plot()

# In[69]:

rf_v1.r2()

# ### Gradient Boost

# In[61]:

gbm1 = H2OGradientBoostingEstimator()
gbm1.train(train_X, train_y, training_frame=train, validation_frame=val)

# In[62]:

pred = gbm1.predict(val[:, 1:-1]).as_data_frame().as_matrix()[:, -2:].ravel()
true = pd.get_dummies(
    val[:, -1].as_data_frame().as_matrix().flatten()).values.ravel()
print("AUC Score calculaed by sklearn")
roc_auc_score(true, pred)

# In[63]:

gbm1.confusion_matrix(valid=True)

# In[41]:
creditcard_df = h2o.import_file(os.path.realpath("input/creditcard.csv"))

# 60% for training
# 20% for validation (hyper parameter tuning)
# 20% for final testing

#split the data as described above
train, valid, test = creditcard_df.split_frame([0.6, 0.2], seed=1234)

#Prepare predictors and response columns
creditcard_X = creditcard_df.col_names[:
                                       -1]  #last column is Class, our desired response variable
creditcard_y = creditcard_df.col_names[-1]

gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_creditcard_v1",
                                      max_hit_ratio_k=3,
                                      seed=2000000)
gbm_v1.train(creditcard_X,
             creditcard_y,
             training_frame=train,
             validation_frame=valid)

gbm_v1.score_history()

gbm_v1.hit_ratio_table(valid, train=FALSE, valid=FALSE, xval=FALSE)

# This default GBM is much worse than our original random forest.
#
#
# The GBM is far from converging, so there are three primary knobs to adjust to get our performance up if we want to keep a similar run time.
#
Beispiel #24
0
#gs = H2OGridSearch(H2OGradientBoostingEstimator(distribution="multinomial"), hyper_params=hyper_parameters)
#gs.train(x=range(0, iris_df.ncol-1), y=iris_df.ncol-1, training_frame=iris_df, nfold=10)

##
## Pipeline
##

from h2o.transforms.preprocessing import H2OScaler
from h2o.transforms.decomposition import H2OPCA
from sklearn.pipeline import Pipeline

h2o.no_progress()

pipeline = Pipeline([
    ("standardize", H2OScaler()), ("pca", H2OPCA(k=2)),
    ("gbm", H2OGradientBoostingEstimator(distribution="multinomial"))
])

print pipeline.fit(iris_df[:4], iris_df[4])

##
## Randomized Gird Search
##
from sklearn.grid_search import RandomizedSearchCV
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.metrics.scorer import make_scorer

params = {
    "standardize__center": [True, False],
    "standardize__scale": [True, False],
Beispiel #25
0
                             ("uploader", H2OFrameCreator()),
                             ("classifier", classifier)])
    pipeline.fit(audit_X,
                 H2OFrame(audit_y.to_frame(), column_types=["categorical"]))
    pipeline.verify(audit_X.sample(frac=0.05, random_state=13))
    classifier = pipeline._final_estimator
    store_mojo(classifier, name)
    store_pkl(pipeline, name)
    adjusted = pipeline.predict(audit_X)
    adjusted.set_names(["h2o(Adjusted)", "probability(0)", "probability(1)"])
    store_csv(adjusted.as_data_frame(), name)


if "Audit" in datasets and with_h2o:
    build_audit_h2o(
        H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=17),
        "H2OGradientBoostingAudit")
    build_audit_h2o(H2OGeneralizedLinearEstimator(family="binomial"),
                    "H2OLogisticRegressionAudit")
    build_audit_h2o(
        H2ORandomForestEstimator(distribution="bernoulli", seed=13),
        "H2ORandomForestAudit")

audit_dict_X = audit_X.to_dict("records")


def build_audit_dict(classifier, name, with_proba=True):
    pipeline = PMMLPipeline([("dict-transformer", DictVectorizer()),
                             ("classifier", classifier)])
    pipeline.fit(audit_dict_X, audit_y)
    store_pkl(pipeline, name)
Beispiel #26
0
def stackedensemble_validation_frame_test():
    """This test checks the following:
    1) That passing in a validation_frame to h2o.stackedEnsemble does something (validation metrics exist).
    2) It should hopefully produce a better model (in the metalearning step).
    """

    # Import training set
    df = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_train_5k.csv"),
                         destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/higgs/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = df.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    df[y] = df[y].asfactor()
    test[y] = test[y].asfactor()

    # Split off a validation_frame
    ss = df.split_frame(seed = 1)
    train = ss[0]
    valid = ss[1]

    # Set number of folds
    nfolds = 5

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          fold_assignment="Modulo",
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    # Train a stacked ensemble & check that validation metrics are missing
    stack1 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id])
    stack1.train(x=x, y=y, training_frame=train)
    assert(stack1.model_performance(valid=True) is None)

    # Train a stacked ensemble with a validation_frame & check that validation metrics exist & are correct type
    stack2 = H2OStackedEnsembleEstimator(base_models=[my_gbm.model_id, my_rf.model_id])
    stack2.train(x=x, y=y, training_frame=train, validation_frame=valid)
    assert(type(stack2.model_performance(valid=True)) == h2o.model.metrics_base.H2OBinomialModelMetrics)
    assert(type(stack2.auc(valid=True)) == float)


    # Compare test AUC (ensemble with validation_frame should not be worse)
    perf1 = stack1.model_performance(test_data=test)
    perf2 = stack2.model_performance(test_data=test)
    assert perf2.auc() >= perf1.auc()
Beispiel #27
0
            #Here we retrain base models before calling stack ensemble
            print("stacked")
            for b_model in base:
                if 'GLM' in b_model:  #GLM is giving error with re-training
                    base.remove(b_model)
                else:
                    m = h2o.get_model(b_model)
                    m.train(y=-1, training_frame=d)

            ensemble = H2OStackedEnsembleEstimator(base_models=base)
            ensemble.train(y=-1, training_frame=d)
            anytime_model = ensemble

        else:
            aml.leader.train(y=-1, training_frame=d)
            anytime_model = aml.leader

# In[27]:

from h2o.estimators.gbm import H2OGradientBoostingEstimator

m = h2o.get_model('GBM_grid__1_AutoML_20200518_140119_model_4')
print(m)

# In[28]:

m_new = H2OGradientBoostingEstimator(checkpoint=m_new)
m_new.train(y=-1, training_frame=d)
print(m_new)
 def test_h2o_classifier_multi_2class(self):
     gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution="multinomial")
     mojo_path, test_data = _train_classifier(gbm, 2, is_str=True)
     with self.assertRaises(ValueError) as err:
         _convert_mojo(mojo_path)
     self.assertRegexpMatches(err.exception.args[0], "not supported")
Beispiel #29
0
def mojo_predict_api_test(sandbox_dir):
    data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    output_csv = "%s/prediction.csv" % sandbox_dir
    h2o.export_file(data[1, 2:], input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    # test that we can predict using default paths
    h2o.mojo_predict_csv(input_csv_path=input_csv,
                         mojo_zip_path=model_zip_path,
                         verbose=True)
    h2o.mojo_predict_csv(input_csv_path=input_csv,
                         mojo_zip_path=model_zip_path,
                         genmodel_jar_path=genmodel_path,
                         verbose=True)
    assert os.path.isfile(output_csv)
    os.remove(model_zip_path)
    os.remove(genmodel_path)
    os.remove(output_csv)

    # test that we can predict using custom genmodel path
    other_sandbox_dir = tempfile.mkdtemp()
    try:
        genmodel_path = os.path.join(other_sandbox_dir,
                                     'h2o-genmodel-custom.jar')
        download_mojo(model, model_zip_path, genmodel_path)
        assert os.path.isfile(model_zip_path)
        assert os.path.isfile(genmodel_path)
        try:
            h2o.mojo_predict_csv(input_csv_path=input_csv,
                                 mojo_zip_path=model_zip_path,
                                 verbose=True)
            assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir
        except RuntimeError:
            pass
        assert not os.path.isfile(output_csv)
        h2o.mojo_predict_csv(input_csv_path=input_csv,
                             mojo_zip_path=model_zip_path,
                             genmodel_jar_path=genmodel_path,
                             verbose=True)
        assert os.path.isfile(output_csv)
        os.remove(output_csv)

        output_csv = "%s/out.prediction" % other_sandbox_dir

        # test that we can predict using default paths
        h2o.mojo_predict_csv(input_csv_path=input_csv,
                             mojo_zip_path=model_zip_path,
                             genmodel_jar_path=genmodel_path,
                             verbose=True,
                             output_csv_path=output_csv)
        assert os.path.isfile(output_csv)
        os.remove(model_zip_path)
        os.remove(genmodel_path)
        os.remove(output_csv)
    finally:
        shutil.rmtree(other_sandbox_dir)
def partial_plot_test_with_user_splits():
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                             learn_rate=0.05,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    file, filename = tempfile.mkstemp(suffix=".png")
    user_splits = dict()
    user_splits['AGE'] = [
        43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579,
        50.578947368421055, 52.473684210526315, 54.368421052631575,
        56.26315789473684, 58.1578947368421, 60.05263157894737,
        61.94736842105263, 63.84210526315789, 65.73684210526315,
        67.63157894736842, 69.52631578947368, 71.42105263157895,
        73.3157894736842, 75.21052631578948, 77.10526315789474
    ]
    user_splits['RACE'] = ["Black", "White"]
    pdpUserSplit2D = gbm_model.partial_plot(data=data,
                                            server=True,
                                            plot=True,
                                            user_splits=user_splits,
                                            col_pairs_2dpdp=[['AGE', 'PSA'],
                                                             ['AGE', 'RACE']],
                                            save_to_file=filename)
    pdpUserSplit1D2D = gbm_model.partial_plot(data=data,
                                              cols=['AGE', 'RACE', 'DCAPS'],
                                              server=True,
                                              plot=True,
                                              user_splits=user_splits,
                                              col_pairs_2dpdp=[['AGE', 'PSA'],
                                                               ['AGE',
                                                                'RACE']],
                                              save_to_file=filename)
    pdpUserSplit1D = gbm_model.partial_plot(data=data,
                                            cols=['AGE', 'RACE', 'DCAPS'],
                                            server=True,
                                            plot=True,
                                            user_splits=user_splits,
                                            save_to_file=filename)
    if os.path.isfile(filename):
        os.remove(filename)
        # compare results 1D pdp
    for i in range(3):
        pyunit_utils.assert_H2OTwoDimTable_equal_upto(
            pdpUserSplit1D[i],
            pdpUserSplit1D2D[i],
            pdpUserSplit1D[i].col_header,
            tolerance=1e-10)
    # compare results 2D pdp
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[0],
                                                  pdpUserSplit1D2D[3],
                                                  pdpUserSplit2D[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[1],
                                                  pdpUserSplit1D2D[4],
                                                  pdpUserSplit2D[1].col_header,
                                                  tolerance=1e-10)