def infer_uses_defaults_when_base_model_doesnt_support_distributions_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x_reg = train.columns
    y_reg = "petal_wid"
    x_reg.remove(y_reg)

    nfolds = 2
    glm_reg = H2OGeneralizedLinearEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        family="tweedie")
    glm_reg.train(x=x_reg, y=y_reg, training_frame=train)

    gbm_reg = H2OGradientBoostingEstimator(
        nfolds=nfolds,
        fold_assignment="Modulo",
        keep_cross_validation_predictions=True,
        distribution="tweedie")
    gbm_reg.train(x=x_reg, y=y_reg, training_frame=train)

    drf_reg = H2ORandomForestEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    drf_reg.train(x=x_reg, y=y_reg, training_frame=train)

    se_reg_0 = H2OStackedEnsembleEstimator(training_frame=train,
                                           validation_frame=test,
                                           base_models=[glm_reg, gbm_reg],
                                           metalearner_algorithm="gbm")
    se_reg_0.train(x_reg, y_reg, train)

    assert se_reg_0.metalearner().actual_params.get("distribution") == "tweedie", \
        "Expected distribution {} but got {}".format("tweedie",
                                                     se_reg_0.metalearner().actual_params.get("distribution"))

    se_reg_1 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[glm_reg, gbm_reg, drf_reg],
        metalearner_algorithm="gbm")
    se_reg_1.train(x_reg, y_reg, train)

    assert se_reg_1.metalearner().actual_params.get("distribution") == "gaussian", \
        "Expected distribution {} but got {}".format("gaussian",
                                                     se_reg_1.metalearner().actual_params.get("distribution"))

    se_reg_2 = H2OStackedEnsembleEstimator(
        training_frame=train,
        validation_frame=test,
        base_models=[drf_reg, glm_reg, gbm_reg],
        metalearner_algorithm="gbm")
    se_reg_2.train(x_reg, y_reg, train)

    assert se_reg_2.metalearner().actual_params.get("distribution") == "gaussian", \
        "Expected distribution {} but got {}".format("gaussian",
                                                     se_reg_2.metalearner().actual_params.get("distribution"))
def plot_test():
    air = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = [
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]
    myY = "IsDepDelayed"

    air_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                           ntrees=100,
                                           max_depth=3,
                                           learn_rate=0.01)
    air_gbm.train(x=myX,
                  y=myY,
                  training_frame=air_train,
                  validation_frame=air_valid)

    # Plot ROC for train set
    perf_train = air_gbm.model_performance(train=True)
    perf_train.plot(type="roc", server=True)
    perf_train.plot(type="pr", server=True)

    # Plot ROC for valid set
    perf_valid = air_gbm.model_performance(valid=True)
    perf_valid.plot(type="roc", server=True)
    perf_valid.plot(type="pr", server=True)

    # Plot ROC for test set
    air_test = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
    perf_test = air_gbm.model_performance(air_test)
    perf_test.plot(type="roc", server=True)
    perf_test.plot(type="pr", server=True)

    # Test file saving
    fn = "curve_plot.png"
    perf_test.plot(type="roc", server=False, save_to_file=fn)
    if os.path.isfile(fn):
        os.remove(fn)
    perf_test.plot(type="pr", server=False, save_to_file=fn)
    if os.path.isfile(fn):
        os.remove(fn)

    # Test no plot parameter
    (fprs, tprs) = perf_test.plot(type="roc", server=True, plot=False)
    assert len(fprs) == len(
        tprs
    ), "Expected fprs and tprs to have the same shape but they are not."
    (recalls, precisions) = perf_test.plot(type="pr", server=True, plot=False)
    assert len(recalls) == len(
        precisions
    ), "Expected recall and precision to have the same shape but they are not."
def GBM_impute(data, columnsArray, rm_cols):

    columnsArray_ind = []
    c = list(filter(lambda x: x not in list(data.columns), list(columnsArray['columnDisplayName'].replace(r'!@#$%^&*.', " ", regex=True))))
    for j in c:
        a=columnsArray.index[columnsArray['columnName'].replace(r'!@#$%^&*.', " ", regex=True)==j].values
        columnsArray.drop(columnsArray.index[a], inplace = True)
        columnsArray.index = range(columnsArray.shape[0])
        
    for i in columnsArray['columnName']:
        if i in rm_cols:
            columnsArray_ind.append(list(columnsArray[columnsArray['columnName']==i].index)[0])
    columnsArray_ind1 = set(columnsArray.index)-set(columnsArray_ind)
#    print(columnsArray_ind1)
    columnsArray_edit = columnsArray.iloc[list(columnsArray_ind1)]        
    
    # select observations without NA's
    data_clean = data.dropna()
    # creating H2O Frame and splitting for model train
    #data_clean.info()
    hf = h2o.H2OFrame(data_clean)
    train, valid, test = hf.split_frame(ratios=[.8, .1])
    # select observations with NA's
    data_na_index = [i for i in (set(list(data.index)) - set(list(data_clean.index))) ]
    data_na = data.iloc[data_na_index]

    model_accuracy = []
    print("Number of missing values : " + str(len(data_na)))
    gbm = H2OGradientBoostingEstimator()
    for i in range(len(data_na)):
        y_set = set(data_na.iloc[i].index) - set(data_na.iloc[i].dropna().index)
        
        for yValue in y_set:
            xValues = set(data_na.columns)-y_set
            gbm.train(xValues, yValue, training_frame=train, validation_frame=valid)
            model_accuracy.append(gbm.r2())
            print(yValue)
            test_na = data_na
    #            print('Missing value prediction with GBM model')
            test_na, columnsArray_edit = columns_data_type(test_na, columnsArray_edit)
    #            print(test_na)
                
        #        test_na = test_na.drop(xValues,axis=1)
            test_na = test_na.drop(yValue,axis=1)
                #print(i)
            test_na = h2o.H2OFrame(test_na)
            predicted = gbm.predict(test_na)
            predicted = predicted.as_data_frame()
            for j in range(data_na.shape[0]):
                if np.isnan(data_na[yValue].iloc[j]):
                    data_na[yValue].iloc[j] = predicted['predict'][j]
                else:
                    continue
        
    acc = np.mean(model_accuracy)
    frames = [data_clean, data_na]
    df = pd.concat(frames, axis=0)
    df.sort_index(axis = 0, inplace = True)
    return df, acc 
 def test_h2o_classifier_multi_int(self):
     gbm = H2OGradientBoostingEstimator(ntrees=9, max_depth=5)
     mojo_path, test_data = _train_classifier(gbm, 9, is_str=False)
     onnx_model = _convert_mojo(mojo_path)
     self.assertIsNot(onnx_model, None)
     dump_data_and_model(
         test_data, H2OMojoWrapper(mojo_path), onnx_model,
         basename="H2OClassMultiBin")
Beispiel #5
0
def frameslice_gbm():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate = prostate[1:9]

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model = H2OGradientBoostingEstimator()
    model.train(x=range(1, 8), y=0, training_frame=prostate)
 def test_h2o_classifier_multi_2class(self):
     gbm = H2OGradientBoostingEstimator(ntrees=7,
                                        max_depth=5,
                                        distribution="multinomial")
     mojo_path, test_data = _train_classifier(gbm, 2, is_str=True)
     with self.assertRaises(ValueError) as err:
         _convert_mojo(mojo_path)
     self.assertRegexpMatches(err.exception.args[0], "not supported")
Beispiel #7
0
def checkpoint_new_category_in_response():

    sv = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/setosa_versicolor.csv"))
    iris = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    m1 = H2OGradientBoostingEstimator(ntrees=100)
    m1.train(x=[0, 1, 2, 3], y=4, training_frame=sv)

    # attempt to continue building model, but with an expanded categorical response domain.
    # this should fail
    try:
        m2 = H2OGradientBoostingEstimator(ntrees=200, checkpoint=m1.model_id)
        m2.train(x=[0, 1, 2, 3], y=4, training_frame=iris)
        assert False, "Expected continued model-building to fail with new categories introduced in response"
    except EnvironmentError:
        pass
Beispiel #8
0
def iris_gbm_grid():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # Run GBM

  ntrees_opts = [1,3]
  learn_rate_opts = [0.1,0.01,.05]
  size_of_hyper_space = len(ntrees_opts) * len(learn_rate_opts)
  hyper_parameters = OrderedDict()
  hyper_parameters["learn_rate"] = learn_rate_opts
  hyper_parameters["ntrees"] = ntrees_opts
  print("GBM grid with the following hyper_parameters:", hyper_parameters)

  gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
  gs.train(x=list(range(4)), y=4, training_frame=train)
  print("\nsorted by mse: ")
  print(gs.sort_by("mse"))
  #print gs.hit_ratio_table()

  for model in gs:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(gs) == size_of_hyper_space
  total_grid_space = list(map(list, itertools.product(*list(hyper_parameters.values()))))
  print( str(total_grid_space) )
  for model in gs.models:
    combo = [model.parms['learn_rate']['actual_value'], model.parms['ntrees']['actual_value']]
    assert combo in total_grid_space, "combo: " + str(combo) + "; total_grid_space=" + str(total_grid_space)
    total_grid_space.remove(combo)

  # test back-end sorting of model metrics:
  locally_sorted = gs.sort_by("r2", H2OGridSearch.DESC)
  remotely_sorted_desc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='desc')

  assert len(locally_sorted.cell_values) == len(remotely_sorted_desc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  for i in range(len(remotely_sorted_desc.model_ids)):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_desc.model_ids[i], "Expected back-end sort by r2 to be the same as locally-sorted: " + str(i)

  remotely_sorted_asc = H2OGridSearch.get_grid(H2OGradientBoostingEstimator(distribution='multinomial'), hyper_parameters, gs.grid_id, sort_by='r2', sort_order='asc')
  for model in remotely_sorted_asc:
    assert isinstance(model, H2OGradientBoostingEstimator)

  assert len(locally_sorted.cell_values) == len(remotely_sorted_asc.model_ids), "Expected locally sorted and remotely sorted grids to have the same number of models"
  length = len(remotely_sorted_asc.model_ids)
  for i in range(length):
    assert locally_sorted.cell_values[i][0] == remotely_sorted_asc.model_ids[length - i - 1], "Expected back-end sort by r2, ascending, to be the reverse as locally-sorted ascending: " + str(i)
 def test_h2o_classifier_bin_int(self):
     gbm = H2OGradientBoostingEstimator(ntrees=8, max_depth=5)
     mojo_path, test_data = _train_classifier(gbm, 2, is_str=False, force_y_numeric=True)
     onnx_model = _convert_mojo(mojo_path)
     self.assertIsNot(onnx_model, None)
     dump_data_and_model(
         test_data, H2OMojoWrapper(mojo_path), onnx_model,
         basename="H2OClassBinInt")
Beispiel #10
0
def constant_col_gbm():
    train = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    train["constantCol"] = 1

    # Run GBM, which should run successfully with constant response when check_constant_response is set to false
    my_gbm = H2OGradientBoostingEstimator(check_constant_response=False)
    my_gbm.train(x=list(range(1, 5)), y="constantCol", training_frame=train)
def test_reset_threshold():
    """ 
    Test the model threshold can be reset. 
    Performance metric should be recalculated and also predictions should be changed based on the new threshold.
    """

    # import data
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/modified_airlines.csv"))

    # convert columns to factors
    airlines["Year"] = airlines["Year"].asfactor()
    airlines["Month"] = airlines["Month"].asfactor()
    airlines["DayOfWeek"] = airlines["DayOfWeek"].asfactor()
    airlines["Cancelled"] = airlines["Cancelled"].asfactor()
    airlines['FlightNum'] = airlines['FlightNum'].asfactor()

    # set the predictor names and the response column name
    predictors = [
        "Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month",
        "Distance", "FlightNum"
    ]
    response = "IsDepDelayed"

    # split into train and validation sets
    train, valid = airlines.split_frame(ratios=[.8], seed=1234)

    # initialize the estimator
    model = H2OGradientBoostingEstimator(seed=1234, ntrees=5)

    # train the model
    model.train(x=predictors, y=response, training_frame=train)
    old_threshold = model._model_json['output']['default_threshold']

    # predict
    preds = model.predict(airlines)

    # reset the threshold and get the old one
    new_threshold = 0.6917189903082518
    old_returned = reset_model_threshold(model, new_threshold)
    reset_model = h2o.get_model(model.model_id)
    reset_threshold = reset_model._model_json['output']['default_threshold']

    # predict with reset model
    preds_reset = reset_model.predict(airlines)

    # compare thresholds
    assert old_threshold == old_returned
    assert new_threshold == reset_threshold
    assert reset_threshold != old_threshold

    # compare predictions
    print("old threshold:", old_threshold, "new_threshold:", new_threshold)
    for i in range(airlines.nrow):
        if preds[i, 2] >= old_threshold and preds[i, 2] < new_threshold:
            assert preds[i, 0] != preds_reset[i, 0]
        else:
            assert preds[i, 0] == preds_reset[i, 0]
def titanic_with_te_kfoldstrategy(frame=None, seeds=None):
    sum_of_aucs = 0
    for current_seed in seeds:
        ds = split_data(frame, current_seed)

        targetColumnName = "survived"

        foldColumnName = "kfold_column"
        ds['train'][foldColumnName] = ds['train'].kfold_column(
            n_folds=5, seed=current_seed)

        teColumns = ["home.dest", "cabin", "embarked"]
        targetEncoder = TargetEncoder(x=teColumns,
                                      y=targetColumnName,
                                      fold_column=foldColumnName,
                                      blending_avg=True,
                                      inflection_point=3,
                                      smoothing=1)
        targetEncoder.fit(frame=ds['train'])

        encodedTrain = targetEncoder.transform(frame=ds['train'],
                                               holdout_type="kfold",
                                               seed=1234)
        encodedValid = targetEncoder.transform(frame=ds['valid'],
                                               holdout_type="none",
                                               noise=0.0)
        encodedTest = targetEncoder.transform(frame=ds['test'],
                                              holdout_type="none",
                                              noise=0.0)

        myX = [
            "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te",
            "embarked_te", "home.dest_te"
        ]
        air_model = H2OGradientBoostingEstimator(
            ntrees=1000,
            learn_rate=0.1,
            score_tree_interval=10,
            stopping_rounds=5,
            stopping_metric="AUC",
            stopping_tolerance=0.001,
            distribution="multinomial",
            # why AUC is different for quasibinomial and multinomial?
            seed=1234)
        air_model.train(x=myX,
                        y=targetColumnName,
                        training_frame=encodedTrain,
                        validation_frame=encodedValid)
        variable_importance = air_model._model_json['output'][
            'variable_importances'].as_data_frame()
        # print(variable_importance)

        my_gbm_metrics = air_model.model_performance(encodedTest)
        auc = my_gbm_metrics.auc()
        sum_of_aucs += auc
        print("AUC with kfold for seed: " + str(current_seed) + " = " +
              str(auc))
    return sum_of_aucs / len(seeds)
Beispiel #13
0
def _run_builder(key, algorithm, training_dataset_key, y, x, model_type):
    try:
        client = _get_memcached_client()
        contents = get_dataset_contents(training_dataset_key)
        with tempfile.TemporaryDirectory() as tmpdir:
            dataset_path = os.path.join(tmpdir, TRAINING_FILE)
            with open(dataset_path, 'w') as training_file:
                training_file.write(contents)
            h2o.init()
            training_frame = h2o.import_file(dataset_path)

            if algorithm == MLAlgorithm.NAIVE_BAYES:
                # naivebayes expects the prediction response to be categorical
                training_frame[y] = training_frame[y].asfactor()
                estimator = H2ONaiveBayesEstimator()
            elif algorithm == MLAlgorithm.GRADIENT_BOOSTING_MACHINE:
                estimator = H2OGradientBoostingEstimator()

            kwargs = {'training_frame': training_frame, 'y': y}
            if x is not None:
                kwargs['x'] = x

            estimator.train(**kwargs)

            temp_folder = os.path.join(os.path.abspath(os.sep), 'tmp')
            if model_type.upper() == 'POJO':
                model_file = estimator.download_pojo(
                    path=temp_folder,
                    get_genmodel_jar=True,
                    genmodel_name='h2o-genmodel.jar')
            else:
                model_file = estimator.download_mojo(
                    path=temp_folder,
                    get_genmodel_jar=True,
                    genmodel_name='h2o-genmodel.jar')

            model_performance = estimator.model_performance()
            details = {'mse': model_performance.mse()}

            with zipfile.ZipFile(os.path.join(temp_folder, key), 'w') as zip:
                zip.write(model_file, os.path.basename(model_file))
                zip.write(os.path.join(temp_folder, 'h2o-genmodel.jar'),
                          'h2o-genmodel.jar')

            client.set(
                key,
                json.dumps({
                    'status': 'COMPLETE',
                    'description': 'Model has been built',
                    'details': details,
                    'path': model_file
                }))
    except Exception as ex:
        client.set(key, json.dumps({
            'status': 'FAILED',
            'description': str(ex)
        }))
        logger.exception("Building model failed")
Beispiel #14
0
def test_gbm_bulk_train():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    # model will be built for each segment
    segment_col = "RACE"
    # segment 0 is too small, will not produce a model
    bad_segment = 0

    segments = prostate[segment_col].unique()
    segments.rename({'C1': segment_col})

    params = {"min_rows": 2, "ntrees": 4, "seed": 42}
    prostate_gbm = H2OGradientBoostingEstimator(**params)
    models = prostate_gbm.bulk_train(y="CAPSULE",
                                     ignored_columns=["ID"],
                                     training_frame=prostate,
                                     segments=segments)
    models_list = models.as_frame()

    assert models_list.names == [
        u'RACE', u'Status', u'Model', u'Errors', u'Warnings'
    ]
    assert models_list.nrow == 3

    # Check failed models
    expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \
                     'must have at least 4.0 (weighted) rows, but have only 3.0.\n'
    assert (models_list["Errors"][models_list[segment_col] == bad_segment]
            ).as_data_frame()["Errors"][0] == expected_error

    mp = models_list.as_data_frame()
    # Check built models
    for i in range(mp.shape[0]):
        segment = int(mp.iloc[i][segment_col])
        if segment != bad_segment:
            model_id = mp.iloc[i]["Model"]
            model = h2o.get_model(model_id)
            prostate_segment = prostate[prostate[segment_col] == segment]
            prostate_gbm_segment = H2OGradientBoostingEstimator(**params)
            prostate_gbm_segment.train(y="CAPSULE",
                                       ignored_columns=["ID"],
                                       training_frame=prostate_segment)
            pyunit_utils.check_models(model, prostate_gbm_segment)
Beispiel #15
0
def spaces_in_column_names():
    train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/jira/spaces_in_column_names.csv"))
    train_data.show()
    train_data.describe()
    train_data["r e s p o n s e"] = train_data["r e s p o n s e"].asfactor()
    X = ["p r e d i c t o r 1","predictor2","p r e d i ctor3","pre d ictor4","predictor5"]
    gbm = H2OGradientBoostingEstimator(ntrees=1, distribution="bernoulli", min_rows=1)
    gbm.train(x=X,y="r e s p o n s e", training_frame=train_data)
    gbm.show()
def test_stacked_ensemble_is_able_to_use_imported_base_models():
    import tempfile, shutil, glob
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    drf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    drf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, drf.model_id])
    se.train(x=x, y=y, training_frame=train)

    assert len(se.base_models) == 2

    TMP_DIR = tempfile.mkdtemp()
    try:
        h2o.save_model(gbm, TMP_DIR + "/gbm.model")
        h2o.save_model(drf, TMP_DIR + "/drf.model")

        gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id
        drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id
        h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout")
        h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout")

        h2o.remove_all()

        h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id)
        h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id)

        gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0])
        drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0])

        train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame")
        test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame")
        x = train.columns
        y = "species"
        x.remove(y)

        se_loaded = H2OStackedEnsembleEstimator(training_frame=train,
                                                validation_frame=test,
                                                base_models=[gbm.model_id, drf.model_id])
        se_loaded.train(x=x, y=y, training_frame=train)

        assert len(se_loaded.base_models) == 2
    finally:
        shutil.rmtree(TMP_DIR)
def gbm_predict_contributions_sorting_large():
    fr = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/creditcardfraud/creditcardfraud.csv"))

    m = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    m.train(x=list(range(0, fr.ncol)), y=30, training_frame=fr)

    contributions = m.predict_contributions(fr, top_n=-1, bottom_n=0, compare_abs=False)
    assert_equals(61, contributions.shape[1], "Wrong number of columns")
    assert_equals(284807, contributions.shape[0], "Wrong number of rows")
Beispiel #18
0
 def create_grid(self):
     """Returns an H2O grid search object 
     """
     gbm_model = H2OGradientBoostingEstimator(**self.model_params)
     gbm_grid = H2OGridSearch(model=gbm_model,
                              hyper_params=self.hyper_params,
                              grid_id=self.grid_id,
                              search_criteria=self.search_params)
     return gbm_grid
def pub_445_long_request_uri():
    mnistTrain = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))
    mnistTest = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/mnist/train.csv.gz"))

    mnistTrain.set_name(col=784, name="label")
    mnistTest.set_name(col=784, name="label")

    mnistModel = H2OGradientBoostingEstimator(ntrees=2, max_depth=2)
    mnistModel.train(x=list(range(784)),y="label",training_frame=mnistTrain,validation_frame=mnistTest)
def offset_init_train_gbm():
    # Connect to a pre-existing cluster
    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5]] * 398)
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    # offset_column passed in the train method
    gbm_train = H2OGradientBoostingEstimator(ntrees=1,
                                             max_depth=1,
                                             min_rows=1,
                                             learn_rate=1)
    gbm_train.train(x=list(range(2, 8)),
                    y="economy_20mpg",
                    training_frame=cars,
                    offset_column="x1")
    predictions_train = gbm_train.predict(cars)

    # test offset_column passed in estimator init
    gbm_init = H2OGradientBoostingEstimator(ntrees=1,
                                            max_depth=1,
                                            min_rows=1,
                                            learn_rate=1,
                                            offset_column="x1")
    gbm_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars)
    predictions_init = gbm_init.predict(cars)

    # test case the both offset column parameters are set the parameter in train will be used
    gbm_init_train = H2OGradientBoostingEstimator(ntrees=1,
                                                  max_depth=1,
                                                  min_rows=1,
                                                  learn_rate=1,
                                                  offset_column="x1")
    gbm_init_train.train(x=list(range(2, 8)),
                         y="economy_20mpg",
                         training_frame=cars,
                         offset_column="x1")
    predictions_init_train = gbm_init_train.predict(cars)

    assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor."
    assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."
def algo_pr_auc_test():
    '''
    This pyunit test is written to make sure we can call pr_auc() on all binomial models.
    '''

    seed = 123456789
    prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.1, max_depth=4, min_rows=10,
                                           distribution="bernoulli", seed=seed)
    gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GBM model")
    print(gbm_h2o)
    print("pr_auc for GBM model is {0}".format(gbm_h2o.pr_auc()))

    # Build H2O GLM classification model:
    glm_h2o = H2OGeneralizedLinearEstimator(family='binomial', seed=seed)
    glm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing GLM model")
    print(glm_h2o)  # glm scoring history does not contain AUC, and hence no pr_auc
    print("pr_auc for GLM model is {0}".format(glm_h2o.pr_auc()))
    
    rf_h2o = H2ORandomForestEstimator(ntrees=10, score_tree_interval=0)
    rf_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing random forest model")
    print(rf_h2o)
    print("pr_auc for Random Forest model is {0}".format(rf_h2o.pr_auc()))

    dl_h2o = H2ODeepLearningEstimator(distribution='bernoulli', seed=seed, hidden=[2,2])
    dl_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)
    print("***************************   Printing deeplearning model")
    print(dl_h2o)
    print("pr_auc for deeplearning model is {0}".format(dl_h2o.pr_auc()))

    assert abs(gbm_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-dl_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    assert abs(rf_h2o.pr_auc()-glm_h2o.pr_auc()) < 0.9, \
        "problem with pr_auc values"

    # try to call pr_auc() for regression.  Should encounter error.
    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
    myY = "GLEASON"
    myX = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    h2o_model = H2OGeneralizedLinearEstimator(family="gaussian", link="identity",alpha=0.5, Lambda=0)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    try:
        print(h2o_model.pr_auc())
        assert 1==2, "pr_auc() should raise an error for multinomial but did not."
    except:
        pass
def bernoulli_gbm():

  #Log.info("Importing prostate.csv data...\n")
  prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

  #Log.info("H2O Summary of prostate frame:\n")
  #prostate.summary()

  # Import prostate_train.csv as numpy array for scikit comparison
  trainData = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_train.csv"), delimiter=',', skiprows=1)
  trainDataResponse = trainData[:,0]
  trainDataFeatures = trainData[:,1:]

  ntrees = 100
  learning_rate = 0.1
  depth = 5
  min_rows = 10
  # Build H2O GBM classification model:
  from h2o.estimators.gbm import H2OGradientBoostingEstimator
  gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate,
                                         max_depth=depth,
                                         min_rows=min_rows,
                                         distribution="bernoulli")
  gbm_h2o.train(x=range(1,prostate_train.ncol),y="CAPSULE", training_frame=prostate_train)

  # Build scikit GBM classification model
  #Log.info("scikit GBM with same parameters\n")
  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=ntrees, max_depth=depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(trainDataFeatures,trainDataResponse)

  #Log.info("Importing prostate_test.csv data...\n")
  prostate_test = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_test.csv"))

  #Log.info("Converting CAPSULE and RACE columns to factors...\n")
  prostate_test["CAPSULE"] = prostate_test["CAPSULE"].asfactor()

  # Import prostate_test.csv as numpy array for scikit comparison
  testData = np.loadtxt(pyunit_utils.locate("smalldata/logreg/prostate_test.csv"), delimiter=',', skiprows=1)
  testDataResponse = testData[:,0]
  testDataFeatures = testData[:,1:]

  # Score on the test data and compare results

  # scikit
  auc_sci = roc_auc_score(testDataResponse, gbm_sci.predict_proba(testDataFeatures)[:,1])

  # h2o
  gbm_perf = gbm_h2o.model_performance(prostate_test)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Beispiel #23
0
def mojo_predict_api_test(sandbox_dir):
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    output_csv = "%s/prediction.csv" % sandbox_dir
    h2o.export_file(data[1, 2:], input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    # test that we can predict using default paths
    h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
    h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path,
                               verbose=True)
    assert os.path.isfile(output_csv)
    os.remove(model_zip_path)
    os.remove(genmodel_path)
    os.remove(output_csv)

    # test that we can predict using custom genmodel path
    other_sandbox_dir = tempfile.mkdtemp()
    try:
        genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar')
        download_mojo(model, model_zip_path, genmodel_path)
        assert os.path.isfile(model_zip_path)
        assert os.path.isfile(genmodel_path)
        try:
            h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
            assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir
        except RuntimeError:
            pass
        assert not os.path.isfile(output_csv)
        h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True)
        assert os.path.isfile(output_csv)
        os.remove(output_csv)

        output_csv = "%s/out.prediction" % other_sandbox_dir

        # test that we can predict using default paths
        h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv)
        assert os.path.isfile(output_csv)
        os.remove(model_zip_path)
        os.remove(genmodel_path)
        os.remove(output_csv)
    finally:
        shutil.rmtree(other_sandbox_dir)
 def test_h2o_regressor_unsupported_dists(self):
     diabetes = load_diabetes()
     train, test = _train_test_split_as_frames(diabetes.data, diabetes.target)
     not_supported_dists = ["poisson", "gamma", "tweedie"]
     for d in not_supported_dists:
         gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution=d)
         mojo_path = _make_mojo(gbm, train)
         with self.assertRaises(ValueError) as err:
             _convert_mojo(mojo_path)
         self.assertRegexpMatches(err.exception.args[0], "not supported")
Beispiel #25
0
def gbm_demo():
    from h2o.estimators.gbm import H2OGradientBoostingEstimator

    df[1] = df[1].asfactor()

    m = H2OGradientBoostingEstimator(ntrees=10, max_depth=5)

    m.train(x=df.names[2:], y='CAPSULE', training_frame=df)

    print('m.type_print:', m.type)
Beispiel #26
0
def test_weights_column_not_in_train():
    try:
        df = h2o.import_file(
            pyunit_utils.locate("smalldata/prostate/prostate.csv"))
        gbm = H2OGradientBoostingEstimator(seed=1234, weights_column='foo')
        gbm.train(y=-1, training_frame=df)
        assert False, "Model building should fail."
    except H2OResponseError as e:
        assert "ERRR on field: _weights_column" in str(
            e), "Model building should fail with this in message."
Beispiel #27
0
def test_binomial_response_warning():
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"))
    y = "survived"
    features = ["name", "sex"]

    expected_warning = 'We have detected that your response column has only 2 unique values (0/1). ' \
                       'If you wish to train a binary model instead of a regression model, ' \
                       'convert your target column to categorical before training.'

    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(ws, expected_warning)

    training_data[training_data[y] == 0, y] = -1
    with pyunit_utils.catch_warnings() as ws:
        model = H2OGradientBoostingEstimator(ntrees=1)
        model.train(x=features, y=y, training_frame=training_data)
        assert pyunit_utils.contains_warning(ws, expected_warning)
def pubdev_1696():
    

    iris = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    try:
      H2OGradientBoostingEstimator(nfolds=-99).train(x=[0,1,2],y=3,training_frame=iris)
      assert False, "expected an error"
    except EnvironmentError:
        assert True
Beispiel #29
0
def nfold_predict():
    fr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))

    m = H2OGradientBoostingEstimator(nfolds=10, ntrees=10)
    m.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr)
    xval_models = m.get_xval_models()
    fr["weights"] = 1
    preds = [model.predict(fr) for model in xval_models]
    (old_div(sum(preds), 10)).show()
Beispiel #30
0
def test_gbm_train_segments_parallel():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_gbm = H2OGradientBoostingEstimator(min_rows=2, ntrees=4, seed=42)
    models = prostate_gbm.train_segments(y="CAPSULE", ignored_columns=["ID"], training_frame=prostate,
                                         segments=["RACE"], parallelism=2)

    models_list = models.as_frame()
    assert models_list.nrow == 3
df.columns

# Optionally print column data types. Note that Spark intelligentlly
# identifies that the predictor columns are double because I had
# made all of them rdd elements double (above). This saved me from
# having to write really ugly Spark casting code
df.schema.fields

# Convert the Spark DataFrame to something that H2O can ingest
df_h2o = hc.as_h2o_frame(df,"df_h2o")

'''



'''

predictors = column_names[:-1]
response = column_names[-1]

ratios = [0.6,0.2]
h2o_frame_splits = df_h2o.split_frame(ratios,seed=12345)
train = h2o_frame_splits[0]
train.frame_id = "Train"
valid = h2o_frame_splits[2]
valid.frame_id = "Validation"
test = h2o_frame_splits[1]
test.frame_id = "Test"

model = GBM(ntrees=50,max_depth=6,learn_rate=0.1,distribution="multinomial")
model.train(x=predictors,y=response,training_frame=train,validation_frame=valid)