def bernoulli_synthetic_data_gbm_medium():

  # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R)
  train_rows = 10000
  train_cols = 10

  #  Generate variables V1, ... V10
  X_train = np.random.randn(train_rows, train_cols)

  #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
  y_train = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in
                                                                                   np.multiply(X_train,X_train).tolist()]])

  # Train scikit gbm
  # TODO: grid-search
  distribution = "bernoulli"
  ntrees = 150
  min_rows = 1
  max_depth = 2
  learn_rate = .01
  nbins = 20

  gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth,
                                                min_samples_leaf=min_rows, max_features=None)
  gbm_sci.fit(X_train,y_train)

  # Generate testing dataset
  test_rows = 2000
  test_cols = 10

  #  Generate variables V1, ... V10
  X_test = np.random.randn(test_rows, test_cols)

  #  y = +1 if sum_i x_{ij}^2 > chisq median on 10 df
  y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in
                                                                                  np.multiply(X_test,X_test).tolist()]])

  # Score (AUC) the scikit gbm model on the test data
  auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1])

  # Compare this result to H2O
  train_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_train, X_train)).tolist()))
  test_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_test, X_test)).tolist()))

  gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees,
                    min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins)
  gbm_perf = gbm_h2o.model_performance(test_h2o)
  auc_h2o = gbm_perf.auc()

  #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
  assert abs(auc_h2o - auc_sci) < 1e-2, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \
                                        "scickit auc: {1}".format(auc_h2o, auc_sci)
Beispiel #2
0
def upload_file():


    a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    print a.describe()

    from h2o import H2OFrame


    # using lists []
    py_list_to_h2o = H2OFrame.fromPython(zip(*[[0, 1, 2, 3, 4]]))

    print py_list_to_h2o.describe()

    py_list_to_h2o_2 = H2OFrame.fromPython(zip(*[[0, 1, 2, 3], [5, 6, "hi", "dog"]]))

    print py_list_to_h2o_2.describe()


    # using tuples ()
    py_tuple_to_h2o = H2OFrame.fromPython(zip(*[(0, 1, 2, 3, 4)]))

    print py_tuple_to_h2o.describe()

    py_tuple_to_h2o_2 = H2OFrame.fromPython(zip(*((0, 1, 2, 3), (5, 6, "hi", "dog"))))

    print py_tuple_to_h2o_2.describe()


    # using dicts {}
    py_dict_to_h2o = H2OFrame.fromPython({"column1": [5, 4, 3, 2, 1],
                                          "column2": (1, 2, 3, 4, 5)})

    py_dict_to_h2o.describe()

    py_dict_to_h2o_2 = H2OFrame.fromPython({"colA": ["bilbo", "baggins"], "colB": ["meow"]})

    print py_dict_to_h2o_2.describe()


    # using collections.OrderedDict

    import collections
    d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]}  # still unordered!
    py_ordered_dict_to_h2o = H2OFrame.fromPython(collections.OrderedDict(d))

    py_ordered_dict_to_h2o.describe()


    # make an ordered dictionary!
    d2 = collections.OrderedDict()
    d2["colA"] = ["bilbo", "baggins"]
    d2["colB"] = ["meow"]


    py_ordered_dict_to_h2o_2 = H2OFrame.fromPython(collections.OrderedDict(d2))
    py_ordered_dict_to_h2o_2.describe()