def bernoulli_synthetic_data_gbm_medium(): # Generate training dataset (adaptation of http://www.stat.missouri.edu/~speckman/stat461/boost.R) train_rows = 10000 train_cols = 10 # Generate variables V1, ... V10 X_train = np.random.randn(train_rows, train_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_train = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_train,X_train).tolist()]]) # Train scikit gbm # TODO: grid-search distribution = "bernoulli" ntrees = 150 min_rows = 1 max_depth = 2 learn_rate = .01 nbins = 20 gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(X_train,y_train) # Generate testing dataset test_rows = 2000 test_cols = 10 # Generate variables V1, ... V10 X_test = np.random.randn(test_rows, test_cols) # y = +1 if sum_i x_{ij}^2 > chisq median on 10 df y_test = np.asarray([1 if rs > scipy.stats.chi2.ppf(0.5, 10) else -1 for rs in [sum(r) for r in np.multiply(X_test,X_test).tolist()]]) # Score (AUC) the scikit gbm model on the test data auc_sci = roc_auc_score(y_test, gbm_sci.predict_proba(X_test)[:,1]) # Compare this result to H2O train_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_train, X_train)).tolist())) test_h2o = H2OFrame.fromPython(zip(*np.column_stack((y_test, X_test)).tolist())) gbm_h2o = h2o.gbm(x=train_h2o[1:], y=train_h2o["C1"].asfactor(), distribution=distribution, ntrees=ntrees, min_rows=min_rows, max_depth=max_depth, learn_rate=learn_rate, nbins=nbins) gbm_perf = gbm_h2o.model_performance(test_h2o) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert abs(auc_h2o - auc_sci) < 1e-2, "h2o (auc) performance degradation, with respect to scikit. h2o auc: {0} " \ "scickit auc: {1}".format(auc_h2o, auc_sci)
def upload_file(): a = h2o.upload_file(pyunit_utils.locate("smalldata/logreg/prostate.csv")) print a.describe() from h2o import H2OFrame # using lists [] py_list_to_h2o = H2OFrame.fromPython(zip(*[[0, 1, 2, 3, 4]])) print py_list_to_h2o.describe() py_list_to_h2o_2 = H2OFrame.fromPython(zip(*[[0, 1, 2, 3], [5, 6, "hi", "dog"]])) print py_list_to_h2o_2.describe() # using tuples () py_tuple_to_h2o = H2OFrame.fromPython(zip(*[(0, 1, 2, 3, 4)])) print py_tuple_to_h2o.describe() py_tuple_to_h2o_2 = H2OFrame.fromPython(zip(*((0, 1, 2, 3), (5, 6, "hi", "dog")))) print py_tuple_to_h2o_2.describe() # using dicts {} py_dict_to_h2o = H2OFrame.fromPython({"column1": [5, 4, 3, 2, 1], "column2": (1, 2, 3, 4, 5)}) py_dict_to_h2o.describe() py_dict_to_h2o_2 = H2OFrame.fromPython({"colA": ["bilbo", "baggins"], "colB": ["meow"]}) print py_dict_to_h2o_2.describe() # using collections.OrderedDict import collections d = {"colA": ["bilbo", "baggins"], "colB": ["meow"]} # still unordered! py_ordered_dict_to_h2o = H2OFrame.fromPython(collections.OrderedDict(d)) py_ordered_dict_to_h2o.describe() # make an ordered dictionary! d2 = collections.OrderedDict() d2["colA"] = ["bilbo", "baggins"] d2["colB"] = ["meow"] py_ordered_dict_to_h2o_2 = H2OFrame.fromPython(collections.OrderedDict(d2)) py_ordered_dict_to_h2o_2.describe()