def demo_body(go): """ Demo of H2O's Gradient Boosting estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.describe() go() # Randomly split the dataset into ~70/30, training/test sets train, test = prostate.split_frame(ratios=[0.70]) go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_gbm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_gbm.predict(test) predictions.show() go() # Fetch a tree, print number of tree nodes, show root node description from h2o.tree import H2OTree, H2ONode tree = H2OTree(prostate_gbm, 0, "0") len(tree) tree.left_children tree.right_children tree.root_node.show() go() # Show default performance metrics performance = prostate_gbm.model_performance(test) performance.show()
def main(): args = parse_args() h2o.init(ip=args.host, port=args.port) # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") # Print a description of the prostate data prostate.describe() # Randomly split the dataset into ~70/30, training/test sets client.update_task_info({ 'test_train': 0.7, 'learn_rate': 0.2, }) train, test = prostate.split_frame(ratios=[0.70]) # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() # Build a (classification) GLM from h2o.estimators import H2OGradientBoostingEstimator prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) # Show the model prostate_gbm.show() # Predict on the test set and show the first ten predictions predictions = prostate_gbm.predict(test) predictions.show() # Fetch a tree, print number of tree nodes, show root node description from h2o.tree import H2OTree, H2ONode tree = H2OTree(prostate_gbm, 0, "0") tree.root_node.show() # Show default performance metrics performance = prostate_gbm.model_performance(test) performance.show() client.update_task_info({ 'mse': performance.mse(), 'rmse': performance.rmse(), 'auc': performance.auc(), 'gini': performance.gini(), 'logloss': performance.logloss(), })
def h2oload_dataset(): """ Python API test: h2o.load_dataset(relative_path) """ try: prostate = h2o.load_dataset("prostate") assert_is_type(prostate, H2OFrame) except Exception as e: assert False, "h2o.load_dataset() command not is working."
def demo_body(go): """ Demo of H2O's Deep Learning model. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.summary() go() # Randomly split the dataset into ~70/30, training/test sets r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2ODeepLearningEstimator prostate_dl = H2ODeepLearningEstimator(activation="Tanh", hidden=[10, 10, 10], epochs=10000) prostate_dl.train(x=list(set(prostate.col_names) - {"ID", "CAPSULE"}), y="CAPSULE", training_frame=train) go() # Show the model prostate_dl.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_dl.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_dl.model_performance(test) performance.show()
def demo_body(go): """ Demo of H2O's Generalized Linear Estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.summary() go() # Randomly split the dataset into ~70/30, training/test sets r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_glm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_glm.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_glm.model_performance(test) performance.show()
def demo_body(go): """ Demo of H2O's Deep Learning model. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.describe() go() # Randomly split the dataset into ~70/30, training/test sets train, test = prostate.split_frame(ratios=[0.70]) go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2ODeepLearningEstimator prostate_dl = H2ODeepLearningEstimator(activation="Tanh", hidden=[10, 10, 10], epochs=10000) prostate_dl.train(x=list(set(prostate.col_names) - {"ID", "CAPSULE"}), y="CAPSULE", training_frame=train) go() # Show the model prostate_dl.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_dl.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_dl.model_performance(test) performance.show()
def demo_body(go): """ Demo of H2O's Generalized Linear Estimator. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.load_dataset("prostate") go() # Print a description of the prostate data prostate.describe() go() # Randomly split the dataset into ~70/30, training/test sets train, test = prostate.split_frame(ratios=[0.70]) go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) go() # Show the model prostate_glm.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_glm.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_glm.model_performance(test) performance.show()
def train_model(): # start h2o and load data h2o.init() h2o_df = h2o.load_dataset("iris.csv") # train model model = H2OGradientBoostingEstimator(ntrees=100, max_depth=4, learn_rate=0.1, model_id='latest') model.train(y="Species", training_frame=h2o_df) # save model to MOJO modelfile = model.download_mojo(path="models/", get_genmodel_jar=True) print("Model saved to " + modelfile) # save python model to disk h2o.save_model(model=model, path=os.getcwd() + '/models/h2o_model', force=True)
def main(): h2o.init() #df = h2o.import_file(path="smalldata/logreg/prostate.csv") prostate = h2o.load_dataset("prostate") prostate.describe() train, test = prostate.split_frame(ratios=[0.70]) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() # Train model from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) prostate_glm.show() predictions = prostate_glm.predict(test) predictions.show() performance = prostate_glm.model_performance(test) performance.show() # Export model model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True) print(model_path) model = prostate_glm predictions = model.predict(test) predictions.show() performance = model.model_performance(test) performance.show() # Export test data df = test.as_data_frame() with open("data.json", "w") as f: #json.dump(df.to_json(orient='records'), f) #json.dump(df.to_json(orient='columns'), f) json.dump(df.to_json(orient='index'), f)
def h2oload_dataset(): """ Python API test: h2o.load_dataset(relative_path) """ prostate = h2o.load_dataset("prostate") assert_is_type(prostate, H2OFrame)
def main(): # set random seed seed(SEED) #################################################################################################################### # logging config # log file logger.setLevel(logging.DEBUG) # create console handler and file handler and set level to debug sh = logging.StreamHandler() sh.setLevel(logging.DEBUG) # create formatter formatter = logging.Formatter( '%(asctime)s.%(msecs)06d: %(levelname)s %(name)s:%(lineno)d:\n%(message)s', datefmt='%Y-%m-%d %I:%M:%S') # add formatter sh.setFormatter(formatter) # add handler to logger logger.addHandler(sh) #################################################################################################################### # output directory # output directory out_dir = 'out-' + PREFIX + '-' + time_stamp try: if not os.path.exists(out_dir): os.mkdir(out_dir) log_name = out_dir + os.sep + PREFIX + '-' + time_stamp + '.log' fh = logging.FileHandler(log_name) fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) logger.info('Created output directory: %s' % out_dir) except IOError: print('Failed to create output directory: %s!' % out_dir) sys.exit(-1) #################################################################################################################### # training data iterations = pd.DataFrame( columns=['Main AUC', 'Main AIR', 'Adversary AUC']) train_f_name = '/home/patrickh/Workspace/GWU_rml/tests/data/train_simulated_transformed.csv' valid_f_name = '/home/patrickh/Workspace/GWU_rml/tests/data/test_simulated_transformed.csv' x_names = [ 'binary1', 'binary2', 'cat1_0', 'cat1_1', 'cat1_2', 'cat1_3', 'cat1_4', 'fried1_std', 'fried2_std', 'fried3_std', 'fried4_std', 'fried5_std' ] y_name = 'outcome' demo_name = 'ctrl_class1' protected_level = 0 control_level = 1 #################################################################################################################### # training iter_frame = pd.DataFrame(columns=['Adv. AUC', 'Main AUC', 'Main AIR']) logger.info('Training ...') # initialize main model htrain = h2o.load_dataset(train_f_name) htrain[y_name] = htrain[y_name].asfactor() print(htrain.head()) hvalid = h2o.load_dataset(valid_f_name) hvalid[y_name] = hvalid[y_name].asfactor() print(hvalid.head()) main_ = model.gbm_grid(x_names, y_name, htrain, hvalid, SEED) main_auc = main_.auc(valid=True) acc = main_.accuracy(valid=True) logger.info( 'Initial GBM grid search completed with accuracy %.4f at threshold %.4f.' % (acc[0][1], acc[0][0])) hvalid['pred'] = main_.predict(hvalid)['p1'] cm_protected = get_confusion_matrix(hvalid.as_data_frame(), y_name, 'pred', by=demo_name, level=protected_level, cutoff=acc[0][0]) logger.info(cm_protected) cm_control = get_confusion_matrix(hvalid.as_data_frame(), y_name, 'pred', by=demo_name, level=control_level, cutoff=acc[0][0]) logger.info(cm_control) cm_dict = {0: cm_protected, 1: cm_control} main_air = get_air(cm_dict, control_level, protected_level) logger.info('Initial GBM AIR %.4f.' % main_air) iter_frame = iter_frame.append( { 'Adv. AUC': np.nan, 'Main AUC': main_auc, 'Main AIR': main_air }, ignore_index=True) for i in range(0, EPOCHS): # train adversary to create weights adv_htrain = main_.predict_contributions(htrain) adv_htrain['pred'] = main_.predict(htrain)[ 'p1'] # probability of high priced adv_htrain['demo'] = htrain[demo_name] adv_htrain['demo'] = adv_htrain['demo'].asfactor() print(adv_htrain.head()) adv_hvalid = main_.predict_contributions(hvalid) adv_hvalid['pred'] = main_.predict(hvalid)[ 'p1'] # probability of high priced adv_hvalid['demo'] = hvalid[demo_name] adv_hvalid['demo'] = adv_hvalid['demo'].asfactor() print(adv_hvalid.head()) adversary_ = model.gbm_grid(x_names + ['pred'], 'demo', adv_htrain, adv_hvalid, SEED) adv_auc = adversary_.auc(valid=True) logger.info('Epoch %d adversary AUC: %.4f.' % (int(i), adv_auc)) # re-train main model with weights htrain['weight'] = adversary_.predict(adv_htrain)[ 'p0'] # probability of control print(htrain.head()) hvalid['weight'] = adversary_.predict(adv_hvalid)[ 'p0'] # probability of control print(hvalid.head()) main_ = model.gbm_grid(x_names, y_name, htrain, hvalid, SEED, weight='weight') main_auc = main_.auc(valid=True) acc = main_.accuracy(valid=True) logger.info( 'Epoch %d GBM grid search completed with accuracy %.4f at threshold %.4f.' % (int(i), acc[0][1], acc[0][0])) hvalid['pred'] = main_.predict(hvalid)['p1'] cm_protected = get_confusion_matrix(hvalid.as_data_frame(), y_name, 'pred', by=demo_name, level=protected_level, cutoff=acc[0][0]) logger.info(cm_protected) cm_control = get_confusion_matrix(hvalid.as_data_frame(), y_name, 'pred', by=demo_name, level=control_level, cutoff=acc[0][0]) logger.info(cm_control) cm_dict = {0: cm_protected, 1: cm_control} main_air = get_air(cm_dict, control_level, protected_level) logger.info('Epoch %d GBM AIR %.4f.' % (int(i), main_air)) iter_frame = iter_frame.append( { 'Adv. AUC': adv_auc, 'Main AUC': main_auc, 'Main AIR': main_air }, ignore_index=True) logger.info(iter_frame) iter_frame.to_csv(out_dir + os.sep + 'iter.csv') #################################################################################################################### # end timer toc = time.time() - tic logger.info('All tasks completed in %.2f s' % toc)
enc.fit(data[f].values.reshape(-1, 1)) X_train = sparse.hstack( (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr') X_test = sparse.hstack( (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr') print(X_train.shape) print(X_test.shape) y_train = target.values ############# 特征工程 end # train = h2o.upload_file(pre_root_path + '/jinnan_round1_train_20181227.csv')#"./cache_data/train_data_f{}.csv".format(1)) # test = h2o.upload_file(pre_root_path + '/jinnan_round1_testA_20181227.csv')#"./cache_data/test_data_f{}.csv".format(1)) print('load data to h2o') train = h2o.load_dataset(X_train) test = h2o.load_dataset(X_test) # all_data = h2o.connect(train, test) # all_data = X_train + X_test # all_data = ' ' # 这是所有的特征,可以从参考baseline1 # feature_name = [i for i in all_data.columns if i not in ['样本id','收率']] feature_name = mean_columns + numerical_columns x = feature_name y = '收率' aml = H2OAutoML(max_models=320, seed=2019, max_runtime_secs=12800) aml.train(x=feature_name, y=y, training_frame=train) lb = aml.leaderboard
import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator h2o.init() h2o_df = h2o.load_dataset("prostate.csv") h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=100, max_depth=4, learn_rate=0.1) model.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df) modefile = model.download_mojo(path="exp", get_genmodel_jar=True) print("Model saved to " + modefile)
#!/usr/bin/env python import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator h2o.init() h2o_df = h2o.load_dataset("prostate") h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor() model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=100, max_depth=4, learn_rate=0.1) model.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df) modelfile = model.download_mojo(path="./model", get_genmodel_jar=True) print("Model saved to " + modelfile)