Example #1
0
    def demo_body(go):
        """
        Demo of H2O's Gradient Boosting estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.describe()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        train, test = prostate.split_frame(ratios=[0.70])

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGradientBoostingEstimator
        prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8,
                                                    min_rows=10, learn_rate=0.2)
        prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE", training_frame=train)

        go()
        # Show the model
        prostate_gbm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_gbm.predict(test)
        predictions.show()

        go()
        # Fetch a tree, print number of tree nodes, show root node description
        from h2o.tree import H2OTree, H2ONode
        tree = H2OTree(prostate_gbm, 0, "0")
        len(tree)
        tree.left_children
        tree.right_children
        tree.root_node.show()

        go()
        # Show default performance metrics
        performance = prostate_gbm.model_performance(test)
        performance.show()
Example #2
0
    def demo_body(go):
        """
        Demo of H2O's Gradient Boosting estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.describe()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        train, test = prostate.split_frame(ratios=[0.70])

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGradientBoostingEstimator
        prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8,
                                                    min_rows=10, learn_rate=0.2)
        prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE", training_frame=train)

        go()
        # Show the model
        prostate_gbm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_gbm.predict(test)
        predictions.show()

        go()
        # Fetch a tree, print number of tree nodes, show root node description
        from h2o.tree import H2OTree, H2ONode
        tree = H2OTree(prostate_gbm, 0, "0")
        len(tree)
        tree.left_children
        tree.right_children
        tree.root_node.show()

        go()
        # Show default performance metrics
        performance = prostate_gbm.model_performance(test)
        performance.show()
Example #3
0
def main():
    args = parse_args()
    h2o.init(ip=args.host, port=args.port)

    # Upload the prostate dataset that comes included in the h2o python package
    prostate = h2o.load_dataset("prostate")

    # Print a description of the prostate data
    prostate.describe()

    # Randomly split the dataset into ~70/30, training/test sets
    client.update_task_info({
        'test_train': 0.7,
        'learn_rate': 0.2,
    })

    train, test = prostate.split_frame(ratios=[0.70])

    # Convert the response columns to factors (for binary classification problems)
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Build a (classification) GLM
    from h2o.estimators import H2OGradientBoostingEstimator
    prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                                ntrees=10,
                                                max_depth=8,
                                                min_rows=10,
                                                learn_rate=0.2)
    prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)

    # Show the model
    prostate_gbm.show()

    # Predict on the test set and show the first ten predictions
    predictions = prostate_gbm.predict(test)
    predictions.show()

    # Fetch a tree, print number of tree nodes, show root node description
    from h2o.tree import H2OTree, H2ONode
    tree = H2OTree(prostate_gbm, 0, "0")
    tree.root_node.show()

    # Show default performance metrics
    performance = prostate_gbm.model_performance(test)
    performance.show()

    client.update_task_info({
        'mse': performance.mse(),
        'rmse': performance.rmse(),
        'auc': performance.auc(),
        'gini': performance.gini(),
        'logloss': performance.logloss(),
    })
Example #4
0
def h2oload_dataset():
    """
    Python API test: h2o.load_dataset(relative_path)
    """

    try:
        prostate = h2o.load_dataset("prostate")
        assert_is_type(prostate, H2OFrame)
    except Exception as e:
        assert False, "h2o.load_dataset() command not is working."
Example #5
0
    def demo_body(go):
        """
        Demo of H2O's Deep Learning model.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.summary()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        r = prostate[0].runif()
        train = prostate[r < 0.70]
        test = prostate[r >= 0.70]

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2ODeepLearningEstimator
        prostate_dl = H2ODeepLearningEstimator(activation="Tanh",
                                               hidden=[10, 10, 10],
                                               epochs=10000)
        prostate_dl.train(x=list(set(prostate.col_names) - {"ID", "CAPSULE"}),
                          y="CAPSULE",
                          training_frame=train)

        go()
        # Show the model
        prostate_dl.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_dl.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_dl.model_performance(test)
        performance.show()
Example #6
0
    def demo_body(go):
        """
        Demo of H2O's Generalized Linear Estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.summary()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        r = prostate[0].runif()
        train = prostate[r < 0.70]
        test = prostate[r >= 0.70]

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGeneralizedLinearEstimator
        prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                     alpha=[0.5])
        prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE",
                           training_frame=train)

        go()
        # Show the model
        prostate_glm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_glm.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_glm.model_performance(test)
        performance.show()
Example #7
0
File: demos.py Project: h2oai/h2o-3
    def demo_body(go):
        """
        Demo of H2O's Deep Learning model.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.describe()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        train, test = prostate.split_frame(ratios=[0.70])

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2ODeepLearningEstimator
        prostate_dl = H2ODeepLearningEstimator(activation="Tanh", hidden=[10, 10, 10], epochs=10000)
        prostate_dl.train(x=list(set(prostate.col_names) - {"ID", "CAPSULE"}),
                          y="CAPSULE", training_frame=train)

        go()
        # Show the model
        prostate_dl.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_dl.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_dl.model_performance(test)
        performance.show()
Example #8
0
File: demos.py Project: h2oai/h2o-3
    def demo_body(go):
        """
        Demo of H2O's Generalized Linear Estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.load_dataset("prostate")

        go()
        # Print a description of the prostate data
        prostate.describe()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        train, test = prostate.split_frame(ratios=[0.70])

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGeneralizedLinearEstimator
        prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5])
        prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE", training_frame=train)

        go()
        # Show the model
        prostate_glm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_glm.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_glm.model_performance(test)
        performance.show()
def train_model():
    # start h2o and load data
    h2o.init()
    h2o_df = h2o.load_dataset("iris.csv")
    # train model
    model = H2OGradientBoostingEstimator(ntrees=100,
                                         max_depth=4,
                                         learn_rate=0.1,
                                         model_id='latest')
    model.train(y="Species", training_frame=h2o_df)
    # save model to MOJO
    modelfile = model.download_mojo(path="models/", get_genmodel_jar=True)
    print("Model saved to " + modelfile)
    # save python model to disk
    h2o.save_model(model=model,
                   path=os.getcwd() + '/models/h2o_model',
                   force=True)
def main():
    h2o.init()

    #df = h2o.import_file(path="smalldata/logreg/prostate.csv")
    prostate = h2o.load_dataset("prostate")
    prostate.describe()

    train, test = prostate.split_frame(ratios=[0.70])
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Train model
    from h2o.estimators import H2OGeneralizedLinearEstimator
    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                 alpha=[0.5])
    prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)
    prostate_glm.show()

    predictions = prostate_glm.predict(test)
    predictions.show()

    performance = prostate_glm.model_performance(test)
    performance.show()

    # Export model
    model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True)
    print(model_path)

    model = prostate_glm
    predictions = model.predict(test)
    predictions.show()

    performance = model.model_performance(test)
    performance.show()

    # Export test data
    df = test.as_data_frame()
    with open("data.json", "w") as f:
        #json.dump(df.to_json(orient='records'), f)
        #json.dump(df.to_json(orient='columns'), f)
        json.dump(df.to_json(orient='index'), f)
Example #11
0
def h2oload_dataset():
    """
    Python API test: h2o.load_dataset(relative_path)
    """
    prostate = h2o.load_dataset("prostate")
    assert_is_type(prostate, H2OFrame)
def main():

    # set random seed
    seed(SEED)

    ####################################################################################################################
    # logging config

    # log file
    logger.setLevel(logging.DEBUG)

    # create console handler and file handler and set level to debug
    sh = logging.StreamHandler()
    sh.setLevel(logging.DEBUG)

    # create formatter
    formatter = logging.Formatter(
        '%(asctime)s.%(msecs)06d: %(levelname)s %(name)s:%(lineno)d:\n%(message)s',
        datefmt='%Y-%m-%d %I:%M:%S')

    # add formatter
    sh.setFormatter(formatter)

    # add handler to logger
    logger.addHandler(sh)

    ####################################################################################################################
    # output directory

    # output directory
    out_dir = 'out-' + PREFIX + '-' + time_stamp

    try:
        if not os.path.exists(out_dir):

            os.mkdir(out_dir)

            log_name = out_dir + os.sep + PREFIX + '-' + time_stamp + '.log'
            fh = logging.FileHandler(log_name)
            fh.setLevel(logging.DEBUG)
            fh.setFormatter(formatter)
            logger.addHandler(fh)

            logger.info('Created output directory: %s' % out_dir)

    except IOError:
        print('Failed to create output directory: %s!' % out_dir)
        sys.exit(-1)

    ####################################################################################################################
    # training data

    iterations = pd.DataFrame(
        columns=['Main AUC', 'Main AIR', 'Adversary AUC'])

    train_f_name = '/home/patrickh/Workspace/GWU_rml/tests/data/train_simulated_transformed.csv'
    valid_f_name = '/home/patrickh/Workspace/GWU_rml/tests/data/test_simulated_transformed.csv'
    x_names = [
        'binary1', 'binary2', 'cat1_0', 'cat1_1', 'cat1_2', 'cat1_3', 'cat1_4',
        'fried1_std', 'fried2_std', 'fried3_std', 'fried4_std', 'fried5_std'
    ]
    y_name = 'outcome'

    demo_name = 'ctrl_class1'
    protected_level = 0
    control_level = 1

    ####################################################################################################################
    # training

    iter_frame = pd.DataFrame(columns=['Adv. AUC', 'Main AUC', 'Main AIR'])

    logger.info('Training ...')

    # initialize main model

    htrain = h2o.load_dataset(train_f_name)
    htrain[y_name] = htrain[y_name].asfactor()
    print(htrain.head())
    hvalid = h2o.load_dataset(valid_f_name)
    hvalid[y_name] = hvalid[y_name].asfactor()
    print(hvalid.head())

    main_ = model.gbm_grid(x_names, y_name, htrain, hvalid, SEED)
    main_auc = main_.auc(valid=True)

    acc = main_.accuracy(valid=True)
    logger.info(
        'Initial GBM grid search completed with accuracy %.4f at threshold %.4f.'
        % (acc[0][1], acc[0][0]))

    hvalid['pred'] = main_.predict(hvalid)['p1']

    cm_protected = get_confusion_matrix(hvalid.as_data_frame(),
                                        y_name,
                                        'pred',
                                        by=demo_name,
                                        level=protected_level,
                                        cutoff=acc[0][0])
    logger.info(cm_protected)

    cm_control = get_confusion_matrix(hvalid.as_data_frame(),
                                      y_name,
                                      'pred',
                                      by=demo_name,
                                      level=control_level,
                                      cutoff=acc[0][0])
    logger.info(cm_control)

    cm_dict = {0: cm_protected, 1: cm_control}
    main_air = get_air(cm_dict, control_level, protected_level)

    logger.info('Initial GBM AIR %.4f.' % main_air)

    iter_frame = iter_frame.append(
        {
            'Adv. AUC': np.nan,
            'Main AUC': main_auc,
            'Main AIR': main_air
        },
        ignore_index=True)

    for i in range(0, EPOCHS):

        # train adversary to create weights

        adv_htrain = main_.predict_contributions(htrain)
        adv_htrain['pred'] = main_.predict(htrain)[
            'p1']  # probability of high priced
        adv_htrain['demo'] = htrain[demo_name]
        adv_htrain['demo'] = adv_htrain['demo'].asfactor()
        print(adv_htrain.head())

        adv_hvalid = main_.predict_contributions(hvalid)
        adv_hvalid['pred'] = main_.predict(hvalid)[
            'p1']  # probability of high priced
        adv_hvalid['demo'] = hvalid[demo_name]
        adv_hvalid['demo'] = adv_hvalid['demo'].asfactor()
        print(adv_hvalid.head())

        adversary_ = model.gbm_grid(x_names + ['pred'], 'demo', adv_htrain,
                                    adv_hvalid, SEED)
        adv_auc = adversary_.auc(valid=True)

        logger.info('Epoch %d adversary AUC: %.4f.' % (int(i), adv_auc))

        # re-train main model with weights

        htrain['weight'] = adversary_.predict(adv_htrain)[
            'p0']  # probability of control
        print(htrain.head())
        hvalid['weight'] = adversary_.predict(adv_hvalid)[
            'p0']  # probability of control
        print(hvalid.head())

        main_ = model.gbm_grid(x_names,
                               y_name,
                               htrain,
                               hvalid,
                               SEED,
                               weight='weight')
        main_auc = main_.auc(valid=True)

        acc = main_.accuracy(valid=True)
        logger.info(
            'Epoch %d GBM grid search completed with accuracy %.4f at threshold %.4f.'
            % (int(i), acc[0][1], acc[0][0]))

        hvalid['pred'] = main_.predict(hvalid)['p1']

        cm_protected = get_confusion_matrix(hvalid.as_data_frame(),
                                            y_name,
                                            'pred',
                                            by=demo_name,
                                            level=protected_level,
                                            cutoff=acc[0][0])
        logger.info(cm_protected)

        cm_control = get_confusion_matrix(hvalid.as_data_frame(),
                                          y_name,
                                          'pred',
                                          by=demo_name,
                                          level=control_level,
                                          cutoff=acc[0][0])
        logger.info(cm_control)

        cm_dict = {0: cm_protected, 1: cm_control}
        main_air = get_air(cm_dict, control_level, protected_level)

        logger.info('Epoch %d GBM AIR %.4f.' % (int(i), main_air))

        iter_frame = iter_frame.append(
            {
                'Adv. AUC': adv_auc,
                'Main AUC': main_auc,
                'Main AIR': main_air
            },
            ignore_index=True)

        logger.info(iter_frame)

    iter_frame.to_csv(out_dir + os.sep + 'iter.csv')

    ####################################################################################################################
    # end timer

    toc = time.time() - tic
    logger.info('All tasks completed in %.2f s' % toc)
Example #13
0
    enc.fit(data[f].values.reshape(-1, 1))
    X_train = sparse.hstack(
        (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
    X_test = sparse.hstack(
        (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')
print(X_train.shape)
print(X_test.shape)

y_train = target.values

############# 特征工程 end

# train = h2o.upload_file(pre_root_path + '/jinnan_round1_train_20181227.csv')#"./cache_data/train_data_f{}.csv".format(1))
# test = h2o.upload_file(pre_root_path + '/jinnan_round1_testA_20181227.csv')#"./cache_data/test_data_f{}.csv".format(1))
print('load data to h2o')
train = h2o.load_dataset(X_train)
test = h2o.load_dataset(X_test)

# all_data = h2o.connect(train, test)
# all_data = X_train + X_test

# all_data = ' '  # 这是所有的特征,可以从参考baseline1
# feature_name = [i for i in all_data.columns if i not in ['样本id','收率']]
feature_name = mean_columns + numerical_columns
x = feature_name
y = '收率'

aml = H2OAutoML(max_models=320, seed=2019, max_runtime_secs=12800)
aml.train(x=feature_name, y=y, training_frame=train)

lb = aml.leaderboard
Example #14
0
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
h2o_df = h2o.load_dataset("prostate.csv")
h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor()
model = H2OGradientBoostingEstimator(distribution="bernoulli",
                                     ntrees=100,
                                     max_depth=4,
                                     learn_rate=0.1)
model.train(y="CAPSULE",
            x=["AGE", "RACE", "PSA", "GLEASON"],
            training_frame=h2o_df)
modefile = model.download_mojo(path="exp", get_genmodel_jar=True)
print("Model saved to " + modefile)
Example #15
0
def h2oload_dataset():
    """
    Python API test: h2o.load_dataset(relative_path)
    """
    prostate = h2o.load_dataset("prostate")
    assert_is_type(prostate, H2OFrame)
#!/usr/bin/env python

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

h2o.init()

h2o_df = h2o.load_dataset("prostate")
h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor()
model = H2OGradientBoostingEstimator(distribution="bernoulli",
                                     ntrees=100,
                                     max_depth=4,
                                     learn_rate=0.1)

model.train(y="CAPSULE",
            x=["AGE", "RACE", "PSA", "GLEASON"],
            training_frame=h2o_df)

modelfile = model.download_mojo(path="./model", get_genmodel_jar=True)
print("Model saved to " + modelfile)