Beispiel #1
0
def test_glm_multinomial_coeffs():
    trainF = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    y = "species"
    x = [0,1,2,3]
    bin_LS = glm(family='multinomial', seed=12345)
    bin_LS.train(x=x, y=y, training_frame=trainF)
    print(bin_LS.summary())
    coefficient_table_original = bin_LS._model_json["output"]["coefficients_table"]
    coefficient_table = bin_LS._model_json["output"]["coefficients_table_multinomials_with_class_names"]

    coeffNamesOld = coefficient_table_original.col_header
    coeffNames = coefficient_table.col_header
    validCoefficientNames = [u"names", u"coefs_class_Iris-setosa", u"coefs_class_Iris-versicolor",
                             u"coefs_class_Iris-virginica", u"std_coefs_class_Iris-setosa",
                             u"std_coefs_class_Iris-versicolor", u"std_coefs_class_Iris-virginica"]
    oldCoefficientNames = [u"names", u"coefs_class_0", u"coefs_class_1",
                             u"coefs_class_2", u"std_coefs_class_0",
                             u"std_coefs_class_1", u"std_coefs_class_2"]
    print(coefficient_table)
    print(coefficient_table_original)

    # compare coefficient names
    assert len(set(coeffNames).intersection(validCoefficientNames))==len(coeffNames),\
        "Expected coefficient names: {0}.  Actual coefficient names: {1}".format(validCoefficientNames, coeffNames)
    assert len(set(coeffNamesOld).intersection(oldCoefficientNames))==len(coeffNames), \
        "Expected original coefficient names: {0}.  Actual original coefficient names: " \
        "{1}".format(oldCoefficientNames, coeffNamesOld)

    # compare table contents to make sure they contain the same values
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(coefficient_table_original, coefficient_table, [u'coefs_class_0'],
                                                  tolerance=1e-10)
Beispiel #2
0
def testFrameTransform():
  train = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip"))
  y = 'CAPSULE'
  x = ['AGE','VOL','DCAPS']
  train[10,2] = None
  train[20,7] = None
  train[y] = train[y].asfactor()
  # build model choosing skip
  model1 = H2OANOVAGLMEstimator(family='binomial', lambda_=0, missing_values_handling="skip")
  model1.train(x=x, y=y, training_frame=train)
  # build model deleting the two rows with missing values
  train.drop([10, 20], axis=0)
  model2 = H2OANOVAGLMEstimator(family='binomial', lambda_=0, missing_values_handling="skip")
  model2.train(x=x, y=y, training_frame=train)
  # the two models should be the same, compare the model summaries
  summary1 = model1._model_json['output']['model_summary']
  summary2 = model2._model_json['output']['model_summary']
  pyunit_utils.assert_H2OTwoDimTable_equal_upto(summary1, summary2, summary1.col_header)
def partial_plot_test():
    # Import data set that contains NAs
    data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    weights = h2o.H2OFrame([3.0]*data.nrow)
    tweight2 = [1.0]*data.nrow
    random.seed(12345)
    for ind in range(len(tweight2)):
        tweight2[ind] = random.randint(0,5)
    weights2 = h2o.H2OFrame(tweight2)
    data = data.cbind(weights)
    data = data.cbind(weights2)
    data.set_name(data.ncol-2, "constWeight")
    data.set_name(data.ncol-1, "variWeight")

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp without weight or NA
    pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE'],server=True, plot=True)
    # pdp with constant weight and NA
    pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
                                     weight_column="constWeight", include_na=True)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10)
    # pdp with changing weight NA
    pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
                                     weight_column="variWeight", include_na=True)
    ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age")
    raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[1], "race")
    raceList.remove(raceList[2])
    raceList.append(data[21,"RACE"]) # replace with NA word
    ageList[len(ageList)-1] = float('nan') # replace nan with proper form for python

    compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10)
    compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)
Beispiel #4
0
def partial_plot_test_with_user_splits():
    data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    user_splits = dict()
    user_splits['AGE'] = [43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055,
                          52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421,
                          60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315,
                          67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842,
                          75.21052631578948, 77.10526315789474]
    user_splits['RACE'] = ["Black"]
    # pdp without weight or NA
    file, filename = tempfile.mkstemp(suffix=".png")
    pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, save_to_file=filename)
    assert os.path.getsize(filename) > 0
    os.unlink(filename)
    if os.path.isfile(filename):
        os.remove(filename)

    pdpUserSplit = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True,
                                          user_splits=user_splits)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[0], pdpOrig[0], pdpUserSplit[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[1], pdpOrig[1], pdpUserSplit[1].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[2], pdpUserSplit[2], pdpUserSplit[2].col_header, tolerance=1e-10)
def partial_plot_test_with_user_splits():
    data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    user_splits = dict()
    user_splits['AGE'] = [43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055,
                          52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421,
                          60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315,
                          67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842,
                          75.21052631578948, 77.10526315789474]
    user_splits['RACE'] = ["Black"]
    # pdp without weight or NA
    file, filename = tempfile.mkstemp(suffix=".png")
    pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, save_to_file=filename)
    assert os.path.getsize(filename) > 0
    os.unlink(filename)

    pdpUserSplit = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True,
                                          user_splits=user_splits)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[0], pdpOrig[0], pdpUserSplit[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[1], pdpOrig[1], pdpUserSplit[1].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[2], pdpUserSplit[2], pdpUserSplit[2].col_header, tolerance=1e-10)
def partial_plot_test():
    # Import data set that contains NAs

    data = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"])
    test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"])
    x = data.names
    y = "IsDepDelayed"
    data[y] = data[y]
    x.remove(y)
    x.remove("Weight")
    x.remove("IsDepDelayed_REC")
    WC = "Weight"

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp with weight and no NA
    pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
                                  weight_column=WC)

    # pdp with weight and NA
    pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
                                    weight_column=WC, include_na = True)
    input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[0], "input_miss")
    assert math.isnan(input_miss_list[-1]), "Expected last element to be nan but is not."
    distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[1], "distance")
    assert math.isnan(distance_list[-1]), "Expected last element to be nan but is not."
    # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs.
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10)

    # compare pdpwNA with theoretical results
    pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss",
                                       test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10)
    pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance",
                                       test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
Beispiel #7
0
def partial_plot_test_with_user_splits():
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                             learn_rate=0.05,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    file, filename = tempfile.mkstemp(suffix=".png")
    user_splits = dict()
    user_splits['AGE'] = [
        43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579,
        50.578947368421055, 52.473684210526315, 54.368421052631575,
        56.26315789473684, 58.1578947368421, 60.05263157894737,
        61.94736842105263, 63.84210526315789, 65.73684210526315,
        67.63157894736842, 69.52631578947368, 71.42105263157895,
        73.3157894736842, 75.21052631578948, 77.10526315789474
    ]
    user_splits['RACE'] = ["Black", "White"]
    pdpUserSplit2D = gbm_model.partial_plot(data=data,
                                            server=True,
                                            plot=True,
                                            user_splits=user_splits,
                                            col_pairs_2dpdp=[['AGE', 'PSA'],
                                                             ['AGE', 'RACE']],
                                            save_to_file=filename)
    pdpUserSplit1D2D = gbm_model.partial_plot(data=data,
                                              cols=['AGE', 'RACE', 'DCAPS'],
                                              server=True,
                                              plot=True,
                                              user_splits=user_splits,
                                              col_pairs_2dpdp=[['AGE', 'PSA'],
                                                               ['AGE',
                                                                'RACE']],
                                              save_to_file=filename)
    pdpUserSplit1D = gbm_model.partial_plot(data=data,
                                            cols=['AGE', 'RACE', 'DCAPS'],
                                            server=True,
                                            plot=True,
                                            user_splits=user_splits,
                                            save_to_file=filename)

    # compare results 1D pdp
    for i in range(3):
        pyunit_utils.assert_H2OTwoDimTable_equal_upto(
            pdpUserSplit1D[i],
            pdpUserSplit1D2D[i],
            pdpUserSplit1D[i].col_header,
            tolerance=1e-10)
    # compare results 2D pdp
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[0],
                                                  pdpUserSplit1D2D[3],
                                                  pdpUserSplit2D[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[1],
                                                  pdpUserSplit1D2D[4],
                                                  pdpUserSplit2D[1].col_header,
                                                  tolerance=1e-10)
def partial_plot_test():
    # Import data set that contains NAs

    data = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"),
        na_strings=["NA"])
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"),
        na_strings=["NA"])
    x = data.names
    y = "IsDepDelayed"
    data[y] = data[y]
    x.remove(y)
    x.remove("Weight")
    x.remove("IsDepDelayed_REC")
    WC = "Weight"

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=80,
                                             learn_rate=0.1,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp with weight and no NA
    pdpw = gbm_model.partial_plot(data=test,
                                  cols=["Input_miss", "Distance"],
                                  server=True,
                                  plot=False,
                                  weight_column=WC)

    # pdp with weight and NA
    pdpwNA = gbm_model.partial_plot(data=test,
                                    cols=["Input_miss", "Distance"],
                                    server=True,
                                    plot=False,
                                    weight_column=WC,
                                    include_na=True)
    input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpwNA[0], "input_miss")
    assert math.isnan(
        input_miss_list[-1]), "Expected last element to be nan but is not."
    distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpwNA[1], "distance")
    assert math.isnan(
        distance_list[-1]), "Expected last element to be nan but is not."
    # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs.
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0],
                                                  pdpwNA[0],
                                                  pdpw[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1],
                                                  pdpwNA[1],
                                                  pdpw[1].col_header,
                                                  tolerance=1e-10)

    # compare pdpwNA with theoretical results
    pyunit_utils.compare_weightedStats(gbm_model,
                                       test,
                                       input_miss_list,
                                       "Input_miss",
                                       test[WC].as_data_frame(use_pandas=False,
                                                              header=False),
                                       pdpwNA[0],
                                       tol=1e-10)
    pyunit_utils.compare_weightedStats(gbm_model,
                                       test,
                                       distance_list,
                                       "Distance",
                                       test[WC].as_data_frame(use_pandas=False,
                                                              header=False),
                                       pdpwNA[1],
                                       tol=1e-10)
Beispiel #9
0
def partial_plot_test():
    # Import data set that contains NAs
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv'))
    x = data.names
    y = 'CAPSULE'
    x.remove(y)

    weights = h2o.H2OFrame([3.0] * data.nrow)
    tweight2 = [1.0] * data.nrow
    random.seed(12345)
    for ind in range(len(tweight2)):
        tweight2[ind] = random.randint(0, 5)
    weights2 = h2o.H2OFrame(tweight2)
    data = data.cbind(weights)
    data = data.cbind(weights2)
    data.set_name(data.ncol - 2, "constWeight")
    data.set_name(data.ncol - 1, "variWeight")

    # Build a GBM model predicting for response CAPSULE
    gbm_model = H2OGradientBoostingEstimator(ntrees=50,
                                             learn_rate=0.05,
                                             seed=12345)
    gbm_model.train(x=x, y=y, training_frame=data)

    # pdp without weight or NA
    pdpOrig = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True)
    # pdp with constant weight and NA
    pdpcWNA = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True,
                                     weight_column="constWeight",
                                     include_na=True)

    # compare results
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0],
                                                  pdpcWNA[0],
                                                  pdpOrig[0].col_header,
                                                  tolerance=1e-10)
    pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1],
                                                  pdpcWNA[1],
                                                  pdpOrig[1].col_header,
                                                  tolerance=1e-10)
    # pdp with changing weight NA
    pdpvWNA = gbm_model.partial_plot(data=data,
                                     cols=['AGE', 'RACE'],
                                     server=True,
                                     plot=True,
                                     weight_column="variWeight",
                                     include_na=True)
    ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age")
    raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(
        pdpvWNA[1], "race")
    raceList.remove(raceList[2])
    raceList.append(data[21, "RACE"])  # replace with NA word
    ageList[len(ageList) - 1] = float(
        'nan')  # replace nan with proper form for python

    compare_weightedStats(gbm_model,
                          'smalldata/prostate/prostate_cat_NA.csv',
                          raceList,
                          "RACE",
                          tweight2,
                          pdpvWNA[1],
                          tol=1e-10)
    compare_weightedStats(gbm_model,
                          'smalldata/prostate/prostate_cat_NA.csv',
                          ageList,
                          "AGE",
                          tweight2,
                          pdpvWNA[0],
                          tol=1e-10)