def test_glm_multinomial_coeffs(): trainF = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv")) y = "species" x = [0,1,2,3] bin_LS = glm(family='multinomial', seed=12345) bin_LS.train(x=x, y=y, training_frame=trainF) print(bin_LS.summary()) coefficient_table_original = bin_LS._model_json["output"]["coefficients_table"] coefficient_table = bin_LS._model_json["output"]["coefficients_table_multinomials_with_class_names"] coeffNamesOld = coefficient_table_original.col_header coeffNames = coefficient_table.col_header validCoefficientNames = [u"names", u"coefs_class_Iris-setosa", u"coefs_class_Iris-versicolor", u"coefs_class_Iris-virginica", u"std_coefs_class_Iris-setosa", u"std_coefs_class_Iris-versicolor", u"std_coefs_class_Iris-virginica"] oldCoefficientNames = [u"names", u"coefs_class_0", u"coefs_class_1", u"coefs_class_2", u"std_coefs_class_0", u"std_coefs_class_1", u"std_coefs_class_2"] print(coefficient_table) print(coefficient_table_original) # compare coefficient names assert len(set(coeffNames).intersection(validCoefficientNames))==len(coeffNames),\ "Expected coefficient names: {0}. Actual coefficient names: {1}".format(validCoefficientNames, coeffNames) assert len(set(coeffNamesOld).intersection(oldCoefficientNames))==len(coeffNames), \ "Expected original coefficient names: {0}. Actual original coefficient names: " \ "{1}".format(oldCoefficientNames, coeffNamesOld) # compare table contents to make sure they contain the same values pyunit_utils.assert_H2OTwoDimTable_equal_upto(coefficient_table_original, coefficient_table, [u'coefs_class_0'], tolerance=1e-10)
def testFrameTransform(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate_complete.csv.zip")) y = 'CAPSULE' x = ['AGE','VOL','DCAPS'] train[10,2] = None train[20,7] = None train[y] = train[y].asfactor() # build model choosing skip model1 = H2OANOVAGLMEstimator(family='binomial', lambda_=0, missing_values_handling="skip") model1.train(x=x, y=y, training_frame=train) # build model deleting the two rows with missing values train.drop([10, 20], axis=0) model2 = H2OANOVAGLMEstimator(family='binomial', lambda_=0, missing_values_handling="skip") model2.train(x=x, y=y, training_frame=train) # the two models should be the same, compare the model summaries summary1 = model1._model_json['output']['model_summary'] summary2 = model2._model_json['output']['model_summary'] pyunit_utils.assert_H2OTwoDimTable_equal_upto(summary1, summary2, summary1.col_header)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) weights = h2o.H2OFrame([3.0]*data.nrow) tweight2 = [1.0]*data.nrow random.seed(12345) for ind in range(len(tweight2)): tweight2[ind] = random.randint(0,5) weights2 = h2o.H2OFrame(tweight2) data = data.cbind(weights) data = data.cbind(weights2) data.set_name(data.ncol-2, "constWeight") data.set_name(data.ncol-1, "variWeight") # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp without weight or NA pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE'],server=True, plot=True) # pdp with constant weight and NA pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="constWeight", include_na=True) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10) # pdp with changing weight NA pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="variWeight", include_na=True) ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age") raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[1], "race") raceList.remove(raceList[2]) raceList.append(data[21,"RACE"]) # replace with NA word ageList[len(ageList)-1] = float('nan') # replace nan with proper form for python compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10) compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)
def partial_plot_test_with_user_splits(): data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) user_splits = dict() user_splits['AGE'] = [43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055, 52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421, 60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315, 67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842, 75.21052631578948, 77.10526315789474] user_splits['RACE'] = ["Black"] # pdp without weight or NA file, filename = tempfile.mkstemp(suffix=".png") pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, save_to_file=filename) assert os.path.getsize(filename) > 0 os.unlink(filename) if os.path.isfile(filename): os.remove(filename) pdpUserSplit = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, user_splits=user_splits) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[0], pdpOrig[0], pdpUserSplit[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[1], pdpOrig[1], pdpUserSplit[1].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[2], pdpUserSplit[2], pdpUserSplit[2].col_header, tolerance=1e-10)
def partial_plot_test_with_user_splits(): data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) user_splits = dict() user_splits['AGE'] = [43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055, 52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421, 60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315, 67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842, 75.21052631578948, 77.10526315789474] user_splits['RACE'] = ["Black"] # pdp without weight or NA file, filename = tempfile.mkstemp(suffix=".png") pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, save_to_file=filename) assert os.path.getsize(filename) > 0 os.unlink(filename) pdpUserSplit = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'],server=True, plot=True, user_splits=user_splits) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[0], pdpOrig[0], pdpUserSplit[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit[1], pdpOrig[1], pdpUserSplit[1].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[2], pdpUserSplit[2], pdpUserSplit[2].col_header, tolerance=1e-10)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) x = data.names y = "IsDepDelayed" data[y] = data[y] x.remove(y) x.remove("Weight") x.remove("IsDepDelayed_REC") WC = "Weight" # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp with weight and no NA pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC) # pdp with weight and NA pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC, include_na = True) input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[0], "input_miss") assert math.isnan(input_miss_list[-1]), "Expected last element to be nan but is not." distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[1], "distance") assert math.isnan(distance_list[-1]), "Expected last element to be nan but is not." # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs. pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10) # compare pdpwNA with theoretical results pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10) pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
def partial_plot_test_with_user_splits(): data = h2o.import_file( pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) file, filename = tempfile.mkstemp(suffix=".png") user_splits = dict() user_splits['AGE'] = [ 43.0, 44.89473684210526, 46.78947368421053, 48.68421052631579, 50.578947368421055, 52.473684210526315, 54.368421052631575, 56.26315789473684, 58.1578947368421, 60.05263157894737, 61.94736842105263, 63.84210526315789, 65.73684210526315, 67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842, 75.21052631578948, 77.10526315789474 ] user_splits['RACE'] = ["Black", "White"] pdpUserSplit2D = gbm_model.partial_plot(data=data, server=True, plot=True, user_splits=user_splits, col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename) pdpUserSplit1D2D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True, user_splits=user_splits, col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename) pdpUserSplit1D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True, user_splits=user_splits, save_to_file=filename) # compare results 1D pdp for i in range(3): pyunit_utils.assert_H2OTwoDimTable_equal_upto( pdpUserSplit1D[i], pdpUserSplit1D2D[i], pdpUserSplit1D[i].col_header, tolerance=1e-10) # compare results 2D pdp pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[0], pdpUserSplit1D2D[3], pdpUserSplit2D[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpUserSplit2D[1], pdpUserSplit1D2D[4], pdpUserSplit2D[1].col_header, tolerance=1e-10)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) test = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) x = data.names y = "IsDepDelayed" data[y] = data[y] x.remove(y) x.remove("Weight") x.remove("IsDepDelayed_REC") WC = "Weight" # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp with weight and no NA pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC) # pdp with weight and NA pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC, include_na=True) input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpwNA[0], "input_miss") assert math.isnan( input_miss_list[-1]), "Expected last element to be nan but is not." distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpwNA[1], "distance") assert math.isnan( distance_list[-1]), "Expected last element to be nan but is not." # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs. pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10) # compare pdpwNA with theoretical results pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10) pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file( pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) weights = h2o.H2OFrame([3.0] * data.nrow) tweight2 = [1.0] * data.nrow random.seed(12345) for ind in range(len(tweight2)): tweight2[ind] = random.randint(0, 5) weights2 = h2o.H2OFrame(tweight2) data = data.cbind(weights) data = data.cbind(weights2) data.set_name(data.ncol - 2, "constWeight") data.set_name(data.ncol - 1, "variWeight") # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp without weight or NA pdpOrig = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True) # pdp with constant weight and NA pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="constWeight", include_na=True) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10) # pdp with changing weight NA pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="variWeight", include_na=True) ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age") raceList = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpvWNA[1], "race") raceList.remove(raceList[2]) raceList.append(data[21, "RACE"]) # replace with NA word ageList[len(ageList) - 1] = float( 'nan') # replace nan with proper form for python compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10) compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)