def compare_weightedStats(model, datafile, xlist, xname, weightV, pdpTDTable, tol=1e-6): weightStat = manual_partial_dependence( model, datafile, xlist, xname, weightV) # calculate theoretical weighted sts wMean = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpTDTable, "mean_response") # stats for age predictor wStd = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpTDTable, "stddev_response") wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpTDTable, "std_error_mean_response") pyunit_utils.equal_two_arrays(weightStat[0], wMean, tol, tol, throw_error=True) pyunit_utils.equal_two_arrays(weightStat[1], wStd, tol, tol, throw_error=True) pyunit_utils.equal_two_arrays(weightStat[2], wStdErr, tol, tol, throw_error=True)
def compare_weightedStats(model, datafile, xlist, xname, weightV, pdpTDTable, tol=1e-6): weightStat = manual_partial_dependence(model, datafile, xlist, xname, weightV) # calculate theoretical weighted sts wMean = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "mean_response") # stats for age predictor wStd = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "stddev_response") wStdErr = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpTDTable, "std_error_mean_response") pyunit_utils.equal_two_arrays(weightStat[0], wMean, tol, tol, throwError=True) pyunit_utils.equal_two_arrays(weightStat[1], wStd, tol, tol, throwError=True) pyunit_utils.equal_two_arrays(weightStat[2], wStdErr, tol, tol, throwError=True)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file(pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) weights = h2o.H2OFrame([3.0]*data.nrow) tweight2 = [1.0]*data.nrow random.seed(12345) for ind in range(len(tweight2)): tweight2[ind] = random.randint(0,5) weights2 = h2o.H2OFrame(tweight2) data = data.cbind(weights) data = data.cbind(weights2) data.set_name(data.ncol-2, "constWeight") data.set_name(data.ncol-1, "variWeight") # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp without weight or NA pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE'],server=True, plot=True) # pdp with constant weight and NA pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="constWeight", include_na=True) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10) # pdp with changing weight NA pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="variWeight", include_na=True) ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age") raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[1], "race") raceList.remove(raceList[2]) raceList.append(data[21,"RACE"]) # replace with NA word ageList[len(ageList)-1] = float('nan') # replace nan with proper form for python compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10) compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)
def cv_nfolds_sd_check(): prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate[1] = prostate[1].asfactor() prostate.summary() prostate_gbm = H2OGradientBoostingEstimator(nfolds=4, distribution="bernoulli") prostate_gbm.train(x=list(range(2, 9)), y=1, training_frame=prostate) prostate_gbm.show() prostate_gbm.model_performance(xval=True) # extract mean and std column calculated by cross-validation metric meanCol = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "mean") stdCol = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "sd") # extract actual values from all folds cv1 = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "cv_1_valid") cv2 = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "cv_2_valid") cv3 = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "cv_3_valid") cv4 = pyunit_utils.extract_col_value_H2OTwoDimTable( prostate_gbm._model_json["output"]["cross_validation_metrics_summary"], "cv_4_valid") cvVals = [cv1, cv2, cv3, cv4] assertMeanSDCalculation( meanCol, stdCol, cvVals ) # compare manual mean/std calculation from cross-validation calculation
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) test = h2o.import_file(pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) x = data.names y = "IsDepDelayed" data[y] = data[y] x.remove(y) x.remove("Weight") x.remove("IsDepDelayed_REC") WC = "Weight" # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp with weight and no NA pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC) # pdp with weight and NA pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC, include_na = True) input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[0], "input_miss") assert math.isnan(input_miss_list[-1]), "Expected last element to be nan but is not." distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[1], "distance") assert math.isnan(distance_list[-1]), "Expected last element to be nan but is not." # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs. pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10) # compare pdpwNA with theoretical results pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10) pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
def test_anova_table_frame(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/anovaGlm/Moore.csv")) y = 'conformity' x = ['fcategory', 'partner.status'] model = H2OANOVAGLMEstimator(family='gaussian', lambda_=0, save_transformed_framekeys=True) model.train(x=x, y=y, training_frame=train) anova_table = model.result() # compare model summary and anova table frame colNames = anova_table.names for name in colNames: summaryCol = pyunit_utils.extract_col_value_H2OTwoDimTable(model._model_json["output"]["model_summary"], name) for ind in range(0, anova_table.nrow): if anova_table[name].isnumeric()[0]: assert abs(summaryCol[ind]-anova_table[name][ind,0]) < 1e-6, "expected value: {0}, actual value: {1} and they" \ " are different.".format(summaryCol[ind], anova_table[name][ind,0]) else: assert summaryCol[ind]==anova_table[name][ind,0], "expected value: {0}, actual value: {1} and they are" \ " different.".format(summaryCol[ind], anova_table[name][ind,0])
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) test = h2o.import_file( pyunit_utils.locate("smalldata/airlines/AirlinesTrainWgt.csv"), na_strings=["NA"]) x = data.names y = "IsDepDelayed" data[y] = data[y] x.remove(y) x.remove("Weight") x.remove("IsDepDelayed_REC") WC = "Weight" # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=80, learn_rate=0.1, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp with weight and no NA pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC) # pdp with weight and NA pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False, weight_column=WC, include_na=True) input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpwNA[0], "input_miss") assert math.isnan( input_miss_list[-1]), "Expected last element to be nan but is not." distance_list = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpwNA[1], "distance") assert math.isnan( distance_list[-1]), "Expected last element to be nan but is not." # compare pdpw with pdpwNA, they should equal upto NA since the pdpw does not have NAs. pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[0], pdpwNA[0], pdpw[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpw[1], pdpwNA[1], pdpw[1].col_header, tolerance=1e-10) # compare pdpwNA with theoretical results pyunit_utils.compare_weightedStats(gbm_model, test, input_miss_list, "Input_miss", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[0], tol=1e-10) pyunit_utils.compare_weightedStats(gbm_model, test, distance_list, "Distance", test[WC].as_data_frame(use_pandas=False, header=False), pdpwNA[1], tol=1e-10)
def partial_plot_test(): # Import data set that contains NAs data = h2o.import_file( pyunit_utils.locate('smalldata/prostate/prostate_cat_NA.csv')) x = data.names y = 'CAPSULE' x.remove(y) weights = h2o.H2OFrame([3.0] * data.nrow) tweight2 = [1.0] * data.nrow random.seed(12345) for ind in range(len(tweight2)): tweight2[ind] = random.randint(0, 5) weights2 = h2o.H2OFrame(tweight2) data = data.cbind(weights) data = data.cbind(weights2) data.set_name(data.ncol - 2, "constWeight") data.set_name(data.ncol - 1, "variWeight") # Build a GBM model predicting for response CAPSULE gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05, seed=12345) gbm_model.train(x=x, y=y, training_frame=data) # pdp without weight or NA pdpOrig = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True) # pdp with constant weight and NA pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="constWeight", include_na=True) # compare results pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10) pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10) # pdp with changing weight NA pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True, weight_column="variWeight", include_na=True) ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age") raceList = pyunit_utils.extract_col_value_H2OTwoDimTable( pdpvWNA[1], "race") raceList.remove(raceList[2]) raceList.append(data[21, "RACE"]) # replace with NA word ageList[len(ageList) - 1] = float( 'nan') # replace nan with proper form for python compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', raceList, "RACE", tweight2, pdpvWNA[1], tol=1e-10) compare_weightedStats(gbm_model, 'smalldata/prostate/prostate_cat_NA.csv', ageList, "AGE", tweight2, pdpvWNA[0], tol=1e-10)