def comparison_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: data= h2o.import_file(pyunit_utils.locate("smalldata/jira/adult_data_modified.csv")) data[14] = data[14].asfactor() myX = list(range(0, 13)) # use column indices print(myX) y='income' h2oParamsD = {"ntrees":30, "max_depth":4, "seed":2, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": 30+1, "tree_method": "exact", "backend":"cpu"} h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=data) h2oPredictD = h2oModelD.predict(data) nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = data.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-10) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: data= h2o.import_file(pyunit_utils.locate("smalldata/jira/adult_data_modified.csv")) data[14] = data[14].asfactor() myX = list(range(0, 13)) # use column indices print(myX) y='income' h2oParamsD = {"ntrees":30, "max_depth":4, "seed":2, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": 30+1, "tree_method": "exact", "backend":"cpu"} h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=data) h2oPredictD = h2oModelD.predict(data) nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = data.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-10) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 5 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=ntrees) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format(t1Array._cell_values[len(t1Array._cell_values) - 1][t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"] } nrows = 10000 ncols = 11 factorL = 0 numCols = 11 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, randseed=dataSeed) print(trainFile) myX = trainFile.names y = 'response' h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=[]) nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelsfound = False while not (modelsfound ): # loop to make sure accurate number of trees are built modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) if len(modelInfo) >= ntrees: modelsfound = True else: nrounds = nrounds + 1 nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_binomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format( t1Array._cell_values[len(t1Array._cell_values) - 1][ t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 # CPU Backend is forced for the results to be comparable h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y='response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time()-time1 # train the native XGBoost nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 print("Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse.....") pyunit_utils.summarizeResult_binomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 # CPU Backend is forced for the results to be comparable h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'eval_metric': ['auc', 'aucpr'] } nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, 0, enumCols=enumCols, enumFactors=factorL, miscfrac=0.1, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nrounds = ntrees evals_result = {} watch_list = [(nativeTrain, 'train')] nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds, evals=watch_list, verbose_eval=True, evals_result=evals_result) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_binomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) print( "Comparing H2OXGBoost metrics with native XGBoost metrics when DMatrix is set to sparse....." ) h2o_metrics = [ h2oModelS.training_model_metrics()["AUC"], h2oModelS.training_model_metrics()["pr_auc"] ] xgboost_metrics = [ evals_result['train']['auc'][ntrees - 1], evals_result['train']['aucpr'][ntrees - 1] ] # TODO: less tolerance ? pyunit_utils.summarize_metrics_binomial(h2o_metrics, xgboost_metrics, ["auc", "aucpr"], tolerance=1e-3) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test_dense(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method":"exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 5 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelsfound = False while not(modelsfound): # loop to make sure accurate number of trees are built modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) if len(modelInfo)>=ntrees: modelsfound=True else: nrounds=nrounds+1 nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")