def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 10 maxdepth = 5 nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"], 'num_class':responseL} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 maxdepth = 5 nrows = 10000 ncols = 12 factorL = 20 numCols = 1 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "tree_method": "exact", "backend":"cpu"} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01,randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) newNames = [] for ind in range(0, len(myX)): myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names newNames.append(myX[ind]) newNames.append(y) trainFile.set_names(newNames) h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oPredictD = h2oModelD.predict(trainFile) # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 maxdepth = 5 nrows = 10000 ncols = 12 factorL = 20 numCols = 11 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "tree_method": "exact", "backend":"cpu"} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01,randseed=dataSeed) myX = trainFile.names y='response' myX.remove(y) newNames = [] for ind in range(0, len(myX)): myX[ind] = myX[ind]+"_"+str(ind) # avoid duplicated column names newNames.append(myX[ind]) newNames.append(y) trainFile.set_names(newNames) h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oPredictD = h2oModelD.predict(trainFile) # derive native XGBoost parameter and DMatrx from h2oXGBoost model and H2OFrame nativeXGBoostParam = h2oModelD.convert_H2OXGBoostParams_2_XGBoostParams() nativeXGBoostInput = trainFile.convert_H2OFrame_2_DMatrix(myX, y, h2oModelD) nativeModel = xgb.train(params=nativeXGBoostParam[0], dtrain=nativeXGBoostInput, num_boost_round=nativeXGBoostParam[1]) nativePred = nativeModel.predict(data=nativeXGBoostInput, ntree_limit=nativeXGBoostParam[1]) pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, -1, -1, -1, -1, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 responseL = 11 # CPU Backend is forced for the results to be comparable h2oParamsS = {"ntrees":ntrees, "max_depth":4, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'num_class':responseL} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y='response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time()-time1 # train the native XGBoost nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 print("Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse.....") pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols - numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"], 'num_class': responseL } trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01, randseed=dataSeed) myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 ntrees = 17 responseL = 11 # CPU Backend is forced for the results to be comparable h2oParamsS = { "ntrees": ntrees, "max_depth": 4, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "sparse", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsS["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsS["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'eta': h2oParamsS["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsS["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsS["max_depth"], 'num_class': responseL } nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) # load in dataset and add response column print(trainFile) myX = trainFile.names y = 'response' myX.remove(y) nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=myX) h2oModelS = H2OXGBoostEstimator(**h2oParamsS) # gather, print and save performance numbers for h2o model h2oModelS.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeS = h2oModelS._model_json["output"]["run_time"] time1 = time.time() h2oPredictS = h2oModelS.predict(trainFile) h2oPredictTimeS = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 print( "Comparing H2OXGBoost results with native XGBoost result when DMatrix is set to sparse....." ) pyunit_utils.summarizeResult_multinomial(h2oPredictS, nativePred, h2oTrainTimeS, nativeTrainTime, h2oPredictTimeS, nativeScoreTime, tolerance=1e-6) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )