def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 10 maxdepth = 5 nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols-numCols responseL = 4 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'multi:softprob', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"], 'num_class':responseL} trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, responseLevel=responseL, miscfrac=0.01) myX = trainFile.names y='response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_multinomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format(t1Array._cell_values[len(t1Array._cell_values) - 1][t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 20 factorL = 20 numCols = 10 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] t1Array = h2oModelD._model_json["output"]["scoring_history"] print("classifier error: {0}".format( t1Array._cell_values[len(t1Array._cell_values) - 1][ t1Array._col_header.index("training_classification_error")])) time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nrounds = ntrees nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format( len(modelInfo), nrounds)) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print( "******** Test skipped. This test cannot be performed in multinode environment." )
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = random.randint(1, 1073741824) testTol = 1e-6 ntrees = 10 maxdepth = 5 h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'auto', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'binary:logistic', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = random.randint(10000, 100000) ncols = random.randint(5, 20) factorL = random.randint(11, 20) numCols = random.randint(1, ncols) enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01) # load in dataset and add response column myX = trainFile.names y = 'response' myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_binomial(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-10 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(nrows, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) # need to specify one more to get the right number nativeTrainTime = time.time()-time1 # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions # cached predictions are slightly different from the actual predictions nativeTest = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) time1=time.time() nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01) # load in dataset and add response column y = 'response' trainFile = trainFile.drop( y) # drop the enum response and generate real values here yresp = 0.99 * pyunit_utils.random_dataset_numeric_only( nrows, 1, integerR=1000000, misFrac=0) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=ntrees) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)