def generate_trainingFrame(): nrows = 1000000 # per nidhi request trainGroup = pyunit_utils.random_dataset_enums_only(nrows, randint(1,3), randint(2,10)) trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100)) # columns to sort sortColumnsNames = ["sort0", "sort1", "sort2"] trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols]) groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"] trainGroup.set_names(groupNames[0:trainGroup.ncols]) finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame return finalTrain,trainGroup.names,trainEnums.names
def generate_trainingFrame(): nrows = 1000000 # per Michalk request temp= pyunit_utils.random_dataset_int_only(nrows, 1, 1, misFrac=0.01) # one column of value 1 trainGroup = temp-temp trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100)) # columns to sort sortColumnsNames = ["sort0", "sort1", "sort2"] trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols]) groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"] trainGroup.set_names(groupNames[0:trainGroup.ncols]) finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame return finalTrain,trainGroup.names,trainEnums.names
def generate_trainingFrame(): nrows = 1000000 # per Michalk request temp= pyunit_utils.random_dataset_int_only(nrows, 1, 1, misFrac=0.01) # one column of value 1 trainGroup = temp-temp trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100)) # columns to sort sortColumnsNames = ["sort0", "sort1", "sort2"] trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols]) groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"] trainGroup.set_names(groupNames[0:trainGroup.ncols]) finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame return finalTrain,trainGroup.names,trainEnums.names
def generate_trainingFrame(): nrows = 1000000 # per nidhi request trainGroup = pyunit_utils.random_dataset_enums_only( nrows, randint(1, 3), randint(2, 10)) trainEnums = pyunit_utils.random_dataset_numeric_only( nrows, randint(1, 3), randint(20, 100)) # columns to sort sortColumnsNames = ["sort0", "sort1", "sort2"] trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols]) groupNames = ["GroupByCols0", "GroupByCols1", "GroupByCols2"] trainGroup.set_names(groupNames[0:trainGroup.ncols]) finalTrain = trainGroup.cbind( trainEnums) # this will be the training frame return finalTrain, trainGroup.names, trainEnums.names
def comparison_test_dense(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-10 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) nativeTrainTime = time.time()-time1 # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions # cached predictions are slightly different from the actual predictions nativeTest = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) time1=time.time() nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True runSeed = 1 testTol = 1e-6 ntrees = 10 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = { "ntrees": ntrees, "max_depth": maxdepth, "seed": runSeed, "learn_rate": 0.7, "col_sample_rate_per_tree": 0.9, "min_rows": 5, "score_tree_interval": ntrees + 1, "dmatrix_type": "dense", "tree_method": "exact", "backend": "cpu" } nativeParam = { 'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"] } nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols - numCols trainFile = pyunit_utils.genTrainFrame( nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01) # load in dataset and add response column y = 'response' trainFile = trainFile.drop( y) # drop the enum response and generate real values here yresp = 0.99 * pyunit_utils.random_dataset_numeric_only( nrows, 1, integerR=1000000, misFrac=0) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time() - time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain) nativeTrainTime = time.time() - time1 time1 = time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time() - time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol)
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-10 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 20 numCols = 5 enumCols = ncols-numCols trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01, randseed=dataSeed) # load in dataset and add response column y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(nrows, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) # need to specify one more to get the right number nativeTrainTime = time.time()-time1 # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions # cached predictions are slightly different from the actual predictions nativeTest = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols) time1=time.time() nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")
def comparison_test_dense(): assert H2OXGBoostEstimator.available() is True ret = h2o.cluster() if len(ret.nodes) == 1: runSeed = 1 dataSeed = 17 testTol = 1e-6 ntrees = 17 maxdepth = 5 # CPU Backend is forced for the results to be comparable h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9, "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"} nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"], 'tree_method': 'exact', 'seed': h2oParamsD["seed"], 'booster': 'gbtree', 'objective': 'reg:linear', 'lambda': 0.0, 'eta': h2oParamsD["learn_rate"], 'grow_policy': 'depthwise', 'alpha': 0.0, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'max_delta_step': 0.0, 'min_child_weight': h2oParamsD["min_rows"], 'gamma': 0.0, 'max_depth': h2oParamsD["max_depth"]} nrows = 10000 ncols = 10 factorL = 11 numCols = 0 enumCols = ncols-numCols responseL = 2 trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5, responseLevel=responseL, randseed=dataSeed) y='response' trainFile = trainFile.drop(y) # drop the enum response and generate real values here yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed) yresp.set_name(0, y) trainFile = trainFile.cbind(yresp) myX = trainFile.names myX.remove(y) enumCols = myX[0:enumCols] h2oModelD = H2OXGBoostEstimator(**h2oParamsD) # gather, print and save performance numbers for h2o model h2oModelD.train(x=myX, y=y, training_frame=trainFile) h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"] time1 = time.time() h2oPredictD = h2oModelD.predict(trainFile) h2oPredictTimeD = time.time()-time1 # train the native XGBoost nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols) nrounds=ntrees nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) modelInfo = nativeModel.get_dump() print(modelInfo) print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds)) nativeTrainTime = time.time()-time1 time1=time.time() nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees) nativeScoreTime = time.time()-time1 pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD, nativeScoreTime, tolerance=testTol) else: print("******** Test skipped. This test cannot be performed in multinode environment.")