コード例 #1
0
def generate_trainingFrame():
    nrows = 1000000 # per nidhi request
    trainGroup = pyunit_utils.random_dataset_enums_only(nrows, randint(1,3), randint(2,10))
    trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100))   # columns to sort
    sortColumnsNames = ["sort0", "sort1", "sort2"]
    trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols])
    groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"]
    trainGroup.set_names(groupNames[0:trainGroup.ncols])
    finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame
    return finalTrain,trainGroup.names,trainEnums.names
コード例 #2
0
def generate_trainingFrame():
    nrows = 1000000 # per Michalk request
    temp= pyunit_utils.random_dataset_int_only(nrows, 1, 1, misFrac=0.01)  # one column of value 1
    trainGroup = temp-temp
    trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100))   # columns to sort
    sortColumnsNames = ["sort0", "sort1", "sort2"]
    trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols])
    groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"]
    trainGroup.set_names(groupNames[0:trainGroup.ncols])
    finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame
    return finalTrain,trainGroup.names,trainEnums.names
コード例 #3
0
def generate_trainingFrame():
    nrows = 1000000 # per Michalk request
    temp= pyunit_utils.random_dataset_int_only(nrows, 1, 1, misFrac=0.01)  # one column of value 1
    trainGroup = temp-temp
    trainEnums = pyunit_utils.random_dataset_numeric_only(nrows, randint(1,3), randint(20, 100))   # columns to sort
    sortColumnsNames = ["sort0", "sort1", "sort2"]
    trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols])
    groupNames = ["GroupByCols0","GroupByCols1","GroupByCols2"]
    trainGroup.set_names(groupNames[0:trainGroup.ncols])
    finalTrain = trainGroup.cbind(trainEnums) # this will be the training frame
    return finalTrain,trainGroup.names,trainEnums.names
コード例 #4
0
def generate_trainingFrame():
    nrows = 1000000  # per nidhi request
    trainGroup = pyunit_utils.random_dataset_enums_only(
        nrows, randint(1, 3), randint(2, 10))
    trainEnums = pyunit_utils.random_dataset_numeric_only(
        nrows, randint(1, 3), randint(20, 100))  # columns to sort
    sortColumnsNames = ["sort0", "sort1", "sort2"]
    trainEnums.set_names(sortColumnsNames[0:trainEnums.ncols])
    groupNames = ["GroupByCols0", "GroupByCols1", "GroupByCols2"]
    trainGroup.set_names(groupNames[0:trainGroup.ncols])
    finalTrain = trainGroup.cbind(
        trainEnums)  # this will be the training frame
    return finalTrain, trainGroup.names, trainEnums.names
def comparison_test_dense():
    if sys.version.startswith("2"):
        print("native XGBoost tests only supported on python3")
        return
    import xgboost as xgb
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-10
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'reg:linear',
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols-numCols
        responseL = 2

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)


        y='response'
        trainFile = trainFile.drop(y)   # drop the enum response and generate real values here
        yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed)
        yresp.set_name(0, y)
        trainFile = trainFile.cbind(yresp)
        myX = trainFile.names
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        nativeTrainTime = time.time()-time1
        # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions
        # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions
        # cached predictions are slightly different from the actual predictions
        nativeTest = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
コード例 #6
0
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True

    runSeed = 1
    testTol = 1e-6
    ntrees = 10
    maxdepth = 5
    # CPU Backend is forced for the results to be comparable
    h2oParamsD = {
        "ntrees": ntrees,
        "max_depth": maxdepth,
        "seed": runSeed,
        "learn_rate": 0.7,
        "col_sample_rate_per_tree": 0.9,
        "min_rows": 5,
        "score_tree_interval": ntrees + 1,
        "dmatrix_type": "dense",
        "tree_method": "exact",
        "backend": "cpu"
    }
    nativeParam = {
        'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
        'tree_method': 'exact',
        'seed': h2oParamsD["seed"],
        'booster': 'gbtree',
        'objective': 'reg:linear',
        'lambda': 0.0,
        'eta': h2oParamsD["learn_rate"],
        'grow_policy': 'depthwise',
        'alpha': 0.0,
        'subsample': 1.0,
        'colsample_bylevel': 1.0,
        'max_delta_step': 0.0,
        'min_child_weight': h2oParamsD["min_rows"],
        'gamma': 0.0,
        'max_depth': h2oParamsD["max_depth"]
    }

    nrows = 10000
    ncols = 10
    factorL = 20
    numCols = 5
    enumCols = ncols - numCols

    trainFile = pyunit_utils.genTrainFrame(
        nrows, numCols, enumCols=enumCols, enumFactors=factorL,
        miscfrac=0.01)  # load in dataset and add response column
    y = 'response'
    trainFile = trainFile.drop(
        y)  # drop the enum response and generate real values here
    yresp = 0.99 * pyunit_utils.random_dataset_numeric_only(
        nrows, 1, integerR=1000000, misFrac=0)
    yresp.set_name(0, y)
    trainFile = trainFile.cbind(yresp)
    myX = trainFile.names
    myX.remove(y)
    enumCols = myX[0:enumCols]

    h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
    # gather, print and save performance numbers for h2o model
    h2oModelD.train(x=myX, y=y, training_frame=trainFile)
    h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
    time1 = time.time()
    h2oPredictD = h2oModelD.predict(trainFile)
    h2oPredictTimeD = time.time() - time1

    # train the native XGBoost
    nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile,
                                                        y,
                                                        enumCols=enumCols)
    nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain)
    nativeTrainTime = time.time() - time1
    time1 = time.time()
    nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
    nativeScoreTime = time.time() - time1

    pyunit_utils.summarizeResult_regression(h2oPredictD,
                                            nativePred,
                                            h2oTrainTimeD,
                                            nativeTrainTime,
                                            h2oPredictTimeD,
                                            nativeScoreTime,
                                            tolerance=testTol)
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-10
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                      "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"dense","tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'reg:linear',
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 20
        numCols = 5
        enumCols = ncols-numCols

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.01,
                                               randseed=dataSeed)     # load in dataset and add response column
        y='response'
        trainFile = trainFile.drop(y)   # drop the enum response and generate real values here
        yresp = 0.99*pyunit_utils.random_dataset_numeric_only(nrows, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed)
        yresp.set_name(0, y)
        trainFile = trainFile.cbind(yresp)
        myX = trainFile.names
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds) # need to specify one more to get the right number
        nativeTrainTime = time.time()-time1
        # create a "test" matrix - it will be identical to "train" matrix but it will not have any cached predictions
        # if we tried to use matrix `nativeTrain` predict(..) will not actually compute anything it will return the cached predictions
        # cached predictions are slightly different from the actual predictions
        nativeTest = pyunit_utils.convertH2OFrameToDMatrix(trainFile, y, enumCols=enumCols)
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTest, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                                nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")
コード例 #8
0
def comparison_test_dense():
    assert H2OXGBoostEstimator.available() is True
    ret = h2o.cluster()
    if len(ret.nodes) == 1:
        runSeed = 1
        dataSeed = 17
        testTol = 1e-6
        ntrees = 17
        maxdepth = 5
        # CPU Backend is forced for the results to be comparable
        h2oParamsD = {"ntrees":ntrees, "max_depth":maxdepth, "seed":runSeed, "learn_rate":0.7, "col_sample_rate_per_tree" : 0.9,
                     "min_rows" : 5, "score_tree_interval": ntrees+1, "dmatrix_type":"sparse", "tree_method": "exact", "backend":"cpu"}
        nativeParam = {'colsample_bytree': h2oParamsD["col_sample_rate_per_tree"],
                       'tree_method': 'exact',
                       'seed': h2oParamsD["seed"],
                       'booster': 'gbtree',
                       'objective': 'reg:linear',
                       'lambda': 0.0,
                       'eta': h2oParamsD["learn_rate"],
                       'grow_policy': 'depthwise',
                       'alpha': 0.0,
                       'subsample': 1.0,
                       'colsample_bylevel': 1.0,
                       'max_delta_step': 0.0,
                       'min_child_weight': h2oParamsD["min_rows"],
                       'gamma': 0.0,
                       'max_depth': h2oParamsD["max_depth"]}

        nrows = 10000
        ncols = 10
        factorL = 11
        numCols = 0
        enumCols = ncols-numCols
        responseL = 2

        trainFile = pyunit_utils.genTrainFrame(nrows, numCols, enumCols=enumCols, enumFactors=factorL, miscfrac=0.5,
                                               responseLevel=responseL, randseed=dataSeed)


        y='response'
        trainFile = trainFile.drop(y)   # drop the enum response and generate real values here
        yresp = 0.99*pyunit_utils.random_dataset_numeric_only(10000, 1, integerR = 1000000, misFrac=0, randSeed=dataSeed)
        yresp.set_name(0, y)
        trainFile = trainFile.cbind(yresp)
        myX = trainFile.names
        myX.remove(y)
        enumCols = myX[0:enumCols]

        h2oModelD = H2OXGBoostEstimator(**h2oParamsD)
        # gather, print and save performance numbers for h2o model
        h2oModelD.train(x=myX, y=y, training_frame=trainFile)
        h2oTrainTimeD = h2oModelD._model_json["output"]["run_time"]
        time1 = time.time()
        h2oPredictD = h2oModelD.predict(trainFile)
        h2oPredictTimeD = time.time()-time1

        # train the native XGBoost
        nativeTrain = pyunit_utils.convertH2OFrameToDMatrixSparse(trainFile, y, enumCols=enumCols)
        nrounds=ntrees
        nativeModel = xgb.train(params=nativeParam, dtrain=nativeTrain, num_boost_round=nrounds)
        modelInfo = nativeModel.get_dump()
        print(modelInfo)
        print("num_boost_round: {1}, Number of trees built: {0}".format(len(modelInfo), nrounds))
        nativeTrainTime = time.time()-time1
        time1=time.time()
        nativePred = nativeModel.predict(data=nativeTrain, ntree_limit=ntrees)
        nativeScoreTime = time.time()-time1

        pyunit_utils.summarizeResult_regression(h2oPredictD, nativePred, h2oTrainTimeD, nativeTrainTime, h2oPredictTimeD,
                                              nativeScoreTime, tolerance=testTol)
    else:
        print("********  Test skipped.  This test cannot be performed in multinode environment.")