Ejemplo n.º 1
0
def daal_model(df):
    TRAIN_SIZE, TEST_SIZE, PRED_SIZE = int(args.row_count * 0.8), int(
        args.row_count * 0.1), int(args.row_count * 0.1)
    train_df, test_df, pred_df = df.iloc[:TRAIN_SIZE], df.iloc[
        TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE], df.iloc[-PRED_SIZE:]
    train_x, train_y = train_df[args.src_cols], train_df[args.tgt_col]
    test_x, test_y = test_df[args.src_cols], test_df[args.tgt_col]
    pred_x = pred_df[args.src_cols]
    best_score = float("inf")
    best_params = None
    params_grid = ParameterGrid(args.daal_params_grid)
    for params in params_grid:
        model = d4p.gbt_regression_training(**params)
        train_result = model.compute(train_x, train_y)
        predict_alg = d4p.gbt_regression_prediction()
        pred = predict_alg.compute(test_x, train_result.model).prediction
        score = (np.mean(abs(pred - test_y.values) / test_y.values))
        if score < best_score:
            best_score = score
            best_params = params
    best_model = d4p.gbt_regression_training(**best_params)
    best_train_result = best_model.compute(
        df.iloc[:TRAIN_SIZE + TEST_SIZE][args.src_cols],
        df.iloc[:TRAIN_SIZE + TEST_SIZE][args.tgt_col])
    best_predict_alg = d4p.gbt_regression_prediction()
    pred_df['pred'] = best_predict_alg.compute(
        pred_x, best_train_result.model).prediction
    return pred_df
Ejemplo n.º 2
0
    def train_daal(pd_df):
        import daal4py

        dxgb_daal_params = {
            "fptype": "float",
            "maxIterations": 100,
            "maxTreeDepth": 8,
            "minSplitLoss": 0.1,
            "shrinkage": 0.1,
            "observationsPerTreeFraction": 1,
            "lambda_": 1,
            "minObservationsInLeafNode": 1,
            "maxBins": 256,
            "featuresPerNode": 0,
            "minBinSize": 5,
            "memorySavingMode": False,
        }

        t0 = default_timer()
        y = np.ascontiguousarray(pd_df["delinquency_12"],
                                 dtype=np.float32).reshape(len(pd_df), 1)
        x = np.ascontiguousarray(pd_df.drop(["delinquency_12"], axis=1),
                                 dtype=np.float32)
        t1 = default_timer()
        self.t_dmatrix = t1 - t0
        # print("Convert x,y from 64 to 32:", t1-t0)

        train_algo = daal4py.gbt_regression_training(**dxgb_daal_params)
        t0 = default_timer()
        train_result = train_algo.compute(x, y)
        self.t_train = default_timer() - t0
        # print("TRAINING TIME:", default_timer()-t0)
        return train_result
Ejemplo n.º 3
0
def main():
    maxIterations = 40

    # input data file
    infile = "./data/batch/df_regression_train.csv"
    testfile = "./data/batch/df_regression_test.csv"

    # Configure a training object
    train_algo = d4p.gbt_regression_training(maxIterations=maxIterations)

    # Read data. Let's use 3 features per observation
    data = read_csv(infile, range(13))
    deps = read_csv(infile, range(13, 14))
    train_result = train_algo.compute(data, deps)

    # Now let's do some prediction
    predict_algo = d4p.gbt_regression_prediction()
    # read test data (with same #features)
    pdata = read_csv(testfile, range(13))
    ptdata = read_csv(testfile, range(13, 14))
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    assert (predict_result.prediction.shape == (pdata.shape[0], 1))

    return (train_result, predict_result, ptdata)
def compute(train_indep_data, train_dep_data, test_indep_data, maxIterations):
    # Configure a training object
    train_algo = d4p.gbt_regression_training(maxIterations=maxIterations)
    train_result = train_algo.compute(train_indep_data, train_dep_data)
    # Now let's do some prediction
    predict_algo = d4p.gbt_regression_prediction()
    # now predict using the model from the training above
    return predict_algo.compute(test_indep_data, train_result.model)
Ejemplo n.º 5
0
    def fit(self, X, y):
        # Check the algorithm parameters
        self._check_params()

        # Check that X and y have correct shape
        X, y = check_X_y(X, y, y_numeric=True, dtype=[np.single, np.double])

        # Convert to 2d array
        y_ = y.reshape((-1, 1))

        self.n_features_ = X.shape[1]

        # Get random seed
        rs_ = check_random_state(self.random_state)
        seed_ = rs_.randint(0, np.iinfo('i').max)

        # Define type of data
        fptype = getFPType(X)

        # Fit the model
        train_algo = d4p.gbt_regression_training(
            fptype=fptype,
            splitMethod=self.split_method,
            maxIterations=self.max_iterations,
            maxTreeDepth=self.max_tree_depth,
            shrinkage=self.shrinkage,
            minSplitLoss=self.min_split_loss,
            lambda_=self.reg_lambda,
            observationsPerTreeFraction=self.observations_per_tree_fraction,
            featuresPerNode=self.features_per_node,
            minObservationsInLeafNode=self.min_observations_in_leaf_node,
            memorySavingMode=self.memory_saving_mode,
            maxBins=self.max_bins,
            minBinSize=self.min_bin_size,
            engine=d4p.engines_mcg59(seed=seed_))
        train_result = train_algo.compute(X, y_)

        # Store the model
        self.daal_model_ = train_result.model

        # Return the classifier
        return self
Ejemplo n.º 6
0
def main(readcsv=read_csv, method='defaultDense'):
    maxIterations = 200

    # input data file
    infile = "./data/batch/df_regression_train.csv"
    testfile = "./data/batch/df_regression_test.csv"

    # Configure a training object
    train_algo = d4p.gbt_regression_training(maxIterations=maxIterations)

    # Read data. Let's use 3 features per observation
    data = readcsv(infile, range(13), t=np.float32)
    deps = readcsv(infile, range(13, 14), t=np.float32)
    train_result = train_algo.compute(data, deps)

    # Now let's do some prediction
    predict_algo = d4p.gbt_regression_prediction()
    # read test data (with same #features)
    pdata = readcsv(testfile, range(13), t=np.float32)
    # now predict using the model from the training above
    predict_result = predict_algo.compute(pdata, train_result.model)

    # Prediction result provides prediction
    ptdata = np.loadtxt(testfile,
                        usecols=range(13, 14),
                        delimiter=',',
                        ndmin=2,
                        dtype=np.float32)
    # ptdata = np.loadtxt('../tests/unittest_data/gradient_boosted_regression_batch.csv',
    #                     delimiter=',', ndmin=2, dtype=np.float32)
    if hasattr(ptdata, 'toarray'):
        ptdata = ptdata.toarray()
        # to make the next assertion work with scipy's csr_matrix
    assert True or \
           np.square(predict_result.prediction - ptdata).mean() < 1e-2, \
           np.square(predict_result.prediction - ptdata).mean()

    return (train_result, predict_result, ptdata)
def train_daal(pd_df):
    dxgb_daal_params = {
		'fptype':                       'float',
		'maxIterations':                100,
		'maxTreeDepth':                 8,
		'minSplitLoss':                 0.1,
		'shrinkage':                    0.1,
		'observationsPerTreeFraction':  1,
		'lambda_':                      1,
		'minObservationsInLeafNode':    1,
		'maxBins':                      256,
		'featuresPerNode':              0,
		'minBinSize':                   5,
		'memorySavingMode':             False,
	}


    y = np.ascontiguousarray(pd_df["delinquency_12"], dtype=np.float32).reshape(len(pd_df), 1)
    x = np.ascontiguousarray(pd_df.drop(["delinquency_12"], axis=1), dtype=np.float32)

    train_algo = daal4py.gbt_regression_training(**dxgb_daal_params)
    train_result = train_algo.compute(x, y)
    return train_result
Ejemplo n.º 8
0
from timeit import default_timer as timer

import daal4py as d4p
import numpy as np
import pandas as pd

import common

NUM_LOOPS = 100
PARAMS = {'nIterations': 10, 'method': 'defaultDense', 'fptype': 'double'}

gbt = d4p.gbt_regression_training(maxIterations=200)
MODEL = gbt.compute(pd.DataFrame(common.X, dtype=np.float32),
                    pd.DataFrame(common.y, dtype=np.float32)).model


def run_inference(num_observations: int = 1000):
    """Run xgboost for specified number of observations"""
    # Load data
    test_df = common.get_test_data(num_observations)
    data = pd.DataFrame(test_df, dtype=np.float32)
    predictor = d4p.gbt_regression_prediction(**PARAMS)
    num_rows = len(test_df)

    run_times = []
    inference_times = []
    for _ in range(NUM_LOOPS):

        start_time = timer()
        predictor.compute(data, MODEL)
        end_time = timer()