def daal_model(df): TRAIN_SIZE, TEST_SIZE, PRED_SIZE = int(args.row_count * 0.8), int( args.row_count * 0.1), int(args.row_count * 0.1) train_df, test_df, pred_df = df.iloc[:TRAIN_SIZE], df.iloc[ TRAIN_SIZE:TRAIN_SIZE + TEST_SIZE], df.iloc[-PRED_SIZE:] train_x, train_y = train_df[args.src_cols], train_df[args.tgt_col] test_x, test_y = test_df[args.src_cols], test_df[args.tgt_col] pred_x = pred_df[args.src_cols] best_score = float("inf") best_params = None params_grid = ParameterGrid(args.daal_params_grid) for params in params_grid: model = d4p.gbt_regression_training(**params) train_result = model.compute(train_x, train_y) predict_alg = d4p.gbt_regression_prediction() pred = predict_alg.compute(test_x, train_result.model).prediction score = (np.mean(abs(pred - test_y.values) / test_y.values)) if score < best_score: best_score = score best_params = params best_model = d4p.gbt_regression_training(**best_params) best_train_result = best_model.compute( df.iloc[:TRAIN_SIZE + TEST_SIZE][args.src_cols], df.iloc[:TRAIN_SIZE + TEST_SIZE][args.tgt_col]) best_predict_alg = d4p.gbt_regression_prediction() pred_df['pred'] = best_predict_alg.compute( pred_x, best_train_result.model).prediction return pred_df
def train_daal(pd_df): import daal4py dxgb_daal_params = { "fptype": "float", "maxIterations": 100, "maxTreeDepth": 8, "minSplitLoss": 0.1, "shrinkage": 0.1, "observationsPerTreeFraction": 1, "lambda_": 1, "minObservationsInLeafNode": 1, "maxBins": 256, "featuresPerNode": 0, "minBinSize": 5, "memorySavingMode": False, } t0 = default_timer() y = np.ascontiguousarray(pd_df["delinquency_12"], dtype=np.float32).reshape(len(pd_df), 1) x = np.ascontiguousarray(pd_df.drop(["delinquency_12"], axis=1), dtype=np.float32) t1 = default_timer() self.t_dmatrix = t1 - t0 # print("Convert x,y from 64 to 32:", t1-t0) train_algo = daal4py.gbt_regression_training(**dxgb_daal_params) t0 = default_timer() train_result = train_algo.compute(x, y) self.t_train = default_timer() - t0 # print("TRAINING TIME:", default_timer()-t0) return train_result
def main(): maxIterations = 40 # input data file infile = "./data/batch/df_regression_train.csv" testfile = "./data/batch/df_regression_test.csv" # Configure a training object train_algo = d4p.gbt_regression_training(maxIterations=maxIterations) # Read data. Let's use 3 features per observation data = read_csv(infile, range(13)) deps = read_csv(infile, range(13, 14)) train_result = train_algo.compute(data, deps) # Now let's do some prediction predict_algo = d4p.gbt_regression_prediction() # read test data (with same #features) pdata = read_csv(testfile, range(13)) ptdata = read_csv(testfile, range(13, 14)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction assert (predict_result.prediction.shape == (pdata.shape[0], 1)) return (train_result, predict_result, ptdata)
def compute(train_indep_data, train_dep_data, test_indep_data, maxIterations): # Configure a training object train_algo = d4p.gbt_regression_training(maxIterations=maxIterations) train_result = train_algo.compute(train_indep_data, train_dep_data) # Now let's do some prediction predict_algo = d4p.gbt_regression_prediction() # now predict using the model from the training above return predict_algo.compute(test_indep_data, train_result.model)
def fit(self, X, y): # Check the algorithm parameters self._check_params() # Check that X and y have correct shape X, y = check_X_y(X, y, y_numeric=True, dtype=[np.single, np.double]) # Convert to 2d array y_ = y.reshape((-1, 1)) self.n_features_ = X.shape[1] # Get random seed rs_ = check_random_state(self.random_state) seed_ = rs_.randint(0, np.iinfo('i').max) # Define type of data fptype = getFPType(X) # Fit the model train_algo = d4p.gbt_regression_training( fptype=fptype, splitMethod=self.split_method, maxIterations=self.max_iterations, maxTreeDepth=self.max_tree_depth, shrinkage=self.shrinkage, minSplitLoss=self.min_split_loss, lambda_=self.reg_lambda, observationsPerTreeFraction=self.observations_per_tree_fraction, featuresPerNode=self.features_per_node, minObservationsInLeafNode=self.min_observations_in_leaf_node, memorySavingMode=self.memory_saving_mode, maxBins=self.max_bins, minBinSize=self.min_bin_size, engine=d4p.engines_mcg59(seed=seed_)) train_result = train_algo.compute(X, y_) # Store the model self.daal_model_ = train_result.model # Return the classifier return self
def main(readcsv=read_csv, method='defaultDense'): maxIterations = 200 # input data file infile = "./data/batch/df_regression_train.csv" testfile = "./data/batch/df_regression_test.csv" # Configure a training object train_algo = d4p.gbt_regression_training(maxIterations=maxIterations) # Read data. Let's use 3 features per observation data = readcsv(infile, range(13), t=np.float32) deps = readcsv(infile, range(13, 14), t=np.float32) train_result = train_algo.compute(data, deps) # Now let's do some prediction predict_algo = d4p.gbt_regression_prediction() # read test data (with same #features) pdata = readcsv(testfile, range(13), t=np.float32) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # Prediction result provides prediction ptdata = np.loadtxt(testfile, usecols=range(13, 14), delimiter=',', ndmin=2, dtype=np.float32) # ptdata = np.loadtxt('../tests/unittest_data/gradient_boosted_regression_batch.csv', # delimiter=',', ndmin=2, dtype=np.float32) if hasattr(ptdata, 'toarray'): ptdata = ptdata.toarray() # to make the next assertion work with scipy's csr_matrix assert True or \ np.square(predict_result.prediction - ptdata).mean() < 1e-2, \ np.square(predict_result.prediction - ptdata).mean() return (train_result, predict_result, ptdata)
def train_daal(pd_df): dxgb_daal_params = { 'fptype': 'float', 'maxIterations': 100, 'maxTreeDepth': 8, 'minSplitLoss': 0.1, 'shrinkage': 0.1, 'observationsPerTreeFraction': 1, 'lambda_': 1, 'minObservationsInLeafNode': 1, 'maxBins': 256, 'featuresPerNode': 0, 'minBinSize': 5, 'memorySavingMode': False, } y = np.ascontiguousarray(pd_df["delinquency_12"], dtype=np.float32).reshape(len(pd_df), 1) x = np.ascontiguousarray(pd_df.drop(["delinquency_12"], axis=1), dtype=np.float32) train_algo = daal4py.gbt_regression_training(**dxgb_daal_params) train_result = train_algo.compute(x, y) return train_result
from timeit import default_timer as timer import daal4py as d4p import numpy as np import pandas as pd import common NUM_LOOPS = 100 PARAMS = {'nIterations': 10, 'method': 'defaultDense', 'fptype': 'double'} gbt = d4p.gbt_regression_training(maxIterations=200) MODEL = gbt.compute(pd.DataFrame(common.X, dtype=np.float32), pd.DataFrame(common.y, dtype=np.float32)).model def run_inference(num_observations: int = 1000): """Run xgboost for specified number of observations""" # Load data test_df = common.get_test_data(num_observations) data = pd.DataFrame(test_df, dtype=np.float32) predictor = d4p.gbt_regression_prediction(**PARAMS) num_rows = len(test_df) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() predictor.compute(data, MODEL) end_time = timer()