def test_non_contig(self): from numpy.random import rand p = 10007 nx = 1017 ny = 77 X = rand(p + 1, nx + 1) Xp = rand(p + 1, nx + 1) y = rand(p + 1, ny + 1) Xn = X[:p, :nx] Xpn = Xp[:p, :nx] yn = y[:p, :ny] Xc = np.ascontiguousarray(Xn) Xpc = np.ascontiguousarray(Xpn) yc = np.ascontiguousarray(yn) self.assertTrue(not Xn.flags['C_CONTIGUOUS'] and not Xpn.flags['C_CONTIGUOUS'] and not yn.flags['C_CONTIGUOUS']) self.assertTrue(Xc.flags['C_CONTIGUOUS'] and Xpc.flags['C_CONTIGUOUS'] and yc.flags['C_CONTIGUOUS']) self.assertTrue( np.allclose(Xc, Xn) and np.allclose(Xpc, Xpn) and np.allclose(yc, yn)) regr_train = d4p.linear_regression_training() rtc = regr_train.compute(Xc, yc) regr_predict = d4p.linear_regression_prediction() rpc = regr_predict.compute(Xpc, rtc.model) regr_train = d4p.linear_regression_training() rtn = regr_train.compute(Xn, yn) regr_predict = d4p.linear_regression_prediction() rpn = regr_predict.compute(Xpn, rtn.model) self.assertTrue(np.allclose(rpn.prediction, rpc.prediction))
def _daal4py_fit(self, X, y_): y = make2d(y_) X_fptype = getFPType(X) try: lr_algorithm = daal4py.linear_regression_training( fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method='defaultDense') lr_res = lr_algorithm.compute(X, y) except RuntimeError: # Normal system is not invertible, try QR try: lr_algorithm = daal4py.linear_regression_training( fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method='qrDense') lr_res = lr_algorithm.compute(X, y) except RuntimeError: return None lr_model = lr_res.model self.daal_model_ = lr_model coefs = lr_model.Beta self.intercept_ = coefs[:, 0].copy(order='C') self.coef_ = coefs[:, 1:].copy(order='C') if self.coef_.shape[0] == 1 and y_.ndim == 1: self.coef_ = np.ravel(self.coef_) self.intercept_ = self.intercept_[0] return self
def verify_on_linear_regression(self, X, Y): alg1 = d4p.linear_regression_training(interceptFlag=True, fptype='double') res1 = alg1.compute(X, Y) Xc = np.ascontiguousarray(X) Yc = np.ascontiguousarray(Y).reshape((len(Y), 1)) alg2 = d4p.linear_regression_training(interceptFlag=True, fptype='double') res2 = alg2.compute(Xc, Yc) self.assertTrue(np.allclose(res1.model.Beta, res2.model.Beta))
def run_inference(num_observations: int = 1000): """Run xgboost for specified number of observations""" # Load data train_x_df = common.get_test_data_df(X=common.X_df, size=num_observations) train_y_df = common.get_test_data_df(X=common.y_df, size=num_observations) num_rows = len(train_x_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer() MODEL = d4p.linear_regression_training() train_result = MODEL.compute(train_x_df, train_y_df) #predictor.compute(data, MODEL) end_time = timer() total_time = end_time - start_time run_times.append(total_time * 10e3) inference_time = total_time * (10e6) / num_rows inference_times.append(inference_time) return_elem = common.calculate_stats(inference_times) print(num_observations, ", ", return_elem) return return_elem
def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/linear_regression_train.csv" testfile = "./data/batch/linear_regression_test.csv" # Configure a Linear regression training object train_algo = d4p.linear_regression_training(interceptFlag=True) # Read data. Let's have 10 independent, # and 2 dependent variables (for each observation) indep_data = readcsv(infile, range(10)) dep_data = readcsv(infile, range(10, 12)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction predict_algo = d4p.linear_regression_prediction() # read test data (with same #features) pdata = readcsv(testfile, range(10)) ptdata = readcsv(testfile, range(10, 12)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # The prediction result provides prediction assert predict_result.prediction.shape == (pdata.shape[0], dep_data.shape[1]) return (train_result, predict_result, ptdata)
def linearRegression(self, Data_Path, test_data_path, target, n): ''' daal4py Linear Regression SPMD Mode ''' # Initialize SPMD mode d4p.daalinit(nthreads=n) # training setup file = Data_Path + str(d4p.my_procid() + 1) + ".csv" data = pd.read_csv(file) X = data.drop(columns=target) y = data[target] train_algo = d4p.linear_regression_training(method='qrDense', distributed=True) self.logger.info('Training the Linear Regression in pydaal SPMD Mode') start = time.time() train_result = train_algo.compute(X, y) self.latency['Parallel_LinearRegression_Pydaal_Time'] = time.time() - \ start # test file setup test = pd.read_csv(test_data_path) y_test = test[target] X_test = test.drop(target, axis=1) if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction() # now predict using the model from the training above predict_result = predict_algo.compute(X_test, train_result.model) self.latency[ "Overall Parallel Linear Regression Prediction SPMD Time"] = time.time( ) - start # The prediction result provides prediction #assert predict_result.prediction.shape == (X_test.shape[0], y.shape[1]) d4p.daalfini() self.logger.info('Completed Linear Regression in pydaal SPMD Mode') # Compute metrics mse = mean_squared_error(y_test, predict_result.prediction) r2score = r2_score(y_test, predict_result.prediction) # Store the time taken and model metrics self.metrics['MSE_Parallel_LinearRegression_Pydaal'] = mse self.metrics['r2score_Parallel_LinearRegression_Pydaal'] = r2score return
def compute(train_indep_data, train_dep_data, test_indep_data): # Configure a Linear regression training object train_algo = d4p.linear_regression_training(interceptFlag=True) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(train_indep_data, train_dep_data) # Now let's do some prediction predict_algo = d4p.linear_regression_prediction() # now predict using the model from the training above return predict_algo.compute(test_indep_data, train_result.model), train_result
def _daal4py_fit(self, X, y): y = make2d(y) X_fptype = getFPType(X) lr_algorithm = daal4py.linear_regression_training( fptype=X_fptype, interceptFlag=bool(self.fit_intercept), method='defaultDense') lr_res = lr_algorithm.compute(X, y) lr_model = lr_res.model self.daal_model_ = lr_model coefs = lr_model.Beta self.intercept_ = coefs[:,0].copy(order='C') self.coef_ = coefs[:,1:].copy(order='C') if self.coef_.shape[0] == 1: self.coef_ = np.ravel(self.coef_) self.intercept_ = self.intercept_[0] return self
def main(readcsv=read_csv, method='defaultDense'): infile = "./data/batch/linear_regression_train.csv" testfile = "./data/batch/linear_regression_test.csv" # Configure a Linear regression training object for streaming train_algo = d4p.linear_regression_training(interceptFlag=True, streaming=True) chunk_size = 250 lines_read = 0 # read and feed chunk by chunk while True: # Read data in chunks # Let's have 10 independent, and 2 dependent variables (for each observation) try: indep_data = readcsv(infile, range(10), lines_read, chunk_size) dep_data = readcsv(infile, range(10, 12), lines_read, chunk_size) except: break # Now feed chunk train_algo.compute(indep_data, dep_data) lines_read += indep_data.shape[0] # All chunks are done, now finalize the computation train_result = train_algo.finalize() # Now let's do some prediction predict_algo = d4p.linear_regression_prediction() # read test data (with same #features) pdata = readcsv(testfile, range(10)) ptdata = readcsv(testfile, range(10, 12)) # now predict using the model from the training above predict_result = predict_algo.compute(pdata, train_result.model) # The prediction result provides prediction assert predict_result.prediction.shape == (pdata.shape[0], dep_data.shape[1]) return (train_result, predict_result, ptdata)
def linearRegression(self, X_train, X_test, y_train, y_test, target): ''' Method for Linear Regression ''' # Configure a Linear regression training object train_algo = d4p.linear_regression_training(method='qrDense') self.logger.info( 'Training the Linear Regression in pydaal Batch/Serial Mode') start = time.time() # Now train/compute, the result provides the model for prediction lm_trained = train_algo.compute(X_train, y_train) self.latency["Serial Linear Regression Batch Time"] = time.time() - \ start y_pred = d4p.linear_regression_prediction().compute( X_test, lm_trained.model).prediction self.latency[ 'Overall Serial Linear Regression Prediction Batch Time'] = time.time( ) - start self.logger.info( 'Completed Linear Regression in pydaal Batch/Serial Mode') # Compute metrics mse = mean_squared_error(y_test, y_pred) r2score = r2_score(y_test, y_pred) # Store the time taken and model metrics self.metrics['MSE_serial_linear_regression_pydaal'] = mse self.metrics['r2_score_serial_linear_regression_pydaal'] = r2score return
def lr_train(N, D): data = np.random.ranf((N, D)) gt = np.random.ranf((N, 2)) return daal4py.linear_regression_training(interceptFlag=True, method='qrDense').compute( data, gt)
y = data.target[np.newaxis].T # house price # splitting the data for training and testing, with a 25% test dataset size X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1693) # ## Training and Saving the Model # Let's **train our model** and look at the model's features! # In[4]: # training the model for prediction train_result = d4p.linear_regression_training().compute(X_train, y_train) # To **get training model information** and **save it to a file**: # In[5]: # retrieving and printing training model model = train_result.model print("Here's our model:\n\n\n", model, "\n") model_filename = './models/linear_regression_batch.sav' # saving model to a file pickle.dump(model, open(model_filename, "wb")) # Now let's **load up the model** and look at one of the model's features.
def test_fit(X, y): regr_train = linear_regression_training(fptype=getFPType(X), method=params.method, interceptFlag=params.fit_intercept) return regr_train.compute(X, y)
import daal4py as d4p from timeit import default_timer as timer from sklearn.metrics import mean_squared_error import numpy as np import pandas as pd import common NUM_LOOPS = 100 d4p.daalinit() print("Computing for Linear Regression With Daal") MODEL = d4p.linear_regression_training() train_result = MODEL.compute(common.X_df, common.y_df) def run_inference(num_observations: int = 1000): """Run xgboost for specified number of observations""" # Load data test_df = common.get_test_data_df(X=common.X_df, size=num_observations) num_rows = len(test_df) ###################### print("_______________________________________") print("Total Number of Rows", num_rows) run_times = [] inference_times = [] for _ in range(NUM_LOOPS): start_time = timer()
# each process gets its own data infile = "./data/linear_regression_train_" + str(d4p.my_procid() + 1) + ".csv" # read data indep_data = pd.read_csv(infile).drop(["target"], axis=1) # house characteristics dep_data = pd.read_csv(infile)["target"] # house price # ## Training and Saving the Model # Time to **train our model** and look at the model's features! # In[16]: # training the model for prediction train_result = d4p.linear_regression_training(distributed=True).compute( indep_data, dep_data) # To **get training model information** and **save it to a file**: # In[17]: # retrieving and printing training model model = train_result.model print("Here's our model:\n\n\n", model, "\n") model_filename = './models/daal4py_Distributed_LinearRegression_' + str( d4p.my_procid() + 1) + '.sav' # saving model to a file joblib.dump(model, model_filename)
for n in range(REP): t1 = timeit.default_timer() r = func(*args, **keyArgs) t2 = timeit.default_timer() times.append(t2-t1) print(min(times)) return r return st_func p = args.size[0] n = args.size[1] X = rand(p,n) Xp = rand(p,n) y = rand(p,n) regr_train = linear_regression_training() regr_predict = linear_regression_prediction() @st_time def test_fit(X,y): regr_train.compute(X, y) @st_time def test_predict(X, m): regr_predict.compute(X, m) print (','.join([args.batchID, args.arch, args.prefix, "Linear.fit", coreString(args.num_threads), "Double", "%sx%s" % (p,n)]), end=',') test_fit(X, y) res = regr_train.compute(X, y) print (','.join([args.batchID, args.arch, args.prefix, "Linear.prediction", coreString(args.num_threads), "Double", "%sx%s" % (p,n)]), end=',') test_predict(Xp, res.model)
# run like this: # mpirun -n 4 python ./linreg_spmd.py import daal4py as d4p from numpy import loadtxt, allclose if __name__ == "__main__": # Initialize SPMD mode d4p.daalinit() # Each process gets its own data infile = "./data/distributed/linear_regression_train_" + str( d4p.my_procid() + 1) + ".csv" # Configure a Linear regression training object train_algo = d4p.linear_regression_training(distributed=True) # Read data. Let's have 10 independent, and 2 dependent variables (for each observation) indep_data = loadtxt(infile, delimiter=',', usecols=range(10)) dep_data = loadtxt(infile, delimiter=',', usecols=range(10, 12)) # Now train/compute, the result provides the model for prediction train_result = train_algo.compute(indep_data, dep_data) # Now let's do some prediction # It run only on a single node if d4p.my_procid() == 0: predict_algo = d4p.linear_regression_prediction() # read test data (with same #features) pdata = loadtxt("./data/distributed/linear_regression_test.csv", delimiter=',', usecols=range(10))