def predict_price(total_amount, trip_distance, passenger_count): # Create a dataframe out of the three columns # and pass it to dask-xgboost, to predict # distributed X = dd.concat([total_amount, trip_distance, passenger_count], axis=1).astype("float64") return dask_xgboost.predict(client, bst, X)
def test_numpy(c, s, a, b): dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2, )) dbst = yield dxgb.train(c, param, dX, dy) dbst = yield dxgb.train(c, param, dX, dy) # we can do this twice predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions = yield c.compute(predictions) _test_container(dbst, predictions, np.array)
def test_numpy(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2, )) dbst = yield dxgb.train(c, param, dX, dy) dbst = yield dxgb.train(c, param, dX, dy) # we can do this twice predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions = yield c.compute(predictions) _test_container(dbst, predictions, np.array)
def test_sparse(c, s, a, b): dX = da.from_array(X, chunks=(2, 2)).map_blocks(sparse.COO) dy = da.from_array(y, chunks=(2, )) dbst = yield dxgb.train(c, param, dX, dy) dbst = yield dxgb.train(c, param, dX, dy) # we can do this twice predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions_result = yield c.compute(predictions) _test_container(dbst, predictions_result, sparse.COO)
def test_sparse(c, s, a, b): xgb.rabit.init() # workaround for "Doing rabit call after Finalize" dX = da.from_array(X, chunks=(2, 2)).map_blocks(scipy.sparse.csr_matrix) dy = da.from_array(y, chunks=(2, )) dbst = yield dxgb.train(c, param, dX, dy) dbst = yield dxgb.train(c, param, dX, dy) # we can do this twice predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions_result = yield c.compute(predictions) _test_container(dbst, predictions_result, scipy.sparse.csr_matrix)
def test_daskxgboost(startDaskClient) : client = startDaskClient import dask.dataframe as dd df = dd.read_csv('...') # use dask.dataframe to load and df_train = False # preprocess data labels_train = False import dask_xgboost as dxgb params = {'objective': 'binary:logistic', } # use normal xgboost params bst = dxgb.train(client, params, df_train, labels_train) predictions = dxgb.predict(client, bsg, data_test)
def daskml_regressor(client, data_train, data_test, labels_train, labels_test): print("\n\n***** Dask ml XGBoost *****") start = time.time() from config import param_grid_xgboost bst = dxgb.train(client, param_grid_xgboost, data_train, labels_train) pdxgb_train_time = str(time.time() - start) predictions = dxgb.predict(client, bst, data_test).persist() accuracy = roc_auc_score(labels_test.compute(), predictions.compute()) print("Accuracy:", accuracy) print("- Done") return [0, pdxgb_train_time, accuracy]
def main(): object = ps.preprocess() X_train, X_test, y_train, y_test = object.cleaning() params = {'objective': 'binary:logistic', 'max_depth': 8, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 1} print ("Start training dxgb") cluster = LocalCluster(n_workers=8, threads_per_worker=1) client = Client(cluster) start_time = time.time() bst = dxgb.train(client, params, X_train, y_train) end_time = time.time() #time difference in dXGB is 1588108665 print ("time difference in dXGB is %d seonds" % end_time) predictions = dxgb.predict(client, bst, X_test) #Accuracy = 0.6968888393419537 print ("Accuracy score is : ") print(roc_auc_score(y_test.compute(), predictions.compute())) client.shutdown()
def test_basic(c, s, a, b): dtrain = xgb.DMatrix(df, label=labels) bst = xgb.train(param, dtrain) ddf = dd.from_pandas(df, npartitions=4) dlabels = dd.from_pandas(labels, npartitions=4) dbst = yield dxgb.train(c, param, ddf, dlabels) dbst = yield dxgb.train(c, param, ddf, dlabels) # we can do this twice result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == labels dcorrect = (dresult > 0.5) == labels assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, ddf) assert isinstance(predictions, dd.Series) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, pd.Series) assert ((predictions > 0.5) != labels).sum() < 2
def test_numpy(c, s, a, b): dtrain = xgb.DMatrix(X, label=y) bst = xgb.train(param, dtrain) dX = da.from_array(X, chunks=(2, 2)) dy = da.from_array(y, chunks=(2, )) dbst = yield dxgb._train(c, param, dX, dy) dbst = yield dxgb._train(c, param, dX, dy) # we can do this twice result = bst.predict(dtrain) dresult = dbst.predict(dtrain) correct = (result > 0.5) == y dcorrect = (dresult > 0.5) == y assert dcorrect.sum() >= correct.sum() predictions = dxgb.predict(c, dbst, dX) assert isinstance(predictions, da.Array) predictions = yield c.compute(predictions)._result() assert isinstance(predictions, np.ndarray) assert ((predictions > 0.5) != labels).sum() < 2
# In[6]: params = { 'objective': 'reg:squarederror', 'n_estimators': 100000, 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.5 } bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100) # In[7]: y_hat = dask_xgboost.predict(client, bst, X_test).persist() y_hat # In[8]: r = r2_score(y_test.compute(), y_hat.compute()) mae = mean_absolute_error(y_test.compute(), y_hat.compute()) mse = mean_squared_error(y_test.compute(), y_hat.compute()) print("R^2:", r) print("MAE:", mae) print("MSE:", mse) # In[9]: from dask_ml.datasets import make_classification
for col in vars_cat: dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col]) X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True) y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0) X_train = X_all[0:d_train.shape[0],] y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),] y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] X_train.persist() y_train.persist() client.has_what() param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1} %time md = dxgb.train(client, param, X_train, y_train, num_boost_round = 100) y_pred = dxgb.predict(client, md, X_test) y_pred_loc = y_pred.compute() y_test_loc = y_test.compute() print(metrics.roc_auc_score(y_test_loc, y_pred_loc)) ## m5.4xlarge 16c (8+8HT) ## Wall time: 34.3 s ## 0.7928378346764724
def main(): print("Setting up data directory") print("-------------------------") #flights(args.url) columns = ['Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'Origin'] data_dir = 'data' target = 'DepDelay' log = '' results = {} df = get_df(columns).dropna() is_dask = True client = None if is_dask: client = Client(n_workers=20, threads_per_worker=20, memory_limit='1GB') model = GradientBoostingRegressor(random_state=18) params = {'max_depth': [2, 3], 'n_estimators': [1, 2, 3]} X_train, X_test, y_train, y_test = get_data(df.copy(), target, is_dask=False, chunksize=200) results = dict() clf_name = type(model).__name__ clf_cv = GridSearchCV(model, param_grid=params, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=18), scoring='neg_mean_squared_error') with joblib.parallel_backend("dask" if is_dask else 'loky'): start = time.time() clf_cv.fit(X_train, y_train) end = time.time() y_predict_train = clf_cv.best_estimator_.predict(X_train) y_predict_test = clf_cv.best_estimator_.predict(X_test) train_error = mean_squared_error( y_train, y_predict_train, ) test_error = mean_squared_error( y_test, y_predict_test, ) best_params = clf_cv.best_params_ results['Scikit XGBoost'] = { 'train_error': train_error, 'test_error': test_error, 'time': end - start } log += 'Scikit XGBoost train_error: %.2f, test_error: %.2f, took: %.2f\n' % ( train_error, test_error, end - start) is_dask = True X_train, X_test, y_train, y_test = get_data(df.copy(), target, is_dask=is_dask, chunksize=200) params = { 'objective': 'reg:squarederror', 'max_depth': 3, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.2 } start = time.time() bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=10) end = time.time() y_train_pred = dask_xgboost.predict(client, bst, X_train).persist() y_test_pred = dask_xgboost.predict(client, bst, X_test).persist() y_train, y_train_pred = dask.compute(y_train, y_train_pred) y_test, y_test_pred = dask.compute(y_test, y_test_pred) train_error = mean_squared_error(y_train, y_train_pred) test_error = mean_squared_error(y_test, y_test_pred) log += 'Dask XGBoost train_error: %.2f, test_error: %.2f, took: %.2f' % ( train_error, test_error, end - start) results['Dask XGBoost'] = { 'train_error': train_error, 'test_error': test_error, 'time': end - start } with open('results.txt', 'w') as outfile: json.dump(results, outfile) print('Finished!')
def task(df, ram_to_use, is_dask): client = None if is_dask: client = Client(threads_per_worker=10, n_workers=10, memory_limit=''.join([str(ram_to_use), 'GB'])) models = [ Ridge(random_state=42), GradientBoostingRegressor(random_state=42), ][:1 if is_dask else 2] params = [ { "alpha": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0], }, { 'max_depth': [2, 3, 4, 6], 'n_estimators': [2, 3, 4, 5], }, ][:1 if is_dask else 2] X_train, X_test, y_train, y_test = get_dask_data( df.copy(), 'DepDelay') if is_dask else get_normal_data( df.copy(), 'DepDelay') for model, param in zip(models, params): t_start = time.time() results, _, _ = run_single_model(model, param, X_train, X_test, y_train, y_test, is_dask=is_dask) model_name = type(model).__name__ train_error, test_error = results[model_name]['metric'][ 'mean_squared_error'] t_end = time.time() time_took = round(t_end - t_start, 3) dict_saver = {} dict_saver.update( {'model_name': model_name + ('_dask' if is_dask else '')}) dict_saver.update({'train_error(MSE)': train_error}) dict_saver.update({'test_error(MSE)': test_error}) dict_saver.update({'time': time_took}) save_to_file(file_to_save_path, dict_saver) print(model_name, ':\t took ->', time_took, '\t with error (train, test)', (train_error, test_error)) if is_dask: params = { 'objective': 'reg:squarederror', 'max_depth': 4, 'eta': 0.01, 'subsample': 0.5, 'min_child_weight': 0.5 } t_start = time.time() bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=10) t_end = time.time() time_took = round(t_end - t_start, 3) y_train_hat = dask_xgboost.predict(client, bst, X_train).persist() y_test_hat = dask_xgboost.predict(client, bst, X_test).persist() y_train, y_train_hat = dask.compute(y_train, y_train_hat) y_test, y_test_hat = dask.compute(y_test, y_test_hat) train_error = mean_squared_error(y_train, y_train_hat) test_error = mean_squared_error(y_test, y_test_hat) dict_saver = {} dict_saver.update({'model_name': 'Dask XGBoost' + '_dask'}) dict_saver.update({'train_error(MSE)': train_error}) dict_saver.update({'test_error(MSE)': test_error}) dict_saver.update({'time': time_took}) save_to_file(file_to_save_path, dict_saver) print('Dask XGBoost', ':\t took ->', time_took, '\t with error (train, test)', (train_error, test_error))