Ejemplo n.º 1
0
    gpu_dfs = [
        dask.delayed(xgb.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs
    ]
    gpu_dfs = [gpu_df.persist() for gpu_df in gpu_dfs]
    gc.collect()
    wait(gpu_dfs)

    end = time.time()
    print("****Data Convertion done. Time used: ", end - start)

    # #### Train the Gradient Boosted Decision Tree with a single call to
    # ```python
    # dask_xgboost.train(client, params, data, labels, num_boost_round=dxgb_gpu_params['nround'])
    # ```

    # In[ ]:

    start = time.time()
    print("starting training----")

    # %%time
    labels = None
    bst = dxgb_gpu.train(client,
                         dxgb_gpu_params,
                         gpu_dfs,
                         labels,
                         num_boost_round=dxgb_gpu_params['nround'])

    end = time.time()
    print("****Training done. Time used: ", end - start)
Ejemplo n.º 2
0
    def process(self, inputs):
        import gc  # python standard lib garbage collector
        import xgboost as xgb
        from dask.delayed import delayed
        from dask.distributed import (wait, get_worker)
        import dask_xgboost as dxgb_gpu

        logmgr = MortgagePluginsLoggerMgr()
        logger = logmgr.get_logger()

        filter_dask_logger = self.conf.get('filter_dask_logger')

        client = self.conf['client']

        client.run(init_workers_logger)

        dxgb_gpu_params = self.conf['dxgb_gpu_params']
        delete_dataframes = self.conf.get('delete_dataframes')
        create_dmatrix_serially = self.conf.get('create_dmatrix_serially')

        mortgage_feat_df_delinq_df_pandas_futures = inputs[0]

        def make_xgb_dmatrix(mortgage_feat_df_delinq_df_pandas_tuple,
                             delete_dataframes=None):
            worker = get_worker()

            logname = 'make_xgb_dmatrix'
            logmgr = MortgagePluginsLoggerMgr(worker, logname)
            logger = logmgr.get_logger()

            logger.info('CREATING DMATRIX ON WORKER {}'.format(worker.name))
            (mortgage_feat_df, delinq_df) = \
                mortgage_feat_df_delinq_df_pandas_tuple
            dmat = xgb.DMatrix(mortgage_feat_df, delinq_df)

            if delete_dataframes:
                del (mortgage_feat_df)
                del (delinq_df)
                # del(mortgage_feat_df_delinq_df_pandas_tuple)
                gc.collect()

            logmgr.cleanup()

            return dmat

        dmatrix_delayed_list = []
        nworkers = len(mortgage_feat_df_delinq_df_pandas_futures)

        if create_dmatrix_serially:
            logger.info(
                'CREATING DMATRIX SERIALLY ACROSS {} WORKERS'.format(nworkers))
        else:
            logger.info(
                'CREATING DMATRIX IN PARALLEL ACROSS {} WORKERS'.format(
                    nworkers))

        for ifut in mortgage_feat_df_delinq_df_pandas_futures:
            dmat_delayed = delayed(make_xgb_dmatrix)(ifut, delete_dataframes)
            dmat_delayed_persist = dmat_delayed.persist()

            if create_dmatrix_serially:
                # TODO: For multinode efficiency need to poll the futures
                #     such that only doing serial dmatrix creation on the
                #     same node, but across nodes should be in parallel.
                wait(dmat_delayed_persist)

            dmatrix_delayed_list.append(dmat_delayed_persist)

        wait(dmatrix_delayed_list)

        if filter_dask_logger:
            wlogs = client.get_worker_logs()
            print_distributed_dask_hijacked_logs(wlogs, logger,
                                                 ('make_xgb_dmatrix', ))

        client.run(restore_workers_logger)

        logger.info('JUST AFTER DMATRIX')
        print_ram_usage()

        logger.info('RUNNING XGBOOST TRAINING USING DASK-XGBOOST')
        labels = None
        bst = dxgb_gpu.train(client,
                             dxgb_gpu_params,
                             dmatrix_delayed_list,
                             labels,
                             num_boost_round=dxgb_gpu_params['nround'])

        logmgr.cleanup()

        return bst
Ejemplo n.º 3
0
# In[5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# In[6]:

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 100000,
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5
}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100)

# In[7]:

y_hat = dask_xgboost.predict(client, bst, X_test).persist()
y_hat

# In[8]:

r = r2_score(y_test.compute(), y_hat.compute())
mae = mean_absolute_error(y_test.compute(), y_hat.compute())
mse = mean_squared_error(y_test.compute(), y_hat.compute())
print("R^2:", r)
print("MAE:", mae)
print("MSE:", mse)
Ejemplo n.º 4
0
for col in vars_cat:
  dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col])
  
X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True)      
y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0)  

X_train = X_all[0:d_train.shape[0],]
y_train = y_all[0:d_train.shape[0]]
X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),]
y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])]

X_train.persist()
y_train.persist()

client.has_what()


param = {'objective':'binary:logistic', 'tree_method':'hist', 'max_depth':10, 'eta':0.1}             
%time md = dxgb.train(client, param, X_train, y_train, num_boost_round = 100)


y_pred = dxgb.predict(client, md, X_test)
y_pred_loc = y_pred.compute()
y_test_loc = y_test.compute()
print(metrics.roc_auc_score(y_test_loc, y_pred_loc))


## m5.4xlarge 16c (8+8HT)
## Wall time: 34.3 s
## 0.7928378346764724
Ejemplo n.º 5
0
def main():
    print("Setting up data directory")
    print("-------------------------")

    #flights(args.url)
    columns = ['Year', 'Month', 'DayOfWeek', 'Distance', 'DepDelay', 'Origin']
    data_dir = 'data'
    target = 'DepDelay'
    log = ''
    results = {}

    df = get_df(columns).dropna()
    is_dask = True

    client = None
    if is_dask:
        client = Client(n_workers=20,
                        threads_per_worker=20,
                        memory_limit='1GB')

    model = GradientBoostingRegressor(random_state=18)
    params = {'max_depth': [2, 3], 'n_estimators': [1, 2, 3]}
    X_train, X_test, y_train, y_test = get_data(df.copy(),
                                                target,
                                                is_dask=False,
                                                chunksize=200)
    results = dict()
    clf_name = type(model).__name__

    clf_cv = GridSearchCV(model,
                          param_grid=params,
                          cv=StratifiedKFold(n_splits=10,
                                             shuffle=True,
                                             random_state=18),
                          scoring='neg_mean_squared_error')

    with joblib.parallel_backend("dask" if is_dask else 'loky'):
        start = time.time()
        clf_cv.fit(X_train, y_train)
        end = time.time()

    y_predict_train = clf_cv.best_estimator_.predict(X_train)
    y_predict_test = clf_cv.best_estimator_.predict(X_test)

    train_error = mean_squared_error(
        y_train,
        y_predict_train,
    )
    test_error = mean_squared_error(
        y_test,
        y_predict_test,
    )
    best_params = clf_cv.best_params_

    results['Scikit XGBoost'] = {
        'train_error': train_error,
        'test_error': test_error,
        'time': end - start
    }
    log += 'Scikit XGBoost train_error: %.2f, test_error: %.2f, took: %.2f\n' % (
        train_error, test_error, end - start)

    is_dask = True
    X_train, X_test, y_train, y_test = get_data(df.copy(),
                                                target,
                                                is_dask=is_dask,
                                                chunksize=200)
    params = {
        'objective': 'reg:squarederror',
        'max_depth': 3,
        'eta': 0.01,
        'subsample': 0.5,
        'min_child_weight': 0.2
    }

    start = time.time()
    bst = dask_xgboost.train(client,
                             params,
                             X_train,
                             y_train,
                             num_boost_round=10)
    end = time.time()

    y_train_pred = dask_xgboost.predict(client, bst, X_train).persist()
    y_test_pred = dask_xgboost.predict(client, bst, X_test).persist()

    y_train, y_train_pred = dask.compute(y_train, y_train_pred)
    y_test, y_test_pred = dask.compute(y_test, y_test_pred)

    train_error = mean_squared_error(y_train, y_train_pred)
    test_error = mean_squared_error(y_test, y_test_pred)

    log += 'Dask XGBoost train_error: %.2f, test_error: %.2f, took: %.2f' % (
        train_error, test_error, end - start)
    results['Dask XGBoost'] = {
        'train_error': train_error,
        'test_error': test_error,
        'time': end - start
    }

    with open('results.txt', 'w') as outfile:
        json.dump(results, outfile)

    print('Finished!')
Ejemplo n.º 6
0
def task(df, ram_to_use, is_dask):
    client = None
    if is_dask:
        client = Client(threads_per_worker=10,
                        n_workers=10,
                        memory_limit=''.join([str(ram_to_use), 'GB']))

    models = [
        Ridge(random_state=42),
        GradientBoostingRegressor(random_state=42),
    ][:1 if is_dask else 2]

    params = [
        {
            "alpha": [0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
        },
        {
            'max_depth': [2, 3, 4, 6],
            'n_estimators': [2, 3, 4, 5],
        },
    ][:1 if is_dask else 2]

    X_train, X_test, y_train, y_test = get_dask_data(
        df.copy(), 'DepDelay') if is_dask else get_normal_data(
            df.copy(), 'DepDelay')

    for model, param in zip(models, params):
        t_start = time.time()
        results, _, _ = run_single_model(model,
                                         param,
                                         X_train,
                                         X_test,
                                         y_train,
                                         y_test,
                                         is_dask=is_dask)
        model_name = type(model).__name__
        train_error, test_error = results[model_name]['metric'][
            'mean_squared_error']
        t_end = time.time()
        time_took = round(t_end - t_start, 3)

        dict_saver = {}
        dict_saver.update(
            {'model_name': model_name + ('_dask' if is_dask else '')})
        dict_saver.update({'train_error(MSE)': train_error})
        dict_saver.update({'test_error(MSE)': test_error})
        dict_saver.update({'time': time_took})
        save_to_file(file_to_save_path, dict_saver)

        print(model_name, ':\t took ->', time_took,
              '\t with error (train, test)', (train_error, test_error))

    if is_dask:
        params = {
            'objective': 'reg:squarederror',
            'max_depth': 4,
            'eta': 0.01,
            'subsample': 0.5,
            'min_child_weight': 0.5
        }

        t_start = time.time()
        bst = dask_xgboost.train(client,
                                 params,
                                 X_train,
                                 y_train,
                                 num_boost_round=10)
        t_end = time.time()
        time_took = round(t_end - t_start, 3)

        y_train_hat = dask_xgboost.predict(client, bst, X_train).persist()
        y_test_hat = dask_xgboost.predict(client, bst, X_test).persist()

        y_train, y_train_hat = dask.compute(y_train, y_train_hat)
        y_test, y_test_hat = dask.compute(y_test, y_test_hat)

        train_error = mean_squared_error(y_train, y_train_hat)
        test_error = mean_squared_error(y_test, y_test_hat)

        dict_saver = {}
        dict_saver.update({'model_name': 'Dask XGBoost' + '_dask'})
        dict_saver.update({'train_error(MSE)': train_error})
        dict_saver.update({'test_error(MSE)': test_error})
        dict_saver.update({'time': time_took})
        save_to_file(file_to_save_path, dict_saver)

        print('Dask XGBoost', ':\t took ->', time_took,
              '\t with error (train, test)', (train_error, test_error))