Exemple #1
0
def main(client):
    m = 100000
    n = 100
    X, y = make_regression(n_samples=m,
                           n_features=n,
                           chunks=200,
                           random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dtrain = DaskDMatrix(client, X_train, y_train)
    dtest = DaskDMatrix(client, X_test, y_test)

    output = xgb.dask.train(
        client,
        {
            "verbosity": 1,
            "tree_method": "hist",
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "max_depth": 6,
            "learning_rate": 1.0,
        },
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dtest, "test")],
        callbacks=[
            CustomEarlyStopping(validation_set="test",
                                target_metric="rmse",
                                maximize=False,
                                seed=0)
        ],
    )
Exemple #2
0
def using_dask_matrix(client: Client, X, y):
    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(
        client,
        {
            'verbosity': 2,
            # Golden line for GPU training
            'tree_method': 'gpu_hist'
        },
        dtrain,
        num_boost_round=4,
        evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction
Exemple #3
0
def test_from_dask_dataframe():
    with LocalCluster(n_workers=5) as cluster:
        with Client(cluster) as client:
            X, y = generate_array()

            X = dd.from_dask_array(X)
            y = dd.from_dask_array(y)

            dtrain = DaskDMatrix(client, X, y)
            booster = xgb.dask.train(client, {}, dtrain,
                                     num_boost_round=2)['booster']

            prediction = xgb.dask.predict(client, model=booster, data=dtrain)

            assert prediction.ndim == 1
            assert isinstance(prediction, da.Array)
            assert prediction.shape[0] == kRows

            with pytest.raises(ValueError):
                # evals_result is not supported in dask interface.
                xgb.dask.train(client, {},
                               dtrain,
                               num_boost_round=2,
                               evals_result={})
            # force prediction to be computed
            prediction = prediction.compute()
Exemple #4
0
def main(client):
    # generate some random data for demonstration
    m = 100000
    n = 100
    X = da.random.random(size=(m, n), chunks=100)
    y = da.random.random(size=(m, ), chunks=100)

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(client, {
        'verbosity': 1,
        'nthread': 1,
        'tree_method': 'hist'
    },
                            dtrain,
                            num_boost_round=4,
                            evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction
Exemple #5
0
def main(client):
    n = 100
    m = 100000
    partition_size = 1000
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    output = xgb.dask.train(client, {
        'verbosity': 2,
        'nthread': 1,
        'tree_method': 'gpu_hist'
    },
                            dtrain,
                            num_boost_round=4,
                            evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction
Exemple #6
0
def test_predict():
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            X, y = generate_array()
            dtrain = DaskDMatrix(client, X, y)
            booster = xgb.dask.train(client, {}, dtrain,
                                     num_boost_round=2)['booster']

            pred = xgb.dask.predict(client, model=booster, data=dtrain)
            assert pred.ndim == 1
            assert pred.shape[0] == kRows

            margin = xgb.dask.predict(client,
                                      model=booster,
                                      data=dtrain,
                                      output_margin=True)
            assert margin.ndim == 1
            assert margin.shape[0] == kRows

            shap = xgb.dask.predict(client,
                                    model=booster,
                                    data=dtrain,
                                    pred_contribs=True)
            assert shap.ndim == 2
            assert shap.shape[0] == kRows
            assert shap.shape[1] == kCols + 1
Exemple #7
0
def test_from_dask_dataframe() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client:
            X, y, _ = generate_array()

            X = dd.from_dask_array(X)
            y = dd.from_dask_array(y)

            dtrain = DaskDMatrix(client, X, y)
            booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster']

            prediction = xgb.dask.predict(client, model=booster, data=dtrain)

            assert prediction.ndim == 1
            assert isinstance(prediction, da.Array)
            assert prediction.shape[0] == kRows

            with pytest.raises(TypeError):
                # evals_result is not supported in dask interface.
                xgb.dask.train(  # type:ignore
                    client, {}, dtrain, num_boost_round=2, evals_result={})
            # force prediction to be computed
            from_dmatrix = prediction.compute()

            prediction = xgb.dask.predict(client, model=booster, data=X)
            from_df = prediction.compute()

            assert isinstance(prediction, dd.Series)
            assert np.all(prediction.compute().values == from_dmatrix)
            assert np.all(from_dmatrix == from_df.to_numpy())

            series_predictions = xgb.dask.inplace_predict(client, booster, X)
            assert isinstance(series_predictions, dd.Series)
            np.testing.assert_allclose(series_predictions.compute().values,
                                       from_dmatrix)
Exemple #8
0
def test_from_dask_array():
    with LocalCluster(n_workers=kWorkers, threads_per_worker=5) as cluster:
        with Client(cluster) as client:
            X, y = generate_array()
            dtrain = DaskDMatrix(client, X, y)
            # results is {'booster': Booster, 'history': {...}}
            result = xgb.dask.train(client, {}, dtrain)

            prediction = xgb.dask.predict(client, result, dtrain)
            assert prediction.shape[0] == kRows

            assert isinstance(prediction, da.Array)
            # force prediction to be computed
            prediction = prediction.compute()

            booster = result['booster']
            single_node_predt = booster.predict(xgb.DMatrix(X.compute()))
            np.testing.assert_allclose(prediction, single_node_predt)

            config = json.loads(booster.save_config())
            assert int(config['learner']['generic_param']['nthread']) == 5

            from_arr = xgb.dask.predict(client, model=booster, data=X)

            assert isinstance(from_arr, da.Array)
            assert np.all(single_node_predt == from_arr.compute())
Exemple #9
0
def test_from_dask_array(client):
    X, y = generate_array()
    dtrain = DaskDMatrix(client, X, y)
    # results is {'booster': Booster, 'history': {...}}
    result = xgb.dask.train(client, {}, dtrain)

    prediction = xgb.dask.predict(client, result, dtrain)

    assert isinstance(prediction, da.Array)
Exemple #10
0
def load_higgs_for_dask(client, X_t, X_v, y_t, y_v):
    '''
    :param client: gpu设备
    :param X_t: 训练集
    :param X_v: 验证集
    :param y_t: 训练集标签
    :param y_v: 验证集标签
    :return: dask.datafram格式的数据
    '''
    import dask.dataframe as dd
    # 1. Create a Dask Dataframe from Pandas Dataframe.
    ddf_higgs_train = dd.from_pandas(X_t, npartitions=8)
    ddf_higgs_test = dd.from_pandas(X_v, npartitions=8)
    ddf_y_train = dd.from_pandas(y_t, npartitions=8)
    ddf_y_test = dd.from_pandas(y_v, npartitions=8)
    # 2. Create Dask DMatrix Object using dask dataframes
    ddtrain = DaskDMatrix(client, ddf_higgs_train, ddf_y_train)
    ddtest = DaskDMatrix(client, ddf_higgs_test, ddf_y_test)

    return ddtrain, ddtest
def test_from_dask_array(client):
    X, y = generate_array()
    dtrain = DaskDMatrix(client, X, y)
    # results is {'booster': Booster, 'history': {...}}
    result = xgb.dask.train(client, {}, dtrain)

    prediction = xgb.dask.predict(client, result, dtrain)
    assert prediction.shape[0] == kRows

    assert isinstance(prediction, da.Array)

    prediction = prediction.compute()  # force prediction to be computed
def main(client):
    # Load an example survival data from CSV into a Dask data frame.
    # The Veterans' Administration Lung Cancer Trial
    # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
    CURRENT_DIR = os.path.dirname(__file__)
    df = dd.read_csv(
        os.path.join(CURRENT_DIR, os.pardir, 'data',
                     'veterans_lung_cancer.csv'))

    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
    # DMatrix scatter around workers.
    # For AFT survival, you'd need to extract the lower and upper bounds for the label
    # and pass them as arguments to DaskDMatrix.
    y_lower_bound = df['Survival_label_lower_bound']
    y_upper_bound = df['Survival_label_upper_bound']
    X = df.drop(['Survival_label_lower_bound', 'Survival_label_upper_bound'],
                axis=1)
    dtrain = DaskDMatrix(client,
                         X,
                         label_lower_bound=y_lower_bound,
                         label_upper_bound=y_upper_bound)

    # Use train method from xgboost.dask instead of xgboost.  This
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
    params = {
        'verbosity': 1,
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'learning_rate': 0.05,
        'aft_loss_distribution_scale': 1.20,
        'aft_loss_distribution': 'normal',
        'max_depth': 6,
        'lambda': 0.01,
        'alpha': 0.02
    }
    output = xgb.dask.train(client,
                            params,
                            dtrain,
                            num_boost_round=100,
                            evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history: ', history)

    # Uncomment the following line to save the model to the disk
    # bst.save_model('survival_model.json')

    return prediction
Exemple #13
0
def train(seed, epochs, n_gpus, dataset):
    with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=4) as cluster:
        with Client(cluster) as client:
            # Fetch dataset using sklearn
            if dataset == 'boston':
                dataset = load_boston()
                param = {}
            elif dataset == 'covertype':
                dataset = fetch_covtype()
                param = {
                    'objective': 'multi:softmax',
                    'num_class': 8
                    # 'single_precision_histogram': True
                }

            param['verbosity'] = 2
            param['tree_method'] = 'gpu_hist'

            # Rechunking is required for the covertype dataset
            X = da.from_array(dataset.data, chunks=1000)
            y = da.from_array(dataset.target, chunks=1000)

            # Create 0.75/0.25 train/test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.25, train_size=0.75, random_state=0)

            dtrain = DaskDMatrix(client, X_train, y_train)
            dtest = DaskDMatrix(client, X_test, y_test)

            random_seed(seed, param)

            gpu_runtime = time.time()
            model_training_results = xgb.dask.train(client,
                                                    param,
                                                    dtrain,
                                                    num_boost_round=epochs,
                                                    evals=[(dtest, 'test')])

            print(model_training_results)
            print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
def main(client, train_dir, model_file, fs, do_wait=False):

    colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
    df = dd.read_csv(train_dir, header=None, names=colnames)
    X = df[df.columns.difference(['label'])]
    y = df['label']
    print("[INFO]: ------ CSV files are read")

    if do_wait is True:
        df = df.persist()
        X = X.persist()
        wait(df)
        wait(X)
        print("[INFO]: ------ Long waited but the data is ready now")

    start_time = time.time()
    dtrain = DaskDMatrix(client, X, y)
    print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format(
        (time.time() - start_time)))

    del df
    del X
    del y

    start_time = time.time()
    output = xgb.dask.train(client, {
        'verbosity': 2,
        'learning_rate': 0.1,
        'max_depth': 8,
        'objective': 'reg:squarederror',
        'subsample': 0.5,
        'gamma': 0.9,
        'verbose_eval': True,
        'tree_method': 'hist',
    },
                            dtrain,
                            num_boost_round=100,
                            evals=[(dtrain, 'train')])
    print("[INFO]: ------ Training is completed in {} seconds ---".format(
        (time.time() - start_time)))

    history = output['history']
    print('[INFO]: ------ Training evaluation history:', history)

    output['booster'].save_model('/tmp/tmp.model')
    fs.put('/tmp/tmp.model', model_file)
    print("[INFO]: ------ Model saved here:{}".format(model_file))
Exemple #15
0
def test_from_dask_dataframe(client):
    X, y = generate_array()

    X = dd.from_dask_array(X)
    y = dd.from_dask_array(y)

    dtrain = DaskDMatrix(client, X, y)
    booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster']

    prediction = xgb.dask.predict(client, model=booster, data=dtrain)

    assert isinstance(prediction, da.Array)
    assert prediction.shape[0] == kRows, prediction

    with pytest.raises(ValueError):
        # evals_result is not supported in dask interface.
        xgb.dask.train(client, {}, dtrain, num_boost_round=2, evals_result={})
Exemple #16
0
    def test_global_config(self, client: "Client") -> None:
        X, y, _ = generate_array()
        xgb.config.set_config(verbosity=0)
        dtrain = DaskDMatrix(client, X, y)
        before_fname = './before_training-test_global_config'
        after_fname = './after_training-test_global_config'

        class TestCallback(xgb.callback.TrainingCallback):
            def write_file(self, fname: str) -> None:
                with open(fname, 'w') as fd:
                    fd.write(str(xgb.config.get_config()['verbosity']))

            def before_training(self, model: xgb.Booster) -> xgb.Booster:
                self.write_file(before_fname)
                assert xgb.config.get_config()['verbosity'] == 0
                return model

            def after_training(self, model: xgb.Booster) -> xgb.Booster:
                assert xgb.config.get_config()['verbosity'] == 0
                return model

            def before_iteration(self, model: xgb.Booster, epoch: int,
                                 evals_log: Dict) -> bool:
                assert xgb.config.get_config()['verbosity'] == 0
                return False

            def after_iteration(self, model: xgb.Booster, epoch: int,
                                evals_log: Dict) -> bool:
                self.write_file(after_fname)
                assert xgb.config.get_config()['verbosity'] == 0
                return False

        xgb.dask.train(client, {},
                       dtrain,
                       num_boost_round=4,
                       callbacks=[TestCallback()])['booster']

        with open(before_fname, 'r') as before, open(after_fname,
                                                     'r') as after:
            assert before.read() == '0'
            assert after.read() == '0'

        os.remove(before_fname)
        os.remove(after_fname)
Exemple #17
0
def test_predict_with_meta(client):
    X, y, w = generate_array(with_weights=True)
    partition_size = 20
    margin = da.random.random(kRows, partition_size) + 1e4

    dtrain = DaskDMatrix(client, X, y, weight=w, base_margin=margin)
    booster = xgb.dask.train(
        client, {}, dtrain, num_boost_round=4)['booster']

    prediction = xgb.dask.predict(client, model=booster, data=dtrain)
    assert prediction.ndim == 1
    assert prediction.shape[0] == kRows

    prediction = client.compute(prediction).result()
    assert np.all(prediction > 1e3)

    m = xgb.DMatrix(X.compute())
    m.set_info(label=y.compute(), weight=w.compute(), base_margin=margin.compute())
    single = booster.predict(m)  # Make sure the ordering is correct.
    assert np.all(prediction == single)
Exemple #18
0
def main(client):
    n = 100
    m = 100000
    partition_size = 1000
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

    dtrain = DaskDMatrix(client, X, y)

    output = xgb.dask.train(client,
                            {'verbosity': 2,
                             'nthread': 1,
                             'tree_method': 'hist'},
                            dtrain,
                            num_boost_round=4, evals=[(dtrain, 'train')])
    bst = output['booster']
    history = output['history']

    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction
Exemple #19
0
def test_xgboost_covtype_multi_gpu():
    import xgboost as xgb
    import numpy as np
    from sklearn.model_selection import train_test_split
    import time
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    from dask import array as da
    import xgboost as xgb
    from xgboost.dask import DaskDMatrix
    from dask import array as da

    # Fetch dataset using sklearn
    cov = fetch_data()
    X = cov.data
    y = cov.target

    print(X.shape, y.shape)

    # Create 0.75/0.25 train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
                                                        random_state=42)

    # Specify sufficient boosting iterations to reach a minimum
    num_round = 10

    # Leave most parameters as default
    param = {'objective': 'multi:softmax',  # Specify multiclass classification
             'num_class': 8,  # Number of possible output classes
             'tree_method': 'gpu_hist',  # Use GPU accelerated algorithm
             }

    from h2o4gpu.util.gpu import device_count
    n_gpus, devices = device_count(-1)

    with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=1) as cluster:
        with Client(cluster) as client:
            # Convert input data from numpy to XGBoost format
            partition_size = 100000

            # remove when https://github.com/dmlc/xgboost/issues/4987 is fixed
            dask_X_train = da.from_array(X_train, partition_size)
            dask_X_train = dask_X_train.persist()
            client.rebalance(dask_X_train)
            dask_label_train = da.from_array(y_train, partition_size)
            dask_label_train = dask_label_train.persist()
            client.rebalance(dask_label_train)

            dtrain = DaskDMatrix(
                client=client, data=dask_X_train, label=dask_label_train)

            dask_X_test = da.from_array(X_test, partition_size)
            dask_X_test = dask_X_test.persist()
            client.rebalance(dask_X_test)
            dask_label_test = da.from_array(y_test, partition_size)
            dask_label_test = dask_label_test.persist()
            client.rebalance(dask_label_test)

            dtest = DaskDMatrix(
                client=client, data=dask_X_test, label=dask_label_test)

            gpu_res = {}  # Store accuracy result
            tmp = time.time()
            # Train model
            xgb.dask.train(client, param, dtrain, num_boost_round=num_round, evals=[
                (dtest, 'test')])
            print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

            # TODO: https://github.com/dmlc/xgboost/issues/4518
            dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
            dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)
            # Repeat for CPU algorithm
            tmp = time.time()
            param['tree_method'] = 'hist'
            cpu_res = {}
            xgb.train(param, dtrain, num_round, evals=[
                (dtest, 'test')], evals_result=cpu_res)
            print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
def train_xgboost_gpu(X,
                      y,
                      data_chunksize=None,
                      n_gpus=None,
                      n_threads_per_gpu=1,
                      params=None,
                      xgboost_model=None,
                      gpu_cluster=None,
                      client=None):
    '''
	Trains a XGBoost model on the GPU.

	:param X: a 2D matrix object of either type numpy ndarray or pandas DataFrame;
	:param y: a 1D array of one of the following types: numpy ndarray, pandas Series or pandas DataFrame;

	:param data_chunksize: number of rows to partition input data (both X and y simultaneously) to split among multiple
		GPU devices. Default value None splits evenly among devices;
	:param n_gpus: number of GPUs to be used. Default value None selects all available devices.
	:param n_threads_per_gpu: number of threads per GPU;
	:param params: xgboost trainning params as a python dict, refer to
		https://xgboost.readthedocs.io/en/latest/parameter.html
	:param xgboost_model: xgbooster object to continue training, it may be either a regular XGBoost model or a
		dask xgboost dict
	:param gpu_cluster: an existing dask cluster object to use. This param should be used if you call this method
		too many times in quick successions. Note that this function doesn't close an externally created cluster.
	:param client: an existing dask client object to use. This param should be used if you call this method
		too many times in quick successions. Note that this function doesn't close an externally created client.

	:return:
	A dictionary containing 2 keys:
		* 'booster': maps to a XGBoost model
		* 'history': maps to  another dict which informs the history of the training process, as in the following the
			examṕle: {'train': {'logloss': ['0.48253', '0.35953']}, 'eval': {'logloss': ['0.480385', '0.357756']}}}
	'''

    if gpu_cluster is None:
        local_gpus = LocalCUDACluster(n_workers=n_gpus,
                                      threads_per_worker=n_threads_per_gpu)
    else:
        local_gpus = gpu_cluster
    if client is None:
        local_dask_client = Client(local_gpus, {'verbose': 0})
    else:
        local_dask_client = client

    if data_chunksize is None:
        data_chunksize = X.shape[0] // len(local_gpus.cuda_visible_devices)
    if params is None:
        params = {
            'learning_rate': 0.3,
            'max_depth': 8,
            'objective': 'reg:squarederror',
            'verbosity': 0,
            'tree_method': 'gpu_hist'
        }

    if isinstance(X, pd.DataFrame):
        X = from_pandas(X, chunksize=data_chunksize)
    else:
        X = from_array(X, chunksize=data_chunksize)
    if isinstance(y, pd.DataFrame):
        y = from_pandas(y, chunksize=data_chunksize)
    else:
        y = from_array(y, chunksize=data_chunksize)
    dtrain = DaskDMatrix(local_dask_client, X, y)

    if type(xgboost_model) is dict:
        xgboost_model = xgboost_model['booster']

    xgb_model = dask_xgboost_train(local_dask_client,
                                   params,
                                   dtrain,
                                   num_boost_round=100,
                                   evals=[(dtrain, 'train')],
                                   xgb_model=xgboost_model)

    if client is None:
        local_dask_client.close()
    if gpu_cluster is None:
        local_gpus.close()

    return xgb_model