def using_quantile_device_dmatrix(client: Client, train_dir, model_file, fs, do_wait=False): '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree method that reduces memory overhead. When training on GPU pipeline, it's preferred over `DaskDMatrix`. .. versionadded:: 1.2.0 ''' colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)] df = dask_cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None) X = df[df.columns.difference(['label'])] y = df['label'] print("[INFO]: ------ CSV files are read from" + train_dir) if do_wait is True: df = df.persist() X = X.persist() wait(df) wait(X) print("[INFO]: ------ Long waited but the data is ready now") # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful # that it can not be used for anything else than training. start_time = time.time() dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time))) del df del X del y start_time = time.time() output = xgb.dask.train(client, { 'verbosity': 2, 'learning_rate': 0.1, 'max_depth': 8, 'objective': 'reg:squarederror', 'subsample': 0.5, 'gamma': 0.9, 'verbose_eval': True, 'tree_method':'gpu_hist', #'nthread':1 }, dtrain, num_boost_round=100, evals=[(dtrain, 'train')]) print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time))) history = output['history'] print('[INFO]: ------ Training evaluation history:', history) output['booster'].save_model('/tmp/tmp.model') fs.put('/tmp/tmp.model', model_file) print("[INFO]: ------ Model saved here:{}".format( model_file))
def test_empty_partition(self, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf import cudf import cupy with Client(local_cuda_cluster) as client: mult = 100 df = cudf.DataFrame({ "a": [1, 2, 3, 4, 5.1] * mult, "b": [10, 15, 29.3, 30, 31] * mult, "y": [10, 20, 30, 40., 50] * mult, }) parameters = {"tree_method": "gpu_hist", "debug_synchronize": True} empty = df.iloc[:0] ddf = dask_cudf.concat( [dask_cudf.from_cudf(empty, npartitions=1)] + [dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst_empty = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt_empty = dxgb.predict(client, bst_empty, X).compute().values ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt = dxgb.predict(client, bst, X).compute().values cupy.testing.assert_allclose(predt, predt_empty)
def using_quantile_device_dmatrix(client: Client, X, y): '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree method that reduces memory overhead. When training on GPU pipeline, it's preferred over `DaskDMatrix`. .. versionadded:: 1.2.0 ''' # Input must be on GPU for `DaskDeviceQuantileDMatrix`. X = X.map_blocks(cp.array) y = y.map_blocks(cp.array) # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful # that it can not be used for anything else than training. dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) output = xgb.dask.train(client, {'verbosity': 2, 'tree_method': 'gpu_hist'}, dtrain, num_boost_round=4) prediction = xgb.dask.predict(client, output, X) return prediction