Beispiel #1
0
def using_quantile_device_dmatrix(client: Client, train_dir, model_file, fs, do_wait=False):
    '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
     method that reduces memory overhead.  When training on GPU pipeline, it's
     preferred over `DaskDMatrix`.
    .. versionadded:: 1.2.0
    '''
    colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
    df = dask_cudf.read_csv(train_dir, header=None, names=colnames, chunksize=None)
    X = df[df.columns.difference(['label'])]
    y = df['label']
    print("[INFO]: ------ CSV files are read from" + train_dir)
   

    if do_wait is True:
        df = df.persist()
        X = X.persist()
        wait(df)
        wait(X)
        print("[INFO]: ------ Long waited but the data is ready now")
    

    # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
    # that it can not be used for anything else than training.
    start_time = time.time()
    dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
    print("[INFO]: ------ QuantileDMatrix is formed in {} seconds ---".format((time.time() - start_time)))
   
    del df
    del X
    del y

    start_time = time.time()
    output = xgb.dask.train(client,
                        { 'verbosity': 2,
                         'learning_rate': 0.1,
                          'max_depth': 8,
                          'objective': 'reg:squarederror',
                          'subsample': 0.5,
                          'gamma': 0.9,
                          'verbose_eval': True,
                          'tree_method':'gpu_hist',
                          #'nthread':1
                        },
                        dtrain,
                        num_boost_round=100, evals=[(dtrain, 'train')])
    print("[INFO]: ------ Training is completed in {} seconds ---".format((time.time() - start_time)))
    
    history = output['history']
    print('[INFO]: ------ Training evaluation history:', history)
    
    output['booster'].save_model('/tmp/tmp.model')
    fs.put('/tmp/tmp.model', model_file)
    print("[INFO]: ------ Model saved here:{}".format( model_file))
    def test_empty_partition(self,
                             local_cuda_cluster: LocalCUDACluster) -> None:
        import dask_cudf
        import cudf
        import cupy
        with Client(local_cuda_cluster) as client:
            mult = 100
            df = cudf.DataFrame({
                "a": [1, 2, 3, 4, 5.1] * mult,
                "b": [10, 15, 29.3, 30, 31] * mult,
                "y": [10, 20, 30, 40., 50] * mult,
            })
            parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}

            empty = df.iloc[:0]
            ddf = dask_cudf.concat(
                [dask_cudf.from_cudf(empty, npartitions=1)] +
                [dask_cudf.from_cudf(df, npartitions=3)] +
                [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst_empty = xgb.dask.train(client,
                                       parameters,
                                       dtrain,
                                       evals=[(dtrain, "train")])
            predt_empty = dxgb.predict(client, bst_empty, X).compute().values

            ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] +
                                   [dask_cudf.from_cudf(df, npartitions=3)])
            X = ddf[ddf.columns.difference(["y"])]
            y = ddf[["y"]]
            dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
            bst = xgb.dask.train(client,
                                 parameters,
                                 dtrain,
                                 evals=[(dtrain, "train")])
            predt = dxgb.predict(client, bst, X).compute().values

            cupy.testing.assert_allclose(predt, predt_empty)
def using_quantile_device_dmatrix(client: Client, X, y):
    '''`DaskDeviceQuantileDMatrix` is a data type specialized for `gpu_hist`, tree
     method that reduces memory overhead.  When training on GPU pipeline, it's
     preferred over `DaskDMatrix`.

    .. versionadded:: 1.2.0

    '''
    # Input must be on GPU for `DaskDeviceQuantileDMatrix`.
    X = X.map_blocks(cp.array)
    y = y.map_blocks(cp.array)

    # `DaskDeviceQuantileDMatrix` is used instead of `DaskDMatrix`, be careful
    # that it can not be used for anything else than training.
    dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y)
    output = xgb.dask.train(client,
                            {'verbosity': 2,
                             'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=4)

    prediction = xgb.dask.predict(client, output, X)
    return prediction