Exemple #1
0
def null_dataframe_masks(
    draw,
    strategy: Optional[SearchStrategy],
    nullable_columns: Dict[str, bool],
):
    """Strategy for masking a values in a pandas DataFrame.

    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param nullable_columns: dictionary where keys are column names and
        values indicate whether that column is nullable.
    """
    val = draw(strategy)
    size = val.shape[0]
    columns_strat = []
    for name, nullable in nullable_columns.items():
        element_st = st.booleans() if nullable else st.just(False)
        columns_strat.append(
            pdst.column(
                name=name,
                elements=element_st,
                dtype=bool,
                fill=st.just(False),
            )
        )
    mask_st = pdst.data_frames(
        columns=columns_strat,
        index=pdst.range_indexes(min_size=size, max_size=size),
    )
    null_mask = draw(mask_st)
    # assume that there is at least one masked value
    hypothesis.assume(null_mask.any(axis=None))
    return val.mask(null_mask)
Exemple #2
0
def series_strategy(
    pandas_dtype: PandasDtype,
    strategy: Optional[SearchStrategy] = None,
    *,
    checks: Optional[Sequence] = None,
    nullable: Optional[bool] = False,
    allow_duplicates: Optional[bool] = True,
    name: Optional[str] = None,
    size: Optional[int] = None,
):
    """Strategy to generate a pandas Series.

    :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance.
    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param checks: sequence of :class:`~pandera.checks.Check` s to constrain
        the values of the data in the column/index.
    :param nullable: whether or not generated Series contains null values.
    :param allow_duplicates: whether or not generated Series contains
        duplicates.
    :param name: name of the Series.
    :param size: number of elements in the Series.
    :returns: ``hypothesis`` strategy.
    """
    elements = field_element_strategy(pandas_dtype, strategy, checks=checks)
    strategy = (
        pdst.series(
            elements=elements,
            dtype=pandas_dtype.numpy_dtype,
            index=pdst.range_indexes(
                min_size=0 if size is None else size, max_size=size
            ),
            unique=not allow_duplicates,
        )
        .filter(lambda x: x.shape[0] > 0)
        .map(lambda x: x.rename(name))
        .map(lambda x: x.astype(pandas_dtype.str_alias))
    )
    if nullable:
        strategy = null_field_masks(strategy)

    def undefined_check_strategy(strategy, check):
        """Strategy for checks with undefined strategies."""
        warnings.warn(
            "Vectorized check doesn't have a defined strategy."
            "Falling back to filtering drawn values based on the check "
            "definition. This can considerably slow down data-generation."
        )

        def _check_fn(series):
            return check(series).check_passed

        return strategy.filter(_check_fn)

    for check in checks if checks is not None else []:
        if not hasattr(check, "strategy") and not check.element_wise:
            strategy = undefined_check_strategy(strategy, check)

    return strategy
Exemple #3
0
def two_equal_size_series(draw):
    series_strategy = series(
        dtype=np.float64, elements=float_strategy, index=range_indexes(min_size=1)
    )
    s1 = draw(series_strategy)
    s2 = draw(series_strategy)
    assume(len(s1) == len(s2))
    return s1, s2
Exemple #4
0
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames([
        pdst.column('A', dtype=int, unique=False),
        pdst.column('B', dtype=int, unique=True)],
        rows=st.tuples(st.integers(0, 10), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2)
    )
    find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
Exemple #5
0
def categoricaldf_strategy():
    return data_frames(
        columns=[
            column("names", st.sampled_from(names)),
            column("numbers", st.sampled_from(range(3))),
        ],
        index=range_indexes(min_size=1, max_size=20),
    )
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames([
        pdst.column('A', dtype=int, unique=False),
        pdst.column('B', dtype=int, unique=True)],
        rows=st.tuples(st.integers(0, 10), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2)
    )
    find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
def cmatrix_dataframes():
    df = data_frames(
        columns=[s_column(),
                 alf_column("X"), alf_column("Y"),
                 bet_column("X"), bet_column("Y"),
                 generic_column("R")],
        index=range_indexes(min_size=2, max_size=MAX_NRES)
    )
    return df
Exemple #8
0
def nulldf_strategy():
    return data_frames(
        columns=[
            column("1", st.floats(allow_nan=True, allow_infinity=True)),
            column("2", st.sampled_from([np.nan])),
            column("3", st.sampled_from([np.nan])),
        ],
        index=range_indexes(min_size=3, max_size=20),
    )
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames(
        [
            pdst.column("A", dtype=bool, unique=False),
            pdst.column("B", dtype=int, unique=True),
        ],
        rows=st.tuples(st.booleans(), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2),
    )
    find_any(data_frames, lambda x: x["A"][0] == x["A"][1])
def full_dataframes():
    df = data_frames(
        columns=[s_column(),
                 bet_column("X"), bet_column("Y"),
                 mu_column("X"), mu_column("Y"),
                 d_column("X"), d_column("Y"),
                 generic_column("K0L"), generic_column("K0SL"),
                 generic_column("K1L"), generic_column("K1SL"),
                 generic_column("K2L"), generic_column("K2SL"),
                 generic_column("K3L"), generic_column("K3SL")],
        index=range_indexes(min_size=2, max_size=MAX_NRES)
    )
    return df
Exemple #11
0
def multiindex_strategy(
    pandera_dtype: Optional[DataType] = None,
    strategy: Optional[SearchStrategy] = None,
    *,
    indexes: Optional[List] = None,
    size: Optional[int] = None,
):
    """Strategy to generate a pandas MultiIndex object.

    :param pandera_dtype: :class:`pandera.dtypes.DataType` instance.
    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param indexes: a list of :class:`~pandera.schema_components.Index`
        objects.
    :param size: number of elements in the Series.
    :returns: ``hypothesis`` strategy.
    """
    # pylint: disable=unnecessary-lambda
    if strategy:
        raise BaseStrategyOnlyError(
            "The dataframe strategy is a base strategy. You cannot specify "
            "the strategy argument to chain it to a parent strategy."
        )
    indexes = [] if indexes is None else indexes
    index_dtypes = {
        index.name if index.name is not None else i: str(index.dtype)
        for i, index in enumerate(indexes)
    }
    nullable_index = {
        index.name if index.name is not None else i: index.nullable
        for i, index in enumerate(indexes)
    }
    strategy = pdst.data_frames(
        [index.strategy_component() for index in indexes],
        index=pdst.range_indexes(
            min_size=0 if size is None else size, max_size=size
        ),
    ).map(lambda x: x.astype(index_dtypes))

    # this is a hack to convert np.str_ data values into native python str.
    for name, dtype in index_dtypes.items():
        if dtype in {"object", "str"} or dtype.startswith("string"):
            # pylint: disable=cell-var-from-loop,undefined-loop-variable
            strategy = strategy.map(
                lambda df: df.assign(**{name: df[name].map(str)})
            )

    if any(nullable_index.values()):
        strategy = null_dataframe_masks(strategy, nullable_index)
    return strategy.map(pd.MultiIndex.from_frame)
Exemple #12
0
def gen_columns_and_subset(draw, elements=names):
    column_names = draw(lists(elements, min_size=1, unique=True))
    num_columns_to_keep = draw(
        integers(min_value=1, max_value=len(column_names)))
    i = num_columns_to_keep
    columns_to_keep = set()
    while i > 0:
        keeper_column = draw(
            integers(min_value=0, max_value=len(column_names) - 1))
        columns_to_keep.add(column_names[keeper_column])
        i = i - 1

    # With column data and 'keeper' columns selected, utilize draw to return
    # a hypothesis DataFrame column strategies defined.
    return draw(
        hpd.data_frames(hpd.columns(column_names, elements=elements),
                        index=hpd.range_indexes(min_size=5))), columns_to_keep
def dataframe(draw):
    n_cols = draw(integers(min_value=1, max_value=20))
    dtypes = draw(
        lists(sampled_from([float, int, str]),
              min_size=n_cols,
              max_size=n_cols))
    colnames = draw(
        lists(text() | integers(),
              min_size=n_cols,
              max_size=n_cols,
              unique=True))
    return draw(
        data_frames(
            columns=[
                column(name=name, dtype=dtype)
                for dtype, name in zip(dtypes, colnames)
            ],
            index=range_indexes(min_size=1),
        ))
Exemple #14
0
def multiindex_strategy(
    pandas_dtype: Optional[PandasDtype] = None,
    strategy: Optional[SearchStrategy] = None,
    *,
    indexes: Optional[List] = None,
    size: Optional[int] = None,
):
    """Strategy to generate a pandas MultiIndex object.

    :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance.
    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param indexes: a list of :class:`~pandera.schema_components.Inded`
        objects.
    :param size: number of elements in the Series.
    :returns: ``hypothesis`` strategy.
    """
    # pylint: disable=unnecessary-lambda
    if strategy:
        raise BaseStrategyOnlyError(
            "The dataframe strategy is a base strategy. You cannot specify "
            "the strategy argument to chain it to a parent strategy."
        )
    indexes = [] if indexes is None else indexes
    index_dtypes = {
        index.name if index.name is not None else i: index.dtype
        for i, index in enumerate(indexes)
    }
    nullable_index = {
        index.name if index.name is not None else i: index.nullable
        for i, index in enumerate(indexes)
    }
    strategy = pdst.data_frames(
        [index.strategy_component() for index in indexes],
        index=pdst.range_indexes(
            min_size=0 if size is None else size, max_size=size
        ),
    ).map(lambda x: x.astype(index_dtypes))
    if any(nullable_index.values()):
        strategy = null_dataframe_masks(strategy, nullable_index)
    return strategy.map(pd.MultiIndex.from_frame)
Exemple #15
0
def df_strategy():
    """
    A convenience function for generating a dataframe as a hypothesis strategy.

    Should be treated like a fixture, but should not be passed as a fixture
    into a test function. Instead::

        @given(df=dataframe())
        def test_function(df):
            # test goes here
    """
    return data_frames(
        columns=[
            column("a", elements=st.integers()),
            column("Bell__Chart", elements=st.floats()),
            column("decorated-elephant", elements=st.integers()),
            column("animals@#$%^", elements=st.text()),
            column("cities", st.text()),
        ],
        index=range_indexes(min_size=1, max_size=20),
    )
def test_arbitrary_range_index(i, j, data):
    if j is not None:
        i, j = sorted((i, j))
    data.draw(pdst.range_indexes(i, j))
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp
        dtrain.set_info(base_margin=base_margin)
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        cp.testing.assert_allclose(from_inplace, from_dmatrix)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        missing = 11  # set to integer for testing

        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)

        X = cp.random.randn(rows, cols)
        missing_idx = [i for i in range(0, cols, 4)]
        X[:, missing_idx] = missing  # set to be missing
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)

        test = xgb.DMatrix(X[:10, ...], missing=missing)
        predt_from_array = booster.inplace_predict(X[:10, ...],
                                                   missing=missing)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

        base_margin = cp_rng.randn(rows)
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

        # Create a wide dataset
        X = cp_rng.randn(100, 10000)
        y = cp_rng.randn(100)

        missing_idx = [i for i in range(0, X.shape[1], 16)]
        X[:, missing_idx] = missing
        reg = xgb.XGBRegressor(tree_method="gpu_hist",
                               n_estimators=8,
                               missing=missing)
        reg.fit(X, y)

        gpu_predt = reg.predict(X)
        reg.set_params(predictor="cpu_predictor")
        cpu_predt = reg.predict(X)
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            # column major array
            inplace_predt = booster.inplace_predict(x.values)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            assert cp.all(copied_predt == inplace_predt)

            inplace_predt = booster.inplace_predict(x)
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

        base_margin = cudf.Series(rng.randn(rows))
        self.run_inplace_base_margin(booster, dtrain, X, base_margin)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, print_blob=True)
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20, print_blob=True)
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith(
                "-l1"):  # not supported by the exact tree method
            return
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "gpu_hist"},
                            Xy,
                            num_boost_round=10)

        booster.set_param({"predictor": "gpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

        booster.set_param({"predictor": "cpu_predictor"})
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1),
                                   margin,
                                   rtol=1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None, print_blob=True)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None, print_blob=True)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize("n_classes", [2, 3])
    def test_predict_dart(self, n_classes):
        from sklearn.datasets import make_classification
        import cupy as cp
        n_samples = 1000
        X_, y_ = make_classification(n_samples=n_samples,
                                     n_informative=5,
                                     n_classes=n_classes)
        X, y = cp.array(X_), cp.array(y_)

        Xy = xgb.DMatrix(X, y)
        if n_classes == 2:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "binary:logistic"
            }
        else:
            params = {
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
                "num_class": n_classes
            }

        booster = xgb.train(params, Xy, num_boost_round=32)
        # predictor=auto
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)
        cpu_inplace = booster.inplace_predict(X_)
        booster.set_param({"predictor": "cpu_predictor"})
        cpu_copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
        cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

        booster.set_param({"predictor": "gpu_predictor"})
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

        copied = cp.array(copied)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_dtypes(self):
        import cupy as cp
        rows = 1000
        cols = 10
        rng = cp.random.RandomState(1994)
        orig = rng.randint(low=0, high=127,
                           size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)

        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
                cp.signedinteger,
                cp.byte,
                cp.short,
                cp.intc,
                cp.int_,
                cp.longlong,
                cp.unsignedinteger,
                cp.ubyte,
                cp.ushort,
                cp.uintc,
                cp.uint,
                cp.ulonglong,
                cp.floating,
                cp.half,
                cp.single,
                cp.double,
        ]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # boolean
        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
        predt_orig = booster.inplace_predict(orig)
        for dtype in [cp.bool8, cp.bool_]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)

        # unsupported types
        for dtype in [
                cp.complex64,
                cp.complex128,
        ]:
            X = cp.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                booster.inplace_predict(X)
Exemple #18
0
    scaler = MinMaxScaler()
    scaler.fit(array)

    assert (scaler.transform(array).min(axis=0) >= 0).all()
    assert (scaler.transform(array).max(axis=0) <= 1).all()
    np.testing.assert_allclose(scaler.fit(array).transform(array), scaler.fit_transform(array))
    np.testing.assert_allclose(array, scaler.inv_transform(scaler.transform(array)))


@given(
    series(
        unique=True,
        elements=st.floats(
            max_value=1e8, min_value=-1e8, allow_nan=False, allow_infinity=False
        ),
        index=range_indexes(min_size=2)
    )
)
def test_minmax_scaler_series(series):
    scaler = MinMaxScaler()
    scaler.fit(series)

    assert scaler.transform(series).min() >= 0
    assert scaler.transform(series).max() <= 1

    np.testing.assert_allclose(scaler.fit(series).transform(series), scaler.fit_transform(series))
    np.testing.assert_allclose(series, scaler.inv_transform(scaler.transform(series)), rtol=1e-06)


@given(
    data_frames(
    assert len(ix) <= 2
    assert len(set(ix)) == len(ix)


# Sizes that fit into an int64 without overflow
range_sizes = st.integers(0, 2**63 - 1)


@given(range_sizes, range_sizes | st.none(), st.data())
def test_arbitrary_range_index(i, j, data):
    if j is not None:
        i, j = sorted((i, j))
    data.draw(pdst.range_indexes(i, j))


@given(pdst.range_indexes())
def test_basic_range_indexes(ix):
    assert isinstance(ix, pandas.RangeIndex)


@settings(suppress_health_check=[HealthCheck.too_slow])
@given(st.data())
def test_generate_arbitrary_indices(data):
    min_size = data.draw(st.integers(0, 10), 'min_size')
    max_size = data.draw(st.none() | st.integers(min_size, min_size + 10),
                         'max_size')
    unique = data.draw(st.booleans(), 'unique')
    dtype = data.draw(npst.scalar_dtypes(), 'dtype')
    assume(supported_by_pandas(dtype))

    # Pandas bug: https://github.com/pandas-dev/pandas/pull/14916 until 0.20;
Exemple #20
0
@fixture()
def empty_dataframe(columns=['int_value', 'float_value', 'bool_value', 'str_value'], dtypes=['int32', 'float32', 'bool', 'object'], index=None):
    assert len(columns) == len(dtypes)
    df = pd.DataFrame(index=index)
    for c, d in zip(columns, dtypes):
        df[c] = pd.Series(dtype=d)
    return df


@fixture()
def fixed_dataframe():
    return pd.DataFrame({'int':[0, 1], 'float':[10., 20.], 'string':['aa', 'bb']})


dataframe         = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('int_value' , dtype = int   ),
                                                                                      column('float_val' , dtype = float ),
                                                                                      column('bool_value', dtype = bool  )])

dataframe_diff    = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('int_value' , dtype = int   ),
                                                                                      column('float_val' , dtype = float)])

strings_dataframe = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('str_val', elements=text(alphabet=string.ascii_letters, min_size=10, max_size=32))])


def test_load_dst(KrMC_kdst):
    df_read = load_dst(*KrMC_kdst[0].file_info)
    assert_dataframes_close(df_read, KrMC_kdst[0].true,
                            False  , rtol=1e-5)

import hypothesis.extra.pandas as pdst
from hypothesis import given, assume
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(st.data())
def test_can_create_a_series_of_any_dtype(data):
    dtype = np.dtype(data.draw(npst.scalar_dtypes()))
    assume(supported_by_pandas(dtype))
    series = data.draw(pdst.series(dtype=dtype))
    assert series.dtype == pandas.Series([], dtype=dtype).dtype


@given(pdst.series(
    dtype=float, index=pdst.range_indexes(min_size=2, max_size=5)))
def test_series_respects_size_bounds(s):
    assert 2 <= len(s) <= 5


def test_can_fill_series():
    nan_backed = pdst.series(
        elements=st.floats(allow_nan=False), fill=st.just(float('nan')))
    find_any(
        nan_backed, lambda x: np.isnan(x).any()
    )


@given(pdst.series(dtype=int))
def test_can_generate_integral_series(s):
    assert s.dtype == np.dtype(int)
Exemple #22
0
def model():
    return load_model(
        'data/06_models/model.pb',
        compile=False,
    )


@pytest.fixture(scope='session')
def tokenizer():
    with open('data/06_models/tokenizer.pkl', 'rb') as tokenizer_handle:
        return pickle.load(tokenizer_handle)


@given(
    data_frames(
        index=range_indexes(min_size=10, max_size=10),
        columns=[
            column(
                col_pass,
                dtype=str,
                elements=strategies.text(
                    min_size=3,
                    max_size=max_length,
                    alphabet=list('abcdef0123456789 '),
                ),
            ),
        ],
    ),
)
@settings(deadline=None)
def test_predict(model, tokenizer, test):
Exemple #23
0
    assert len(ix) <= 2
    assert len(set(ix)) == len(ix)


# Sizes that fit into an int64 without overflow
range_sizes = st.integers(0, 2 ** 63 - 1)


@given(range_sizes, range_sizes | st.none(), st.data())
def test_arbitrary_range_index(i, j, data):
    if j is not None:
        i, j = sorted((i, j))
    data.draw(pdst.range_indexes(i, j))


@given(pdst.range_indexes())
def test_basic_range_indexes(ix):
    assert isinstance(ix, pandas.RangeIndex)


@given(st.data())
def test_generate_arbitrary_indices(data):
    min_size = data.draw(st.integers(0, 10), 'min_size')
    max_size = data.draw(
        st.none() | st.integers(min_size, min_size + 10), 'max_size')
    unique = data.draw(st.booleans(), 'unique')
    dtype = data.draw(npst.scalar_dtypes(), 'dtype')
    assume(supported_by_pandas(dtype))

    # Pandas bug: https://github.com/pandas-dev/pandas/pull/14916 until 0.20;
    # then int64 indexes are inferred from uint64 values.
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(st.data())
def test_can_create_a_series_of_any_dtype(data):
    dtype = np.dtype(data.draw(npst.scalar_dtypes()))
    assume(supported_by_pandas(dtype))
    # Use raw data to work around pandas bug in repr. See
    # https://github.com/pandas-dev/pandas/issues/27484
    series = data.conjecture_data.draw(pdst.series(dtype=dtype))
    assert series.dtype == pandas.Series([], dtype=dtype).dtype


@given(
    pdst.series(dtype=float, index=pdst.range_indexes(min_size=2, max_size=5)))
def test_series_respects_size_bounds(s):
    assert 2 <= len(s) <= 5


def test_can_fill_series():
    nan_backed = pdst.series(elements=st.floats(allow_nan=False),
                             fill=st.just(np.nan))
    find_any(nan_backed, lambda x: np.isnan(x).any())


@given(pdst.series(dtype=int))
def test_can_generate_integral_series(s):
    assert s.dtype == np.dtype(int)

Exemple #25
0
#   - (some) booleans

MAX_VAL = 2**31 - 1

# Strategies
strat_text = st.text(alphabet=st.characters(min_codepoint=32,
                                            max_codepoint=127),
                     min_size=0)
strat_ints = st.integers(min_value=-MAX_VAL, max_value=MAX_VAL)
strat_floats = st.floats(min_value=-MAX_VAL,
                         max_value=MAX_VAL,
                         allow_nan=False,
                         allow_infinity=False)
strat_dates = st.dates()

strat_df_index = hpd.range_indexes(min_size=1)

df_hypo_mixed = hpd.data_frames(
    columns=[
        hpd.column(name="col1_text", elements=strat_text),
        hpd.column(name="col2_ints", elements=strat_ints),
        hpd.column(name="col3_floats", elements=strat_floats),
        hpd.column(name="col4_dates", elements=strat_dates),
        hpd.column(name="col4_bools", elements=st.booleans()),
    ],
    index=strat_df_index,
)

df_hypo_text = hpd.data_frames(columns=hpd.columns(5, elements=strat_text),
                               index=strat_df_index)
df_hypo_ints = hpd.data_frames(columns=hpd.columns(5, elements=strat_ints),
Exemple #26
0
from tests.common.debug import minimal, find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([
    pdst.column('a', dtype=int),
    pdst.column('b', dtype=float),
]))
def test_can_have_columns_of_distinct_types(df):
    assert df['a'].dtype == np.dtype(int)
    assert df['b'].dtype == np.dtype(float)


@given(pdst.data_frames(
    [pdst.column(dtype=int)],
    index=pdst.range_indexes(min_size=1, max_size=5)))
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float)))
def test_can_specify_just_column_names(df):
    df['A']
    df['B']


@given(pdst.data_frames(pdst.columns(2, dtype=float)))
def test_can_specify_just_column_count(df):
    df[0]
    df[1]
import hypothesis.extra.pandas as pdst
import hypothesis.strategies as st
from hypothesis import assume, given
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(st.data())
def test_can_create_a_series_of_any_dtype(data):
    dtype = np.dtype(data.draw(npst.scalar_dtypes()))
    assume(supported_by_pandas(dtype))
    series = data.draw(pdst.series(dtype=dtype))
    assert series.dtype == pandas.Series([], dtype=dtype).dtype


@given(pdst.series(dtype=float, index=pdst.range_indexes(min_size=2, max_size=5)))
def test_series_respects_size_bounds(s):
    assert 2 <= len(s) <= 5


def test_can_fill_series():
    nan_backed = pdst.series(
        elements=st.floats(allow_nan=False), fill=st.just(float("nan"))
    )
    find_any(nan_backed, lambda x: np.isnan(x).any())


@given(pdst.series(dtype=int))
def test_can_generate_integral_series(s):
    assert s.dtype == np.dtype(int)
import hypothesis.extra.pandas as pdst
import hypothesis.strategies as st
from hypothesis import HealthCheck, given, reject, settings
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)]))
def test_can_have_columns_of_distinct_types(df):
    assert df["a"].dtype == np.dtype(int)
    assert df["b"].dtype == np.dtype(float)


@given(
    pdst.data_frames(
        [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5)
    )
)
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float)))
def test_can_specify_just_column_names(df):
    df["A"]
    df["B"]


@given(pdst.data_frames(pdst.columns(2, dtype=float)))
def test_can_specify_just_column_count(df):
    df[0]
import hypothesis.extra.pandas as pdst
import hypothesis.strategies as st
from hypothesis import HealthCheck, given, reject, settings
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)]))
def test_can_have_columns_of_distinct_types(df):
    assert df["a"].dtype == np.dtype(int)
    assert df["b"].dtype == np.dtype(float)


@given(
    pdst.data_frames(
        [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5)
    )
)
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float)))
def test_can_specify_just_column_names(df):
    df["A"]
    df["B"]


@given(pdst.data_frames(pdst.columns(2, dtype=float)))
def test_can_specify_just_column_count(df):
    df[0]
Exemple #30
0
def test_arbitrary_range_index(i, j, data):
    if j is not None:
        i, j = sorted((i, j))
    data.draw(pdst.range_indexes(i, j))
Exemple #31
0
from .. reco.deconv_functions import richardson_lucy
from .. reco.deconv_functions import InterpolationMethod

from .. core.core_functions   import in_range
from .. core.core_functions   import shift_to_bin_centers
from .. core.testing_utils    import assert_dataframes_close

from ..   io.dst_io           import load_dst

from scipy.stats              import multivariate_normal


@given(data_frames(columns=[column('A', dtype=float, elements=floats(1, 1e3)),
                            column('B', dtype=float, elements=floats(1, 1e3)),
                            column('C', dtype=float, elements=floats(1, 1e3))],
                     index=range_indexes(min_size=2, max_size=10)))
def test_cut_and_redistribute_df(df):
    cut_var       = 'A'
    redist_var    = ['B', 'C']
    cut_val       = round(df[cut_var].mean(), 3)
    cut_condition = f'{cut_var} > {cut_val:.3f}'
    cut_function  = cut_and_redistribute_df(cut_condition, redist_var)
    df_cut        = cut_function(df)
    df_cut_manual = df.loc[df[cut_var].values > cut_val, :].copy()
    df_cut_manual.loc[:, redist_var] = df_cut_manual.loc[:, redist_var] * df.loc[:, redist_var].sum() /  df_cut_manual.loc[:, redist_var].sum()
    assert_dataframes_close(df_cut, df_cut_manual)


def test_drop_isolated_sensors():
    size          = 20
    dist          = [10.1, 10.1]
Exemple #32
0
from hypothesis.strategies import text


@fixture()
def empty_dataframe(
        columns=['int_value', 'float_value', 'bool_value', 'str_value'],
        dtypes=['int32', 'float32', 'bool', 'object'],
        index=None):
    assert len(columns) == len(dtypes)
    df = pd.DataFrame(index=index)
    for c, d in zip(columns, dtypes):
        df[c] = pd.Series(dtype=d)
    return df


dataframe = data_frames(index=range_indexes(min_size=1, max_size=5),
                        columns=[
                            column('int_value', dtype=int),
                            column('float_val', dtype=float),
                            column('bool_value', dtype=bool)
                        ])

dataframe_diff = data_frames(
    index=range_indexes(min_size=1, max_size=5),
    columns=[column('int_value', dtype=int),
             column('float_val', dtype=float)])

strings_dataframe = data_frames(index=range_indexes(min_size=1, max_size=5),
                                columns=[
                                    column('str_val',
                                           elements=text(
 e(pdst.data_frames, pdst.columns(1)),
 e(pdst.data_frames, pdst.columns(1, dtype=float, fill=1)),
 e(pdst.data_frames, pdst.columns(1, dtype=float, elements=1)),
 e(pdst.data_frames, pdst.columns(1, fill=1, dtype=float)),
 e(pdst.data_frames, pdst.columns(['A', 'A'], dtype=float)),
 e(pdst.data_frames, pdst.columns(1, elements=st.none(), dtype=int)),
 e(pdst.data_frames, 1),
 e(pdst.data_frames, [1]),
 e(pdst.data_frames, pdst.columns(1, dtype='category')),
 e(pdst.data_frames,
   pdst.columns(['A'], dtype=bool),
   rows=st.tuples(st.booleans(), st.booleans())),
 e(pdst.data_frames,
   pdst.columns(1, elements=st.booleans()),
   rows=st.tuples(st.booleans())),
 e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(0, 0)),
 e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(1, 1)),
 e(pdst.data_frames, pdst.columns(1, dtype=int), rows=st.integers()),
 e(pdst.indexes),
 e(pdst.indexes, dtype='category'),
 e(pdst.indexes, dtype='not a dtype'),
 e(pdst.indexes, elements='not a strategy'),
 e(pdst.indexes, elements=st.text(), dtype=float),
 e(pdst.indexes, elements=st.none(), dtype=int),
 e(pdst.indexes, dtype=int, max_size=0, min_size=1),
 e(pdst.indexes, dtype=int, unique='true'),
 e(pdst.indexes, dtype=int, min_size='0'),
 e(pdst.indexes, dtype=int, max_size='1'),
 e(pdst.range_indexes, 1, 0),
 e(pdst.range_indexes, min_size='0'),
 e(pdst.range_indexes, max_size='1'),
    not_has_all_quotechars,
    strat_dates,
    strat_floats,
    strat_ints,
    strat_text,
)
from benchmarks.read_sql.read_sql import read_sql

hypo_df = hpd.data_frames(
    columns=[
        hpd.column(name="col1_text", elements=strat_text),
        hpd.column(name="col2_ints", elements=strat_ints),
        hpd.column(name="col3_floats", elements=strat_floats),
        hpd.column(name="col4_dates", elements=strat_dates),
    ],
    index=hpd.range_indexes(min_size=1),
)


class TestReadSqlBasic:
    """
    For all tests, the 'actual' is retrieved using the built-in pandas methods, to compare to the
    'expected' which used bcpandas.

    Because dtypes change when reading from text files, ignoring dtypes checks. TODO how to really fix this
    """

    table_name = "lotr_readsql"
    view_name = f"v_{table_name}"

    @given(df=hypo_df)
from hypothesis import given
from hypothesis.extra.pandas import columns, data_frames, range_indexes
import hypothesis.strategies as st
import pandas as pd

from analyse_weather import get_data, hottest_summer


@given(
    data_frames(
        columns=columns(
            ['JUN', 'JUL', 'AUG'],
            elements=st.floats(allow_nan=True)
        ),
        index=range_indexes(min_size=1)
    )
)
def test_hottest_summer_auto(df):
    assert not pd.isnull(hottest_summer(df))


# Below is annother example of using fixtures but for this function:
import pytest
from pandas import DataFrame


@pytest.fixture
def full_dataset():
    return get_data()

 e(pdst.data_frames, pdst.columns(1)),
 e(pdst.data_frames, pdst.columns(1, dtype=float, fill=1)),
 e(pdst.data_frames, pdst.columns(1, dtype=float, elements=1)),
 e(pdst.data_frames, pdst.columns(1, fill=1, dtype=float)),
 e(pdst.data_frames, pdst.columns(['A', 'A'], dtype=float)),
 e(pdst.data_frames, pdst.columns(1, elements=st.none(), dtype=int)),
 e(pdst.data_frames, 1),
 e(pdst.data_frames, [1]),
 e(pdst.data_frames, pdst.columns(1, dtype='category')),
 e(pdst.data_frames,
     pdst.columns(['A'], dtype=bool),
     rows=st.tuples(st.booleans(), st.booleans())),
 e(pdst.data_frames,
     pdst.columns(1, elements=st.booleans()),
     rows=st.tuples(st.booleans())),
 e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(0, 0)),
 e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(1, 1)),
 e(pdst.data_frames, pdst.columns(1, dtype=int), rows=st.integers()),
 e(pdst.indexes),
 e(pdst.indexes, dtype='category'),
 e(pdst.indexes, dtype='not a dtype'),
 e(pdst.indexes, elements='not a strategy'),
 e(pdst.indexes, elements=st.text(), dtype=float),
 e(pdst.indexes, elements=st.none(), dtype=int),
 e(pdst.indexes, dtype=int, max_size=0, min_size=1),
 e(pdst.indexes, dtype=int, unique='true'),
 e(pdst.range_indexes, 1, 0),
 e(pdst.series),
 e(pdst.series, dtype='not a dtype'),
 e(pdst.series, elements='not a strategy'),
 e(pdst.series, elements=st.none(), dtype=int),
Exemple #37
0
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None)
    def test_shap(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20)
    def test_shap_interactions(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)
from tests.common.debug import minimal, find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([
    pdst.column('a', dtype=int),
    pdst.column('b', dtype=float),
]))
def test_can_have_columns_of_distinct_types(df):
    assert df['a'].dtype == np.dtype(int)
    assert df['b'].dtype == np.dtype(float)


@given(pdst.data_frames(
    [pdst.column(dtype=int)],
    index=pdst.range_indexes(min_size=1, max_size=5)))
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float)))
def test_can_specify_just_column_names(df):
    df['A']
    df['B']


@given(pdst.data_frames(pdst.columns(2, dtype=float)))
def test_can_specify_just_column_count(df):
    df[0]
    df[1]