def null_dataframe_masks( draw, strategy: Optional[SearchStrategy], nullable_columns: Dict[str, bool], ): """Strategy for masking a values in a pandas DataFrame. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param nullable_columns: dictionary where keys are column names and values indicate whether that column is nullable. """ val = draw(strategy) size = val.shape[0] columns_strat = [] for name, nullable in nullable_columns.items(): element_st = st.booleans() if nullable else st.just(False) columns_strat.append( pdst.column( name=name, elements=element_st, dtype=bool, fill=st.just(False), ) ) mask_st = pdst.data_frames( columns=columns_strat, index=pdst.range_indexes(min_size=size, max_size=size), ) null_mask = draw(mask_st) # assume that there is at least one masked value hypothesis.assume(null_mask.any(axis=None)) return val.mask(null_mask)
def series_strategy( pandas_dtype: PandasDtype, strategy: Optional[SearchStrategy] = None, *, checks: Optional[Sequence] = None, nullable: Optional[bool] = False, allow_duplicates: Optional[bool] = True, name: Optional[str] = None, size: Optional[int] = None, ): """Strategy to generate a pandas Series. :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param checks: sequence of :class:`~pandera.checks.Check` s to constrain the values of the data in the column/index. :param nullable: whether or not generated Series contains null values. :param allow_duplicates: whether or not generated Series contains duplicates. :param name: name of the Series. :param size: number of elements in the Series. :returns: ``hypothesis`` strategy. """ elements = field_element_strategy(pandas_dtype, strategy, checks=checks) strategy = ( pdst.series( elements=elements, dtype=pandas_dtype.numpy_dtype, index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), unique=not allow_duplicates, ) .filter(lambda x: x.shape[0] > 0) .map(lambda x: x.rename(name)) .map(lambda x: x.astype(pandas_dtype.str_alias)) ) if nullable: strategy = null_field_masks(strategy) def undefined_check_strategy(strategy, check): """Strategy for checks with undefined strategies.""" warnings.warn( "Vectorized check doesn't have a defined strategy." "Falling back to filtering drawn values based on the check " "definition. This can considerably slow down data-generation." ) def _check_fn(series): return check(series).check_passed return strategy.filter(_check_fn) for check in checks if checks is not None else []: if not hasattr(check, "strategy") and not check.element_wise: strategy = undefined_check_strategy(strategy, check) return strategy
def two_equal_size_series(draw): series_strategy = series( dtype=np.float64, elements=float_strategy, index=range_indexes(min_size=1) ) s1 = draw(series_strategy) s2 = draw(series_strategy) assume(len(s1) == len(s2)) return s1, s2
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames([ pdst.column('A', dtype=int, unique=False), pdst.column('B', dtype=int, unique=True)], rows=st.tuples(st.integers(0, 10), st.integers(0, 10)), index=pdst.range_indexes(2, 2) ) find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
def categoricaldf_strategy(): return data_frames( columns=[ column("names", st.sampled_from(names)), column("numbers", st.sampled_from(range(3))), ], index=range_indexes(min_size=1, max_size=20), )
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames([ pdst.column('A', dtype=int, unique=False), pdst.column('B', dtype=int, unique=True)], rows=st.tuples(st.integers(0, 10), st.integers(0, 10)), index=pdst.range_indexes(2, 2) ) find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
def cmatrix_dataframes(): df = data_frames( columns=[s_column(), alf_column("X"), alf_column("Y"), bet_column("X"), bet_column("Y"), generic_column("R")], index=range_indexes(min_size=2, max_size=MAX_NRES) ) return df
def nulldf_strategy(): return data_frames( columns=[ column("1", st.floats(allow_nan=True, allow_infinity=True)), column("2", st.sampled_from([np.nan])), column("3", st.sampled_from([np.nan])), ], index=range_indexes(min_size=3, max_size=20), )
def test_uniqueness_does_not_affect_other_rows_2(): data_frames = pdst.data_frames( [ pdst.column("A", dtype=bool, unique=False), pdst.column("B", dtype=int, unique=True), ], rows=st.tuples(st.booleans(), st.integers(0, 10)), index=pdst.range_indexes(2, 2), ) find_any(data_frames, lambda x: x["A"][0] == x["A"][1])
def full_dataframes(): df = data_frames( columns=[s_column(), bet_column("X"), bet_column("Y"), mu_column("X"), mu_column("Y"), d_column("X"), d_column("Y"), generic_column("K0L"), generic_column("K0SL"), generic_column("K1L"), generic_column("K1SL"), generic_column("K2L"), generic_column("K2SL"), generic_column("K3L"), generic_column("K3SL")], index=range_indexes(min_size=2, max_size=MAX_NRES) ) return df
def multiindex_strategy( pandera_dtype: Optional[DataType] = None, strategy: Optional[SearchStrategy] = None, *, indexes: Optional[List] = None, size: Optional[int] = None, ): """Strategy to generate a pandas MultiIndex object. :param pandera_dtype: :class:`pandera.dtypes.DataType` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param indexes: a list of :class:`~pandera.schema_components.Index` objects. :param size: number of elements in the Series. :returns: ``hypothesis`` strategy. """ # pylint: disable=unnecessary-lambda if strategy: raise BaseStrategyOnlyError( "The dataframe strategy is a base strategy. You cannot specify " "the strategy argument to chain it to a parent strategy." ) indexes = [] if indexes is None else indexes index_dtypes = { index.name if index.name is not None else i: str(index.dtype) for i, index in enumerate(indexes) } nullable_index = { index.name if index.name is not None else i: index.nullable for i, index in enumerate(indexes) } strategy = pdst.data_frames( [index.strategy_component() for index in indexes], index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), ).map(lambda x: x.astype(index_dtypes)) # this is a hack to convert np.str_ data values into native python str. for name, dtype in index_dtypes.items(): if dtype in {"object", "str"} or dtype.startswith("string"): # pylint: disable=cell-var-from-loop,undefined-loop-variable strategy = strategy.map( lambda df: df.assign(**{name: df[name].map(str)}) ) if any(nullable_index.values()): strategy = null_dataframe_masks(strategy, nullable_index) return strategy.map(pd.MultiIndex.from_frame)
def gen_columns_and_subset(draw, elements=names): column_names = draw(lists(elements, min_size=1, unique=True)) num_columns_to_keep = draw( integers(min_value=1, max_value=len(column_names))) i = num_columns_to_keep columns_to_keep = set() while i > 0: keeper_column = draw( integers(min_value=0, max_value=len(column_names) - 1)) columns_to_keep.add(column_names[keeper_column]) i = i - 1 # With column data and 'keeper' columns selected, utilize draw to return # a hypothesis DataFrame column strategies defined. return draw( hpd.data_frames(hpd.columns(column_names, elements=elements), index=hpd.range_indexes(min_size=5))), columns_to_keep
def dataframe(draw): n_cols = draw(integers(min_value=1, max_value=20)) dtypes = draw( lists(sampled_from([float, int, str]), min_size=n_cols, max_size=n_cols)) colnames = draw( lists(text() | integers(), min_size=n_cols, max_size=n_cols, unique=True)) return draw( data_frames( columns=[ column(name=name, dtype=dtype) for dtype, name in zip(dtypes, colnames) ], index=range_indexes(min_size=1), ))
def multiindex_strategy( pandas_dtype: Optional[PandasDtype] = None, strategy: Optional[SearchStrategy] = None, *, indexes: Optional[List] = None, size: Optional[int] = None, ): """Strategy to generate a pandas MultiIndex object. :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance. :param strategy: an optional hypothesis strategy. If specified, the pandas dtype strategy will be chained onto this strategy. :param indexes: a list of :class:`~pandera.schema_components.Inded` objects. :param size: number of elements in the Series. :returns: ``hypothesis`` strategy. """ # pylint: disable=unnecessary-lambda if strategy: raise BaseStrategyOnlyError( "The dataframe strategy is a base strategy. You cannot specify " "the strategy argument to chain it to a parent strategy." ) indexes = [] if indexes is None else indexes index_dtypes = { index.name if index.name is not None else i: index.dtype for i, index in enumerate(indexes) } nullable_index = { index.name if index.name is not None else i: index.nullable for i, index in enumerate(indexes) } strategy = pdst.data_frames( [index.strategy_component() for index in indexes], index=pdst.range_indexes( min_size=0 if size is None else size, max_size=size ), ).map(lambda x: x.astype(index_dtypes)) if any(nullable_index.values()): strategy = null_dataframe_masks(strategy, nullable_index) return strategy.map(pd.MultiIndex.from_frame)
def df_strategy(): """ A convenience function for generating a dataframe as a hypothesis strategy. Should be treated like a fixture, but should not be passed as a fixture into a test function. Instead:: @given(df=dataframe()) def test_function(df): # test goes here """ return data_frames( columns=[ column("a", elements=st.integers()), column("Bell__Chart", elements=st.floats()), column("decorated-elephant", elements=st.integers()), column("animals@#$%^", elements=st.text()), column("cities", st.text()), ], index=range_indexes(min_size=1, max_size=20), )
def test_arbitrary_range_index(i, j, data): if j is not None: i, j = sorted((i, j)) data.draw(pdst.range_indexes(i, j))
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) def run_inplace_base_margin(self, booster, dtrain, X, base_margin): import cupy as cp dtrain.set_info(base_margin=base_margin) from_inplace = booster.inplace_predict(data=X, base_margin=base_margin) from_dmatrix = booster.predict(dtrain) cp.testing.assert_allclose(from_inplace, from_dmatrix) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 missing = 11 # set to integer for testing cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) missing_idx = [i for i in range(0, cols, 4)] X[:, missing_idx] = missing # set to be missing y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...], missing=missing) predt_from_array = booster.inplace_predict(X[:10, ...], missing=missing) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) base_margin = cp_rng.randn(rows) self.run_inplace_base_margin(booster, dtrain, X, base_margin) # Create a wide dataset X = cp_rng.randn(100, 10000) y = cp_rng.randn(100) missing_idx = [i for i in range(0, X.shape[1], 16)] X[:, missing_idx] = missing reg = xgb.XGBRegressor(tree_method="gpu_hist", n_estimators=8, missing=missing) reg.fit(X, y) gpu_predt = reg.predict(X) reg.set_params(predictor="cpu_predictor") cpu_predt = reg.predict(X) np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): # column major array inplace_predt = booster.inplace_predict(x.values) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) assert cp.all(copied_predt == inplace_predt) inplace_predt = booster.inplace_predict(x) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) base_margin = cudf.Series(rng.randn(rows)) self.run_inplace_base_margin(booster, dtrain, X, base_margin) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, print_blob=True) def test_shap(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20, print_blob=True) def test_shap_interactions(self, num_rounds, dataset, param): if dataset.name.endswith( "-l1"): # not supported by the exact tree method return param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_shap_categorical(self): X, y = tm.make_categorical(100, 20, 7, False) Xy = xgb.DMatrix(X, y, enable_categorical=True) booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10) booster.set_param({"predictor": "gpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) booster.set_param({"predictor": "cpu_predictor"}) shap = booster.predict(Xy, pred_contribs=True) margin = booster.predict(Xy, output_margin=True) np.testing.assert_allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None, print_blob=True) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None, print_blob=True) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.parametrize("n_classes", [2, 3]) def test_predict_dart(self, n_classes): from sklearn.datasets import make_classification import cupy as cp n_samples = 1000 X_, y_ = make_classification(n_samples=n_samples, n_informative=5, n_classes=n_classes) X, y = cp.array(X_), cp.array(y_) Xy = xgb.DMatrix(X, y) if n_classes == 2: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "binary:logistic" } else: params = { "tree_method": "gpu_hist", "booster": "dart", "rate_drop": 0.5, "objective": "multi:softprob", "num_class": n_classes } booster = xgb.train(params, Xy, num_boost_round=32) # predictor=auto inplace = booster.inplace_predict(X) copied = booster.predict(Xy) cpu_inplace = booster.inplace_predict(X_) booster.set_param({"predictor": "cpu_predictor"}) cpu_copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6) cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6) cp.testing.assert_allclose(inplace, copied, atol=1e-6) booster.set_param({"predictor": "gpu_predictor"}) inplace = booster.inplace_predict(X) copied = booster.predict(Xy) copied = cp.array(copied) cp.testing.assert_allclose(inplace, copied, atol=1e-6) @pytest.mark.skipif(**tm.no_cupy()) def test_dtypes(self): import cupy as cp rows = 1000 cols = 10 rng = cp.random.RandomState(1994) orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols) y = rng.randint(low=0, high=127, size=rows) dtrain = xgb.DMatrix(orig, label=y) booster = xgb.train({"tree_method": "gpu_hist"}, dtrain) predt_orig = booster.inplace_predict(orig) # all primitive types in numpy for dtype in [ cp.signedinteger, cp.byte, cp.short, cp.intc, cp.int_, cp.longlong, cp.unsignedinteger, cp.ubyte, cp.ushort, cp.uintc, cp.uint, cp.ulonglong, cp.floating, cp.half, cp.single, cp.double, ]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # boolean orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols) predt_orig = booster.inplace_predict(orig) for dtype in [cp.bool8, cp.bool_]: X = cp.array(orig, dtype=dtype) predt = booster.inplace_predict(X) cp.testing.assert_allclose(predt, predt_orig) # unsupported types for dtype in [ cp.complex64, cp.complex128, ]: X = cp.array(orig, dtype=dtype) with pytest.raises(ValueError): booster.inplace_predict(X)
scaler = MinMaxScaler() scaler.fit(array) assert (scaler.transform(array).min(axis=0) >= 0).all() assert (scaler.transform(array).max(axis=0) <= 1).all() np.testing.assert_allclose(scaler.fit(array).transform(array), scaler.fit_transform(array)) np.testing.assert_allclose(array, scaler.inv_transform(scaler.transform(array))) @given( series( unique=True, elements=st.floats( max_value=1e8, min_value=-1e8, allow_nan=False, allow_infinity=False ), index=range_indexes(min_size=2) ) ) def test_minmax_scaler_series(series): scaler = MinMaxScaler() scaler.fit(series) assert scaler.transform(series).min() >= 0 assert scaler.transform(series).max() <= 1 np.testing.assert_allclose(scaler.fit(series).transform(series), scaler.fit_transform(series)) np.testing.assert_allclose(series, scaler.inv_transform(scaler.transform(series)), rtol=1e-06) @given( data_frames(
assert len(ix) <= 2 assert len(set(ix)) == len(ix) # Sizes that fit into an int64 without overflow range_sizes = st.integers(0, 2**63 - 1) @given(range_sizes, range_sizes | st.none(), st.data()) def test_arbitrary_range_index(i, j, data): if j is not None: i, j = sorted((i, j)) data.draw(pdst.range_indexes(i, j)) @given(pdst.range_indexes()) def test_basic_range_indexes(ix): assert isinstance(ix, pandas.RangeIndex) @settings(suppress_health_check=[HealthCheck.too_slow]) @given(st.data()) def test_generate_arbitrary_indices(data): min_size = data.draw(st.integers(0, 10), 'min_size') max_size = data.draw(st.none() | st.integers(min_size, min_size + 10), 'max_size') unique = data.draw(st.booleans(), 'unique') dtype = data.draw(npst.scalar_dtypes(), 'dtype') assume(supported_by_pandas(dtype)) # Pandas bug: https://github.com/pandas-dev/pandas/pull/14916 until 0.20;
@fixture() def empty_dataframe(columns=['int_value', 'float_value', 'bool_value', 'str_value'], dtypes=['int32', 'float32', 'bool', 'object'], index=None): assert len(columns) == len(dtypes) df = pd.DataFrame(index=index) for c, d in zip(columns, dtypes): df[c] = pd.Series(dtype=d) return df @fixture() def fixed_dataframe(): return pd.DataFrame({'int':[0, 1], 'float':[10., 20.], 'string':['aa', 'bb']}) dataframe = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('int_value' , dtype = int ), column('float_val' , dtype = float ), column('bool_value', dtype = bool )]) dataframe_diff = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('int_value' , dtype = int ), column('float_val' , dtype = float)]) strings_dataframe = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[column('str_val', elements=text(alphabet=string.ascii_letters, min_size=10, max_size=32))]) def test_load_dst(KrMC_kdst): df_read = load_dst(*KrMC_kdst[0].file_info) assert_dataframes_close(df_read, KrMC_kdst[0].true, False , rtol=1e-5)
import hypothesis.extra.pandas as pdst from hypothesis import given, assume from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(st.data()) def test_can_create_a_series_of_any_dtype(data): dtype = np.dtype(data.draw(npst.scalar_dtypes())) assume(supported_by_pandas(dtype)) series = data.draw(pdst.series(dtype=dtype)) assert series.dtype == pandas.Series([], dtype=dtype).dtype @given(pdst.series( dtype=float, index=pdst.range_indexes(min_size=2, max_size=5))) def test_series_respects_size_bounds(s): assert 2 <= len(s) <= 5 def test_can_fill_series(): nan_backed = pdst.series( elements=st.floats(allow_nan=False), fill=st.just(float('nan'))) find_any( nan_backed, lambda x: np.isnan(x).any() ) @given(pdst.series(dtype=int)) def test_can_generate_integral_series(s): assert s.dtype == np.dtype(int)
def model(): return load_model( 'data/06_models/model.pb', compile=False, ) @pytest.fixture(scope='session') def tokenizer(): with open('data/06_models/tokenizer.pkl', 'rb') as tokenizer_handle: return pickle.load(tokenizer_handle) @given( data_frames( index=range_indexes(min_size=10, max_size=10), columns=[ column( col_pass, dtype=str, elements=strategies.text( min_size=3, max_size=max_length, alphabet=list('abcdef0123456789 '), ), ), ], ), ) @settings(deadline=None) def test_predict(model, tokenizer, test):
assert len(ix) <= 2 assert len(set(ix)) == len(ix) # Sizes that fit into an int64 without overflow range_sizes = st.integers(0, 2 ** 63 - 1) @given(range_sizes, range_sizes | st.none(), st.data()) def test_arbitrary_range_index(i, j, data): if j is not None: i, j = sorted((i, j)) data.draw(pdst.range_indexes(i, j)) @given(pdst.range_indexes()) def test_basic_range_indexes(ix): assert isinstance(ix, pandas.RangeIndex) @given(st.data()) def test_generate_arbitrary_indices(data): min_size = data.draw(st.integers(0, 10), 'min_size') max_size = data.draw( st.none() | st.integers(min_size, min_size + 10), 'max_size') unique = data.draw(st.booleans(), 'unique') dtype = data.draw(npst.scalar_dtypes(), 'dtype') assume(supported_by_pandas(dtype)) # Pandas bug: https://github.com/pandas-dev/pandas/pull/14916 until 0.20; # then int64 indexes are inferred from uint64 values.
from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(st.data()) def test_can_create_a_series_of_any_dtype(data): dtype = np.dtype(data.draw(npst.scalar_dtypes())) assume(supported_by_pandas(dtype)) # Use raw data to work around pandas bug in repr. See # https://github.com/pandas-dev/pandas/issues/27484 series = data.conjecture_data.draw(pdst.series(dtype=dtype)) assert series.dtype == pandas.Series([], dtype=dtype).dtype @given( pdst.series(dtype=float, index=pdst.range_indexes(min_size=2, max_size=5))) def test_series_respects_size_bounds(s): assert 2 <= len(s) <= 5 def test_can_fill_series(): nan_backed = pdst.series(elements=st.floats(allow_nan=False), fill=st.just(np.nan)) find_any(nan_backed, lambda x: np.isnan(x).any()) @given(pdst.series(dtype=int)) def test_can_generate_integral_series(s): assert s.dtype == np.dtype(int)
# - (some) booleans MAX_VAL = 2**31 - 1 # Strategies strat_text = st.text(alphabet=st.characters(min_codepoint=32, max_codepoint=127), min_size=0) strat_ints = st.integers(min_value=-MAX_VAL, max_value=MAX_VAL) strat_floats = st.floats(min_value=-MAX_VAL, max_value=MAX_VAL, allow_nan=False, allow_infinity=False) strat_dates = st.dates() strat_df_index = hpd.range_indexes(min_size=1) df_hypo_mixed = hpd.data_frames( columns=[ hpd.column(name="col1_text", elements=strat_text), hpd.column(name="col2_ints", elements=strat_ints), hpd.column(name="col3_floats", elements=strat_floats), hpd.column(name="col4_dates", elements=strat_dates), hpd.column(name="col4_bools", elements=st.booleans()), ], index=strat_df_index, ) df_hypo_text = hpd.data_frames(columns=hpd.columns(5, elements=strat_text), index=strat_df_index) df_hypo_ints = hpd.data_frames(columns=hpd.columns(5, elements=strat_ints),
from tests.common.debug import minimal, find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([ pdst.column('a', dtype=int), pdst.column('b', dtype=float), ])) def test_can_have_columns_of_distinct_types(df): assert df['a'].dtype == np.dtype(int) assert df['b'].dtype == np.dtype(float) @given(pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5))) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float))) def test_can_specify_just_column_names(df): df['A'] df['B'] @given(pdst.data_frames(pdst.columns(2, dtype=float))) def test_can_specify_just_column_count(df): df[0] df[1]
import hypothesis.extra.pandas as pdst import hypothesis.strategies as st from hypothesis import assume, given from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(st.data()) def test_can_create_a_series_of_any_dtype(data): dtype = np.dtype(data.draw(npst.scalar_dtypes())) assume(supported_by_pandas(dtype)) series = data.draw(pdst.series(dtype=dtype)) assert series.dtype == pandas.Series([], dtype=dtype).dtype @given(pdst.series(dtype=float, index=pdst.range_indexes(min_size=2, max_size=5))) def test_series_respects_size_bounds(s): assert 2 <= len(s) <= 5 def test_can_fill_series(): nan_backed = pdst.series( elements=st.floats(allow_nan=False), fill=st.just(float("nan")) ) find_any(nan_backed, lambda x: np.isnan(x).any()) @given(pdst.series(dtype=int)) def test_can_generate_integral_series(s): assert s.dtype == np.dtype(int)
import hypothesis.extra.pandas as pdst import hypothesis.strategies as st from hypothesis import HealthCheck, given, reject, settings from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)])) def test_can_have_columns_of_distinct_types(df): assert df["a"].dtype == np.dtype(int) assert df["b"].dtype == np.dtype(float) @given( pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5) ) ) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float))) def test_can_specify_just_column_names(df): df["A"] df["B"] @given(pdst.data_frames(pdst.columns(2, dtype=float))) def test_can_specify_just_column_count(df): df[0]
import hypothesis.extra.pandas as pdst import hypothesis.strategies as st from hypothesis import HealthCheck, given, reject, settings from tests.common.debug import find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)])) def test_can_have_columns_of_distinct_types(df): assert df["a"].dtype == np.dtype(int) assert df["b"].dtype == np.dtype(float) @given( pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5) ) ) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float))) def test_can_specify_just_column_names(df): df["A"] df["B"] @given(pdst.data_frames(pdst.columns(2, dtype=float))) def test_can_specify_just_column_count(df): df[0]
def test_arbitrary_range_index(i, j, data): if j is not None: i, j = sorted((i, j)) data.draw(pdst.range_indexes(i, j))
from .. reco.deconv_functions import richardson_lucy from .. reco.deconv_functions import InterpolationMethod from .. core.core_functions import in_range from .. core.core_functions import shift_to_bin_centers from .. core.testing_utils import assert_dataframes_close from .. io.dst_io import load_dst from scipy.stats import multivariate_normal @given(data_frames(columns=[column('A', dtype=float, elements=floats(1, 1e3)), column('B', dtype=float, elements=floats(1, 1e3)), column('C', dtype=float, elements=floats(1, 1e3))], index=range_indexes(min_size=2, max_size=10))) def test_cut_and_redistribute_df(df): cut_var = 'A' redist_var = ['B', 'C'] cut_val = round(df[cut_var].mean(), 3) cut_condition = f'{cut_var} > {cut_val:.3f}' cut_function = cut_and_redistribute_df(cut_condition, redist_var) df_cut = cut_function(df) df_cut_manual = df.loc[df[cut_var].values > cut_val, :].copy() df_cut_manual.loc[:, redist_var] = df_cut_manual.loc[:, redist_var] * df.loc[:, redist_var].sum() / df_cut_manual.loc[:, redist_var].sum() assert_dataframes_close(df_cut, df_cut_manual) def test_drop_isolated_sensors(): size = 20 dist = [10.1, 10.1]
from hypothesis.strategies import text @fixture() def empty_dataframe( columns=['int_value', 'float_value', 'bool_value', 'str_value'], dtypes=['int32', 'float32', 'bool', 'object'], index=None): assert len(columns) == len(dtypes) df = pd.DataFrame(index=index) for c, d in zip(columns, dtypes): df[c] = pd.Series(dtype=d) return df dataframe = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[ column('int_value', dtype=int), column('float_val', dtype=float), column('bool_value', dtype=bool) ]) dataframe_diff = data_frames( index=range_indexes(min_size=1, max_size=5), columns=[column('int_value', dtype=int), column('float_val', dtype=float)]) strings_dataframe = data_frames(index=range_indexes(min_size=1, max_size=5), columns=[ column('str_val', elements=text(
e(pdst.data_frames, pdst.columns(1)), e(pdst.data_frames, pdst.columns(1, dtype=float, fill=1)), e(pdst.data_frames, pdst.columns(1, dtype=float, elements=1)), e(pdst.data_frames, pdst.columns(1, fill=1, dtype=float)), e(pdst.data_frames, pdst.columns(['A', 'A'], dtype=float)), e(pdst.data_frames, pdst.columns(1, elements=st.none(), dtype=int)), e(pdst.data_frames, 1), e(pdst.data_frames, [1]), e(pdst.data_frames, pdst.columns(1, dtype='category')), e(pdst.data_frames, pdst.columns(['A'], dtype=bool), rows=st.tuples(st.booleans(), st.booleans())), e(pdst.data_frames, pdst.columns(1, elements=st.booleans()), rows=st.tuples(st.booleans())), e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(0, 0)), e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(1, 1)), e(pdst.data_frames, pdst.columns(1, dtype=int), rows=st.integers()), e(pdst.indexes), e(pdst.indexes, dtype='category'), e(pdst.indexes, dtype='not a dtype'), e(pdst.indexes, elements='not a strategy'), e(pdst.indexes, elements=st.text(), dtype=float), e(pdst.indexes, elements=st.none(), dtype=int), e(pdst.indexes, dtype=int, max_size=0, min_size=1), e(pdst.indexes, dtype=int, unique='true'), e(pdst.indexes, dtype=int, min_size='0'), e(pdst.indexes, dtype=int, max_size='1'), e(pdst.range_indexes, 1, 0), e(pdst.range_indexes, min_size='0'), e(pdst.range_indexes, max_size='1'),
not_has_all_quotechars, strat_dates, strat_floats, strat_ints, strat_text, ) from benchmarks.read_sql.read_sql import read_sql hypo_df = hpd.data_frames( columns=[ hpd.column(name="col1_text", elements=strat_text), hpd.column(name="col2_ints", elements=strat_ints), hpd.column(name="col3_floats", elements=strat_floats), hpd.column(name="col4_dates", elements=strat_dates), ], index=hpd.range_indexes(min_size=1), ) class TestReadSqlBasic: """ For all tests, the 'actual' is retrieved using the built-in pandas methods, to compare to the 'expected' which used bcpandas. Because dtypes change when reading from text files, ignoring dtypes checks. TODO how to really fix this """ table_name = "lotr_readsql" view_name = f"v_{table_name}" @given(df=hypo_df)
from hypothesis import given from hypothesis.extra.pandas import columns, data_frames, range_indexes import hypothesis.strategies as st import pandas as pd from analyse_weather import get_data, hottest_summer @given( data_frames( columns=columns( ['JUN', 'JUL', 'AUG'], elements=st.floats(allow_nan=True) ), index=range_indexes(min_size=1) ) ) def test_hottest_summer_auto(df): assert not pd.isnull(hottest_summer(df)) # Below is annother example of using fixtures but for this function: import pytest from pandas import DataFrame @pytest.fixture def full_dataset(): return get_data()
e(pdst.data_frames, pdst.columns(1)), e(pdst.data_frames, pdst.columns(1, dtype=float, fill=1)), e(pdst.data_frames, pdst.columns(1, dtype=float, elements=1)), e(pdst.data_frames, pdst.columns(1, fill=1, dtype=float)), e(pdst.data_frames, pdst.columns(['A', 'A'], dtype=float)), e(pdst.data_frames, pdst.columns(1, elements=st.none(), dtype=int)), e(pdst.data_frames, 1), e(pdst.data_frames, [1]), e(pdst.data_frames, pdst.columns(1, dtype='category')), e(pdst.data_frames, pdst.columns(['A'], dtype=bool), rows=st.tuples(st.booleans(), st.booleans())), e(pdst.data_frames, pdst.columns(1, elements=st.booleans()), rows=st.tuples(st.booleans())), e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(0, 0)), e(pdst.data_frames, rows=st.integers(), index=pdst.range_indexes(1, 1)), e(pdst.data_frames, pdst.columns(1, dtype=int), rows=st.integers()), e(pdst.indexes), e(pdst.indexes, dtype='category'), e(pdst.indexes, dtype='not a dtype'), e(pdst.indexes, elements='not a strategy'), e(pdst.indexes, elements=st.text(), dtype=float), e(pdst.indexes, elements=st.none(), dtype=int), e(pdst.indexes, dtype=int, max_size=0, min_size=1), e(pdst.indexes, dtype=int, unique='true'), e(pdst.range_indexes, 1, 0), e(pdst.series), e(pdst.series, dtype='not a dtype'), e(pdst.series, elements='not a strategy'), e(pdst.series, elements=st.none(), dtype=int),
class TestGPUPredict: def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] # This test passes for tree_method=gpu_hist and tree_method=exact. but # for `hist` and `approx` the floating point error accumulates faster # and fails even tol is set to 1e-4. For `hist`, the mismatching rate # with 5000 rows is 0.04. for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'logloss', 'tree_method': 'gpu_hist', 'max_depth': 1 } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_increasing(res["train"]["logloss"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6) def non_increasing(self, L): return all((y - x) < 0.001 for x, y in zip(L, L[1:])) # Test case for a bug where multiple batch predictions made on a # test set produce incorrect results @pytest.mark.skipif(**tm.no_sklearn()) def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict) @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn(self): m, n = 15000, 14 tr_size = 2500 X = np.random.rand(m, n) y = 200 * np.matmul(X, np.arange(-3, -3 + n)) X_train, y_train = X[:tr_size, :], y[:tr_size] X_test, y_test = X[tr_size:, :], y[tr_size:] # First with cpu_predictor params = { 'tree_method': 'gpu_hist', 'predictor': 'cpu_predictor', 'n_jobs': -1, 'seed': 123 } m = xgb.XGBRegressor(**params).fit(X_train, y_train) cpu_train_score = m.score(X_train, y_train) cpu_test_score = m.score(X_test, y_test) # Now with gpu_predictor params['predictor'] = 'gpu_predictor' m = xgb.XGBRegressor(**params).fit(X_train, y_train) gpu_train_score = m.score(X_train, y_train) gpu_test_score = m.score(X_test, y_test) assert np.allclose(cpu_train_score, gpu_train_score) assert np.allclose(cpu_test_score, gpu_test_score) @pytest.mark.skipif(**tm.no_cupy()) def test_inplace_predict_cupy(self): import cupy as cp cp.cuda.runtime.setDevice(0) rows = 1000 cols = 10 cp_rng = cp.random.RandomState(1994) cp.random.set_random_state(cp_rng) X = cp.random.randn(rows, cols) y = cp.random.randn(rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X[:10, ...]) predt_from_array = booster.inplace_predict(X[:10, ...]) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_dense(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) # Don't do this on Windows, see issue #5793 if sys.platform.startswith("win"): pytest.skip( 'Multi-threaded in-place prediction with cuPy is not working on Windows' ) for i in range(10): run_threaded_predict(X, rows, predict_dense) @pytest.mark.skipif(**tm.no_cudf()) def test_inplace_predict_cudf(self): import cupy as cp import cudf import pandas as pd rows = 1000 cols = 10 rng = np.random.RandomState(1994) cp.cuda.runtime.setDevice(0) X = rng.randn(rows, cols) X = pd.DataFrame(X) y = rng.randn(rows) X = cudf.from_pandas(X) dtrain = xgb.DMatrix(X, y) booster = xgb.train({'tree_method': 'gpu_hist'}, dtrain, num_boost_round=10) test = xgb.DMatrix(X) predt_from_array = booster.inplace_predict(X) predt_from_dmatrix = booster.predict(test) cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix) def predict_df(x): inplace_predt = booster.inplace_predict(x) d = xgb.DMatrix(x) copied_predt = cp.array(booster.predict(d)) return cp.all(copied_predt == inplace_predt) for i in range(10): run_threaded_predict(X, rows, predict_df) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None) def test_shap(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_contribs=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-3, 1e-3) @given(strategies.integers(1, 10), tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20) def test_shap_interactions(self, num_rounds, dataset, param): param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() bst = xgb.train(param, dmat, num_rounds) test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin) shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) assert np.allclose( np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, 1e-3, 1e-3) def test_predict_leaf_basic(self): gpu_leaf = run_predict_leaf('gpu_predictor') cpu_leaf = run_predict_leaf('cpu_predictor') np.testing.assert_equal(gpu_leaf, cpu_leaf) def run_predict_leaf_booster(self, param, num_rounds, dataset): param = dataset.set_params(param) m = dataset.get_dmat() booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds) booster.set_param({'predictor': 'cpu_predictor'}) cpu_leaf = booster.predict(m, pred_leaf=True) booster.set_param({'predictor': 'gpu_predictor'}) gpu_leaf = booster.predict(m, pred_leaf=True) np.testing.assert_equal(cpu_leaf, gpu_leaf) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_gbtree(self, param, dataset): param['booster'] = 'gbtree' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @given(predict_parameter_strategy, tm.dataset_strategy) @settings(deadline=None) def test_predict_leaf_dart(self, param, dataset): param['booster'] = 'dart' param['tree_method'] = 'gpu_hist' self.run_predict_leaf_booster(param, 10, dataset) @pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_pandas()) @given(df=data_frames([ column('x0', elements=strategies.integers(min_value=0, max_value=3)), column('x1', elements=strategies.integers(min_value=0, max_value=5)) ], index=range_indexes(min_size=20, max_size=50))) @settings(deadline=None) def test_predict_categorical_split(self, df): from sklearn.metrics import mean_squared_error df = df.astype('category') x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy() y = (x0 * 10 - 20) + (x1 - 2) dtrain = xgb.DMatrix(df, label=y, enable_categorical=True) params = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse' } eval_history = {} bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')], verbose_eval=False, evals_result=eval_history) pred = bst.predict(dtrain) rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False) np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
from tests.common.debug import minimal, find_any from tests.pandas.helpers import supported_by_pandas @given(pdst.data_frames([ pdst.column('a', dtype=int), pdst.column('b', dtype=float), ])) def test_can_have_columns_of_distinct_types(df): assert df['a'].dtype == np.dtype(int) assert df['b'].dtype == np.dtype(float) @given(pdst.data_frames( [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5))) def test_respects_size_bounds(df): assert 1 <= len(df) <= 5 @given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float))) def test_can_specify_just_column_names(df): df['A'] df['B'] @given(pdst.data_frames(pdst.columns(2, dtype=float))) def test_can_specify_just_column_count(df): df[0] df[1]