Exemple #1
0
def categoricaldf_strategy():
    return data_frames(
        columns=[
            column("names", st.sampled_from(names)),
            column("numbers", st.sampled_from(range(3))),
        ],
        index=range_indexes(min_size=1, max_size=20),
    )
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames([
        pdst.column('A', dtype=int, unique=False),
        pdst.column('B', dtype=int, unique=True)],
        rows=st.tuples(st.integers(0, 10), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2)
    )
    find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
Exemple #3
0
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames([
        pdst.column('A', dtype=int, unique=False),
        pdst.column('B', dtype=int, unique=True)],
        rows=st.tuples(st.integers(0, 10), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2)
    )
    find_any(data_frames, lambda x: x['A'][0] == x['A'][1])
Exemple #4
0
def nulldf_strategy():
    return data_frames(
        columns=[
            column("1", st.floats(allow_nan=True, allow_infinity=True)),
            column("2", st.sampled_from([np.nan])),
            column("3", st.sampled_from([np.nan])),
        ],
        index=range_indexes(min_size=3, max_size=20),
    )
def datasets(draw: Callable[[st.SearchStrategy], Any]) -> pd.DataFrame:
    """ Generates datasets of MOER values. """
    frame = draw(
        hpd.data_frames(
            (hpd.column("timestamp",
                        st.datetimes()), hpd.column("MOER", st.floats()))))

    frame.iloc[:, 0] = frame.iloc[:, 0].apply(padded_strftime)
    print(frame)
    return frame
def test_uniqueness_does_not_affect_other_rows_2():
    data_frames = pdst.data_frames(
        [
            pdst.column("A", dtype=bool, unique=False),
            pdst.column("B", dtype=int, unique=True),
        ],
        rows=st.tuples(st.booleans(), st.integers(0, 10)),
        index=pdst.range_indexes(2, 2),
    )
    find_any(data_frames, lambda x: x["A"][0] == x["A"][1])
Exemple #7
0
class TestMain(unittest.TestCase):
    @given(df=data_frames(
        [column('1', dtype='float'),
         column('2', dtype='float')]))
    @patch("pandas.read_csv")
    def test_sum_basic(self, read_csv_mock: Mock, df):
        read_csv_mock.return_value = df
        results = sum()
        read_csv_mock.assert_called_once()

        self.assertEqual(len(results.columns), 3)
        self.assertEqual(results['3'].dtype, 'float')
	def test_pandas_vertex_creation_noproperty(self):
		dataframe = data_frames(
                columns=[column(name='src', 
                                elements=st.sampled_from(names), 
                                unique=True),
                         column(name='age', 
                                elements=st.integers(min_value=20, max_value=30), 
                                unique=False),
                       ]
            ).example()

		g = TinkerFactory().addV_from_pandas(dataframe, 
											 src='src', 
											 v_properties = [])
def hypot_df_generator():
    df = raw_()
    cols = []
    for col in ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q8', 'Q9']:
        cols.append(
            column(col, elements=strategies.sampled_from(df[col].unique())))
    return data_frames(columns=cols)
Exemple #10
0
def column_strategy(
    pandas_dtype: PandasDtype,
    strategy: Optional[SearchStrategy] = None,
    *,
    checks: Optional[Sequence] = None,
    allow_duplicates: Optional[bool] = True,
    name: Optional[str] = None,
):
    # pylint: disable=line-too-long
    """Create a data object describing a column in a DataFrame.

    :param pandas_dtype: :class:`pandera.dtypes.PandasDtype` instance.
    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param checks: sequence of :class:`~pandera.checks.Check` s to constrain
        the values of the data in the column/index.
    :param allow_duplicates: whether or not generated Series contains
        duplicates.
    :param name: name of the Series.
    :returns: a `column <https://hypothesis.readthedocs.io/en/latest/numpy.html#hypothesis.extra.pandas.column>`_ object.
    """
    verify_pandas_dtype(pandas_dtype, schema_type="column", name=name)
    elements = field_element_strategy(pandas_dtype, strategy, checks=checks)
    return pdst.column(
        name=name,
        elements=elements,
        dtype=pandas_dtype.numpy_dtype,
        unique=not allow_duplicates,
    )
Exemple #11
0
def null_dataframe_masks(
    draw,
    strategy: Optional[SearchStrategy],
    nullable_columns: Dict[str, bool],
):
    """Strategy for masking a values in a pandas DataFrame.

    :param strategy: an optional hypothesis strategy. If specified, the
        pandas dtype strategy will be chained onto this strategy.
    :param nullable_columns: dictionary where keys are column names and
        values indicate whether that column is nullable.
    """
    val = draw(strategy)
    size = val.shape[0]
    columns_strat = []
    for name, nullable in nullable_columns.items():
        element_st = st.booleans() if nullable else st.just(False)
        columns_strat.append(
            pdst.column(
                name=name,
                elements=element_st,
                dtype=bool,
                fill=st.just(False),
            )
        )
    mask_st = pdst.data_frames(
        columns=columns_strat,
        index=pdst.range_indexes(min_size=size, max_size=size),
    )
    null_mask = draw(mask_st)
    # assume that there is at least one masked value
    hypothesis.assume(null_mask.any(axis=None))
    return val.mask(null_mask)
def s_column():
    return column("S",
                  elements=floats(min_value=0,
                                  max_value=27000,
                                  allow_infinity=False,
                                  allow_nan=False),
                  dtype=np.float64)
def column_by_dtypes(
        dtype_group: Optional[str] = "RiptableNumeric") -> List[pdst.column]:
    """Returns a list of columns from a dtype group for generation of primitive data types wrapped in columns for DataFrame strategies."""
    return [
        pdst.column(str(dtype), dtype=np.dtype(dtype))
        for dtype in set(dtypes_by_group[dtype_group])
    ]
Exemple #14
0
def test_dayofweek_unknown(data):
    data1 = data.draw(
        data_frames(columns=[
            column(name='HOUR',
                   elements=st.integers(min_value=0, max_value=24),
                   unique=False),
            column(name='DAY_WEEK', elements=st.just(9), unique=False)
        ]))
    data2 = data.draw(
        data_frames(columns=[
            column(name='HOUR', elements=st.just(99), unique=False),
            column(name='DAY_WEEK',
                   elements=st.one_of(st.just(2), st.just(6)),
                   unique=False)
        ]))
    for data in [data1, data2]:
        for d in ei.day_of_week(data):
            assert d == 'Unknown'
	def test_pandas_vertex_creation_noproperty(self):
		names = ['andre','renan','diego','caio','victor','bruno']
		languages = ['python','R','java']

		dataframe = data_frames(
                columns=[column(name='src', 
                                elements=st.sampled_from(names), 
                                unique=True),
                         column(name='age', 
                                elements=st.integers(min_value=20, max_value=30), 
                                unique=False),
                         column(name='lang', 
                                elements=st.sampled_from(languages), 
                                unique=False)
                       ]
            ).example()

		g = TinkerFactory().addV_from_pandas(dataframe, src='src', v_properties = ['age'])
Exemple #16
0
def df_strategy(allow_nan=True, allow_infinity=True):
    """
    This strategies generates dataframes that might containing
    a column without null/inf and a column with inf and possible nan
    values.
    """

    return data_frames(
        columns=[
            column(name="item", dtype=float),
            column(name="att1", dtype="object"),
            column(name="att2", dtype=float),
        ],
        rows=st.tuples(
            st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity),
            st.text(printable, max_size=5),
            st.floats(allow_nan=allow_nan, allow_infinity=allow_infinity),
        ),
    )
class Test(unittest.TestCase):

    # Test fibonnaci sequence on first 10 integers
    @given(ST.integers(0, 10))
    def test_00_fib_first_10(self, n):
        Math.fib(n)

    # Test fibonnaci sequence on negative numbers,
    # breaking one of the assumptions of the function
    # (no negative values)
    @given(ST.integers(max_value=-1))
    def test_01_fib_negative_values(self, n):
        Math.fib(n)

    # Test BetaCoefficient functionality
    @given(n=ST.integers(), Y=ST.lists(ST.floats()), X1=ST.lists(ST.floats()))
    def test_03_BetaCoefficient(self, n, Y, X1):
        beta = Math.GetBetaCoefficient(Y, X1)
        print(beta)

    # Test unbound plane with 1000 different examples
    @settings(max_examples=1000)
    @given(coord1=ST.tuples(ST.floats(), ST.floats()),
           coord2=ST.tuples(ST.floats(), ST.floats()))
    def test_04_EuclideanDistance_unbound(self, coord1, coord2):
        Math.EuclideanDistance(coord1, coord2)

    # Test simple pandas transpose
    @given(data_frames([
        column('a', dtype=int),
        column('b', dtype=int),
    ]))
    def test_05_transpose(self, df):
        transforms.transpose(df)

    # Test the creation of a geographic distance matrix
    # Building on euclidean distance, let's test a higher order function.
    @given(
        data_frames([column('lat', dtype=float),
                     column('lon', dtype=float)]))
    def test_06_DistanceMatrixGeneration(self, df):
        df['store_id'] = [_ for _ in range(len(df))]
        transforms.CreateDistanceMatrix(df)
def column_arrays(draw) -> List[Union[np.ndarray, rt.FastArray]]:
    """Returns a list of numpy ndarray and riptide FastArray wrapped in columns for DataFrame strategies."""
    # todo add strategy to generate FastArray to the return list
    arr = draw(
        generate_array(
            shape=ndarray_shape_strategy(),
            dtype=ints_or_floats_dtypes(),
            include_invalid=False,
        ))
    # f_arr = rt.FastArray(arr)
    return [pdst.column(name(arr), elements=arr)]
Exemple #19
0
def df_strategy():
    """
    A convenience function for generating a dataframe as a hypothesis strategy.

    Should be treated like a fixture, but should not be passed as a fixture
    into a test function. Instead::

        @given(df=dataframe())
        def test_function(df):
            # test goes here
    """
    return data_frames(
        columns=[
            column("a", elements=st.integers()),
            column("Bell__Chart", elements=st.floats()),
            column("decorated-elephant", elements=st.integers()),
            column("animals@#$%^", elements=st.text()),
            column("cities", st.text()),
        ],
        index=range_indexes(min_size=1, max_size=20),
    )
Exemple #20
0
def test_weekday(data):
    # Weekday, day=3,4,5
    data1 = data.draw(
        data_frames(columns=[
            column(name='HOUR',
                   elements=st.one_of(st.integers(min_value=0, max_value=24),
                                      st.just(99)),
                   unique=False),
            column(name='DAY_WEEK',
                   elements=st.integers(min_value=3, max_value=5),
                   unique=False)
        ]))
    # Weekday, day=6, hr=0-17, 24
    data2 = data.draw(
        data_frames(columns=[
            column(name='HOUR',
                   elements=st.one_of(st.integers(min_value=0, max_value=17),
                                      st.just(24)),
                   unique=False),
            column(name='DAY_WEEK', elements=st.just(6), unique=False)
        ]))
    # Weekday, day=2, hr=6-23
    data3 = data.draw(
        data_frames(columns=[
            column(name='HOUR',
                   elements=st.integers(min_value=6, max_value=23),
                   unique=False),
            column(name='DAY_WEEK', elements=st.just(2), unique=False)
        ]))
    for data in [data1, data2, data3]:
        for d in ei.day_of_week(data):
            assert d == 'Weekday'
Exemple #21
0
def test_expected_failure_from_omitted_object_dtype(dtype):
    # See https://github.com/HypothesisWorks/hypothesis/issues/3133
    col = pdst.column(elements=st.sets(st.text(), min_size=1), dtype=dtype)

    @given(pdst.data_frames(columns=[col]))
    def works_with_object_dtype(df):
        pass

    if dtype is object:
        works_with_object_dtype()
    else:
        assert dtype is None
        with pytest.raises(ValueError, match="Maybe passing dtype=object would help"):
            works_with_object_dtype()
Exemple #22
0
class TestExogenous:
    @given(giotto_time_series(min_length=2))
    def test_exogenous_single_column(self, time_series: pd.DataFrame):
        exogenous = Exogenous()
        transformed_time_series = exogenous.fit_transform(time_series)
        transformed_time_series.columns = ["time_series"]
        assert_frame_equal(transformed_time_series, time_series, check_names=False)

    @given(data_frames([column("A", dtype=int), column("B", dtype=float)]))
    def test_multiple_columns(self, time_series: pd.DataFrame):
        exogenous = Exogenous()
        transformed_time_series = exogenous.fit_transform(time_series)
        transformed_time_series.columns = ["A", "B"]
        assert_frame_equal(transformed_time_series, time_series, check_names=False)

    @given(giotto_time_series(min_length=2))
    def test_naming(self, time_series: pd.DataFrame):
        exogenous = Exogenous()
        transformed_time_series = exogenous.fit_transform(time_series)
        expected_columns = [
            f"{column_name}__Exogenous" for column_name in time_series.columns
        ]
        assert expected_columns == list(transformed_time_series.columns)
def dataframe(draw):
    n_cols = draw(integers(min_value=1, max_value=20))
    dtypes = draw(
        lists(sampled_from([float, int, str]),
              min_size=n_cols,
              max_size=n_cols))
    colnames = draw(
        lists(text() | integers(),
              min_size=n_cols,
              max_size=n_cols,
              unique=True))
    return draw(
        data_frames(columns=[
            column(name=name, dtype=dtype)
            for dtype, name in zip(dtypes, colnames)
        ]))
Exemple #24
0
def column_strategy(draw):
    name = draw(st.none() | st.text())
    dtype = draw(npst.scalar_dtypes().filter(supported_by_pandas))
    pass_dtype = not draw(st.booleans())
    if pass_dtype:
        pass_elements = not draw(st.booleans())
    else:
        pass_elements = True
    if pass_elements:
        elements = npst.from_dtype(dtype)
    else:
        elements = None

    unique = draw(st.booleans())
    fill = st.nothing() if draw(st.booleans()) else None

    return pdst.column(
        name=name, dtype=dtype, unique=unique, fill=fill, elements=elements)
def column_strategy(draw):
    name = draw(st.none() | st.text())
    dtype = draw(npst.scalar_dtypes().filter(supported_by_pandas))
    pass_dtype = not draw(st.booleans())
    if pass_dtype:
        pass_elements = not draw(st.booleans())
    else:
        pass_elements = True
    if pass_elements:
        elements = npst.from_dtype(dtype)
    else:
        elements = None

    unique = draw(st.booleans())
    fill = st.nothing() if draw(st.booleans()) else None

    return pdst.column(
        name=name, dtype=dtype, unique=unique, fill=fill, elements=elements)
Exemple #26
0
def test_can_minimize_based_on_two_columns_independently(
        disable_fill, non_standard_index):
    columns = [
        pdst.column(name,
                    dtype=bool,
                    fill=st.nothing() if name in disable_fill else None)
        for name in ["A", "B", "C"]
    ]

    x = minimal(
        pdst.data_frames(
            columns,
            index=pdst.indexes(dtype=int) if non_standard_index else None),
        lambda x: x["A"].any() and x["B"].any() and x["C"].any(),
        random=Random(0),
    )
    assert len(x["A"]) == 1
    assert x["A"][0] == 1
    assert x["B"][0] == 1
    assert x["C"][0] == 1
Exemple #27
0
def dataframe(draw):
    n_cols = draw(integers(min_value=1, max_value=20))
    dtypes = draw(
        lists(
            one_of(
                np_strategies.floating_dtypes(),
                np_strategies.integer_dtypes(),
                np_strategies.unicode_string_dtypes(),
            ),
            min_size=n_cols,
            max_size=n_cols,
        ))
    colnames = draw(
        lists(text() | integers(),
              min_size=n_cols,
              max_size=n_cols,
              unique=True))
    return draw(
        data_frames(columns=[
            column(name=name, dtype=dtype)
            for dtype, name in zip(dtypes, colnames)
        ]))
def dataframe_and_clusters(draw, length=None):
    n_cols = draw(integers(min_value=1, max_value=20))
    dtypes = draw(
        lists(sampled_from([float, int, str]),
              min_size=n_cols,
              max_size=n_cols))
    colnames = draw(
        lists(text() | integers(),
              min_size=n_cols,
              max_size=n_cols,
              unique=True))
    df = draw(
        data_frames(columns=[
            column(name=name, dtype=dtype)
            for dtype, name in zip(dtypes, colnames)
        ]))
    cluster_labels = draw(
        lists(
            integers(min_value=0, max_value=3),
            min_size=len(df),
            max_size=len(df),
        ))
    return df, cluster_labels
Exemple #29
0
def test_can_minimize_based_on_two_columns_independently(
    disable_fill, non_standard_index
):
    columns = [
        pdst.column(
            name, dtype=bool,
            fill=st.nothing() if name in disable_fill else None,
        )
        for name in ['A', 'B', 'C']
    ]

    x = minimal(
        pdst.data_frames(
            columns,
            index=pdst.indexes(dtype=int) if non_standard_index else None,
        ),
        lambda x: x['A'].any() and x['B'].any() and x['C'].any(),
        random=Random(0),
    )
    assert len(x['A']) == 1
    assert x['A'][0] == 1
    assert x['B'][0] == 1
    assert x['C'][0] == 1
#
# END HEADER

from __future__ import absolute_import, division, print_function

import numpy as np

import hypothesis.extra.numpy as npst
import hypothesis.extra.pandas as pdst
import hypothesis.strategies as st
from hypothesis import HealthCheck, given, reject, settings
from tests.common.debug import find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([pdst.column("a", dtype=int), pdst.column("b", dtype=float)]))
def test_can_have_columns_of_distinct_types(df):
    assert df["a"].dtype == np.dtype(int)
    assert df["b"].dtype == np.dtype(float)


@given(
    pdst.data_frames(
        [pdst.column(dtype=int)], index=pdst.range_indexes(min_size=1, max_size=5)
    )
)
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(["A", "B"], dtype=float)))
import pytest

from siuba import _, mutate, group_by, summarize, filter
import siuba.sql.dply
from siuba.dply import vector as v
from datetime import timedelta

from hypothesis import given, settings, example
from hypothesis.strategies import text, floats, integers
from hypothesis.extra.pandas import data_frames, column, indexes

from .helpers import assert_equal_query, data_frame, backend_sql
from pandas.testing import assert_frame_equal

DATA_SPEC = data_frames([
    column('x', elements=floats(width=32) | integers(), unique=True),
    column('g', dtype=str, elements=text(max_size=1))
],
                        index=indexes(elements=floats() | integers(),
                                      max_size=10))

OMNIBUS_VECTOR_FUNCS = [
    #cumall, cumany, cummean,
    #desc,
    v.dense_rank(_.x, na_option="keep"),
    #v.percent_rank(_.x),
    v.min_rank(_.x, na_option="keep"),
    v.cume_dist(_.x, na_option="keep"),
    v.row_number(_.x),
    #ntile,
    v.between(_.x, 2, 5, default=False),
Exemple #32
0
def generic_column_pos(name):
    return column(name,
                  elements=floats(min_value=0,
                                  allow_infinity=False, allow_nan=False),
                  dtype=np.float64)
Exemple #33
0
def s_column():
    return column("S",
                  elements=floats(min_value=0, max_value=27000,
                                  allow_infinity=False, allow_nan=False),
                  dtype=np.float64)
Exemple #34
0
from __future__ import division, print_function, absolute_import

import numpy as np
import pytest

import hypothesis.strategies as st
import hypothesis.extra.numpy as npst
import hypothesis.extra.pandas as pdst
from hypothesis import HealthCheck, given, reject, settings
from hypothesis.types import RandomWithSeed as Random
from tests.common.debug import minimal, find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([
    pdst.column('a', dtype=int),
    pdst.column('b', dtype=float),
]))
def test_can_have_columns_of_distinct_types(df):
    assert df['a'].dtype == np.dtype(int)
    assert df['b'].dtype == np.dtype(float)


@given(pdst.data_frames(
    [pdst.column(dtype=int)],
    index=pdst.range_indexes(min_size=1, max_size=5)))
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float)))
Exemple #35
0
class TestGPUPredict:
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        # This test passes for tree_method=gpu_hist and tree_method=exact. but
        # for `hist` and `approx` the floating point error accumulates faster
        # and fails even tol is set to 1e-4.  For `hist`, the mismatching rate
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                     label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                   label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
                                    label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'logloss',
                    'tree_method': 'gpu_hist',
                    'max_depth': 1
                }
                bst = xgb.train(param,
                                dtrain,
                                iterations,
                                evals=watchlist,
                                evals_result=res)
                assert self.non_increasing(res["train"]["logloss"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

                np.testing.assert_allclose(cpu_pred_train,
                                           gpu_pred_train,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_val,
                                           gpu_pred_val,
                                           rtol=1e-6)
                np.testing.assert_allclose(cpu_pred_test,
                                           gpu_pred_test,
                                           rtol=1e-6)

    def non_increasing(self, L):
        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
        m, n = 15000, 14
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

        # First with cpu_predictor
        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'cpu_predictor',
            'n_jobs': -1,
            'seed': 123
        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        cpu_train_score = m.score(X_train, y_train)
        cpu_test_score = m.score(X_test, y_test)

        # Now with gpu_predictor
        params['predictor'] = 'gpu_predictor'

        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    @pytest.mark.skipif(**tm.no_cupy())
    def test_inplace_predict_cupy(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
        rows = 1000
        cols = 10
        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
        X = cp.random.randn(rows, cols)
        y = cp.random.randn(rows)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X[:10, ...])
        predt_from_array = booster.inplace_predict(X[:10, ...])
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_dense(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
                'Multi-threaded in-place prediction with cuPy is not working on Windows'
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)

    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
        import cudf
        import pandas as pd
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
        cp.cuda.runtime.setDevice(0)
        X = rng.randn(rows, cols)
        X = pd.DataFrame(X)
        y = rng.randn(rows)
        X = cudf.from_pandas(X)

        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train({'tree_method': 'gpu_hist'},
                            dtrain,
                            num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)

        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)

        def predict_df(x):
            inplace_predt = booster.inplace_predict(x)
            d = xgb.DMatrix(x)
            copied_predt = cp.array(booster.predict(d))
            return cp.all(copied_predt == inplace_predt)

        for i in range(10):
            run_threaded_predict(X, rows, predict_df)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None)
    def test_shap(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin,
                           1e-3, 1e-3)

    @given(strategies.integers(1, 10), tm.dataset_strategy,
           shap_parameter_strategy)
    @settings(deadline=None, max_examples=20)
    def test_shap_interactions(self, num_rounds, dataset, param):
        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w,
                                dataset.margin)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
        assert np.allclose(
            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
            margin, 1e-3, 1e-3)

    def test_predict_leaf_basic(self):
        gpu_leaf = run_predict_leaf('gpu_predictor')
        cpu_leaf = run_predict_leaf('cpu_predictor')
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
        booster = xgb.train(param,
                            dtrain=dataset.get_dmat(),
                            num_boost_round=num_rounds)
        booster.set_param({'predictor': 'cpu_predictor'})
        cpu_leaf = booster.predict(m, pred_leaf=True)

        booster.set_param({'predictor': 'gpu_predictor'})
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_gbtree(self, param, dataset):
        param['booster'] = 'gbtree'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.dataset_strategy)
    @settings(deadline=None)
    def test_predict_leaf_dart(self, param, dataset):
        param['booster'] = 'dart'
        param['tree_method'] = 'gpu_hist'
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
    @given(df=data_frames([
        column('x0', elements=strategies.integers(min_value=0, max_value=3)),
        column('x1', elements=strategies.integers(min_value=0, max_value=5))
    ],
                          index=range_indexes(min_size=20, max_size=50)))
    @settings(deadline=None)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

        df = df.astype('category')
        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'max_depth': 3,
            'learning_rate': 1.0,
            'base_score': 0.0,
            'eval_metric': 'rmse'
        }

        eval_history = {}
        bst = xgb.train(params,
                        dtrain,
                        num_boost_round=5,
                        evals=[(dtrain, 'train')],
                        verbose_eval=False,
                        evals_result=eval_history)

        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse,
                                       eval_history['train']['rmse'][-1],
                                       decimal=5)
Exemple #36
0
strands = st.sampled_from("+ -".split())
single_strand = st.sampled_from(["+"])
names = st.text("abcdefghijklmnopqrstuvxyz", min_size=1)
scores = st.integers(min_value=0, max_value=256)

datatype = st.sampled_from([pd.Series, np.array, list])

chromosomes = st.sampled_from(
    ["chr{}".format(str(e)) for e in list(range(1, 23)) + "X Y M".split()])
chromosomes_small = st.sampled_from(["chr1"])
cs = st.one_of(chromosomes, chromosomes_small)

runlengths = data_frames(
    index=indexes(dtype=np.int64, min_size=1, unique=True),
    columns=[
        column("Runs", st.integers(min_value=1, max_value=int(1e7))),
        # must have a min/max on floats because R S4vectors translates too big ones into inf.
        # which is unequal to eg -1.79769e+308 so the tests fail
        column("Values", st.integers(min_value=-int(1e7), max_value=int(1e7)))
    ])

better_dfs_no_min = data_frames(
    index=indexes(dtype=np.int64, min_size=0, unique=True, elements=lengths),
    columns=[
        column("Chromosome", cs),
        column("Start", elements=lengths),
        column("End", elements=small_lengths),
        # column("Name", elements=names),
        # column("Score", elements=scores),
        column("Strand", strands)
    ])
Exemple #37
0
def alf_column(plane):
    return column("ALF" + plane,
                  elements=floats(allow_infinity=False, allow_nan=False),
                  dtype=np.float64)
Exemple #38
0
def bet_column(plane):
    return column("BET" + plane,
                  elements=floats(min_value=1e-7,
                                  allow_infinity=False, allow_nan=False),
                  dtype=np.float64)
    text(), primitive_strategy) | lists(primitive_strategy)

nested_strategy = recursive(
    container_strategy,
    lambda children: lists(children) | dictionaries(text(), children),
)

numpy_strategy = arrays(guaranteed_dtypes, array_shapes())

pandas_series = series(dtype=int) | series(dtype=float) | series(dtype=str)

pandas_dfs = (data_frames(columns(3, dtype=int))
              | data_frames(columns(3, dtype=float))
              | data_frames(columns(3, dtype=str))
              | data_frames(
                  [column(dtype=str),
                   column(dtype=float),
                   column(dtype=int)]))

possible_input_data = one_of(
    lists(primitive_strategy),
    numpy_strategy,
    pandas_series,
    # pandas_dfs
)

TEST_DF = pd.DataFrame(np.meshgrid(np.arange(20), np.arange(20))[0])

TEST_SERIES = pd.Series(np.arange(20))

TEST_ARRAY = np.arange(20)
from __future__ import division, print_function, absolute_import

import numpy as np
import pytest

import hypothesis.strategies as st
import hypothesis.extra.numpy as npst
import hypothesis.extra.pandas as pdst
from hypothesis import HealthCheck, given, reject, settings
from hypothesis.types import RandomWithSeed as Random
from tests.common.debug import minimal, find_any
from tests.pandas.helpers import supported_by_pandas


@given(pdst.data_frames([
    pdst.column('a', dtype=int),
    pdst.column('b', dtype=float),
]))
def test_can_have_columns_of_distinct_types(df):
    assert df['a'].dtype == np.dtype(int)
    assert df['b'].dtype == np.dtype(float)


@given(pdst.data_frames(
    [pdst.column(dtype=int)],
    index=pdst.range_indexes(min_size=1, max_size=5)))
def test_respects_size_bounds(df):
    assert 1 <= len(df) <= 5


@given(pdst.data_frames(pdst.columns(['A', 'B'], dtype=float)))