コード例 #1
0
    def test_diff(self):
        import pandas as pd

        ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)})
        ds.keycol = np.random.choice(['a', 'b', 'c'], 10)
        df = pd.DataFrame(ds.asdict())

        rt_result = ds.gb('keycol').rolling_diff()
        pd_result = df.groupby('keycol').diff()

        for k, v in rt_result.items():
            pdc = pd_result[k]
            pdcnan = isnan(pdc)
            self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}')

            masked_valid_pd = isnotnan(pdc)
            masked_valid_rt = isnotnan(v)

            self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
コード例 #2
0
    def test_allnans(self, arg):
        # Call rt.nanmin with the test input.
        # It should raise a RuntimeWarning when given an input which
        # has all NaNs **on the specified axis**.
        with pytest.warns(RuntimeWarning):
            result = rt.nanmin(arg)

        # If given a scalar or 1D array (or some collection converted to such)
        # the result should be a NaN; for higher-rank arrays, the result should
        # be an array where one of the dimensions was collapsed and if there were
        # all NaNs along the selected axis there'll be a NaN there in the result.
        # TODO: Need to fix this to assert correctly for when rt.nanmin called with a higher-rank array.
        assert rt.isnan(result)
コード例 #3
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
コード例 #4
0
    def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None:
        """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back."""
        orig_cat_shape = rt_cat.shape
        if have_nulls:
            # riptable's filtering/masking uses a valid mask (where False means null/NA).
            indices = np.arange(len(rt_cat))
            valid_mask = indices % 3 != 1
            rt_cat = rt_cat.filter(valid_mask)
            assert rt_cat.shape == orig_cat_shape

            # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0.
            filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum()
            assert filtered_element_count == (len(rt_cat) - valid_mask.sum())

        result_pa_arr = rt_cat.to_arrow()

        # Verify the pyarrow array has the correct length, number of categories, etc.
        assert len(rt_cat) == len(result_pa_arr)
        assert pat.is_dictionary(result_pa_arr.type)
        assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \
            "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical."

        if have_nulls:
            assert valid_mask.sum() > 0
            assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count

        # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical.
        #         * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical.
        #         * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped,
        #           which is not the same behavior as for other Categorical modes.
        #         * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array,
        #           like what is supported for other Categorical modes.
        if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary):
            pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.")

        result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable)

        # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be
        # in the same position after being roundtripped, so they should be mapped to the same integer before/after.
        # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them.
        # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in
        #       pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed
        #       we should be able to use the stronger equality check.
        assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)