def test_diff(self): import pandas as pd ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)}) ds.keycol = np.random.choice(['a', 'b', 'c'], 10) df = pd.DataFrame(ds.asdict()) rt_result = ds.gb('keycol').rolling_diff() pd_result = df.groupby('keycol').diff() for k, v in rt_result.items(): pdc = pd_result[k] pdcnan = isnan(pdc) self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}') masked_valid_pd = isnotnan(pdc) masked_valid_rt = isnotnan(v) self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
def test_allnans(self, arg): # Call rt.nanmin with the test input. # It should raise a RuntimeWarning when given an input which # has all NaNs **on the specified axis**. with pytest.warns(RuntimeWarning): result = rt.nanmin(arg) # If given a scalar or 1D array (or some collection converted to such) # the result should be a NaN; for higher-rank arrays, the result should # be an array where one of the dimensions was collapsed and if there were # all NaNs along the selected axis there'll be a NaN there in the result. # TODO: Need to fix this to assert correctly for when rt.nanmin called with a higher-rank array. assert rt.isnan(result)
def test_cumcount_vs_gb(self): arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50) ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)}) gb_result = ds.gb('keycol').cumcount() c = Categorical(ds.keycol) c_result = c.cumcount() rdiff = gb_result - c_result assert sum(rdiff) == 0 f = logical(arange(50) % 2) c_result = c.cumcount(filter=f) assert bool(np.all(isnotnan(c_result[f]))) assert bool(np.all(isnan(c_result[~f])))
def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None: """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back.""" orig_cat_shape = rt_cat.shape if have_nulls: # riptable's filtering/masking uses a valid mask (where False means null/NA). indices = np.arange(len(rt_cat)) valid_mask = indices % 3 != 1 rt_cat = rt_cat.filter(valid_mask) assert rt_cat.shape == orig_cat_shape # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0. filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum() assert filtered_element_count == (len(rt_cat) - valid_mask.sum()) result_pa_arr = rt_cat.to_arrow() # Verify the pyarrow array has the correct length, number of categories, etc. assert len(rt_cat) == len(result_pa_arr) assert pat.is_dictionary(result_pa_arr.type) assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \ "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical." if have_nulls: assert valid_mask.sum() > 0 assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical. # * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical. # * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped, # which is not the same behavior as for other Categorical modes. # * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array, # like what is supported for other Categorical modes. if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary): pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.") result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable) # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be # in the same position after being roundtripped, so they should be mapped to the same integer before/after. # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them. # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in # pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed # we should be able to use the stronger equality check. assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)