def test_cat_base_index_0(self): cat = rt.Categorical(np.tile([0, 1], 100), ['abc ', 'bcd'], base_index=0) result = cat.str.removetrailing() expected = rt.Categorical(np.tile([0, 1], 100), np.asarray(['abc', 'bcd']).astype('S4'), base_index=0) assert_array_or_cat_equal(result, expected)
def test_fill_forward(self): """ Test that Categorical.fill_forward fills values forward *per group*. """ data = rt.FA([1.0, 4.0, 9.0, 16.0, np.nan, np.nan]) cat = rt.Categorical(['A', 'B', 'A', 'B', 'A', 'B']) result = cat.fill_forward(data) # The result of this function should be a Dataset. assert isinstance(result, rt.Dataset) # The dataset should have the same number of rows as the data arrays # we operated on (an invariant of apply_nonreduce/"scan"/"prefix sum"). assert result.shape[0] == len(data) # The dataset should have (N+M) columns, where N is the number # of keys within the Categorical and M is the number of columns # we performed the operation on. expected_col_count = len(cat.category_dict) + 1 assert result.shape[1] == expected_col_count # Check the resulting data; the dtype of the data should be the # same as the original column. assert_array_equal(result[0], rt.FA([1.0, 4.0, 9.0, 16.0, 9.0, 16.0]))
def test_single_col_categoricals(self, func, func_name: str, data_dtype): values = [0, 1, 1, 2, 2, 2, 3, 3, 3, 4] bin_ids = ['a', 'b', 'c', 'd', 'e'] #data = np.random.rand(10) + np.random.randint(0, 10, size=10) data = np.array([ 7.19200901, 0.14907245, 2.28258397, 5.07872708, 0.76125165, 1.32797916, 3.40280423, 4.48942476, 6.98713656, 4.39541456 ]) data = rt.FastArray(data, dtype=data_dtype) map = {'vs': data, 'ks': values} pd_data = pd.DataFrame(map).groupby(by='ks') rt_data = rt.Categorical(values=values, categories=bin_ids, base_index=0) pd_out = self.groupby_func(pd_data, func, None) rt_out = self.groupby_func(rt_data, func, data) col_index = 'Count' if func_name == 'count' else 0 assert_array_almost_equal(rt_out[col_index], pd_out['vs'].values, decimal=3)
class TestPyarrowConvertDataset: @pytest.mark.parametrize(('rt_dset',), [ pytest.param(rt.Dataset({}), id='empty'), pytest.param(rt.Dataset({ 'ink_capacity': rt.FA([15, 10, 15, 25, 10, 15, 25, 15]), 'purchase_date': rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']), 'country_code': rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 124, 36, 484], { 'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392, 'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51 }, ordered=True) }) ) ]) def test_roundtrip_rt_pa_rt(self, rt_dset: rt.Dataset) -> None: """Test round-tripping from rt.Dataset to pyarrow.Table and back.""" result_pa_tbl = rt_dset.to_arrow() result_rt_dset = rt.Dataset.from_arrow(result_pa_tbl, zero_copy_only=False) assert rt_dset.keys() == result_rt_dset.keys() for col_name in rt_dset.keys(): # relaxed_cat_check=True, because we're not trying to test specific details of Categorical conversion # here, we're more interested in the dataset-level stuff. assert_array_or_cat_equal(rt_dset[col_name], result_rt_dset[col_name], relaxed_cat_check=True)
def test_unordered_categorical_disallowed(self): """Test which verifies rt.nanmin raises an exception if called with an unordered Categorical.""" # Create an unordered Categorical. cat = rt.Categorical(["PA", "NY", "NY", "AL", "LA", "PA", "CA", "IL", "IL", "FL", "FL", "LA"], ordered=False) assert not cat.ordered with pytest.raises(ValueError): rt.nanmin(cat)
def test_aggs_sum_symb_0_10_ncols_7(self): test_class = categorical_base(7, 0.10, "sum") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.sum(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).sum() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
def test_aggs_mean_symb_0_40_ncols_6(self): test_class = categorical_base(6, 0.40, "mean") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.mean(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).mean() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
def test_aggs_var_symb_0_25_ncols_5(self): test_class = categorical_base(5, 0.25, "var") cat = rt.Categorical( values=test_class.bin_ids, categories=test_class.keys, base_index=default_base_index, ) cat = cat.var(rt.Dataset(test_class.data)) gb = pd.DataFrame(test_class.data) gb = gb.groupby(test_class.bin_ids).var() for k, v in test_class.data.items(): safe_assert(remove_nan(gb[k]), remove_nan(cat[k]))
def test_alignmk(self): ds1 = rt.Dataset() ds1['Time'] = [0, 1, 4, 6, 8, 9, 11, 16, 19, 30] ds1['Px'] = [10, 12, 15, 11, 10, 9, 13, 7, 9, 10] ds2 = rt.Dataset() ds2['Time'] = [0, 0, 5, 7, 8, 10, 12, 15, 17, 20] ds2['Vols'] = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] # Categorical keys ds1['Ticker'] = rt.Categorical(['Test'] * 10) ds2['Ticker'] = rt.Categorical(['Test', 'Blah'] * 5) res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time) target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8]) assert_array_equal(res, target) # char array keys ds1['Ticker'] = rt.FastArray(['Test'] * 10) ds2['Ticker'] = rt.FastArray(['Test', 'Blah'] * 5) res = alignmk(ds1.Ticker, ds2.Ticker, ds1.Time, ds2.Time) target = rt.FastArray([0, 0, 0, 2, 4, 4, 4, 6, 8, 8]) assert_array_equal(res, target)
def test_ordered_categorical_returns_scalar(self): """ Test which verifies rt.nanmin returns a scalar (Python object or numpy scalar) representing the min Category given an ordered Categorical. """ # Create an ordered Categorical (aka 'ordinal'). cat = rt.Categorical( ["PA", "NY", "", "NY", "AL", "LA", "PA", "", "CA", "IL", "IL", "FL", "FL", "LA"], ordered=True) assert cat.ordered result = rt.nanmin(cat) # The result should either be a Python string, a numpy string scalar, or a Categorical scalar (if we implement one). is_py_str = isinstance(result, (bytes, str)) is_np_scalar = isinstance(result, np.str) is_rt_cat = isinstance(result, rt.Categorical) assert is_py_str or is_np_scalar or is_rt_cat # Check the result is correct. assert result == "PA"
def test_groupby_categorical_sort(self): """ Test that groupby on a categorical sorts the dataset correctly """ ds = rt.Dataset() cats = ['z', 'y', 'x', 'w', 'a', 'b', 'c', 'd'] vals = [0, 1, 2, 3, 4, 5, 6, 7] expected = dict(zip(cats, vals)) ds["Cat"] = rt.Categorical([cats[xx % len(cats)] for xx in range(100)]) # two identical columns ds["Value1"] = [vals[xx % len(cats)] for xx in range(100)] ds["Value2"] = [vals[xx % len(cats)] for xx in range(100)] grp = ds.groupby("Cat").mean() grp["Expected"] = [expected[xx] for xx in grp.Cat.astype('U')] diff = rt.sum(rt.abs(grp.Expected - grp.Value1)) diff += rt.sum(rt.abs(grp.Expected - grp.Value2)) assert diff <= 1e-9
def get_all_categorical_data() -> List[rt.Categorical]: """Returns a list of all the Categorical test data of all supported CategoryModes.""" return [ rt.Categorical(data) for data in get_categorical_data_factory_method() ]
def test_accum_table(self): # Create the test data def unpivot(frame): N, K = frame.shape data = { 'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), } return pd.DataFrame(data, columns=['date', 'variable', 'value']) np.random.seed(1234) df = unpivot( pd.concat([tm.makeTimeDataFrame(), tm.makeTimeDataFrame()])) ds = dataset_from_pandas_df(df) ds.date = DateTimeNano(ds.date, from_tz='NYC').to_iso() ds.date = rt.FastArray([d[:10] for d in ds.date]) ds.variable = rt.Categorical(ds.variable) ds.date = rt.Categorical(ds.date) at = rt.AccumTable(ds.date, ds.variable) # Add and view inner tables with totals at['Sum'] = at.sum(ds.value) self.assertEqual(at['Sum'].shape, (3, 7)) assert_array_almost_equal(at['Sum']['A'], np.array([0.47, -0.79, 1.72]), decimal=2) vw = at.gen('Sum') self.assertEqual(vw.shape, (3, 7)) assert_array_almost_equal(vw['A'], np.array([0.47, -0.79, 1.72]), decimal=2) assert_array_almost_equal(vw['Sum'], np.array([-0.10, -5.02, 5.37]), decimal=2) assert_array_almost_equal(vw.footer_get_values(columns=['Sum'])['Sum'], np.array([0.25]), decimal=2) at['Mean'] = at.mean(ds.value) self.assertEqual(at['Mean'].shape, (3, 7)) assert_array_almost_equal(at['Mean']['A'], np.array([0.24, -0.39, 0.86]), decimal=2) at['Half'] = at['Mean'] / at['Sum'] self.assertEqual(at['Half'].shape, (3, 7)) assert_array_almost_equal(at['Half']['A'], np.array([0.5, 0.5, 0.5]), decimal=2) # Add and view inner tables with blanks at['Blanks'] = at['Sum'].copy() at['Blanks']['C'] = 0.0 for col in at['Blanks'][:, 1:]: at['Blanks'][col][2] = np.nan vw = at.gen('Blanks') self.assertEqual(vw.shape, (2, 9)) assert_array_almost_equal(vw['A'], np.array([0.47, -0.79]), decimal=2) assert_array_almost_equal(vw['Blanks'], np.array([-0.10, -5.02]), decimal=2) self.assertAlmostEqual(vw.footer_get_dict()['Blanks']['Blanks'], 0.245, places=2) vw = at.gen('Blanks', remove_blanks=False) self.assertEqual(vw.shape, (3, 10)) assert_array_almost_equal(vw['A'], np.array([0.47, -0.79, np.nan]), decimal=2) assert_array_almost_equal(vw['Blanks'], np.array([-0.10, -5.02, np.nan]), decimal=2) # Test division with zeros and nans at['Bad'] = at['Blanks'] / at['Half'] self.assertEqual(at['Blanks'].shape, (3, 7)) vw = at.gen('Bad') self.assertEqual(vw.shape, (2, 10)) vw = at.gen('Blanks') self.assertEqual(vw.shape, (2, 10)) vw = at.gen('Half') self.assertEqual(vw.shape, (3, 11)) # Set margin columns to the right at.set_margin_columns(['Blanks', 'Mean']) vw = at.gen('Half') self.assertEqual(vw.shape, (3, 9)) self.assertEqual(vw.keys()[6], 'Half') self.assertEqual(vw.keys()[7], 'Blanks') self.assertEqual(vw.keys()[8], 'Mean') self.assertEqual(list(vw.footer_get_dict().keys()), ['Half', 'Sum', 'Mean', 'Blanks', 'Bad']) vw = at.gen() self.assertEqual(vw.keys()[6], 'Half') vw = at.gen('Sum') self.assertEqual(vw.keys()[6], 'Sum') self.assertEqual(vw.keys()[7], 'Blanks') self.assertEqual(vw.keys()[8], 'Mean') self.assertEqual(list(vw.footer_get_dict().keys()), ['Sum', 'Mean', 'Half', 'Blanks', 'Bad']) # Set footer rows at the bottom at.set_footer_rows(['Mean']) vw = at.gen('Half') self.assertEqual(vw.shape, (3, 9)) self.assertEqual(vw.keys()[6], 'Half') self.assertEqual(vw.keys()[7], 'Blanks') self.assertEqual(vw.keys()[8], 'Mean') self.assertEqual(list(vw.footer_get_dict().keys()), ['Half', 'Mean']) vw = at.gen('Sum') self.assertEqual(vw.keys()[6], 'Sum') self.assertEqual(vw.keys()[7], 'Blanks') self.assertEqual(vw.keys()[8], 'Mean') self.assertEqual(list(vw.footer_get_dict().keys()), ['Sum', 'Mean']) # Access view Dataset elements vw = at.gen('Sum') assert_array_equal( vw.date, rt.FastArray(['2000-01-03', '2000-01-04', '2000-01-05'])) assert_array_almost_equal(vw['Sum'], np.array([-0.10, -5.02, 5.37]), decimal=2) assert_almost_equal(vw[vw.date == '2000-01-03', 'A'][0], 0.47355353, decimal=2) assert_almost_equal( list(vw.footer_get_values('Sum', columns=['A']).values())[0], 1.409830, decimal=2, )
class TestPyarrowConvertCategorical: @pytest.mark.parametrize(('rt_cat',), [ # TODO: Add test cases for CategoryMode.IntEnum; at present, it appears IntEnum support is broken, can't seem to create a Categorical in that mode. # pytest.param(rt.Categorical([]), id='empty', marks=pytest.mark.skip(reason="rt.Categorical does not support creation from an empty list/array.")), pytest.param(rt.Categorical(['red', 'red', 'green', 'blue', 'green', 'red', 'blue'], ordered=False), id='CategoryMode.StringArray'), pytest.param(rt.Categorical(['red', 'red', 'green', 'blue', 'green', 'red', 'blue'], ordered=True), id='CategoryMode.StringArray--ordered'), pytest.param(rt.Categorical(['red', 'red', 'green', 'blue', 'green', 'red', 'blue'], dtype=np.int8, ordered=False), id='CategoryMode.StringArray;int8;ordered=False'), pytest.param(rt.Categorical(['red', 'red', 'green', 'blue', 'green', 'red', 'blue'], dtype=np.int8, ordered=True), id='CategoryMode.StringArray;int8;ordered=True'), pytest.param(rt.Categorical([f"x{i}" for i in range(0, 127)], dtype=np.int8), id="max number of categories for a signed int backing array without causing overflow"), # N.B. The test cases below for Categorical[Date] require pyarrow 5.0.0 or higher; dictionary-encoded date32() arrays didn't work before then. pytest.param( rt.Categorical( rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']), ordered=False), id="Categorical[Date];ordered=False"), pytest.param( rt.Categorical( rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']), ordered=True), id="Categorical[Date];ordered=True" ), pytest.param(rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 372, 840, 372, 840, 124, 840, 124, 36, 484], { 'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392, 'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51 }, ordered=False ), id="CategoryMode.Dictionary;ordered=False;Unicode"), pytest.param(rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 372, 840, 372, 840, 124, 840, 124, 36, 484], { 'IRL': 372, 'USA': 840, 'AUS': 36, 'HKG': 344, 'JPN': 392, 'MEX': 484, 'KHM': 116, 'THA': 764, 'JAM': 388, 'ARM': 51 }, ordered=True ), id="CategoryMode.Dictionary;ordered=True;Unicode"), pytest.param(rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 372, 840, 372, 840, 124, 840, 124, 36, 484], { b'IRL': 372, b'USA': 840, b'AUS': 36, b'HKG': 344, b'JPN': 392, b'MEX': 484, b'KHM': 116, b'THA': 764, b'JAM': 388, b'ARM': 51 }, ordered=False ), id="CategoryMode.Dictionary;ordered=False;ASCII"), pytest.param(rt.Categorical( # Country codes -- adapted from TestCategorical.test_hstack_fails_for_different_mode_cats. [36, 36, 344, 840, 840, 372, 840, 372, 840, 124, 840, 124, 36, 484], { b'IRL': 372, b'USA': 840, b'AUS': 36, b'HKG': 344, b'JPN': 392, b'MEX': 484, b'KHM': 116, b'THA': 764, b'JAM': 388, b'ARM': 51 }, ordered=True ), id="CategoryMode.Dictionary;ordered=True;ASCII"), pytest.param(rt.Categorical( [ rt.FastArray(['Cyan', 'Magenta', 'Yellow', 'Black', 'Magenta', 'Cyan', 'Black', 'Yellow']).set_name('InkColor'), rt.Date(['2019-06-19', '2019-06-19', '2020-01-15', '2020-05-22', '2020-02-10', '2020-02-10', '2020-03-17', '2020-03-17']).set_name('CartridgeInstallDate') ] ), id="CategoryMode.MultiKey") ]) @pytest.mark.parametrize('output_writable', [False, True]) @pytest.mark.parametrize('have_nulls', [False, True]) def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None: """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back.""" orig_cat_shape = rt_cat.shape if have_nulls: # riptable's filtering/masking uses a valid mask (where False means null/NA). indices = np.arange(len(rt_cat)) valid_mask = indices % 3 != 1 rt_cat = rt_cat.filter(valid_mask) assert rt_cat.shape == orig_cat_shape # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0. filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum() assert filtered_element_count == (len(rt_cat) - valid_mask.sum()) result_pa_arr = rt_cat.to_arrow() # Verify the pyarrow array has the correct length, number of categories, etc. assert len(rt_cat) == len(result_pa_arr) assert pat.is_dictionary(result_pa_arr.type) assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \ "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical." if have_nulls: assert valid_mask.sum() > 0 assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical. # * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical. # * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped, # which is not the same behavior as for other Categorical modes. # * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array, # like what is supported for other Categorical modes. if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary): pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.") result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable) # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be # in the same position after being roundtripped, so they should be mapped to the same integer before/after. # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them. # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in # pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed # we should be able to use the stronger equality check. assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary) @pytest.mark.parametrize(('num_cats', 'dtype'), [ pytest.param(127, np.uint8), pytest.param(128, np.uint8), pytest.param(129, np.uint8), pytest.param(32769, np.uint16) ]) @pytest.mark.parametrize('ordered', [False, True]) @pytest.mark.parametrize('output_writable', [False, True]) @pytest.mark.parametrize('have_nulls', [False, True]) def test_pa_to_rt_unsigned(self, num_cats, dtype, ordered: bool, output_writable: bool, have_nulls: bool) -> None: # Create a numpy array containing `num_cats` distinct strings. cat_labels = np.array([f"x{i}" for i in range(0, num_cats)]) indices = np.arange(num_cats, dtype=dtype) # Create the pyarrow dict-encoded array. if have_nulls: # pyarrow uses an INvalid mask (where True means null/NA). invalid_mask = indices % 7 == 3 pa_indices = pa.array(indices, mask=invalid_mask) pa_arr = pa.DictionaryArray.from_arrays(pa_indices, cat_labels, ordered=ordered) else: pa_arr = pa.DictionaryArray.from_arrays(indices, cat_labels, ordered=ordered) assert len(pa_arr.dictionary) == num_cats # Create the Categorical from the pyarrow array. result_cat = rt.Categorical.from_arrow(pa_arr, zero_copy_only=False, writable=output_writable) if have_nulls: result_invalid_mask = result_cat.isfiltered() assert_array_equal(result_invalid_mask, invalid_mask)