def test_categorical_ctor(value_strategy, category_mode, data):
    # cat is drawn from CategoricalStrategy
    ordered: bool = data.draw(booleans())
    cat: Categorical = data.draw(
        CategoricalStrategy(value_strategy,
                            category_mode=category_mode,
                            ordered=ordered))
    assert _check_categorical(cat)

    # Validate properties on constructing a Categorical from a Categorical's values and categories.
    values, categories = cat.expand_array, cat._categories
    # For Dictionary Categoricals, 'categories' should be the original Categorical's category_map.
    if category_mode == CategoryMode.Dictionary:
        categories = cat.category_mapping
    cat2 = Categorical(values, categories=categories, ordered=ordered)
    assert _check_categorical(cat2)

    # Validate properties on constructing a Categorical given a Categorical.
    cat3 = Categorical(cat2)
    assert _check_categorical(cat3)

    # Validate properties on constructing a Categorical using _from_categorical which is a fast path
    # that skips internal routine checks, sorting, or making values unique, but should be identical to
    # the original Categorical.
    from_categorical = cat._categories_wrap
    cat4 = Categorical(
        values,
        categories=categories,
        _from_categorical=from_categorical,
        ordered=ordered,
    )
    assert _check_categorical(cat4)
Exemple #2
0
    def test_groupby_ops_multikey_dict(self):
        mk_dict = {'string1': str_fa, 'string2': str_fa}
        mk_gb = Dataset({
            'string1': str_fa.copy(),
            'string2': str_fa.copy(),
            'ints': int_fa,
            'floats': flt_fa,
            'tens': tens,
        }).gbu(['string1', 'string2'])
        c = Categorical(mk_dict)
        self.funnel_all_tests(c, mk_gb, "multikey dictionary", sorted=False)

        # setitem hits comparison functions - need to rewrite these tests after comparison behavior change
        # self.mk_set_item(mk_dict, constructor_name="multikey dictionary")

        # conflicting names
        x = str_fa.copy()
        y = str_fa.copy()
        z = str_fa.copy()
        x.set_name('strings')
        y.set_name('strings')
        z.set_name('strings1')
        c = Categorical([x, y, z])
        assert c._categories_wrap.ncols ==\
            3,\
            f"incorrect number of columns for multikey from list. {c._categories_wrap.ncols} vs. 3"
        # 04/25/2019 all default column names now happen in grouping object
        assert list(c.categories().keys())\
            == ['strings', GROUPBY_KEY_PREFIX + '_c1', 'strings1'],\
            f"column names did not match for multikey from list. {list(c.categories().keys())} vs. ['strings','strings2','strings1']"
Exemple #3
0
    def test_pre_filter(self):
        c = Categorical(str_fa, filter=even_filter)
        assert c._filter == None

        result = c.sum(ds_nums)
        one_fifty = sum(result.tens)
        assert one_fifty == 150
Exemple #4
0
    def simple_string_set_item(self, *args, **kwargs):
        '''
        This test needs to be updated with different data that reflects the new comparison behavior.
        SJK: 9/24/2018

        '''

        source = kwargs['constructor_name']
        del kwargs['constructor_name']

        if 'categories' in kwargs:
            kwargs['categories'] = kwargs['categories'].copy()

        c = Categorical(*args, **kwargs)
        set_items = [
            # index by string
            (b'b', b'a'),
            (b'b', 'a'),
            # (b'b', 1),
            ('b', b'a'),
            ('b', 'a'),
            # ('b', 1),
            # index by bool array
            # boolean arrays can no longer be generated with these comparisons SJK 9/24/2018
            (c == b'b', b'a'),
            (c == b'b', 'a'),
            # (c == b'b', 1),
            (c == 'b', b'a'),
            (c == 'b', 'a'),
            # (c == 'b', 1),
            # (c == 2, b'a'),
            # (c == 2, 'a'),
            # (c == 2, 1),
            # integer index
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
        ]
        # this test needs to get reworked
        # no longer produces the correct result for all types of categoricals because of == comparison behavior
        for items in set_items:
            c = Categorical(*args, **kwargs)
            goal = c == ['a', 'b']
            c[items[0]] = items[1]
            result = c == items[1]
            all_set = np.sum(goal == result)
            assert all_set ==\
                30,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"

            none_left = np.sum(c == 'b')
            assert none_left ==\
                0,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
Exemple #5
0
    def test_single_key_string_count(self):
        correct_counts = FastArray([4, 5, 9, 6, 6])

        # for sorting/count bug fix 8/21/2018
        c_make_unique = Categorical(str_fa)
        result_counts = c_make_unique.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes = Categorical(sorted_codes,
                                   complete_unique_cats,
                                   base_index=0)
        result_counts = c_from_codes.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes_unsorted = Categorical(sorted_codes,
                                            unsorted_unique_cats,
                                            base_index=0)
        result_counts = c_from_codes_unsorted.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match
        # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index
        # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical
        result_keys = c_from_codes_unsorted.count()[1]
        match = bool(np.all(result_keys == unsorted_unique_cats))
        assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
Exemple #6
0
 def test_multikey_count(self):
     mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()]
     c_multi = Categorical(mk_list)
     result_counts = c_multi.count().Count
     correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1])
     all_correct = bool(np.all(result_counts == correct_counts))
     assert all_correct,\
         f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
Exemple #7
0
    def mk_set_item(self, *args, **kwargs):
        source = kwargs['constructor_name']
        del kwargs['constructor_name']

        if 'categories' in kwargs:
            print('copying categories')
            kwargs['categories'] = kwargs['categories'].copy()

        c = Categorical(*args, **kwargs)
        set_items = [
            # index by string
            ((b'b', b'b'), (b'a', b'a')),
            ((b'b', b'b'), ('a', 'a')),
            ((b'b', b'b'), 5),
            (('b', 'b'), (b'a', b'a')),
            (('b', 'b'), ('a', 'a')),
            (('b', 'b'), 5),
            # index by bool array
            # (c == (b'b', b'b'), (b'a', b'a')),
            # (c == (b'b', b'b'), ('a', 'a')),
            # (c == (b'b', b'b'), 5),
            # (c == ('b', 'b'), (b'a', b'a')),
            # (c == ('b', 'b'), ('a', 'a')),
            # (c == ('b', 'b'), 5),
            # (c == 4, (b'a', b'a')),
            # (c == 4, ('a', 'a')),
            # (c == 4, 5),
            # integer index
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
        ]

        for items in set_items:
            c = Categorical(*args, **kwargs)
            goal = mask_or([c == ('a', 'a'), c == ('b', 'b')])
            c[items[0]] = items[1]
            result = c == items[1]
            all_set = np.sum(goal == result)
            assert all_set ==\
                30,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"

            none_left = np.sum(c == ('b', 'b'))
            assert none_left ==\
                0,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
    def test_gb_categoricals(self):
        codes = [1, 44, 44, 133, 75, 75, 75, 1]
        stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']
        c1 = Categorical(codes, LikertDecision, sort_gb=True)
        c2 = Categorical(stringlist)
        d = {'nums': np.arange(8)}

        # from enum only
        d_enum = d.copy()
        d_enum['cat_from_enum'] = c1
        ds_enum = Dataset(d_enum)
        enum_result = ds_enum.gb('cat_from_enum').sum()
        correct = FastArray([3, 15, 3, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, enum_result.nums),
            msg=
            f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}",
        )

        # from list only
        d_list = d.copy()
        d_list['cat_from_list'] = c2
        ds_list = Dataset(d_list)
        list_result = ds_list.gb('cat_from_list').sum()
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, list_result.nums),
            msg=f"Incorrect sum when grouping by list categorical.",
        )

        d_both = d_enum.copy()
        d_both['cat_from_list'] = c2
        ds_both = Dataset(d_both)

        # by enum, list
        result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum()
        num_result = result.nums
        correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by enum, list categoricals.",
        )

        # by list, enum
        result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum()
        num_result = result.nums
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by list, enum categoricals.",
        )
Exemple #9
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
Exemple #10
0
    def test_empty_category(self):
        # 5/16/2019 invalid category must be in uniques
        # c = Categorical(str_fa_with_invalid, complete_unique_cats, invalid='invalid')
        # can test empty bin like this, the third result will be empty
        c = Categorical(np.random.choice(['a', 'b', 'd', 'e'], 30),
                        ['a', 'b', 'c', 'd', 'e'])
        empty_result = [
            ('sum', 0.0),
            ('mean', np.nan),
            ('min', np.nan),
            ('max', np.nan),
            ('var', np.nan),
            ('std', np.nan),
            ('nansum', 0.0),
            ('nanmean', np.nan),
            ('nanmin', np.nan),
            ('nanmax', np.nan),
            ('nanvar', np.nan),
            ('nanstd', np.nan),
        ]

        for correct_tup in empty_result:
            func = getattr(c, correct_tup[0])
            result = func(ds_nums).floats[2]
            a = np.isnan(correct_tup[1])

            if np.isnan(correct_tup[1]):
                assert result !=\
                    result,\
                    f"Did not product correct result for empty category after {correct_tup[0]} operation."
            else:
                assert result ==\
                    correct_tup[1],\
                    f"Did not product correct result for empty category after {correct_tup[0]} operation."
Exemple #11
0
    def test_gb_labels_enum(self):
        # make sure enum groupby keys are displayed as string,  not integer code
        c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], {
            'a': 30,
            'b': 20,
            'c': 10
        })
        c_result = c.count()
        c_labels = c_result[c_result.label_get_names()][0]

        ds = Dataset({'catcol': c, 'data': arange(9)})
        ds_result = ds.gbu('catcol').count()
        ds_labels = ds_result[ds_result.label_get_names()][0]

        assert c_labels.dtype.char == ds_labels.dtype.char
        assert bool(np.all(c_labels == ds_labels))
Exemple #12
0
 def test_as_matrix_metadata(self):
     error_tol = 0.00001
     ds = Dataset({
         'A': ['EXCH1', 'EXCH2', 'EXCH1', 'EXCH3', 'EXCH3'],
         'B': [-1.6, 2.7, 4.6, 5.7, 8.9],
         'C':
         Categorical([0, 0, 1, 0, 2], ['CPTYA', 'CPTYB', 'CPTYC']),
     })
     X, X_data = dataset_as_matrix(ds)
     self.assertIsInstance(X, numpy.ndarray)
     self.assertEqual(X.shape[0], ds.shape[0])
     self.assertEqual(X.shape[1], ds.shape[1])  # we may break this later
     self.assertEqual(X_data['A']['dtype'], ds.A.dtype)
     self.assertEqual(X_data['B']['dtype'], ds.B.dtype)
     self.assertEqual(X_data['C']['dtype'], ds.C.dtype)
     self.assertEqual(X_data['A']['is_categorical'], False)
     self.assertEqual(X_data['B']['is_categorical'], False)
     self.assertEqual(X_data['C']['is_categorical'], True)
     self.assertTrue((numpy.abs(X[:, 0] - numpy.array([0., 1., 0., 2., 2.]))
                      < error_tol).all(),
                     msg=f"got {X[:, 0]}")
     self.assertTrue((numpy.abs(X[:, 2] - numpy.array([0, 0, 1, 0, 2])) <
                      error_tol).all(),
                     msg=f"got {X[:, 2]}")
     self.assertTrue(
         (X_data['A']['category_values'][numpy.array([0, 1, 0, 2,
                                                      2])] == ds.A).all(),
         msg=
         f"X_data {X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])]}\nds.A {ds.A}"
     )
Exemple #13
0
 def test_groupby_ops_mapping(self):
     d = dict(str_enum.__members__)
     d = {k: int(v) for k, v in d.items()}
     c = Categorical(sorted_codes, d)
     self.funnel_all_tests(c,
                           gbu,
                           "index + mapping dictionary",
                           sorted=False)
Exemple #14
0
    def test_groupby_ops_user_codes_base_0(self):
        c = Categorical(sorted_codes.copy(),
                        categories=complete_unique_cats,
                        base_index=0)
        self.funnel_all_tests(c, gb, "index + categories + base_index 0")

        c = CatZero(sorted_codes.copy(), categories=complete_unique_cats)
        self.funnel_all_tests(c, gb, "index + categories + base_index 0")
    def test_categorical_dict_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        ip.user_ns["cat"] = Categorical(CODES, decision_dict)
        _, matches = complete(line_buffer="cat['")
        for k in decision_dict.keys():
            self.assertIn(k, matches)
Exemple #16
0
 def test_total_sizes_with_categorical(self):
     st = Struct({'c': Categorical(['aa', 'bb', 'cc', 'dd'])})
     st.d = st.c
     (physical, logical) = st.total_sizes
     self.assertEqual(physical, logical // 2)
     self.assertGreaterEqual(
         physical, np.asarray(st.c).nbytes + st.c.category_array.nbytes
     )
Exemple #17
0
    def test_specify_gb_data(self):
        str_col = ['a', 'a', 'b', 'c', 'a']
        num_col = [10, 10, 20, 30, 10]
        col1 = np.arange(5)
        col2 = np.arange(5)
        small_ds = Dataset({
            'str_col': str_col,
            'num_col': num_col,
            'col1': col1,
            'col2': col2
        })
        ds_to_operate_on = small_ds[['col1', 'col2']]

        c = Categorical(str_col)

        # dataset
        d = c.sum(ds_to_operate_on)

        # single
        # list
        d = c.sum([col1, col2])

        # tuple
        d = c.sum((col1, col2))

        # dict
        d = c.sum({'a': col1, 'b': col2})

        # multiple
        d = c.sum(col1, col2)
Exemple #18
0
    def test_groupby_ops_string_list_cats(self):
        c = Categorical(str_fa, complete_unique_cats)
        self.funnel_all_tests(c, gb, "string list + categories")

        self.simple_string_set_item(
            str_fa,
            categories=complete_unique_cats,
            constructor_name="string list + categories",
        )
Exemple #19
0
    def test_as_categorical(self):
        ds = Dataset({
            'keycol1': np.random.choice(['a', 'b', 'c'], 30),
            'keycol2': np.random.choice(['a', 'b', 'c'], 30),
            'data': np.random.rand(30),
        })

        gbu = ds.gbu('keycol1')
        c = Categorical(ds.keycol1, ordered=False, sort_gb=False)
        cgbu = gbu.as_categorical()

        gbu_result = gbu.sum()
        c_result = c.sum(ds.data)
        cgbu_result = cgbu.sum(ds.data)

        for name, col in gbu_result.items():
            assert bool(np.all(c_result[name] == col))
            assert bool(np.all(cgbu_result[name] == col))
    def test_roundtrip_rt_pa_rt(self, rt_cat: rt.Categorical, output_writable: bool, have_nulls: bool) -> None:
        """Test round-tripping from rt.Categorical to pyarrow.Array/pyarrow.Table and back."""
        orig_cat_shape = rt_cat.shape
        if have_nulls:
            # riptable's filtering/masking uses a valid mask (where False means null/NA).
            indices = np.arange(len(rt_cat))
            valid_mask = indices % 3 != 1
            rt_cat = rt_cat.filter(valid_mask)
            assert rt_cat.shape == orig_cat_shape

            # isfiltered() doesn't work as expected for Dictionary/IntEnum-mode Categorical as of riptable 1.1.0.
            filtered_element_count = (rt.isnan(rt_cat._fa) if rt_cat.category_mode in (rt.rt_enum.CategoryMode.Dictionary, rt.rt_enum.CategoryMode.IntEnum) else rt_cat.isfiltered()).sum()
            assert filtered_element_count == (len(rt_cat) - valid_mask.sum())

        result_pa_arr = rt_cat.to_arrow()

        # Verify the pyarrow array has the correct length, number of categories, etc.
        assert len(rt_cat) == len(result_pa_arr)
        assert pat.is_dictionary(result_pa_arr.type)
        assert len(result_pa_arr.dictionary) >= len(next(iter(rt_cat.category_dict.values()))), \
            "The number of categories in the pyarrow array's dictionary is smaller than the number of categories in the input Categorical."

        if have_nulls:
            assert valid_mask.sum() > 0
            assert (len(rt_cat) - valid_mask.sum()) == result_pa_arr.null_count

        # TEMP: Certain cases are marked as XFAIL here due to issues in Categorical.
        #         * Cannot create a pre-filtered (i.e. filtered at construction time) Dictionary- or IntEnum-mode Categorical.
        #         * Filtering a Dictionary- or IntEnum-mode Categorical causes unused categories to be dropped,
        #           which is not the same behavior as for other Categorical modes.
        #         * MultiKey Categoricals can't be created with an explicit list of category arrays + an index array,
        #           like what is supported for other Categorical modes.
        if rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or (have_nulls and rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary):
            pytest.xfail("Expected failure due to issues with the Categorical constructor and/or filtering.")

        result_cat = rt.Categorical.from_arrow(result_pa_arr, zero_copy_only=False, writable=output_writable)

        # relaxed_cat_check <==> rt_cat.ordered, because if the categories are ordered, we expect them to be
        # in the same position after being roundtripped, so they should be mapped to the same integer before/after.
        # multi-key cats always seem to be ordered, even if ordered=False is specified when creating them.
        # TODO: Remove CategoryMode.Dictionary from the relaxed_cat_check here -- it's failing because our encoding in
        #       pyarrow doesn't currenly preserve unused entries from the name <-> code mapping. Once that's fixed
        #       we should be able to use the stronger equality check.
        assert_array_or_cat_equal(rt_cat, result_cat, relaxed_cat_check=rt_cat.ordered or rt_cat.category_mode == rt.rt_enum.CategoryMode.MultiKey or rt_cat.category_mode == rt.rt_enum.CategoryMode.Dictionary)
    def test_categorical_numeric_array_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        lst = [1, 44, 44, 133, 75]  # type: List[int]
        ip.user_ns["cat"] = Categorical(FastArray(lst))
        _, matches = complete(line_buffer="cat['")
        expected = [str(i) for i in lst]
        for c in expected:
            self.assertIn(c, matches)
Exemple #22
0
 def do_draw(self, data):
     # categories will be set if either:
     # - with_categories parameter is set to True, or
     # - CategoryMode or category_mode designates a dictionary Categorical.
     values, categories, cat = None, None, None
     if self.category_mode == CategoryMode.StringArray:
         values = list(map(str, data.draw(self.value_strategy)))
         if self.with_categories:
             categories = list(map(str, set(values)))
         cat = Categorical(values, categories=categories, ordered=self.ordered)
     elif self.category_mode == CategoryMode.Dictionary:
         values = data.draw(self.value_strategy)
         category_dict = self._construct_dict(data, values)
         cat = Categorical(values, categories=category_dict, ordered=self.ordered)
     else:
         raise ValueError(
             f"{self._CN}.do_draw: unhandled category mode {self.category_mode}\n\t{self}"
         )
     return cat
Exemple #23
0
    def test_groupby_ops_user_codes_base_1(self):
        c = Categorical(sorted_codes + 1, complete_unique_cats, base_index=1)
        self.funnel_all_tests(c, gb, "index + categories + base_index 1")

        self.simple_string_set_item(
            sorted_codes.copy(),
            categories=complete_unique_cats,
            base_index=1,
            constructor_name="index + categories + base_index 1",
        )
Exemple #24
0
 def test_groupby_ops_multikey_list(self):
     mk_list = [str_fa.copy(), str_fa.copy()]
     mk_gb = Dataset({
         'string1': str_fa.copy(),
         'string2': str_fa.copy(),
         'ints': int_fa,
         'floats': flt_fa,
         'tens': tens,
     }).gbu(['string1', 'string2'])
     c = Categorical(mk_list)
     self.funnel_all_tests(c, mk_gb, "multikey list", sorted=False)
 def test_categorical_string_array_key_completion(self):
     ip = get_ipython()
     complete = ip.Completer.complete
     lst = ['a', 'b', 'c', 'c', 'd', 'a', 'b']  # type: List[str]
     ip.user_ns["cat"] = Categorical(FastArray(lst),
                                     ordered=True,
                                     base_index=1,
                                     filter=None)
     _, matches = complete(line_buffer="cat['")
     for s in lst:
         self.assertIn(s, matches)
    def test_categorical_multi_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        # note - 'e' is not in first list
        lst1 = ['b', 'a', 'a', 'c', 'a', 'b']  # type: List[str]
        lst2 = ['b', 'a', 'c', 'e']  # type: List[str]
        ip.user_ns["cat"] = Categorical(lst1, lst2, sort_gb=True)
        _, matches = complete(line_buffer="cat['")
        for c in lst1:
            self.assertIn(c, matches)
        for c in lst2:
            self.assertIn(c, matches)
    def test_projections(self):
        num_rows_trade = 1_000_000
        num_symbols = 450
        Trade_Dates = [
            '20180602', '20180603', '20180604', '20180605', '20180606'
        ]
        Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3'])
        np.random.seed(1234)
        ds = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_trade),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_trade)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)]
                for i in range(num_rows_trade)
            ],
            'Time': [
                int(i % (num_rows_trade / len(Trade_Dates)))
                for i in range(num_rows_trade)
            ],
            'Price':
            100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)),
            'Size':
            10 *
            np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64),
        })
        num_rows_quote = 1_000_000
        ds2 = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_quote),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_quote)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)]
                for i in range(num_rows_quote)
            ],
            'Time': [
                int(i % (num_rows_quote / len(Trade_Dates)))
                for i in range(num_rows_quote)
            ],
            'Bid':
            100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
            'Ask':
            100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
        })
        threshold = Dataset(
            {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75})
        trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)})
        trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False)

        # Create GroupBy and corresponding Categorical
        trade_gb = trades.groupby(
            ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500'])
        trade_cat = Categorical(
            [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500])

        # Call sum() and count()
        self.assertEqual(trade_gb.sum().shape, (455654, 7))
        self.assertEqual(trade_cat.sum(ds).shape, (455654, 7))
        self.assertEqual(trade_gb.count().shape, (455654, 5))
        # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys
        self.assertEqual(trade_cat.count().shape, (455654, 5))
        b1 = trade_gb.count().Count.mean()
        b1c = trade_cat.count().Count.mean()
        b2 = trade_gb.count().shape[0]
        self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5)
        self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5)

        # Create ds augmented with filtered ID
        trade_ds = Dataset({'ID': trade_gb.grouping.ikey})
        trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold
        trade_ds_below_thresholdb = Dataset.concat_columns(
            [trade_ds_below_threshold, trade_ds], do_copy=False)

        # Create trade_ds size projection using GroupBy
        trade_gb_id = trade_ds_below_thresholdb.groupby('ID')
        trade_sizes_ds = trade_gb_id['Size'].sum()
        trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1]
        self.assertEqual(trade_size_ds.shape[0], ds.shape[0])

        # Create trade_ds size projection using Categorical
        trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size)
        trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6)

        # Create trade_ds size projection using Pandas groupby
        ptrade_ds_below_thresholdb = dataset_as_pandas_df(
            trade_ds_below_thresholdb)
        ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID')
        trade_sizes_pd_ds = ptrade_gb_id.sum()
        trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
    np.int8,
    np.uint8,
    np.int16,
    np.uint16,
    np.int32,
    np.uint32,
    np.int64,
    np.uint64,
    np.float32,
    np.float64,
]
arr_types_string = [np.bytes_, np.str_]
test_data = {'bool': np.array([True, False, True, False, True], dtype=np.bool)}
for dt in arr_types + arr_types_string:
    test_data[dt.__name__] = np.array(num_list, dtype=dt)
test_data['categorical'] = Categorical([str(i) for i in num_list])
all_headers = list(test_data.keys())
ds = Dataset(test_data)
gb_funcs = ['sum', 'mean', 'first', 'last', 'median', 'min', 'max', 'var']
gb_nan_funcs = ['nansum', 'nanmean', 'nanmedian',
                'nanvar']  #'rolling', 'cumsum', 'nth'


class Groupby_Test(unittest.TestCase):
    def test_math_ops_same_return(self):
        result_dict = {
            'sum': [5, 10],
            'nansum': [5, 10],
            'median': [2.5, 3],
            # TODO: add support for min / max on strings
            'min': [1, 2],
Exemple #29
0
def categorical_stringarray(
    draw,
    max_length: int,
    max_categories: int,
    *,
    endianness: str = '=',
    min_str_len: int = 1,
    max_str_len: int = 16,
    unicode: Optional[bool] = None,
    ordered: Optional[bool] = None,
) -> Categorical:
    """
    Strategy for creating StringArray-mode Categoricals.

    Parameters
    ----------
    draw
    max_length : int
    max_categories : int
    endianness : str
    min_str_len : int
    max_str_len : int
    unicode : bool, optional
    ordered : bool, optional

    Examples
    --------
    >>> array_strategy = arrays(integer_dtypes(endianness="=", sizes=(64,)), (5,))
    arrays(dtype=integer_dtypes(endianness='=', sizes=(64,)), shape=(5,))
    >>> categorical_stringarray(array_strategy, with_categories=True).example()
    0, 0, 0, 0, 0

    Notes
    -----
    TODO: Make sure to include the case where we have category values (in the underlying integer array)
          past the end of the categories array. (Or is that only for a Dictionary mode categorical?)
          To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>.

    TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor:
        * base_index
            Add an optional boolean parameter. When None, draw a boolean to fill it in.
            When the bool is false, call rt.Cat() with base_index=0.
            When True, call rt.Cat() with base_index=1.
        * dtype
            Call the ctor with dtype=None or a signed integer dtype that's either the min size given the
            number of categories or any larger signed integer dtype.
            E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 }
        * filter
            Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean.
            When the bool is false we we call rt.Cat() with filter=None.
            When True, we create a boolean array the same length as our values or fancy index and pass that as the filter.

    TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor.

    TODO: When creating the fancy index array and we've drawn 'explicit_categories=True', allow the fancy index to be created
          with any applicable integer type (signed or unsigned) whose range is large enough to index into the categories array.
          (Or, should we just allow _any_ integer dtype, even if too small? We wouldn't be able to index categories past the
          range of the dtype, but maybe that's an interesting thing to test? Especially around cases like having auto_add=True.)
    """
    # Draw a boolean indicating how the data will be passed to the Categorical constructor later.
    # This is done first since it's one of the most likely things to affect the behavior of the Categorical,
    # and shrinking (in some cases) works better when such values are drawn earlier in strategy.
    explicit_categories: bool = draw(st.booleans())
    if explicit_categories:
        event('Categorical created from unique category array and fancy index.')
    else:
        event('Categorical created from non-unique array of strings.')

    # Draw the string dtype based on whether we want a byte (ascii) string or Unicode.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    if is_unicode:
        labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))
    else:
        labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))

    # Create an array of unique category labels.
    cats_shapes = array_shapes(max_dims=1, max_side=max_categories)
    category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode)
    unique_labels = draw(arrays(dtype=labels_dtype, shape=cats_shapes, elements=category_label_strat, unique=True))

    # Use basic_indices to create a fancy index into the array of unique category labels.
    # Apply it to expand the array of unique labels into an array where those labels may occur zero or more times.
    fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length)
    fancy_index = draw(integer_array_indices(shape=unique_labels.shape, result_shape=fancy_index_shapes))

    # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value
    # to use when creating the categorical.
    is_ordered = draw(st.booleans()) if ordered is None else ordered

    # If the 'explicit_categories' flag is set, create the Categorical by passing in the
    # unique values and fancy index separately.
    # Otherwise, apply the fancy index to the array of unique category values to produce an
    # array where each category appears zero or more times; then create the Categorical from that.
    if explicit_categories:
        return Categorical(fancy_index, categories=unique_labels, ordered=is_ordered, unicode=is_unicode)

    else:
        values = unique_labels[fancy_index]
        return Categorical(values, ordered=is_ordered, unicode=is_unicode)
Exemple #30
0
def categorical_dictmode(
    draw,
    max_length: int,
    max_categories: int,
    *,
    endianness: str = '=',
    min_str_len: int = 1,
    max_str_len: int = 16,
    unicode: Optional[bool] = None,
    ordered: Optional[bool] = None,
) -> Categorical:
    """
    Strategy for creating Dictionary-mode Categoricals.

    This strategy currently only covers creating `Categorical` instances with
    string-typed category labels.

    Parameters
    ----------
    draw
    max_length : int
    max_categories : int
    endianness : str
    min_str_len : int
    max_str_len : int
    unicode : bool, optional
    ordered : bool, optional

    Examples
    --------
    >>> categorical_dictmode(10_000, 1_000, max_str_len=20).example()
    0, 0, 0, 0, 0

    Notes
    -----
    TODO: Make sure to include the case where we have category values (in the underlying integer array)
          past the end of the categories array. (Or is that only for a Dictionary mode categorical?)
          To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>.

    TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor:
        * base_index
            Add an optional boolean parameter. When None, draw a boolean to fill it in.
            When the bool is false, call rt.Cat() with base_index=0.
            When True, call rt.Cat() with base_index=1.
        * dtype
            Call the ctor with dtype=None or a signed integer dtype that's either the min size given the
            number of categories or any larger signed integer dtype.
            E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 }
        * filter
            Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean.
            When the bool is false we we call rt.Cat() with filter=None.
            When True, we create a boolean array the same length as our values or fancy index and pass that as the filter.

    TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor.

    TODO: Does a Dictionary-mode Categorical allow any other types (e.g. rt.Date) to be used for the category labels?
        If so, these should also be covered by this strategy (though changes will needed to allow a variety of
        types to be used for category labels).

    TODO: Any possible issues (that we might want to exercise in this strategy) between the string used when displaying
        the invalid category (e.g. 'Inv') and category labels? What happens if we have a category label using the same string?
    """
    # Draw a boolean indicating whether we'll use a signed or unsigned integer dtype.
    use_signed_integer_dtype: bool = draw(st.booleans())

    # If using a signed integer dtype, draw another boolean indicating whether we'll
    # generate negative category values.
    allow_negative_category_values: bool = draw(st.booleans()) if use_signed_integer_dtype else False
    if use_signed_integer_dtype:
        if allow_negative_category_values:
            event('Categorical may have a mix of negative, zero, and positive category values.')
        else:
            event('Categorical has only non-negative category values.')

    # If the 'unicode' flag is not set, draw a boolean to fill it in.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    event(f'Category labels are {"unicode" if is_unicode else "ascii"} strings.')

    # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value
    # to use when creating the categorical.
    is_ordered = draw(st.booleans()) if ordered is None else ordered
    event(f'ordered = {is_ordered}')

    # Draw the dtype for the category values.
    # TODO: Draw a signed or unsigned integer dtype here which is at least as large as needed, but perhaps larger
    #       than needed.
    #       For now, we just use the smallest dtype large enough to fit the max number of categories; but allowing for
    #       larger (randomly-selected) dtypes later will help ensure we test cases where there are non-consecutive
    #       category values even when the max_categories value is near the max value of a dtype.
    values_dtype = np.min_scalar_type(max_categories)

    # Create the strategy for the category values (integer values representing the categories).
    values_dtype_info = np.iinfo(values_dtype)
    values_strat =\
        st.integers(
            min_value=(values_dtype_info.min if allow_negative_category_values else 0),
            max_value=values_dtype_info.max)

    # Create an array of unique category values/codes.
    cats_shapes = array_shapes(max_dims=1, max_side=max_categories)
    unique_cat_values = draw(arrays(dtype=values_dtype, shape=cats_shapes, elements=values_strat, unique=True))

    # Draw the string dtype for the labels based on whether we want a byte (ascii) string or Unicode.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    if is_unicode:
        labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))
    else:
        labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))

    # Create an array of unique category labels; this must be the same shape as the unique category values array.
    category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode)
    unique_labels =\
        draw(arrays(dtype=labels_dtype, shape=unique_cat_values.shape, elements=category_label_strat, unique=True))

    # TODO: Draw a slice (or None) that we'll apply to both arrays of uniques (the labels and values)
    #   before using them to create the category dictionary.
    #   This allows us to cover cases where a category value isn't in the dictionary.

    # Combine the unique category labels and values to create a dictionary.
    category_dict = dict(zip(unique_labels, unique_cat_values))

    # Use basic_indices to create a fancy index into the array of unique values.
    # Apply it to expand the array of unique values into an array where those values may occur zero or more times.
    fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length)
    fancy_index = draw(integer_array_indices(shape=unique_cat_values.shape, result_shape=fancy_index_shapes))

    # Apply the fancy index to the array of unique category values to produce an
    # array where each category appears zero or more times; then create the Categorical from that.
    cat_values = unique_cat_values[fancy_index]
    return Categorical(cat_values, categories=category_dict, ordered=is_ordered, unicode=is_unicode)