Beispiel #1
0
    def test_groupby_ops_multikey_dict(self):
        mk_dict = {'string1': str_fa, 'string2': str_fa}
        mk_gb = Dataset({
            'string1': str_fa.copy(),
            'string2': str_fa.copy(),
            'ints': int_fa,
            'floats': flt_fa,
            'tens': tens,
        }).gbu(['string1', 'string2'])
        c = Categorical(mk_dict)
        self.funnel_all_tests(c, mk_gb, "multikey dictionary", sorted=False)

        # setitem hits comparison functions - need to rewrite these tests after comparison behavior change
        # self.mk_set_item(mk_dict, constructor_name="multikey dictionary")

        # conflicting names
        x = str_fa.copy()
        y = str_fa.copy()
        z = str_fa.copy()
        x.set_name('strings')
        y.set_name('strings')
        z.set_name('strings1')
        c = Categorical([x, y, z])
        assert c._categories_wrap.ncols ==\
            3,\
            f"incorrect number of columns for multikey from list. {c._categories_wrap.ncols} vs. 3"
        # 04/25/2019 all default column names now happen in grouping object
        assert list(c.categories().keys())\
            == ['strings', GROUPBY_KEY_PREFIX + '_c1', 'strings1'],\
            f"column names did not match for multikey from list. {list(c.categories().keys())} vs. ['strings','strings2','strings1']"
def test_categorical_ctor(value_strategy, category_mode, data):
    # cat is drawn from CategoricalStrategy
    ordered: bool = data.draw(booleans())
    cat: Categorical = data.draw(
        CategoricalStrategy(value_strategy,
                            category_mode=category_mode,
                            ordered=ordered))
    assert _check_categorical(cat)

    # Validate properties on constructing a Categorical from a Categorical's values and categories.
    values, categories = cat.expand_array, cat._categories
    # For Dictionary Categoricals, 'categories' should be the original Categorical's category_map.
    if category_mode == CategoryMode.Dictionary:
        categories = cat.category_mapping
    cat2 = Categorical(values, categories=categories, ordered=ordered)
    assert _check_categorical(cat2)

    # Validate properties on constructing a Categorical given a Categorical.
    cat3 = Categorical(cat2)
    assert _check_categorical(cat3)

    # Validate properties on constructing a Categorical using _from_categorical which is a fast path
    # that skips internal routine checks, sorting, or making values unique, but should be identical to
    # the original Categorical.
    from_categorical = cat._categories_wrap
    cat4 = Categorical(
        values,
        categories=categories,
        _from_categorical=from_categorical,
        ordered=ordered,
    )
    assert _check_categorical(cat4)
Beispiel #3
0
    def simple_string_set_item(self, *args, **kwargs):
        '''
        This test needs to be updated with different data that reflects the new comparison behavior.
        SJK: 9/24/2018

        '''

        source = kwargs['constructor_name']
        del kwargs['constructor_name']

        if 'categories' in kwargs:
            kwargs['categories'] = kwargs['categories'].copy()

        c = Categorical(*args, **kwargs)
        set_items = [
            # index by string
            (b'b', b'a'),
            (b'b', 'a'),
            # (b'b', 1),
            ('b', b'a'),
            ('b', 'a'),
            # ('b', 1),
            # index by bool array
            # boolean arrays can no longer be generated with these comparisons SJK 9/24/2018
            (c == b'b', b'a'),
            (c == b'b', 'a'),
            # (c == b'b', 1),
            (c == 'b', b'a'),
            (c == 'b', 'a'),
            # (c == 'b', 1),
            # (c == 2, b'a'),
            # (c == 2, 'a'),
            # (c == 2, 1),
            # integer index
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
            ([5, 9, 16, 18, 21], b'a'),
            ([5, 9, 16, 18, 21], 'a'),
            # ([ 5,  9, 16, 18, 21], 1),
        ]
        # this test needs to get reworked
        # no longer produces the correct result for all types of categoricals because of == comparison behavior
        for items in set_items:
            c = Categorical(*args, **kwargs)
            goal = c == ['a', 'b']
            c[items[0]] = items[1]
            result = c == items[1]
            all_set = np.sum(goal == result)
            assert all_set ==\
                30,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"

            none_left = np.sum(c == 'b')
            assert none_left ==\
                0,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
Beispiel #4
0
    def test_single_key_string_count(self):
        correct_counts = FastArray([4, 5, 9, 6, 6])

        # for sorting/count bug fix 8/21/2018
        c_make_unique = Categorical(str_fa)
        result_counts = c_make_unique.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes = Categorical(sorted_codes,
                                   complete_unique_cats,
                                   base_index=0)
        result_counts = c_from_codes.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match

        c_from_codes_unsorted = Categorical(sorted_codes,
                                            unsorted_unique_cats,
                                            base_index=0)
        result_counts = c_from_codes_unsorted.count().Count
        match = bool(np.all(result_counts == correct_counts))
        assert match
        # 8/24/2018 SJK - default name for groupby key columns might change, so selected this by index
        # also, in most cases (save intenum/dict) categorical groupby no longer returns a categorical
        result_keys = c_from_codes_unsorted.count()[1]
        match = bool(np.all(result_keys == unsorted_unique_cats))
        assert match, f"Result: {result_keys} Expected: {unsorted_unique_cats}"
Beispiel #5
0
    def mk_set_item(self, *args, **kwargs):
        source = kwargs['constructor_name']
        del kwargs['constructor_name']

        if 'categories' in kwargs:
            print('copying categories')
            kwargs['categories'] = kwargs['categories'].copy()

        c = Categorical(*args, **kwargs)
        set_items = [
            # index by string
            ((b'b', b'b'), (b'a', b'a')),
            ((b'b', b'b'), ('a', 'a')),
            ((b'b', b'b'), 5),
            (('b', 'b'), (b'a', b'a')),
            (('b', 'b'), ('a', 'a')),
            (('b', 'b'), 5),
            # index by bool array
            # (c == (b'b', b'b'), (b'a', b'a')),
            # (c == (b'b', b'b'), ('a', 'a')),
            # (c == (b'b', b'b'), 5),
            # (c == ('b', 'b'), (b'a', b'a')),
            # (c == ('b', 'b'), ('a', 'a')),
            # (c == ('b', 'b'), 5),
            # (c == 4, (b'a', b'a')),
            # (c == 4, ('a', 'a')),
            # (c == 4, 5),
            # integer index
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
            ([5, 9, 16, 18, 21], (b'a', b'a')),
            ([5, 9, 16, 18, 21], ('a', 'a')),
            ([5, 9, 16, 18, 21], 5),
        ]

        for items in set_items:
            c = Categorical(*args, **kwargs)
            goal = mask_or([c == ('a', 'a'), c == ('b', 'b')])
            c[items[0]] = items[1]
            result = c == items[1]
            all_set = np.sum(goal == result)
            assert all_set ==\
                30,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"

            none_left = np.sum(c == ('b', 'b'))
            assert none_left ==\
                0,\
                f"did not set c[{items[0]}] to {items[1]} for categorical from {source}"
    def test_gb_categoricals(self):
        codes = [1, 44, 44, 133, 75, 75, 75, 1]
        stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']
        c1 = Categorical(codes, LikertDecision, sort_gb=True)
        c2 = Categorical(stringlist)
        d = {'nums': np.arange(8)}

        # from enum only
        d_enum = d.copy()
        d_enum['cat_from_enum'] = c1
        ds_enum = Dataset(d_enum)
        enum_result = ds_enum.gb('cat_from_enum').sum()
        correct = FastArray([3, 15, 3, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, enum_result.nums),
            msg=
            f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}",
        )

        # from list only
        d_list = d.copy()
        d_list['cat_from_list'] = c2
        ds_list = Dataset(d_list)
        list_result = ds_list.gb('cat_from_list').sum()
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, list_result.nums),
            msg=f"Incorrect sum when grouping by list categorical.",
        )

        d_both = d_enum.copy()
        d_both['cat_from_list'] = c2
        ds_both = Dataset(d_both)

        # by enum, list
        result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum()
        num_result = result.nums
        correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by enum, list categoricals.",
        )

        # by list, enum
        result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum()
        num_result = result.nums
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by list, enum categoricals.",
        )
Beispiel #7
0
    def test_empty_category(self):
        # 5/16/2019 invalid category must be in uniques
        # c = Categorical(str_fa_with_invalid, complete_unique_cats, invalid='invalid')
        # can test empty bin like this, the third result will be empty
        c = Categorical(np.random.choice(['a', 'b', 'd', 'e'], 30),
                        ['a', 'b', 'c', 'd', 'e'])
        empty_result = [
            ('sum', 0.0),
            ('mean', np.nan),
            ('min', np.nan),
            ('max', np.nan),
            ('var', np.nan),
            ('std', np.nan),
            ('nansum', 0.0),
            ('nanmean', np.nan),
            ('nanmin', np.nan),
            ('nanmax', np.nan),
            ('nanvar', np.nan),
            ('nanstd', np.nan),
        ]

        for correct_tup in empty_result:
            func = getattr(c, correct_tup[0])
            result = func(ds_nums).floats[2]
            a = np.isnan(correct_tup[1])

            if np.isnan(correct_tup[1]):
                assert result !=\
                    result,\
                    f"Did not product correct result for empty category after {correct_tup[0]} operation."
            else:
                assert result ==\
                    correct_tup[1],\
                    f"Did not product correct result for empty category after {correct_tup[0]} operation."
Beispiel #8
0
    def test_pre_filter(self):
        c = Categorical(str_fa, filter=even_filter)
        assert c._filter == None

        result = c.sum(ds_nums)
        one_fifty = sum(result.tens)
        assert one_fifty == 150
Beispiel #9
0
    def test_specify_gb_data(self):
        str_col = ['a', 'a', 'b', 'c', 'a']
        num_col = [10, 10, 20, 30, 10]
        col1 = np.arange(5)
        col2 = np.arange(5)
        small_ds = Dataset({
            'str_col': str_col,
            'num_col': num_col,
            'col1': col1,
            'col2': col2
        })
        ds_to_operate_on = small_ds[['col1', 'col2']]

        c = Categorical(str_col)

        # dataset
        d = c.sum(ds_to_operate_on)

        # single
        # list
        d = c.sum([col1, col2])

        # tuple
        d = c.sum((col1, col2))

        # dict
        d = c.sum({'a': col1, 'b': col2})

        # multiple
        d = c.sum(col1, col2)
Beispiel #10
0
 def test_as_matrix_metadata(self):
     error_tol = 0.00001
     ds = Dataset({
         'A': ['EXCH1', 'EXCH2', 'EXCH1', 'EXCH3', 'EXCH3'],
         'B': [-1.6, 2.7, 4.6, 5.7, 8.9],
         'C':
         Categorical([0, 0, 1, 0, 2], ['CPTYA', 'CPTYB', 'CPTYC']),
     })
     X, X_data = dataset_as_matrix(ds)
     self.assertIsInstance(X, numpy.ndarray)
     self.assertEqual(X.shape[0], ds.shape[0])
     self.assertEqual(X.shape[1], ds.shape[1])  # we may break this later
     self.assertEqual(X_data['A']['dtype'], ds.A.dtype)
     self.assertEqual(X_data['B']['dtype'], ds.B.dtype)
     self.assertEqual(X_data['C']['dtype'], ds.C.dtype)
     self.assertEqual(X_data['A']['is_categorical'], False)
     self.assertEqual(X_data['B']['is_categorical'], False)
     self.assertEqual(X_data['C']['is_categorical'], True)
     self.assertTrue((numpy.abs(X[:, 0] - numpy.array([0., 1., 0., 2., 2.]))
                      < error_tol).all(),
                     msg=f"got {X[:, 0]}")
     self.assertTrue((numpy.abs(X[:, 2] - numpy.array([0, 0, 1, 0, 2])) <
                      error_tol).all(),
                     msg=f"got {X[:, 2]}")
     self.assertTrue(
         (X_data['A']['category_values'][numpy.array([0, 1, 0, 2,
                                                      2])] == ds.A).all(),
         msg=
         f"X_data {X_data['A']['category_values'][numpy.array([0, 1, 0, 2, 2])]}\nds.A {ds.A}"
     )
Beispiel #11
0
 def test_multikey_count(self):
     mk_list = [str_fa.copy(), int_fa.copy(), str_fa.copy(), int_fa.copy()]
     c_multi = Categorical(mk_list)
     result_counts = c_multi.count().Count
     correct_counts = FastArray([6, 5, 1, 2, 3, 2, 2, 4, 2, 2, 1])
     all_correct = bool(np.all(result_counts == correct_counts))
     assert all_correct,\
         f"Incorrect result for multikey count for 4 keys. {result_counts} vs. {correct_counts}"
Beispiel #12
0
 def test_groupby_ops_mapping(self):
     d = dict(str_enum.__members__)
     d = {k: int(v) for k, v in d.items()}
     c = Categorical(sorted_codes, d)
     self.funnel_all_tests(c,
                           gbu,
                           "index + mapping dictionary",
                           sorted=False)
Beispiel #13
0
    def test_groupby_ops_user_codes_base_0(self):
        c = Categorical(sorted_codes.copy(),
                        categories=complete_unique_cats,
                        base_index=0)
        self.funnel_all_tests(c, gb, "index + categories + base_index 0")

        c = CatZero(sorted_codes.copy(), categories=complete_unique_cats)
        self.funnel_all_tests(c, gb, "index + categories + base_index 0")
Beispiel #14
0
 def test_total_sizes_with_categorical(self):
     st = Struct({'c': Categorical(['aa', 'bb', 'cc', 'dd'])})
     st.d = st.c
     (physical, logical) = st.total_sizes
     self.assertEqual(physical, logical // 2)
     self.assertGreaterEqual(
         physical, np.asarray(st.c).nbytes + st.c.category_array.nbytes
     )
    def test_categorical_dict_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        ip.user_ns["cat"] = Categorical(CODES, decision_dict)
        _, matches = complete(line_buffer="cat['")
        for k in decision_dict.keys():
            self.assertIn(k, matches)
Beispiel #16
0
    def test_groupby_ops_string_list_cats(self):
        c = Categorical(str_fa, complete_unique_cats)
        self.funnel_all_tests(c, gb, "string list + categories")

        self.simple_string_set_item(
            str_fa,
            categories=complete_unique_cats,
            constructor_name="string list + categories",
        )
Beispiel #17
0
    def test_groupby_ops_user_codes_base_1(self):
        c = Categorical(sorted_codes + 1, complete_unique_cats, base_index=1)
        self.funnel_all_tests(c, gb, "index + categories + base_index 1")

        self.simple_string_set_item(
            sorted_codes.copy(),
            categories=complete_unique_cats,
            base_index=1,
            constructor_name="index + categories + base_index 1",
        )
Beispiel #18
0
 def do_draw(self, data):
     # categories will be set if either:
     # - with_categories parameter is set to True, or
     # - CategoryMode or category_mode designates a dictionary Categorical.
     values, categories, cat = None, None, None
     if self.category_mode == CategoryMode.StringArray:
         values = list(map(str, data.draw(self.value_strategy)))
         if self.with_categories:
             categories = list(map(str, set(values)))
         cat = Categorical(values, categories=categories, ordered=self.ordered)
     elif self.category_mode == CategoryMode.Dictionary:
         values = data.draw(self.value_strategy)
         category_dict = self._construct_dict(data, values)
         cat = Categorical(values, categories=category_dict, ordered=self.ordered)
     else:
         raise ValueError(
             f"{self._CN}.do_draw: unhandled category mode {self.category_mode}\n\t{self}"
         )
     return cat
    def test_categorical_numeric_array_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        lst = [1, 44, 44, 133, 75]  # type: List[int]
        ip.user_ns["cat"] = Categorical(FastArray(lst))
        _, matches = complete(line_buffer="cat['")
        expected = [str(i) for i in lst]
        for c in expected:
            self.assertIn(c, matches)
Beispiel #20
0
 def test_groupby_ops_multikey_list(self):
     mk_list = [str_fa.copy(), str_fa.copy()]
     mk_gb = Dataset({
         'string1': str_fa.copy(),
         'string2': str_fa.copy(),
         'ints': int_fa,
         'floats': flt_fa,
         'tens': tens,
     }).gbu(['string1', 'string2'])
     c = Categorical(mk_list)
     self.funnel_all_tests(c, mk_gb, "multikey list", sorted=False)
 def test_categorical_string_array_key_completion(self):
     ip = get_ipython()
     complete = ip.Completer.complete
     lst = ['a', 'b', 'c', 'c', 'd', 'a', 'b']  # type: List[str]
     ip.user_ns["cat"] = Categorical(FastArray(lst),
                                     ordered=True,
                                     base_index=1,
                                     filter=None)
     _, matches = complete(line_buffer="cat['")
     for s in lst:
         self.assertIn(s, matches)
    def test_categorical_multi_key_completion(self):
        ip = get_ipython()
        complete = ip.Completer.complete

        # note - 'e' is not in first list
        lst1 = ['b', 'a', 'a', 'c', 'a', 'b']  # type: List[str]
        lst2 = ['b', 'a', 'c', 'e']  # type: List[str]
        ip.user_ns["cat"] = Categorical(lst1, lst2, sort_gb=True)
        _, matches = complete(line_buffer="cat['")
        for c in lst1:
            self.assertIn(c, matches)
        for c in lst2:
            self.assertIn(c, matches)
Beispiel #23
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
Beispiel #24
0
    def test_gb_labels_enum(self):
        # make sure enum groupby keys are displayed as string,  not integer code
        c = Categorical([10, 10, 10, 20, 30, 20, 10, 20, 20], {
            'a': 30,
            'b': 20,
            'c': 10
        })
        c_result = c.count()
        c_labels = c_result[c_result.label_get_names()][0]

        ds = Dataset({'catcol': c, 'data': arange(9)})
        ds_result = ds.gbu('catcol').count()
        ds_labels = ds_result[ds_result.label_get_names()][0]

        assert c_labels.dtype.char == ds_labels.dtype.char
        assert bool(np.all(c_labels == ds_labels))
Beispiel #25
0
    def test_as_categorical(self):
        ds = Dataset({
            'keycol1': np.random.choice(['a', 'b', 'c'], 30),
            'keycol2': np.random.choice(['a', 'b', 'c'], 30),
            'data': np.random.rand(30),
        })

        gbu = ds.gbu('keycol1')
        c = Categorical(ds.keycol1, ordered=False, sort_gb=False)
        cgbu = gbu.as_categorical()

        gbu_result = gbu.sum()
        c_result = c.sum(ds.data)
        cgbu_result = cgbu.sum(ds.data)

        for name, col in gbu_result.items():
            assert bool(np.all(c_result[name] == col))
            assert bool(np.all(cgbu_result[name] == col))
Beispiel #26
0
    def test_projections(self):
        num_rows_trade = 1_000_000
        num_symbols = 450
        Trade_Dates = [
            '20180602', '20180603', '20180604', '20180605', '20180606'
        ]
        Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3'])
        np.random.seed(1234)
        ds = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_trade),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_trade)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)]
                for i in range(num_rows_trade)
            ],
            'Time': [
                int(i % (num_rows_trade / len(Trade_Dates)))
                for i in range(num_rows_trade)
            ],
            'Price':
            100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)),
            'Size':
            10 *
            np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64),
        })
        num_rows_quote = 1_000_000
        ds2 = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_quote),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_quote)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)]
                for i in range(num_rows_quote)
            ],
            'Time': [
                int(i % (num_rows_quote / len(Trade_Dates)))
                for i in range(num_rows_quote)
            ],
            'Bid':
            100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
            'Ask':
            100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
        })
        threshold = Dataset(
            {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75})
        trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)})
        trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False)

        # Create GroupBy and corresponding Categorical
        trade_gb = trades.groupby(
            ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500'])
        trade_cat = Categorical(
            [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500])

        # Call sum() and count()
        self.assertEqual(trade_gb.sum().shape, (455654, 7))
        self.assertEqual(trade_cat.sum(ds).shape, (455654, 7))
        self.assertEqual(trade_gb.count().shape, (455654, 5))
        # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys
        self.assertEqual(trade_cat.count().shape, (455654, 5))
        b1 = trade_gb.count().Count.mean()
        b1c = trade_cat.count().Count.mean()
        b2 = trade_gb.count().shape[0]
        self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5)
        self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5)

        # Create ds augmented with filtered ID
        trade_ds = Dataset({'ID': trade_gb.grouping.ikey})
        trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold
        trade_ds_below_thresholdb = Dataset.concat_columns(
            [trade_ds_below_threshold, trade_ds], do_copy=False)

        # Create trade_ds size projection using GroupBy
        trade_gb_id = trade_ds_below_thresholdb.groupby('ID')
        trade_sizes_ds = trade_gb_id['Size'].sum()
        trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1]
        self.assertEqual(trade_size_ds.shape[0], ds.shape[0])

        # Create trade_ds size projection using Categorical
        trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size)
        trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6)

        # Create trade_ds size projection using Pandas groupby
        ptrade_ds_below_thresholdb = dataset_as_pandas_df(
            trade_ds_below_thresholdb)
        ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID')
        trade_sizes_pd_ds = ptrade_gb_id.sum()
        trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
Beispiel #27
0
    np.int8,
    np.uint8,
    np.int16,
    np.uint16,
    np.int32,
    np.uint32,
    np.int64,
    np.uint64,
    np.float32,
    np.float64,
]
arr_types_string = [np.bytes_, np.str_]
test_data = {'bool': np.array([True, False, True, False, True], dtype=np.bool)}
for dt in arr_types + arr_types_string:
    test_data[dt.__name__] = np.array(num_list, dtype=dt)
test_data['categorical'] = Categorical([str(i) for i in num_list])
all_headers = list(test_data.keys())
ds = Dataset(test_data)
gb_funcs = ['sum', 'mean', 'first', 'last', 'median', 'min', 'max', 'var']
gb_nan_funcs = ['nansum', 'nanmean', 'nanmedian',
                'nanvar']  #'rolling', 'cumsum', 'nth'


class Groupby_Test(unittest.TestCase):
    def test_math_ops_same_return(self):
        result_dict = {
            'sum': [5, 10],
            'nansum': [5, 10],
            'median': [2.5, 3],
            # TODO: add support for min / max on strings
            'min': [1, 2],
Beispiel #28
0
def categorical_stringarray(
    draw,
    max_length: int,
    max_categories: int,
    *,
    endianness: str = '=',
    min_str_len: int = 1,
    max_str_len: int = 16,
    unicode: Optional[bool] = None,
    ordered: Optional[bool] = None,
) -> Categorical:
    """
    Strategy for creating StringArray-mode Categoricals.

    Parameters
    ----------
    draw
    max_length : int
    max_categories : int
    endianness : str
    min_str_len : int
    max_str_len : int
    unicode : bool, optional
    ordered : bool, optional

    Examples
    --------
    >>> array_strategy = arrays(integer_dtypes(endianness="=", sizes=(64,)), (5,))
    arrays(dtype=integer_dtypes(endianness='=', sizes=(64,)), shape=(5,))
    >>> categorical_stringarray(array_strategy, with_categories=True).example()
    0, 0, 0, 0, 0

    Notes
    -----
    TODO: Make sure to include the case where we have category values (in the underlying integer array)
          past the end of the categories array. (Or is that only for a Dictionary mode categorical?)
          To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>.

    TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor:
        * base_index
            Add an optional boolean parameter. When None, draw a boolean to fill it in.
            When the bool is false, call rt.Cat() with base_index=0.
            When True, call rt.Cat() with base_index=1.
        * dtype
            Call the ctor with dtype=None or a signed integer dtype that's either the min size given the
            number of categories or any larger signed integer dtype.
            E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 }
        * filter
            Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean.
            When the bool is false we we call rt.Cat() with filter=None.
            When True, we create a boolean array the same length as our values or fancy index and pass that as the filter.

    TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor.

    TODO: When creating the fancy index array and we've drawn 'explicit_categories=True', allow the fancy index to be created
          with any applicable integer type (signed or unsigned) whose range is large enough to index into the categories array.
          (Or, should we just allow _any_ integer dtype, even if too small? We wouldn't be able to index categories past the
          range of the dtype, but maybe that's an interesting thing to test? Especially around cases like having auto_add=True.)
    """
    # Draw a boolean indicating how the data will be passed to the Categorical constructor later.
    # This is done first since it's one of the most likely things to affect the behavior of the Categorical,
    # and shrinking (in some cases) works better when such values are drawn earlier in strategy.
    explicit_categories: bool = draw(st.booleans())
    if explicit_categories:
        event('Categorical created from unique category array and fancy index.')
    else:
        event('Categorical created from non-unique array of strings.')

    # Draw the string dtype based on whether we want a byte (ascii) string or Unicode.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    if is_unicode:
        labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))
    else:
        labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))

    # Create an array of unique category labels.
    cats_shapes = array_shapes(max_dims=1, max_side=max_categories)
    category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode)
    unique_labels = draw(arrays(dtype=labels_dtype, shape=cats_shapes, elements=category_label_strat, unique=True))

    # Use basic_indices to create a fancy index into the array of unique category labels.
    # Apply it to expand the array of unique labels into an array where those labels may occur zero or more times.
    fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length)
    fancy_index = draw(integer_array_indices(shape=unique_labels.shape, result_shape=fancy_index_shapes))

    # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value
    # to use when creating the categorical.
    is_ordered = draw(st.booleans()) if ordered is None else ordered

    # If the 'explicit_categories' flag is set, create the Categorical by passing in the
    # unique values and fancy index separately.
    # Otherwise, apply the fancy index to the array of unique category values to produce an
    # array where each category appears zero or more times; then create the Categorical from that.
    if explicit_categories:
        return Categorical(fancy_index, categories=unique_labels, ordered=is_ordered, unicode=is_unicode)

    else:
        values = unique_labels[fancy_index]
        return Categorical(values, ordered=is_ordered, unicode=is_unicode)
def test_falsifying_categorical_ctor(data):
    Categorical(data)
Beispiel #30
0
def categorical_dictmode(
    draw,
    max_length: int,
    max_categories: int,
    *,
    endianness: str = '=',
    min_str_len: int = 1,
    max_str_len: int = 16,
    unicode: Optional[bool] = None,
    ordered: Optional[bool] = None,
) -> Categorical:
    """
    Strategy for creating Dictionary-mode Categoricals.

    This strategy currently only covers creating `Categorical` instances with
    string-typed category labels.

    Parameters
    ----------
    draw
    max_length : int
    max_categories : int
    endianness : str
    min_str_len : int
    max_str_len : int
    unicode : bool, optional
    ordered : bool, optional

    Examples
    --------
    >>> categorical_dictmode(10_000, 1_000, max_str_len=20).example()
    0, 0, 0, 0, 0

    Notes
    -----
    TODO: Make sure to include the case where we have category values (in the underlying integer array)
          past the end of the categories array. (Or is that only for a Dictionary mode categorical?)
          To clarify -- this is the behavior where, when we print the Categorical, we get entries like <!456>.

    TODO: Also exercise (in one way or another) the following arguments to the Categorical constructor:
        * base_index
            Add an optional boolean parameter. When None, draw a boolean to fill it in.
            When the bool is false, call rt.Cat() with base_index=0.
            When True, call rt.Cat() with base_index=1.
        * dtype
            Call the ctor with dtype=None or a signed integer dtype that's either the min size given the
            number of categories or any larger signed integer dtype.
            E.g. if len(categories) == 1000, draw from { None, np.int16, np.int32, np.int64 }
        * filter
            Add an optional boolean param to the strategy which defaults to None, in which case we'll fill it by drawing a boolean.
            When the bool is false we we call rt.Cat() with filter=None.
            When True, we create a boolean array the same length as our values or fancy index and pass that as the filter.

    TODO: Support slicing/strides on the values/categories arrays passed to the Categorical constructor.

    TODO: Does a Dictionary-mode Categorical allow any other types (e.g. rt.Date) to be used for the category labels?
        If so, these should also be covered by this strategy (though changes will needed to allow a variety of
        types to be used for category labels).

    TODO: Any possible issues (that we might want to exercise in this strategy) between the string used when displaying
        the invalid category (e.g. 'Inv') and category labels? What happens if we have a category label using the same string?
    """
    # Draw a boolean indicating whether we'll use a signed or unsigned integer dtype.
    use_signed_integer_dtype: bool = draw(st.booleans())

    # If using a signed integer dtype, draw another boolean indicating whether we'll
    # generate negative category values.
    allow_negative_category_values: bool = draw(st.booleans()) if use_signed_integer_dtype else False
    if use_signed_integer_dtype:
        if allow_negative_category_values:
            event('Categorical may have a mix of negative, zero, and positive category values.')
        else:
            event('Categorical has only non-negative category values.')

    # If the 'unicode' flag is not set, draw a boolean to fill it in.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    event(f'Category labels are {"unicode" if is_unicode else "ascii"} strings.')

    # If the 'ordered' flag is not set, draw a boolean for it now so we have a concrete value
    # to use when creating the categorical.
    is_ordered = draw(st.booleans()) if ordered is None else ordered
    event(f'ordered = {is_ordered}')

    # Draw the dtype for the category values.
    # TODO: Draw a signed or unsigned integer dtype here which is at least as large as needed, but perhaps larger
    #       than needed.
    #       For now, we just use the smallest dtype large enough to fit the max number of categories; but allowing for
    #       larger (randomly-selected) dtypes later will help ensure we test cases where there are non-consecutive
    #       category values even when the max_categories value is near the max value of a dtype.
    values_dtype = np.min_scalar_type(max_categories)

    # Create the strategy for the category values (integer values representing the categories).
    values_dtype_info = np.iinfo(values_dtype)
    values_strat =\
        st.integers(
            min_value=(values_dtype_info.min if allow_negative_category_values else 0),
            max_value=values_dtype_info.max)

    # Create an array of unique category values/codes.
    cats_shapes = array_shapes(max_dims=1, max_side=max_categories)
    unique_cat_values = draw(arrays(dtype=values_dtype, shape=cats_shapes, elements=values_strat, unique=True))

    # Draw the string dtype for the labels based on whether we want a byte (ascii) string or Unicode.
    is_unicode: bool = draw(st.booleans()) if unicode is None else unicode
    if is_unicode:
        labels_dtype = draw(unicode_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))
    else:
        labels_dtype = draw(byte_string_dtypes(endianness=endianness, min_len=min_str_len, max_len=max_str_len))

    # Create an array of unique category labels; this must be the same shape as the unique category values array.
    category_label_strat = category_labels(min_str_len, max_str_len, unicode=is_unicode)
    unique_labels =\
        draw(arrays(dtype=labels_dtype, shape=unique_cat_values.shape, elements=category_label_strat, unique=True))

    # TODO: Draw a slice (or None) that we'll apply to both arrays of uniques (the labels and values)
    #   before using them to create the category dictionary.
    #   This allows us to cover cases where a category value isn't in the dictionary.

    # Combine the unique category labels and values to create a dictionary.
    category_dict = dict(zip(unique_labels, unique_cat_values))

    # Use basic_indices to create a fancy index into the array of unique values.
    # Apply it to expand the array of unique values into an array where those values may occur zero or more times.
    fancy_index_shapes = array_shapes(max_dims=1, max_side=max_length)
    fancy_index = draw(integer_array_indices(shape=unique_cat_values.shape, result_shape=fancy_index_shapes))

    # Apply the fancy index to the array of unique category values to produce an
    # array where each category appears zero or more times; then create the Categorical from that.
    cat_values = unique_cat_values[fancy_index]
    return Categorical(cat_values, categories=category_dict, ordered=is_ordered, unicode=is_unicode)