コード例 #1
0
    def test_gb_categoricals(self):
        codes = [1, 44, 44, 133, 75, 75, 75, 1]
        stringlist = ['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']
        c1 = Categorical(codes, LikertDecision, sort_gb=True)
        c2 = Categorical(stringlist)
        d = {'nums': np.arange(8)}

        # from enum only
        d_enum = d.copy()
        d_enum['cat_from_enum'] = c1
        ds_enum = Dataset(d_enum)
        enum_result = ds_enum.gb('cat_from_enum').sum()
        correct = FastArray([3, 15, 3, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, enum_result.nums),
            msg=
            f"Incorrect sum when grouping by enum categorical.\nExpected {correct}\nActual {enum_result.nums}",
        )

        # from list only
        d_list = d.copy()
        d_list['cat_from_list'] = c2
        ds_list = Dataset(d_list)
        list_result = ds_list.gb('cat_from_list').sum()
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, list_result.nums),
            msg=f"Incorrect sum when grouping by list categorical.",
        )

        d_both = d_enum.copy()
        d_both['cat_from_list'] = c2
        ds_both = Dataset(d_both)

        # by enum, list
        result = ds_both.gb(['cat_from_enum', 'cat_from_list']).sum()
        num_result = result.nums
        correct = FastArray([0, 7, 1, 2, 9, 6, 3], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by enum, list categoricals.",
        )

        # by list, enum
        result = ds_both.gb(['cat_from_list', 'cat_from_enum']).sum()
        num_result = result.nums
        correct = FastArray([0, 1, 2, 3, 9, 6, 7], dtype=np.int64)
        self.assertTrue(
            self.array_equal(correct, num_result),
            msg=f"Incorrect sum when grouping by list, enum categoricals.",
        )
コード例 #2
0
    def test_nan_funcs(self):
        GroupBy.include_keys = True
        nan_dict = {
            'f32': np.arange(5, dtype=np.float32),
            'f64': np.arange(5, dtype=np.float64),
        }
        nan_dict['f32'][0] = np.nan
        nan_dict['f64'][0] = np.nan
        nan_ds = Dataset(nan_dict)
        # TODO: add when more nan functions have been implemented
        correct = np.array([1.0, 2.0, 3.0, 4.0, 0.0])
        nan_funcs = ['nansum']
        col_names = ['f32', 'f64']

        for c_name in col_names:
            gb = nan_ds.gb(c_name)
            for f_name in nan_funcs:
                call = getattr(gb, f_name)
                result_ds = call()
                for result_name in col_names:
                    column = getattr(result_ds, result_name)
                    # skip groupby column
                    if c_name != result_name:
                        is_correct = bool(np.all(column == correct))
                        self.assertTrue(
                            is_correct,
                            msg=
                            f"Incorrect result for groupby {c_name} in column {result_name} for {f_name} operation.",
                        )
        GroupBy.include_keys = False
コード例 #3
0
    def test_shift(self):
        import pandas as pd

        ds = Dataset({'col_' + str(i): np.random.rand(30) for i in range(5)})
        ds.keycol = np.random.choice(['a', 'b', 'c'], 30)
        df = pd.DataFrame(ds.asdict())

        rt_result = ds.gb('keycol').shift(periods=-2).trim()
        pd_result = df.groupby('keycol').shift(periods=-2).dropna(axis='rows')

        for k, v in rt_result.items():
            self.assertTrue(bool(np.all(v == pd_result[k])))

        rt_result = ds.gb('keycol').shift(periods=3).trim()
        pd_result = df.groupby('keycol').shift(periods=3).dropna(axis='rows')

        for k, v in rt_result.items():
            self.assertTrue(bool(np.all(v == pd_result[k])))
コード例 #4
0
    def test_pad(self):
        arrsize = 100
        numrows = 20
        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['ZYGO', 'YHOO', 'FB', 'GOOG', 'IBM']
        ds.symbol2 = Cat(1 + ds.data, list('ABCDEFGHIJKLMNOPQRST'))
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        ds.time[[3, 4, 7]] = nan
        newds = ds.gb('symbol').pad()
        self.assertTrue(newds.time[7] == 2.00)
        self.assertTrue(newds.time[3] != newds.time[3])
        newds = ds.gb('symbol').backfill()
        self.assertTrue(newds.time[7] == 12.00)

        # see if we can pull a group
        newds = ds.gb('symbol').get_group('YHOO')
        self.assertTrue(np.all(newds.symbol == 'YHOO'))
コード例 #5
0
    def test_iter(self):
        correct_keys = FastArray(['e', 'd', 'b', 'c', 'a'])
        correct_idx = [[0, 1, 4, 7], [2, 9], [3], [5, 6], [8]]
        str_arr = FastArray(['e', 'e', 'd', 'b', 'e', 'c', 'c', 'e', 'a', 'd'])

        gb = Dataset({'keycol': str_arr, 'idxcol': arange(10)})
        gb = gb.gb('keycol')
        for i, tup in enumerate(gb):
            self.assertEqual(tup[0], correct_keys[i])
            self.assertTrue(bool(np.all(tup[1].idxcol == correct_idx[i])))
コード例 #6
0
    def test_transform(self):

        arrsize = 200
        numrows = 7

        ds = Dataset({'time': arange(arrsize * 1.0)})
        ds.data = np.random.randint(numrows, size=arrsize)
        ds.data2 = np.random.randint(numrows, size=arrsize)
        symbols = ['AAPL', 'AMZN', 'FB', 'GOOG', 'IBM']
        ds.symbol = Cat(1 + arange(arrsize) % len(symbols), symbols)
        newds = ds.gb('symbol')['data'].sum(transform=True)

        # removed from test since gbkeys not returned in transform
        # self.assertTrue(np.all(newds.symbol == ds.symbol))
        catds = ds.symbol.sum(ds.data, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))
        # test showfilter
        catds = ds.symbol.sum(ds.data, showfilter=True, transform=True)
        self.assertTrue(np.all(newds[0] == catds[0]))

        # test diff
        result1 = ds.gb('symbol').apply_nonreduce(TypeRegister.FastArray.diff)
        result2 = ds.gb('symbol').diff()
        self.assertTrue(result1.equals(result2))
コード例 #7
0
    def test_cumcount_vs_gb(self):
        arr = np.random.choice(['a', 'b', 'c', 'd', 'e'], 50)
        ds = Dataset({'keycol': arr, 'col1': arange(50), 'col2': arange(50)})
        gb_result = ds.gb('keycol').cumcount()

        c = Categorical(ds.keycol)
        c_result = c.cumcount()

        rdiff = gb_result - c_result
        assert sum(rdiff) == 0

        f = logical(arange(50) % 2)
        c_result = c.cumcount(filter=f)
        assert bool(np.all(isnotnan(c_result[f])))
        assert bool(np.all(isnan(c_result[~f])))
コード例 #8
0
    def test_diff(self):
        import pandas as pd

        ds = Dataset({'col_' + str(i): np.random.rand(10) for i in range(5)})
        ds.keycol = np.random.choice(['a', 'b', 'c'], 10)
        df = pd.DataFrame(ds.asdict())

        rt_result = ds.gb('keycol').rolling_diff()
        pd_result = df.groupby('keycol').diff()

        for k, v in rt_result.items():
            pdc = pd_result[k]
            pdcnan = isnan(pdc)
            self.assertTrue(bool(np.all(isnan(v) == pdcnan)), msg=f'{v} {pdc}')

            masked_valid_pd = isnotnan(pdc)
            masked_valid_rt = isnotnan(v)

            self.assertTrue(bool(np.all(masked_valid_pd == masked_valid_rt)))
コード例 #9
0
    49.55,
    65.82,
    85.28,
    61.68,
    72.85,
    91.71,
    61.12,
])
tens = FastArray([10] * 30)
ds = Dataset({
    'strings': str_fa.copy(),
    'ints': int_fa,
    'floats': flt_fa,
    'tens': tens
})
gb = ds.gb('strings')
ds_nums = Dataset({'ints': int_fa, 'floats': flt_fa, 'tens': tens})
data_to_compare = ['ints', 'floats', 'tens']
gbu = ds.gbu('strings')

gb_funcs_L1 = [
    'sum',
    'mean',
    'min',
    'max',
    'var',
    'std',
    'nansum',
    'nanmean',
    'nanmin',
    'nanmax',