コード例 #1
0
    def test_specify_gb_data(self):
        str_col = ['a', 'a', 'b', 'c', 'a']
        num_col = [10, 10, 20, 30, 10]
        col1 = np.arange(5)
        col2 = np.arange(5)
        small_ds = Dataset({
            'str_col': str_col,
            'num_col': num_col,
            'col1': col1,
            'col2': col2
        })
        ds_to_operate_on = small_ds[['col1', 'col2']]

        c = Categorical(str_col)

        # dataset
        d = c.sum(ds_to_operate_on)

        # single
        # list
        d = c.sum([col1, col2])

        # tuple
        d = c.sum((col1, col2))

        # dict
        d = c.sum({'a': col1, 'b': col2})

        # multiple
        d = c.sum(col1, col2)
コード例 #2
0
    def test_pre_filter(self):
        c = Categorical(str_fa, filter=even_filter)
        assert c._filter == None

        result = c.sum(ds_nums)
        one_fifty = sum(result.tens)
        assert one_fifty == 150
コード例 #3
0
    def test_as_categorical(self):
        ds = Dataset({
            'keycol1': np.random.choice(['a', 'b', 'c'], 30),
            'keycol2': np.random.choice(['a', 'b', 'c'], 30),
            'data': np.random.rand(30),
        })

        gbu = ds.gbu('keycol1')
        c = Categorical(ds.keycol1, ordered=False, sort_gb=False)
        cgbu = gbu.as_categorical()

        gbu_result = gbu.sum()
        c_result = c.sum(ds.data)
        cgbu_result = cgbu.sum(ds.data)

        for name, col in gbu_result.items():
            assert bool(np.all(c_result[name] == col))
            assert bool(np.all(cgbu_result[name] == col))
コード例 #4
0
    def test_projections(self):
        num_rows_trade = 1_000_000
        num_symbols = 450
        Trade_Dates = [
            '20180602', '20180603', '20180604', '20180605', '20180606'
        ]
        Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3'])
        np.random.seed(1234)
        ds = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_trade),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_trade)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)]
                for i in range(num_rows_trade)
            ],
            'Time': [
                int(i % (num_rows_trade / len(Trade_Dates)))
                for i in range(num_rows_trade)
            ],
            'Price':
            100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)),
            'Size':
            10 *
            np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64),
        })
        num_rows_quote = 1_000_000
        ds2 = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_quote),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_quote)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)]
                for i in range(num_rows_quote)
            ],
            'Time': [
                int(i % (num_rows_quote / len(Trade_Dates)))
                for i in range(num_rows_quote)
            ],
            'Bid':
            100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
            'Ask':
            100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
        })
        threshold = Dataset(
            {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75})
        trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)})
        trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False)

        # Create GroupBy and corresponding Categorical
        trade_gb = trades.groupby(
            ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500'])
        trade_cat = Categorical(
            [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500])

        # Call sum() and count()
        self.assertEqual(trade_gb.sum().shape, (455654, 7))
        self.assertEqual(trade_cat.sum(ds).shape, (455654, 7))
        self.assertEqual(trade_gb.count().shape, (455654, 5))
        # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys
        self.assertEqual(trade_cat.count().shape, (455654, 5))
        b1 = trade_gb.count().Count.mean()
        b1c = trade_cat.count().Count.mean()
        b2 = trade_gb.count().shape[0]
        self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5)
        self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5)

        # Create ds augmented with filtered ID
        trade_ds = Dataset({'ID': trade_gb.grouping.ikey})
        trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold
        trade_ds_below_thresholdb = Dataset.concat_columns(
            [trade_ds_below_threshold, trade_ds], do_copy=False)

        # Create trade_ds size projection using GroupBy
        trade_gb_id = trade_ds_below_thresholdb.groupby('ID')
        trade_sizes_ds = trade_gb_id['Size'].sum()
        trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1]
        self.assertEqual(trade_size_ds.shape[0], ds.shape[0])

        # Create trade_ds size projection using Categorical
        trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size)
        trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6)

        # Create trade_ds size projection using Pandas groupby
        ptrade_ds_below_thresholdb = dataset_as_pandas_df(
            trade_ds_below_thresholdb)
        ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID')
        trade_sizes_pd_ds = ptrade_gb_id.sum()
        trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)