Example #1
0
 def test_ops(self):
     ds = Dataset({
         'test': arange(300000) % 3,
         'test2': arange(300000.0),
         'test2i': arange(300000),
         'test3': arange(300000) % 3,
     })
     gb = ds.groupby('test')
     result = gb.mean()
     self.assertTrue(result.test2[0] == result.test2i[0])
     self.assertTrue(result.test2[1] == result.test2i[1])
     self.assertTrue(result.test3[1] == 1.0)
     result = gb.median()
     result = gb.trimbr()
     result = gb.nanmedian()
Example #2
0
    def test_reductions(self):
        message_types = [
            'CREATE',
            'RUN',
            'CREATE',
            'RUN',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
            'RUN',
            'RUN',
            'RUN',
            'CANCEL',
        ]
        order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1]
        seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120]
        shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0]
        d2 = dict(
            message_type=message_types,
            order_id=order_ids,
            second=seconds,
            shares=shares,
        )
        dat = Dataset(d2)
        dat = dat[['order_id', 'message_type', 'second', 'shares']]

        # Numeric reduction
        dsr = dat.groupby('order_id').sum()
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.second, [410, 676])
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Numeric reduction with all columns returned
        dsr = dat.groupby('order_id', return_all=True).sum()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertEqual(dsr.keys()[1], 'message_type')

        # Order-based reduction
        dsr = dat.groupby('order_id').first()
        self.assertEqual(dsr.shape, (2, 4))
        self.assertArrayEqual(dsr.order_id, [1, 2])
        self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE'])
        self.assertArrayEqual(dsr.second, [50, 72])
        self.assertArrayEqual(dsr.shares, [0, 0])

        # Order-based reduction, which returns all columns regardless
        dsr = dat.groupby('order_id', return_all=True).first()
        self.assertEqual(dsr.shape, (2, 4))

        # Order-based reduction with multiple keys
        dsr = dat.groupby(['order_id', 'message_type']).first()
        self.assertEqual(dsr.shape, (6, 4))
        self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2])
        self.assertArrayEqual(
            dsr.message_type,
            ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN'])
        self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90])
        self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100])

        # On a subset of columns
        gb = dat.groupby('order_id')
        dsr = gb['shares'].sum()
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.shares, [800, 1100])

        # Accumulating function
        dsr = dat.groupby('order_id').cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])

        # return_all has no effect with accumulating functions
        # 8/23/2018 SJK - changed behavior so return all shows the keys
        dsr = dat.groupby('order_id', return_all=True).cumsum()
        self.assertEqual(dsr.shape, (12, 3))

        # Add cum_shares back to a dataset
        dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares
        self.assertEqual(dat.shape, (12, 5))
        self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares)

        # On a subset of columns
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum()
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800])
        self.assertArrayEqual(
            dsr.second,
            [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410])

        # On a subset of columns with a filter
        f = FastArray([
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
            True,
            False,
        ])
        dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f)
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100])
        self.assertArrayEqual(
            dsr.second,
            [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145])

        # On shares and second with filter at groupby construction
        dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum()
        inv = INVALID_DICT[dsr.shares[0].dtype.num]
        self.assertEqual(dsr.shape, (12, 2))
        self.assertArrayEqual(
            dsr.shares,
            [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv])
        self.assertArrayEqual(
            dsr.second,
            [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv])

        # Using agg function
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])
        self.assertEqual(dsr.shape, (2, 2))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Sum.shares, [800, 1100])
        self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2)
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Check for issue when bracket indexing on groupby
        f = open(os.devnull, 'w')
        print(gb, file=f)
        f.close()
        dsr = gb[['second', 'shares']].agg(['sum', 'mean'])

        # Using different functions on different columns
        dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Using numpy functions
        dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]})
        self.assertEqual(dsr.shape, (2, 3))
        self.assertArrayEqual(dsr.Sum.second, [410, 676])
        self.assertArrayEqual(dsr.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14],
                                    places=2)

        # Alternate way to add to multiset
        gb = dat.groupby('order_id')
        ms = gb[['shares']].agg(['max', 'mean'])
        ms.Sum = gb[['second']].sum()
        self.assertEqual(ms.shape, (2, 3))
        self.assertArrayEqual(ms.Sum.second, [410, 676])
        self.assertArrayEqual(ms.Max.shares, [500, 400])
        self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
Example #3
0
    def test_projections(self):
        num_rows_trade = 1_000_000
        num_symbols = 450
        Trade_Dates = [
            '20180602', '20180603', '20180604', '20180605', '20180606'
        ]
        Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3'])
        np.random.seed(1234)
        ds = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_trade),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_trade)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)]
                for i in range(num_rows_trade)
            ],
            'Time': [
                int(i % (num_rows_trade / len(Trade_Dates)))
                for i in range(num_rows_trade)
            ],
            'Price':
            100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)),
            'Size':
            10 *
            np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64),
        })
        num_rows_quote = 1_000_000
        ds2 = Dataset({
            'SymbolID':
            np.random.randint(0, num_symbols, size=num_rows_quote),
            'Exchange':
            Exchanges[np.random.randint(0,
                                        Exchanges.shape[0],
                                        size=num_rows_quote)],
            'Trade_Date': [
                Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)]
                for i in range(num_rows_quote)
            ],
            'Time': [
                int(i % (num_rows_quote / len(Trade_Dates)))
                for i in range(num_rows_quote)
            ],
            'Bid':
            100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
            'Ask':
            100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)),
        })
        threshold = Dataset(
            {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75})
        trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)})
        trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False)

        # Create GroupBy and corresponding Categorical
        trade_gb = trades.groupby(
            ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500'])
        trade_cat = Categorical(
            [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500])

        # Call sum() and count()
        self.assertEqual(trade_gb.sum().shape, (455654, 7))
        self.assertEqual(trade_cat.sum(ds).shape, (455654, 7))
        self.assertEqual(trade_gb.count().shape, (455654, 5))
        # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys
        self.assertEqual(trade_cat.count().shape, (455654, 5))
        b1 = trade_gb.count().Count.mean()
        b1c = trade_cat.count().Count.mean()
        b2 = trade_gb.count().shape[0]
        self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5)
        self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5)

        # Create ds augmented with filtered ID
        trade_ds = Dataset({'ID': trade_gb.grouping.ikey})
        trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold
        trade_ds_below_thresholdb = Dataset.concat_columns(
            [trade_ds_below_threshold, trade_ds], do_copy=False)

        # Create trade_ds size projection using GroupBy
        trade_gb_id = trade_ds_below_thresholdb.groupby('ID')
        trade_sizes_ds = trade_gb_id['Size'].sum()
        trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1]
        self.assertEqual(trade_size_ds.shape[0], ds.shape[0])

        # Create trade_ds size projection using Categorical
        trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size)
        trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6)

        # Create trade_ds size projection using Pandas groupby
        ptrade_ds_below_thresholdb = dataset_as_pandas_df(
            trade_ds_below_thresholdb)
        ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID')
        trade_sizes_pd_ds = ptrade_gb_id.sum()
        trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()]
        self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
Example #4
0
 def test_flatten(self):
     accuracy = 7
     message_types = [
         'NEW',
         'EXECUTE',
         'NEW',
         'EXECUTE',
         'EXECUTE',
         'EXECUTE',
         'EXECUTE',
         'CANCEL',
         'EXECUTE',
         'EXECUTE',
         'EXECUTE',
         'CANCEL',
     ]
     order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1]
     milliseconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120]
     shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0]
     dat = Dataset(
         dict(
             message_type=message_types,
             order_id=order_ids,
             millisecond=milliseconds,
             shares=shares,
         ))
     dat = dat[['order_id', 'message_type', 'millisecond', 'shares']]
     gb = dat.groupby('order_id')
     ms1 = gb[['millisecond', 'shares']].aggregate(['sum', 'mean'])
     # Flatten horizontally
     f1 = ms1.flatten()
     self.assertEqual(
         f1.keys(),
         [
             'order_id',
             'Sum_millisecond',
             'Sum_shares',
             'Mean_millisecond',
             'Mean_shares',
         ],
     )
     self.assertTrue((f1.Sum_millisecond == [410, 676]).all(axis=None))
     # Flatten vertically
     f2 = ms1.flatten(horizontal=False)
     self.assertEqual(f2.keys(),
                      ['order_id', 'Column', 'millisecond', 'shares'])
     self.almost_eq(f2.millisecond, [410.0, 676.0, 82.0, 96.57142857],
                    accuracy)
     ms2 = gb[['millisecond', 'shares']].aggregate(['min', 'max'])
     # Flatten multiset containing multisets
     ms3 = Multiset({'ms1': ms1, 'ms2': ms2})
     f3 = ms3.flatten()
     self.assertEqual(
         f3.keys(),
         [
             'order_id',
             'ms1_Sum_millisecond',
             'ms1_Sum_shares',
             'ms1_Mean_millisecond',
             'ms1_Mean_shares',
             'ms2_Min_millisecond',
             'ms2_Min_shares',
             'ms2_Max_millisecond',
             'ms2_Max_shares',
         ],
     )
     self.assertTrue((f3.ms1_Sum_millisecond == [410, 676]).all(axis=None))
     f3v = ms3.flatten(horizontal=False)
     self.assertTrue((f3v.Column == [
         b'ms1_Sum',
         b'ms1_Sum',
         b'ms1_Mean',
         b'ms1_Mean',
         b'ms2_Min',
         b'ms2_Min',
         b'ms2_Max',
         b'ms2_Max',
     ]).all(axis=None))
     # Flatten multiset containing multisets and a dataset
     ds = gb[['millisecond', 'shares']].std()
     ms4 = Multiset({'ms1': ms1, 'ms2': ms2, 'Std': ds})
     f4 = ms4.flatten()
     f4.label_remove()
     self.assertEqual(
         f4.keys(),
         [
             'order_id',
             'ms1_Sum_millisecond',
             'ms1_Sum_shares',
             'ms1_Mean_millisecond',
             'ms1_Mean_shares',
             'ms2_Min_millisecond',
             'ms2_Min_shares',
             'ms2_Max_millisecond',
             'ms2_Max_shares',
             'Std_millisecond',
             'Std_shares',
         ],
     )
     self.assertTrue((f4.ms1_Sum_millisecond == [410, 676]).all(axis=None))
     f4 = ms4.flatten(horizontal=False)
     self.assertAlmostEqual(f4[9, 'millisecond'], 15.4903964, places=5)
     self.assertAlmostEqual(f4[9, 'shares'], 148.4042099, places=5)
     # Flatten multiset containing multisets and a dataset with non-matching column
     ds1 = gb[['shares']].std()
     ms5 = Multiset({'ms1': ms1, 'ms2': ms2, 'Std': ds1})
     f5 = ms5.flatten()
     f5.label_remove()
     self.assertEqual(
         f5.keys(),
         [
             'order_id',
             'ms1_Sum_millisecond',
             'ms1_Sum_shares',
             'ms1_Mean_millisecond',
             'ms1_Mean_shares',
             'ms2_Min_millisecond',
             'ms2_Min_shares',
             'ms2_Max_millisecond',
             'ms2_Max_shares',
             'Std_shares',
         ],
     )
     self.assertTrue((f5.ms1_Sum_millisecond == [410, 676]).all(axis=None))
     with self.assertRaises(ValueError):
         ms5.flatten(horizontal=False)