def test_ops(self): ds = Dataset({ 'test': arange(300000) % 3, 'test2': arange(300000.0), 'test2i': arange(300000), 'test3': arange(300000) % 3, }) gb = ds.groupby('test') result = gb.mean() self.assertTrue(result.test2[0] == result.test2i[0]) self.assertTrue(result.test2[1] == result.test2i[1]) self.assertTrue(result.test3[1] == 1.0) result = gb.median() result = gb.trimbr() result = gb.nanmedian()
def test_reductions(self): message_types = [ 'CREATE', 'RUN', 'CREATE', 'RUN', 'RUN', 'RUN', 'RUN', 'CANCEL', 'RUN', 'RUN', 'RUN', 'CANCEL', ] order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1] seconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120] shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0] d2 = dict( message_type=message_types, order_id=order_ids, second=seconds, shares=shares, ) dat = Dataset(d2) dat = dat[['order_id', 'message_type', 'second', 'shares']] # Numeric reduction dsr = dat.groupby('order_id').sum() self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.second, [410, 676]) self.assertArrayEqual(dsr.shares, [800, 1100]) # Numeric reduction with all columns returned dsr = dat.groupby('order_id', return_all=True).sum() self.assertEqual(dsr.shape, (2, 4)) self.assertEqual(dsr.keys()[1], 'message_type') # Order-based reduction dsr = dat.groupby('order_id').first() self.assertEqual(dsr.shape, (2, 4)) self.assertArrayEqual(dsr.order_id, [1, 2]) self.assertArrayEqual(dsr.message_type, ['CREATE', 'CREATE']) self.assertArrayEqual(dsr.second, [50, 72]) self.assertArrayEqual(dsr.shares, [0, 0]) # Order-based reduction, which returns all columns regardless dsr = dat.groupby('order_id', return_all=True).first() self.assertEqual(dsr.shape, (2, 4)) # Order-based reduction with multiple keys dsr = dat.groupby(['order_id', 'message_type']).first() self.assertEqual(dsr.shape, (6, 4)) self.assertArrayEqual(dsr.order_id, [1, 1, 1, 2, 2, 2]) self.assertArrayEqual( dsr.message_type, ['CANCEL', 'CREATE', 'RUN', 'CANCEL', 'CREATE', 'RUN']) self.assertArrayEqual(dsr.second, [120, 50, 70, 97, 72, 90]) self.assertArrayEqual(dsr.shares, [0, 0, 200, 0, 0, 100]) # On a subset of columns gb = dat.groupby('order_id') dsr = gb['shares'].sum() self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.shares, [800, 1100]) # Accumulating function dsr = dat.groupby('order_id').cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) # return_all has no effect with accumulating functions # 8/23/2018 SJK - changed behavior so return all shows the keys dsr = dat.groupby('order_id', return_all=True).cumsum() self.assertEqual(dsr.shape, (12, 3)) # Add cum_shares back to a dataset dat['cum_shares'] = dat.groupby('order_id').shares.cumsum().shares self.assertEqual(dat.shape, (12, 5)) self.assertArrayEqual(dat.cum_shares, gb.shares.cumsum().shares) # On a subset of columns dsr = dat.groupby('order_id')[['shares', 'second']].cumsum() self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 200, 0, 700, 100, 500, 800, 500, 800, 950, 1100, 800]) self.assertArrayEqual( dsr.second, [50, 120, 72, 195, 162, 250, 290, 347, 445, 560, 676, 410]) # On a subset of columns with a filter f = FastArray([ True, False, True, False, True, False, True, False, True, False, True, False, ]) dsr = dat.groupby('order_id')[['shares', 'second']].cumsum(filter=f) self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, 0, 0, 0, 100, 100, 100, 100, 400, 400, 550, 100]) self.assertArrayEqual( dsr.second, [50, 50, 72, 50, 162, 162, 145, 162, 260, 260, 376, 145]) # On shares and second with filter at groupby construction dsr = dat.groupby('order_id', filter=f)[['shares', 'second']].cumsum() inv = INVALID_DICT[dsr.shares[0].dtype.num] self.assertEqual(dsr.shape, (12, 2)) self.assertArrayEqual( dsr.shares, [0, inv, 0, inv, 100, inv, 100, inv, 400, inv, 550, inv]) self.assertArrayEqual( dsr.second, [50, inv, 72, inv, 162, inv, 145, inv, 260, inv, 376, inv]) # Using agg function dsr = gb[['second', 'shares']].agg(['sum', 'mean']) self.assertEqual(dsr.shape, (2, 2)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Sum.shares, [800, 1100]) self.assertArrayAlmostEqual(dsr.Mean.second, [82.00, 96.57], places=2) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Check for issue when bracket indexing on groupby f = open(os.devnull, 'w') print(gb, file=f) f.close() dsr = gb[['second', 'shares']].agg(['sum', 'mean']) # Using different functions on different columns dsr = gb.agg({'second': 'sum', 'shares': ['max', 'mean']}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Using numpy functions dsr = gb.agg({'second': np.sum, 'shares': [np.max, np.mean]}) self.assertEqual(dsr.shape, (2, 3)) self.assertArrayEqual(dsr.Sum.second, [410, 676]) self.assertArrayEqual(dsr.Max.shares, [500, 400]) self.assertArrayAlmostEqual(dsr.Mean.shares, [160.00, 157.14], places=2) # Alternate way to add to multiset gb = dat.groupby('order_id') ms = gb[['shares']].agg(['max', 'mean']) ms.Sum = gb[['second']].sum() self.assertEqual(ms.shape, (2, 3)) self.assertArrayEqual(ms.Sum.second, [410, 676]) self.assertArrayEqual(ms.Max.shares, [500, 400]) self.assertArrayAlmostEqual(ms.Mean.shares, [160.00, 157.14], places=2)
def test_projections(self): num_rows_trade = 1_000_000 num_symbols = 450 Trade_Dates = [ '20180602', '20180603', '20180604', '20180605', '20180606' ] Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3']) np.random.seed(1234) ds = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_trade), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_trade)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)] for i in range(num_rows_trade) ], 'Time': [ int(i % (num_rows_trade / len(Trade_Dates))) for i in range(num_rows_trade) ], 'Price': 100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)), 'Size': 10 * np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64), }) num_rows_quote = 1_000_000 ds2 = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_quote), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_quote)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)] for i in range(num_rows_quote) ], 'Time': [ int(i % (num_rows_quote / len(Trade_Dates))) for i in range(num_rows_quote) ], 'Bid': 100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)), 'Ask': 100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)), }) threshold = Dataset( {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75}) trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)}) trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False) # Create GroupBy and corresponding Categorical trade_gb = trades.groupby( ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500']) trade_cat = Categorical( [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500]) # Call sum() and count() self.assertEqual(trade_gb.sum().shape, (455654, 7)) self.assertEqual(trade_cat.sum(ds).shape, (455654, 7)) self.assertEqual(trade_gb.count().shape, (455654, 5)) # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys self.assertEqual(trade_cat.count().shape, (455654, 5)) b1 = trade_gb.count().Count.mean() b1c = trade_cat.count().Count.mean() b2 = trade_gb.count().shape[0] self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5) self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5) # Create ds augmented with filtered ID trade_ds = Dataset({'ID': trade_gb.grouping.ikey}) trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold trade_ds_below_thresholdb = Dataset.concat_columns( [trade_ds_below_threshold, trade_ds], do_copy=False) # Create trade_ds size projection using GroupBy trade_gb_id = trade_ds_below_thresholdb.groupby('ID') trade_sizes_ds = trade_gb_id['Size'].sum() trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1] self.assertEqual(trade_size_ds.shape[0], ds.shape[0]) # Create trade_ds size projection using Categorical trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size) trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1] self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6) # Create trade_ds size projection using Pandas groupby ptrade_ds_below_thresholdb = dataset_as_pandas_df( trade_ds_below_thresholdb) ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID') trade_sizes_pd_ds = ptrade_gb_id.sum() trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()] self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)
def test_flatten(self): accuracy = 7 message_types = [ 'NEW', 'EXECUTE', 'NEW', 'EXECUTE', 'EXECUTE', 'EXECUTE', 'EXECUTE', 'CANCEL', 'EXECUTE', 'EXECUTE', 'EXECUTE', 'CANCEL', ] order_ids = [1, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1] milliseconds = [50, 70, 72, 75, 90, 88, 95, 97, 98, 115, 116, 120] shares = [0, 200, 0, 500, 100, 400, 100, 0, 300, 150, 150, 0] dat = Dataset( dict( message_type=message_types, order_id=order_ids, millisecond=milliseconds, shares=shares, )) dat = dat[['order_id', 'message_type', 'millisecond', 'shares']] gb = dat.groupby('order_id') ms1 = gb[['millisecond', 'shares']].aggregate(['sum', 'mean']) # Flatten horizontally f1 = ms1.flatten() self.assertEqual( f1.keys(), [ 'order_id', 'Sum_millisecond', 'Sum_shares', 'Mean_millisecond', 'Mean_shares', ], ) self.assertTrue((f1.Sum_millisecond == [410, 676]).all(axis=None)) # Flatten vertically f2 = ms1.flatten(horizontal=False) self.assertEqual(f2.keys(), ['order_id', 'Column', 'millisecond', 'shares']) self.almost_eq(f2.millisecond, [410.0, 676.0, 82.0, 96.57142857], accuracy) ms2 = gb[['millisecond', 'shares']].aggregate(['min', 'max']) # Flatten multiset containing multisets ms3 = Multiset({'ms1': ms1, 'ms2': ms2}) f3 = ms3.flatten() self.assertEqual( f3.keys(), [ 'order_id', 'ms1_Sum_millisecond', 'ms1_Sum_shares', 'ms1_Mean_millisecond', 'ms1_Mean_shares', 'ms2_Min_millisecond', 'ms2_Min_shares', 'ms2_Max_millisecond', 'ms2_Max_shares', ], ) self.assertTrue((f3.ms1_Sum_millisecond == [410, 676]).all(axis=None)) f3v = ms3.flatten(horizontal=False) self.assertTrue((f3v.Column == [ b'ms1_Sum', b'ms1_Sum', b'ms1_Mean', b'ms1_Mean', b'ms2_Min', b'ms2_Min', b'ms2_Max', b'ms2_Max', ]).all(axis=None)) # Flatten multiset containing multisets and a dataset ds = gb[['millisecond', 'shares']].std() ms4 = Multiset({'ms1': ms1, 'ms2': ms2, 'Std': ds}) f4 = ms4.flatten() f4.label_remove() self.assertEqual( f4.keys(), [ 'order_id', 'ms1_Sum_millisecond', 'ms1_Sum_shares', 'ms1_Mean_millisecond', 'ms1_Mean_shares', 'ms2_Min_millisecond', 'ms2_Min_shares', 'ms2_Max_millisecond', 'ms2_Max_shares', 'Std_millisecond', 'Std_shares', ], ) self.assertTrue((f4.ms1_Sum_millisecond == [410, 676]).all(axis=None)) f4 = ms4.flatten(horizontal=False) self.assertAlmostEqual(f4[9, 'millisecond'], 15.4903964, places=5) self.assertAlmostEqual(f4[9, 'shares'], 148.4042099, places=5) # Flatten multiset containing multisets and a dataset with non-matching column ds1 = gb[['shares']].std() ms5 = Multiset({'ms1': ms1, 'ms2': ms2, 'Std': ds1}) f5 = ms5.flatten() f5.label_remove() self.assertEqual( f5.keys(), [ 'order_id', 'ms1_Sum_millisecond', 'ms1_Sum_shares', 'ms1_Mean_millisecond', 'ms1_Mean_shares', 'ms2_Min_millisecond', 'ms2_Min_shares', 'ms2_Max_millisecond', 'ms2_Max_shares', 'Std_shares', ], ) self.assertTrue((f5.ms1_Sum_millisecond == [410, 676]).all(axis=None)) with self.assertRaises(ValueError): ms5.flatten(horizontal=False)