def test_specify_gb_data(self): str_col = ['a', 'a', 'b', 'c', 'a'] num_col = [10, 10, 20, 30, 10] col1 = np.arange(5) col2 = np.arange(5) small_ds = Dataset({ 'str_col': str_col, 'num_col': num_col, 'col1': col1, 'col2': col2 }) ds_to_operate_on = small_ds[['col1', 'col2']] c = Categorical(str_col) # dataset d = c.sum(ds_to_operate_on) # single # list d = c.sum([col1, col2]) # tuple d = c.sum((col1, col2)) # dict d = c.sum({'a': col1, 'b': col2}) # multiple d = c.sum(col1, col2)
def test_pre_filter(self): c = Categorical(str_fa, filter=even_filter) assert c._filter == None result = c.sum(ds_nums) one_fifty = sum(result.tens) assert one_fifty == 150
def test_as_categorical(self): ds = Dataset({ 'keycol1': np.random.choice(['a', 'b', 'c'], 30), 'keycol2': np.random.choice(['a', 'b', 'c'], 30), 'data': np.random.rand(30), }) gbu = ds.gbu('keycol1') c = Categorical(ds.keycol1, ordered=False, sort_gb=False) cgbu = gbu.as_categorical() gbu_result = gbu.sum() c_result = c.sum(ds.data) cgbu_result = cgbu.sum(ds.data) for name, col in gbu_result.items(): assert bool(np.all(c_result[name] == col)) assert bool(np.all(cgbu_result[name] == col))
def test_projections(self): num_rows_trade = 1_000_000 num_symbols = 450 Trade_Dates = [ '20180602', '20180603', '20180604', '20180605', '20180606' ] Exchanges = np.array(['EXCH1', 'EXCH2', 'EXCH3']) np.random.seed(1234) ds = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_trade), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_trade)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_trade)] for i in range(num_rows_trade) ], 'Time': [ int(i % (num_rows_trade / len(Trade_Dates))) for i in range(num_rows_trade) ], 'Price': 100 * (1.0 + 0.0005 * np.random.randn(num_rows_trade)), 'Size': 10 * np.array(1 + 30 * np.random.rand(num_rows_trade), dtype=np.int64), }) num_rows_quote = 1_000_000 ds2 = Dataset({ 'SymbolID': np.random.randint(0, num_symbols, size=num_rows_quote), 'Exchange': Exchanges[np.random.randint(0, Exchanges.shape[0], size=num_rows_quote)], 'Trade_Date': [ Trade_Dates[int(i * len(Trade_Dates) / num_rows_quote)] for i in range(num_rows_quote) ], 'Time': [ int(i % (num_rows_quote / len(Trade_Dates))) for i in range(num_rows_quote) ], 'Bid': 100 * (1.0 - 0.001 + 0.0005 * np.random.randn(num_rows_quote)), 'Ask': 100 * (1.0 + 0.001 + 0.0005 * np.random.randn(num_rows_quote)), }) threshold = Dataset( {'Is_Below_Thresdhold': np.random.rand(num_rows_quote) < 0.75}) trade_time = Dataset({'time_2500': (ds.Time / 2500).astype(int)}) trades = Dataset({}).concat_columns([ds, trade_time], do_copy=False) # Create GroupBy and corresponding Categorical trade_gb = trades.groupby( ['SymbolID', 'Exchange', 'Trade_Date', 'time_2500']) trade_cat = Categorical( [ds.SymbolID, ds.Exchange, ds.Trade_Date, trade_time.time_2500]) # Call sum() and count() self.assertEqual(trade_gb.sum().shape, (455654, 7)) self.assertEqual(trade_cat.sum(ds).shape, (455654, 7)) self.assertEqual(trade_gb.count().shape, (455654, 5)) # 8/24/2018 SJK - multikey categorical groupby now returns multiple columns for groupby keys self.assertEqual(trade_cat.count().shape, (455654, 5)) b1 = trade_gb.count().Count.mean() b1c = trade_cat.count().Count.mean() b2 = trade_gb.count().shape[0] self.assertAlmostEqual(ds.shape[0], b1 * b2, places=5) self.assertAlmostEqual(ds.shape[0], b1c * b2, places=5) # Create ds augmented with filtered ID trade_ds = Dataset({'ID': trade_gb.grouping.ikey}) trade_ds_below_threshold = ds * threshold.Is_Below_Thresdhold trade_ds_below_thresholdb = Dataset.concat_columns( [trade_ds_below_threshold, trade_ds], do_copy=False) # Create trade_ds size projection using GroupBy trade_gb_id = trade_ds_below_thresholdb.groupby('ID') trade_sizes_ds = trade_gb_id['Size'].sum() trade_size_ds = trade_sizes_ds.Size[trade_ds_below_thresholdb.ID - 1] self.assertEqual(trade_size_ds.shape[0], ds.shape[0]) # Create trade_ds size projection using Categorical trade_sizes_cat_ds = trade_cat.sum(trade_ds_below_thresholdb.Size) trade_size_cat_ds = trade_sizes_cat_ds.Size[trade_cat - 1] self.assertArrayAlmostEqual(trade_size_ds, trade_size_cat_ds, places=6) # Create trade_ds size projection using Pandas groupby ptrade_ds_below_thresholdb = dataset_as_pandas_df( trade_ds_below_thresholdb) ptrade_gb_id = ptrade_ds_below_thresholdb.groupby('ID') trade_sizes_pd_ds = ptrade_gb_id.sum() trade_size_pd_ds = trade_sizes_pd_ds.Size.values[ptrade_gb_id.ngroup()] self.assertArrayAlmostEqual(trade_size_ds, trade_size_pd_ds, places=6)