def test_tick_bars(self): """ Test the tick bars implementation. """ threshold = 10 db1 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) db2 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False) db3 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=10, verbose=False) ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db4 = pd.read_csv('test.csv') # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) self.assertTrue(db4.shape == db1.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values)) self.assertTrue(np.all(db4.values == db1.values)) # Assert OHLC is correct self.assertTrue(db1.loc[0, 'open'] == 1205) self.assertTrue(db1.loc[0, 'high'] == 1904.75) self.assertTrue(db1.loc[0, 'low'] == 1005.0) self.assertTrue(db1.loc[0, 'close'] == 1304.50) # delete generated csv file (if it wasn't generated test would fail) os.remove('test.csv')
def _generateTickBars(self, endDate, threshold): # Generate tick bar representations: self.ALTERNATIVE_BARS = {} homeStr = os.path.expandvars('${HOME}') thresholdVariable = threshold # Loop for all the assets: for eachAssetName in self.PORTFOLIO._portfolioDict: # Tick Bars > We need to have ticks in the CSV no other form or aggregation. # The timestamp doesn't need to be as index > if it is as an index gives error. READ_PATH = f'{homeStr}/Desktop/quant-research-env/RegimeAnalysisContentSeries/Data/Data_Ticks/{eachAssetName}_BID_ASK_{endDate}.csv' # Read the data: bars = pd.read_csv(READ_PATH) # Generate the mid column price: bars[f'{eachAssetName}_mid_price'] = round((bars[f'{eachAssetName}_bid_price'] + bars[f'{eachAssetName}_ask_price'])/2, 5) # Get the suitable columns: bars = bars[[f'{eachAssetName}_timestamp', f'{eachAssetName}_mid_price', f'{eachAssetName}_ask_size']] print(bars.head()) # Generate the tick bars. bars = standard_data_structures.get_tick_bars(bars, threshold=thresholdVariable, batch_size=100000, verbose=False) # Get log returns for this bars: bars['Returns'] = np.log(bars.close/bars.close.shift(1)) bars.dropna(how='any', inplace=True) print(f'TICK BARS for: {eachAssetName} >> Shape: {bars.shape}') print(bars.head()) # Add them to the dict based on their symbol: self.ALTERNATIVE_BARS[eachAssetName] = bars
def test_tick_bars(self): """ Test the tick bars implementation. """ threshold = 10 # Creating a dynamic threshold data = pd.read_csv(self.path) data.index = pd.to_datetime(data['Date and Time']) data = data.drop('Date and Time', axis=1) t_constant = pd.Series([10], index=[data.index[0]]) t_dynamic = pd.Series([2, 5, 10], index=[data.index[0], data.index[40], data.index[80]]) t_low = pd.Series([2], index=[data.index[0]]) db1 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) db2 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False) db3 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=10, verbose=False) ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db4 = pd.read_csv('test.csv', parse_dates=[0]) # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) self.assertTrue(db4.shape == db1.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values)) self.assertTrue(np.all(db4.values == db1.values)) # Assert OHLC is correct self.assertTrue(db1.loc[0, 'open'] == 1205) self.assertTrue(db1.loc[0, 'high'] == 1904.75) self.assertTrue(db1.loc[0, 'low'] == 1005.0) self.assertTrue(db1.loc[0, 'close'] == 1304.50) # Testing dynamic threshold size df_constant = ds.get_tick_bars(self.path, threshold=t_constant, batch_size=1000, verbose=False) df_dynamic = ds.get_tick_bars(self.path, threshold=t_dynamic, batch_size=1000, verbose=False) df_low = ds.get_tick_bars(self.path, threshold=t_low, batch_size=1000, verbose=False) # Assert that constant size outputs the same result self.assertTrue(df_constant.shape == db1.shape) self.assertTrue(np.all(df_constant.values == db1.values)) # Assert sizes of different thresolds self.assertTrue(df_dynamic.shape == (28, 10)) self.assertTrue(df_low.shape == (50, 10)) # delete generated csv file (if it wasn't generated test would fail) os.remove('test.csv')
def _generateDARWINTickBars(self, threshold): # Generate tick bar representations: self.ALTERNATIVE_BARS = {} homeStr = os.path.expandvars('${HOME}') thresholdVariable = threshold # Loop for all the assets: for eachAssetName in self.PORTFOLIO._portfolioDict: # Tick Bars > We need to have ticks in the CSV no other form or aggregation. # The timestamp doesn't need to be as index > if it is as an index gives error. READ_PATH = f'{homeStr}/Desktop/quant-research-env/DARWINStrategyContentSeries/Data/{eachAssetName}_former_Quotes.csv' # Read the data: bars = pd.read_csv(READ_PATH, index_col=0, parse_dates=True, infer_datetime_format=True) # Generate the vol fake column and take the index out: bars['volume_fake'] = bars.quote * 100 bars.reset_index(inplace=True) print(bars.head()) # Get the suitable columns: bars = bars[['timestamp', 'quote', 'volume_fake']] # Generate the tick bars. bars = standard_data_structures.get_tick_bars( bars, threshold=thresholdVariable, batch_size=100000, verbose=False) # Get log returns for this bars: bars['Returns'] = np.log(bars.close / bars.close.shift(1)) bars.dropna(how='any', inplace=True) print(f'TICK BARS for: {eachAssetName} >> Shape: {bars.shape}') print(bars.head()) # Add them to the dict based on their symbol: self.ALTERNATIVE_BARS[eachAssetName] = bars
def test_tick_bars_add_features(self): """ Tests the additional features functionality with tick bars. """ # Arrange threshold = 10 high_over_low = BarFeature( name='high_over_low', function=lambda df: df['Price'].max() / df['Price'].min() ) low_over_high = BarFeature( name='low_over_high', function=lambda df: df['Price'].min() / df['Price'].max() ) # Act bars = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False, additional_features=[high_over_low, low_over_high]) # Assert self.assertTrue(np.all(bars['high_over_low'] == bars['high'] / bars['low'])) self.assertTrue(np.all(bars['low_over_high'] == bars['low'] / bars['high']))
grouper = data.groupby(pd.Grouper(key="date_time", freq="15min")) time_bars = grouper.aggregate( open=pd.NamedAgg(column="price", aggfunc="first"), close=pd.NamedAgg(column="price", aggfunc="last"), cum_buy_volume=pd.NamedAgg(column="volume", aggfunc="sum"), cum_dollar_value=pd.NamedAgg(column="dollar_value", aggfunc="sum"), ) time_bars = time_bars[time_bars["cum_buy_volume"] > 0].copy() # NO!: time_bars["close"] = time_bars["close"].fillna(method="ffill") time_bars["returns"] = np.log(time_bars["close"]).diff() # ~0.34% print(f"Date bars {len(time_bars)}: {100*len(time_bars) / len(data)}%") # Tick Bars tick_bars = standard_data_structures.get_tick_bars(data, threshold=10000) # ticks tick_bars["returns"] = np.log(tick_bars["close"]).diff() print(f"Tick bars {len(tick_bars)}: {100*len(tick_bars) / len(data)}%") # Dollar Bars # 1/50 of the avg daily dollar value dollar_threshold = (data.groupby(pd.Grouper( key="date_time", freq="1D")).sum()["dollar_value"].mean()) dollar_threshold /= 50 dollar_threshold = int(dollar_threshold) dollar_bars = standard_data_structures.get_dollar_bars( data, threshold=dollar_threshold, verbose=True) dollar_bars["returns"] = np.log(dollar_bars["close"]).diff() print(f"Dollar bars {len(dollar_bars)}: {100*len(dollar_bars) / len(data)}%")