def test_df_as_input(self): """ Tests that bars generated for csv file and Pandas Data Frame yield the same result """ threshold = 100000 tick_data = pd.read_csv(self.path) tick_data['Date and Time'] = pd.to_datetime(tick_data['Date and Time']) db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db2 = pd.read_csv('test.csv') db2['date_time'] = pd.to_datetime(db2.date_time) db3 = ds.get_dollar_bars(tick_data, threshold=threshold, batch_size=10, verbose=False) # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values))
def test_dollar_bars(self): """ Tests the dollar bars implementation. """ threshold = 100000 db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) db2 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False) db3 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=10, verbose=False) ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db4 = pd.read_csv('test.csv') # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) self.assertTrue(db4.shape == db1.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values)) self.assertTrue(np.all(db4.values == db1.values)) # Assert OHLC is correct self.assertTrue(db1.loc[0, 'open'] == 1205) self.assertTrue(db1.loc[0, 'high'] == 1904.75) self.assertTrue(db1.loc[0, 'low'] == 1005.0) self.assertTrue(db1.loc[0, 'close'] == 1304.5) # delete generated csv file (if it wasn't generated test would fail) os.remove('test.csv')
def test_multiple_csv_file_input(self): """ Tests that bars generated for multiple csv files and Pandas Data Frame yield the same result """ threshold = 100000 data = pd.read_csv(self.path) idx = int(np.round(len(data) / 2)) data1 = data.iloc[:idx] data2 = data.iloc[idx:] tick1 = "tick_data_1.csv" tick2 = "tick_data_2.csv" data1.to_csv(tick1, index=False) data2.to_csv(tick2, index=False) file_paths = [tick1, tick2] db1 = ds.get_dollar_bars(file_paths, threshold=threshold, batch_size=1000, verbose=False) db2 = ds.get_dollar_bars(file_paths, threshold=threshold, batch_size=50, verbose=False) db3 = ds.get_dollar_bars(file_paths, threshold=threshold, batch_size=10, verbose=False) ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db4 = pd.read_csv('test.csv', parse_dates=[0]) # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) self.assertTrue(db4.shape == db1.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values)) self.assertTrue(np.all(db4.values == db1.values)) # Assert OHLC is correct self.assertTrue(db1.loc[0, 'open'] == 1205) self.assertTrue(db1.loc[0, 'high'] == 1904.75) self.assertTrue(db1.loc[0, 'low'] == 1005.0) self.assertTrue(db1.loc[0, 'close'] == 1304.50) # delete generated csv files (if they weren't generated test would fail) for csv in (tick1, tick2, "test.csv"): os.remove(csv)
def test_list_as_run_input(self): """ Tests that data generated with csv file and list yield the same result """ threshold = 100000 tick_data = pd.read_csv(self.path) tick_data['Date and Time'] = pd.to_datetime(tick_data['Date and Time']) db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db2 = pd.read_csv('test.csv') db2['date_time'] = pd.to_datetime(db2.date_time) bars = ds.StandardBars(metric='cum_dollar_value', threshold=threshold) cols = ['date_time', 'tick_num', 'open', 'high', 'low', 'close', 'volume', 'cum_buy_volume', 'cum_ticks', 'cum_dollar_value'] data = tick_data.values.tolist() final_bars = bars.run(data) db3 = pd.DataFrame(final_bars, columns=cols) # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values))
def test_wrong_input_value_error_raise(self): """ Tests ValueError raise when neither pd.DataFrame nor path to csv file are passed to function call """ with self.assertRaises(ValueError): ds.get_dollar_bars(None, threshold=20, batch_size=1000, verbose=False)
def _generateDollarBars(self, endDate, threshold): # Generate dollar bar representations: self.ALTERNATIVE_BARS = {} homeStr = os.path.expandvars('${HOME}') thresholdVariable = threshold # Loop for all the assets: for eachAssetName in self.PORTFOLIO._portfolioDict: # Dollar Bars > We need to have ticks in the CSV no other form or aggregation. # The timestamp doesn't need to be as index > if it is as index gives error. READ_PATH = f'{homeStr}/Desktop/quant-research-env/RegimeAnalysisContentSeries/Data/Data_Ticks/{eachAssetName}_BID_ASK_{endDate}.csv' # Read the data: bars = pd.read_csv(READ_PATH) # Generate the mid column price: bars[f'{eachAssetName}_mid_price'] = round((bars[f'{eachAssetName}_bid_price'] + bars[f'{eachAssetName}_ask_price'])/2, 5) # Get the suitable columns: bars = bars[[f'{eachAssetName}_timestamp', f'{eachAssetName}_mid_price', f'{eachAssetName}_ask_size']] print(bars.head()) # Generate the tick bars. bars = standard_data_structures.get_dollar_bars(bars, threshold=thresholdVariable, batch_size=100000, verbose=True) # Get log returns for this bars: bars['Returns'] = np.log(bars.close/bars.close.shift(1)) bars.dropna(how='any', inplace=True) print(f'DOLLAR BARS for: {eachAssetName} >> Shape: {bars.shape}') print(bars.head()) # Add them to the dict based on their symbol: self.ALTERNATIVE_BARS[eachAssetName] = bars
def convert_dol_bar(self): if not self.threshold: print("threshold not defined") return dollar = ds.get_dollar_bars( self.df, threshold=self.threshold, batch_size=1000, verbose=False) dollar.to_csv(self.outFile)
def test_dollar_bars(self): """ Tests the dollar bars implementation. """ threshold = 100000 # Creating a dynamic threshold data = pd.read_csv(self.path) data.index = pd.to_datetime(data['Date and Time']) data = data.drop('Date and Time', axis=1) t_constant = pd.Series([100000], index=[data.index[0]]) t_dynamic = pd.Series([10000, 20000, 50000], index=[data.index[0], data.index[40], data.index[80]]) t_low = pd.Series([1000], index=[data.index[0]]) # Generating dollar bars db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False) db2 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False) db3 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=10, verbose=False) ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False, to_csv=True, output_path='test.csv') db4 = pd.read_csv('test.csv', parse_dates=[0]) # Assert diff batch sizes have same number of bars self.assertTrue(db1.shape == db2.shape) self.assertTrue(db1.shape == db3.shape) self.assertTrue(db4.shape == db1.shape) # Assert same values self.assertTrue(np.all(db1.values == db2.values)) self.assertTrue(np.all(db1.values == db3.values)) self.assertTrue(np.all(db4.values == db1.values)) # Assert OHLC is correct self.assertTrue(db1.loc[0, 'open'] == 1205) self.assertTrue(db1.loc[0, 'high'] == 1904.75) self.assertTrue(db1.loc[0, 'low'] == 1005.0) self.assertTrue(db1.loc[0, 'close'] == 1304.5) # Testing dynamic threshold size df_constant = ds.get_dollar_bars(self.path, threshold=t_constant, batch_size=1000, verbose=False) df_dynamic = ds.get_dollar_bars(self.path, threshold=t_dynamic, batch_size=1000, verbose=False) df_low = ds.get_dollar_bars(self.path, threshold=t_low, batch_size=1000, verbose=False) # Assert that constant size outputs the same result self.assertTrue(df_constant.shape == db1.shape) self.assertTrue(np.all(df_constant.values == db1.values)) # Assert sizes of different thresolds self.assertTrue(df_dynamic.shape == (14, 10)) self.assertTrue(df_low.shape == (99, 10)) # delete generated csv file (if it wasn't generated test would fail) os.remove('test.csv')
def _generateDARWINDollarBars(self, threshold): # Generate dollar bar representations: self.ALTERNATIVE_BARS = {} homeStr = os.path.expandvars('${HOME}') thresholdVariable = threshold # Loop for all the assets: for eachAssetName in self.PORTFOLIO._portfolioDict: # Dollar Bars > We need to have ticks in the CSV no other form or aggregation. # The timestamp doesn't need to be as index > if it is as index gives error. READ_PATH = f'{homeStr}/Desktop/quant-research-env/DARWINStrategyContentSeries/Data/{eachAssetName}_former_Quotes.csv' # Read the data: bars = pd.read_csv(READ_PATH) # Read the data: bars = pd.read_csv(READ_PATH, index_col=0, parse_dates=True, infer_datetime_format=True) # Generate the vol fake column and take the index out: bars['volume_fake'] = bars.quote * 100 bars.reset_index(inplace=True) print(bars.head()) # Get the suitable columns: bars = bars[['timestamp', 'quote', 'volume_fake']] # Generate the tick bars. bars = standard_data_structures.get_dollar_bars( bars, threshold=thresholdVariable, batch_size=100000, verbose=True) # Get log returns for this bars: bars['Returns'] = np.log(bars.close / bars.close.shift(1)) bars.dropna(how='any', inplace=True) print(f'DOLLAR BARS for: {eachAssetName} >> Shape: {bars.shape}') print(bars.head()) # Add them to the dict based on their symbol: self.ALTERNATIVE_BARS[eachAssetName] = bars
def test_dollar_bars_add_features(self): """ Tests the additional features functionality with dollar bars. """ # Arrange threshold = 100000 high_over_low = BarFeature( name='high_over_low', function=lambda df: df['Price'].max() / df['Price'].min() ) low_over_high = BarFeature( name='low_over_high', function=lambda df: df['Price'].min() / df['Price'].max() ) # Act bars = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False, additional_features=[high_over_low, low_over_high]) # Assert self.assertTrue(np.all(bars['high_over_low'] == bars['high'] / bars['low'])) self.assertTrue(np.all(bars['low_over_high'] == bars['low'] / bars['high']))
def make_dollar_bar(self): """make_dollar_bar func Note: 参考 : https://kabukimining.hateblo.jp/entry/FinanceDollarBar """ self.download_dataset() row_data_df = pd.read_csv(self.filepath) print("data_df shape", row_data_df.shape) print("data_df columns", row_data_df.columns) print("data_df index", row_data_df.index) data_df = self.make_candles(row_data_df, self.symbol) print("data_df head\n", data_df.head()) # dollar = standard_data_structures.get_dollar_bars(data_df, \ # threshold=self.threshold, batch_size=1000000, verbose=True, \ # to_csv=True, output_path=file_title) dollar = standard_data_structures.get_dollar_bars(data_df, \ threshold=self.threshold, batch_size=1000000, verbose=True,) print("dollar", dollar.shape) print("dollar head\n", dollar.head())
print(f"Date bars {len(time_bars)}: {100*len(time_bars) / len(data)}%") # Tick Bars tick_bars = standard_data_structures.get_tick_bars(data, threshold=10000) # ticks tick_bars["returns"] = np.log(tick_bars["close"]).diff() print(f"Tick bars {len(tick_bars)}: {100*len(tick_bars) / len(data)}%") # Dollar Bars # 1/50 of the avg daily dollar value dollar_threshold = (data.groupby(pd.Grouper( key="date_time", freq="1D")).sum()["dollar_value"].mean()) dollar_threshold /= 50 dollar_threshold = int(dollar_threshold) dollar_bars = standard_data_structures.get_dollar_bars( data, threshold=dollar_threshold, verbose=True) dollar_bars["returns"] = np.log(dollar_bars["close"]).diff() print(f"Dollar bars {len(dollar_bars)}: {100*len(dollar_bars) / len(data)}%") fig, ax = plt.subplots(nrows=3, ncols=3) for i, j in enumerate( zip(("time bars", "tick bars", "dollar bars"), (time_bars, tick_bars, dollar_bars))): l, df = j[0], j[1] df["cum_dollar_value"].hist(bins=100, ax=ax[i, 0], label=f"{l} $value") df["cum_buy_volume"].hist(bins=100, ax=ax[i, 1], label=f"{l} volume") rets = df["returns"].dropna() plot_acf(rets, lags=10, zero=False, ax=ax[i, 2], label=f"{l} autocorr") ax[i, 0].legend() ax[i, 1].legend() ax[i, 2].legend()