Python get_dollar_barsの例、mlfinlab.data_structures.standard_data_structures.get_dollar_bars Pythonの例

コード例 #1

0

ファイルを表示

    def test_df_as_input(self):
        """
        Tests that bars generated for csv file and Pandas Data Frame yield the same result
        """
        threshold = 100000
        tick_data = pd.read_csv(self.path)
        tick_data['Date and Time'] = pd.to_datetime(tick_data['Date and Time'])

        db1 = ds.get_dollar_bars(self.path,
                                 threshold=threshold,
                                 batch_size=1000,
                                 verbose=False)
        ds.get_dollar_bars(self.path,
                           threshold=threshold,
                           batch_size=50,
                           verbose=False,
                           to_csv=True,
                           output_path='test.csv')
        db2 = pd.read_csv('test.csv')
        db2['date_time'] = pd.to_datetime(db2.date_time)
        db3 = ds.get_dollar_bars(tick_data,
                                 threshold=threshold,
                                 batch_size=10,
                                 verbose=False)

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))

コード例 #2

0

ファイルを表示

ファイル: test_standard_data_structures.py プロジェクト: idanre1/mlfinlab_research

    def test_dollar_bars(self):
        """
        Tests the dollar bars implementation.
        """
        threshold = 100000

        db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False)
        db2 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False)
        db3 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=10, verbose=False)
        ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False,
                           to_csv=True, output_path='test.csv')
        db4 = pd.read_csv('test.csv')

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)
        self.assertTrue(db4.shape == db1.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))
        self.assertTrue(np.all(db4.values == db1.values))

        # Assert OHLC is correct
        self.assertTrue(db1.loc[0, 'open'] == 1205)
        self.assertTrue(db1.loc[0, 'high'] == 1904.75)
        self.assertTrue(db1.loc[0, 'low'] == 1005.0)
        self.assertTrue(db1.loc[0, 'close'] == 1304.5)

        # delete generated csv file (if it wasn't generated test would fail)
        os.remove('test.csv')

コード例 #3

0

ファイルを表示

ファイル: test_standard_data_structures.py プロジェクト: welly87/mlfinlab

    def test_multiple_csv_file_input(self):
        """
        Tests that bars generated for multiple csv files and Pandas Data Frame yield the same result
        """
        threshold = 100000

        data = pd.read_csv(self.path)

        idx = int(np.round(len(data) / 2))

        data1 = data.iloc[:idx]
        data2 = data.iloc[idx:]

        tick1 = "tick_data_1.csv"
        tick2 = "tick_data_2.csv"

        data1.to_csv(tick1, index=False)
        data2.to_csv(tick2, index=False)

        file_paths = [tick1, tick2]

        db1 = ds.get_dollar_bars(file_paths,
                                 threshold=threshold,
                                 batch_size=1000,
                                 verbose=False)
        db2 = ds.get_dollar_bars(file_paths,
                                 threshold=threshold,
                                 batch_size=50,
                                 verbose=False)
        db3 = ds.get_dollar_bars(file_paths,
                                 threshold=threshold,
                                 batch_size=10,
                                 verbose=False)
        ds.get_dollar_bars(self.path,
                           threshold=threshold,
                           batch_size=50,
                           verbose=False,
                           to_csv=True,
                           output_path='test.csv')
        db4 = pd.read_csv('test.csv', parse_dates=[0])

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)
        self.assertTrue(db4.shape == db1.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))
        self.assertTrue(np.all(db4.values == db1.values))

        # Assert OHLC is correct
        self.assertTrue(db1.loc[0, 'open'] == 1205)
        self.assertTrue(db1.loc[0, 'high'] == 1904.75)
        self.assertTrue(db1.loc[0, 'low'] == 1005.0)
        self.assertTrue(db1.loc[0, 'close'] == 1304.50)

        # delete generated csv files (if they weren't generated test would fail)
        for csv in (tick1, tick2, "test.csv"):
            os.remove(csv)

コード例 #4

0

ファイルを表示

    def test_list_as_run_input(self):
        """
        Tests that data generated with csv file and list yield the same result
        """
        threshold = 100000
        tick_data = pd.read_csv(self.path)
        tick_data['Date and Time'] = pd.to_datetime(tick_data['Date and Time'])

        db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False)
        ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False,
                           to_csv=True, output_path='test.csv')
        db2 = pd.read_csv('test.csv')
        db2['date_time'] = pd.to_datetime(db2.date_time)

        bars = ds.StandardBars(metric='cum_dollar_value', threshold=threshold)
        cols = ['date_time', 'tick_num', 'open', 'high', 'low', 'close', 'volume', 'cum_buy_volume', 'cum_ticks',
                'cum_dollar_value']

        data = tick_data.values.tolist()
        final_bars = bars.run(data)
        db3 = pd.DataFrame(final_bars, columns=cols)

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))

コード例 #5

0

ファイルを表示

 def test_wrong_input_value_error_raise(self):
     """
     Tests ValueError raise when neither pd.DataFrame nor path to csv file are passed to function call
     """
     with self.assertRaises(ValueError):
         ds.get_dollar_bars(None,
                            threshold=20,
                            batch_size=1000,
                            verbose=False)

コード例 #6

0

ファイルを表示

ファイル: ResearchStudyClass.py プロジェクト: vaiblast/quant-research-env

    def _generateDollarBars(self, endDate, threshold):

        # Generate dollar bar representations:
        self.ALTERNATIVE_BARS = {}
        homeStr = os.path.expandvars('${HOME}')
        thresholdVariable = threshold

        # Loop for all the assets:
        for eachAssetName in self.PORTFOLIO._portfolioDict:

            # Dollar Bars > We need to have ticks in the CSV no other form or aggregation.
            # The timestamp doesn't need to be as index > if it is as index gives error.
            READ_PATH = f'{homeStr}/Desktop/quant-research-env/RegimeAnalysisContentSeries/Data/Data_Ticks/{eachAssetName}_BID_ASK_{endDate}.csv'

            # Read the data:
            bars = pd.read_csv(READ_PATH)

            # Generate the mid column price:
            bars[f'{eachAssetName}_mid_price'] = round((bars[f'{eachAssetName}_bid_price'] + bars[f'{eachAssetName}_ask_price'])/2, 5)

            # Get the suitable columns:
            bars = bars[[f'{eachAssetName}_timestamp', f'{eachAssetName}_mid_price', f'{eachAssetName}_ask_size']]
            print(bars.head())

            # Generate the tick bars.
            bars = standard_data_structures.get_dollar_bars(bars, threshold=thresholdVariable, batch_size=100000, verbose=True)

            # Get log returns for this bars:
            bars['Returns'] = np.log(bars.close/bars.close.shift(1))
            bars.dropna(how='any', inplace=True)
            print(f'DOLLAR BARS for: {eachAssetName} >> Shape: {bars.shape}')
            print(bars.head())

            # Add them to the dict based on their symbol:
            self.ALTERNATIVE_BARS[eachAssetName] = bars

コード例 #7

0

ファイルを表示

ファイル: bar_generate.py プロジェクト: capitalallen/trading-ml

 def convert_dol_bar(self):
     if not self.threshold:
         print("threshold not defined")
         return
     dollar = ds.get_dollar_bars(
         self.df, threshold=self.threshold, batch_size=1000, verbose=False)
     dollar.to_csv(self.outFile)

コード例 #8

0

ファイルを表示

    def test_dollar_bars(self):
        """
        Tests the dollar bars implementation.
        """
        threshold = 100000

        # Creating a dynamic threshold
        data = pd.read_csv(self.path)
        data.index = pd.to_datetime(data['Date and Time'])
        data = data.drop('Date and Time', axis=1)

        t_constant = pd.Series([100000], index=[data.index[0]])
        t_dynamic = pd.Series([10000, 20000, 50000], index=[data.index[0], data.index[40], data.index[80]])
        t_low = pd.Series([1000], index=[data.index[0]])

        # Generating dollar bars

        db1 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False)
        db2 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False)
        db3 = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=10, verbose=False)
        ds.get_dollar_bars(self.path, threshold=threshold, batch_size=50, verbose=False,
                           to_csv=True, output_path='test.csv')
        db4 = pd.read_csv('test.csv', parse_dates=[0])

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)
        self.assertTrue(db4.shape == db1.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))
        self.assertTrue(np.all(db4.values == db1.values))

        # Assert OHLC is correct
        self.assertTrue(db1.loc[0, 'open'] == 1205)
        self.assertTrue(db1.loc[0, 'high'] == 1904.75)
        self.assertTrue(db1.loc[0, 'low'] == 1005.0)
        self.assertTrue(db1.loc[0, 'close'] == 1304.5)

        # Testing dynamic threshold size
        df_constant = ds.get_dollar_bars(self.path, threshold=t_constant, batch_size=1000, verbose=False)
        df_dynamic = ds.get_dollar_bars(self.path, threshold=t_dynamic, batch_size=1000, verbose=False)
        df_low = ds.get_dollar_bars(self.path, threshold=t_low, batch_size=1000, verbose=False)

        # Assert that constant size outputs the same result
        self.assertTrue(df_constant.shape == db1.shape)
        self.assertTrue(np.all(df_constant.values == db1.values))

        # Assert sizes of different thresolds
        self.assertTrue(df_dynamic.shape == (14, 10))
        self.assertTrue(df_low.shape == (99, 10))

        # delete generated csv file (if it wasn't generated test would fail)
        os.remove('test.csv')

コード例 #9

0

ファイルを表示

    def _generateDARWINDollarBars(self, threshold):

        # Generate dollar bar representations:
        self.ALTERNATIVE_BARS = {}
        homeStr = os.path.expandvars('${HOME}')
        thresholdVariable = threshold

        # Loop for all the assets:
        for eachAssetName in self.PORTFOLIO._portfolioDict:

            # Dollar Bars > We need to have ticks in the CSV no other form or aggregation.
            # The timestamp doesn't need to be as index > if it is as index gives error.
            READ_PATH = f'{homeStr}/Desktop/quant-research-env/DARWINStrategyContentSeries/Data/{eachAssetName}_former_Quotes.csv'

            # Read the data:
            bars = pd.read_csv(READ_PATH)

            # Read the data:
            bars = pd.read_csv(READ_PATH,
                               index_col=0,
                               parse_dates=True,
                               infer_datetime_format=True)

            # Generate the vol fake column and take the index out:
            bars['volume_fake'] = bars.quote * 100
            bars.reset_index(inplace=True)
            print(bars.head())

            # Get the suitable columns:
            bars = bars[['timestamp', 'quote', 'volume_fake']]

            # Generate the tick bars.
            bars = standard_data_structures.get_dollar_bars(
                bars,
                threshold=thresholdVariable,
                batch_size=100000,
                verbose=True)

            # Get log returns for this bars:
            bars['Returns'] = np.log(bars.close / bars.close.shift(1))
            bars.dropna(how='any', inplace=True)
            print(f'DOLLAR BARS for: {eachAssetName} >> Shape: {bars.shape}')
            print(bars.head())

            # Add them to the dict based on their symbol:
            self.ALTERNATIVE_BARS[eachAssetName] = bars

コード例 #10

0

ファイルを表示

ファイル: test_standard_data_structures.py プロジェクト: idanre1/mlfinlab_research

    def test_dollar_bars_add_features(self):
        """
        Tests the additional features functionality with dollar bars.
        """

        # Arrange
        threshold = 100000
        high_over_low = BarFeature(
                        name='high_over_low',
                        function=lambda df: df['Price'].max() / df['Price'].min()
                        )
        low_over_high = BarFeature(
                        name='low_over_high',
                        function=lambda df: df['Price'].min() / df['Price'].max()
                        )

        # Act
        bars = ds.get_dollar_bars(self.path, threshold=threshold, batch_size=1000, verbose=False,
                                  additional_features=[high_over_low, low_over_high])

        # Assert
        self.assertTrue(np.all(bars['high_over_low'] == bars['high'] / bars['low']))
        self.assertTrue(np.all(bars['low_over_high'] == bars['low'] / bars['high']))

コード例 #11

0

ファイルを表示

    def make_dollar_bar(self):
        """make_dollar_bar func

        Note:
            参考 : https://kabukimining.hateblo.jp/entry/FinanceDollarBar

        """
        self.download_dataset()
        row_data_df = pd.read_csv(self.filepath)
        print("data_df shape", row_data_df.shape)
        print("data_df columns", row_data_df.columns)
        print("data_df index", row_data_df.index)

        data_df = self.make_candles(row_data_df, self.symbol)

        print("data_df head\n", data_df.head())

        # dollar = standard_data_structures.get_dollar_bars(data_df, \
        #          threshold=self.threshold, batch_size=1000000, verbose=True, \
        #          to_csv=True, output_path=file_title)
        dollar = standard_data_structures.get_dollar_bars(data_df, \
                 threshold=self.threshold, batch_size=1000000, verbose=True,)
        print("dollar", dollar.shape)
        print("dollar head\n", dollar.head())

コード例 #12

0

ファイルを表示

print(f"Date bars {len(time_bars)}: {100*len(time_bars) / len(data)}%")

# Tick Bars
tick_bars = standard_data_structures.get_tick_bars(data,
                                                   threshold=10000)  # ticks
tick_bars["returns"] = np.log(tick_bars["close"]).diff()
print(f"Tick bars {len(tick_bars)}: {100*len(tick_bars) / len(data)}%")

# Dollar Bars
# 1/50 of the avg daily dollar value
dollar_threshold = (data.groupby(pd.Grouper(
    key="date_time", freq="1D")).sum()["dollar_value"].mean())
dollar_threshold /= 50
dollar_threshold = int(dollar_threshold)

dollar_bars = standard_data_structures.get_dollar_bars(
    data, threshold=dollar_threshold, verbose=True)
dollar_bars["returns"] = np.log(dollar_bars["close"]).diff()
print(f"Dollar bars {len(dollar_bars)}: {100*len(dollar_bars) / len(data)}%")

fig, ax = plt.subplots(nrows=3, ncols=3)
for i, j in enumerate(
        zip(("time bars", "tick bars", "dollar bars"),
            (time_bars, tick_bars, dollar_bars))):
    l, df = j[0], j[1]
    df["cum_dollar_value"].hist(bins=100, ax=ax[i, 0], label=f"{l} $value")
    df["cum_buy_volume"].hist(bins=100, ax=ax[i, 1], label=f"{l} volume")
    rets = df["returns"].dropna()
    plot_acf(rets, lags=10, zero=False, ax=ax[i, 2], label=f"{l} autocorr")
    ax[i, 0].legend()
    ax[i, 1].legend()
    ax[i, 2].legend()