Python get_tick_barsの例、mlfinlab.data_structures.standard_data_structures.get_tick_bars Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_standard_data_structures.py プロジェクト: idanre1/mlfinlab_research

    def test_tick_bars(self):
        """
        Test the tick bars implementation.
        """
        threshold = 10

        db1 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False)
        db2 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False)
        db3 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=10, verbose=False)
        ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False,
                         to_csv=True, output_path='test.csv')
        db4 = pd.read_csv('test.csv')

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)
        self.assertTrue(db4.shape == db1.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))
        self.assertTrue(np.all(db4.values == db1.values))

        # Assert OHLC is correct
        self.assertTrue(db1.loc[0, 'open'] == 1205)
        self.assertTrue(db1.loc[0, 'high'] == 1904.75)
        self.assertTrue(db1.loc[0, 'low'] == 1005.0)
        self.assertTrue(db1.loc[0, 'close'] == 1304.50)

        # delete generated csv file (if it wasn't generated test would fail)
        os.remove('test.csv')

コード例 #2

0

ファイルを表示

ファイル: ResearchStudyClass.py プロジェクト: vaiblast/quant-research-env

    def _generateTickBars(self, endDate, threshold):

        # Generate tick bar representations:
        self.ALTERNATIVE_BARS = {}
        homeStr = os.path.expandvars('${HOME}')
        thresholdVariable = threshold

        # Loop for all the assets:
        for eachAssetName in self.PORTFOLIO._portfolioDict:

            # Tick Bars > We need to have ticks in the CSV no other form or aggregation.
            # The timestamp doesn't need to be as index > if it is as an index gives error.
            READ_PATH = f'{homeStr}/Desktop/quant-research-env/RegimeAnalysisContentSeries/Data/Data_Ticks/{eachAssetName}_BID_ASK_{endDate}.csv'

            # Read the data:
            bars = pd.read_csv(READ_PATH)

            # Generate the mid column price:
            bars[f'{eachAssetName}_mid_price'] = round((bars[f'{eachAssetName}_bid_price'] + bars[f'{eachAssetName}_ask_price'])/2, 5)

            # Get the suitable columns:
            bars = bars[[f'{eachAssetName}_timestamp', f'{eachAssetName}_mid_price', f'{eachAssetName}_ask_size']]
            print(bars.head())

            # Generate the tick bars.
            bars = standard_data_structures.get_tick_bars(bars, threshold=thresholdVariable, batch_size=100000, verbose=False)

            # Get log returns for this bars:
            bars['Returns'] = np.log(bars.close/bars.close.shift(1))
            bars.dropna(how='any', inplace=True)
            print(f'TICK BARS for: {eachAssetName} >> Shape: {bars.shape}')
            print(bars.head())

            # Add them to the dict based on their symbol:
            self.ALTERNATIVE_BARS[eachAssetName] = bars

コード例 #3

0

ファイルを表示

    def test_tick_bars(self):
        """
        Test the tick bars implementation.
        """
        threshold = 10

        # Creating a dynamic threshold
        data = pd.read_csv(self.path)
        data.index = pd.to_datetime(data['Date and Time'])
        data = data.drop('Date and Time', axis=1)

        t_constant = pd.Series([10], index=[data.index[0]])
        t_dynamic = pd.Series([2, 5, 10], index=[data.index[0], data.index[40], data.index[80]])
        t_low = pd.Series([2], index=[data.index[0]])

        db1 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False)
        db2 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False)
        db3 = ds.get_tick_bars(self.path, threshold=threshold, batch_size=10, verbose=False)
        ds.get_tick_bars(self.path, threshold=threshold, batch_size=50, verbose=False,
                         to_csv=True, output_path='test.csv')
        db4 = pd.read_csv('test.csv', parse_dates=[0])

        # Assert diff batch sizes have same number of bars
        self.assertTrue(db1.shape == db2.shape)
        self.assertTrue(db1.shape == db3.shape)
        self.assertTrue(db4.shape == db1.shape)

        # Assert same values
        self.assertTrue(np.all(db1.values == db2.values))
        self.assertTrue(np.all(db1.values == db3.values))
        self.assertTrue(np.all(db4.values == db1.values))

        # Assert OHLC is correct
        self.assertTrue(db1.loc[0, 'open'] == 1205)
        self.assertTrue(db1.loc[0, 'high'] == 1904.75)
        self.assertTrue(db1.loc[0, 'low'] == 1005.0)
        self.assertTrue(db1.loc[0, 'close'] == 1304.50)

        # Testing dynamic threshold size
        df_constant = ds.get_tick_bars(self.path, threshold=t_constant, batch_size=1000, verbose=False)
        df_dynamic = ds.get_tick_bars(self.path, threshold=t_dynamic, batch_size=1000, verbose=False)
        df_low = ds.get_tick_bars(self.path, threshold=t_low, batch_size=1000, verbose=False)

        # Assert that constant size outputs the same result
        self.assertTrue(df_constant.shape == db1.shape)
        self.assertTrue(np.all(df_constant.values == db1.values))

        # Assert sizes of different thresolds
        self.assertTrue(df_dynamic.shape == (28, 10))
        self.assertTrue(df_low.shape == (50, 10))

        # delete generated csv file (if it wasn't generated test would fail)
        os.remove('test.csv')

コード例 #4

0

ファイルを表示

    def _generateDARWINTickBars(self, threshold):

        # Generate tick bar representations:
        self.ALTERNATIVE_BARS = {}
        homeStr = os.path.expandvars('${HOME}')
        thresholdVariable = threshold

        # Loop for all the assets:
        for eachAssetName in self.PORTFOLIO._portfolioDict:

            # Tick Bars > We need to have ticks in the CSV no other form or aggregation.
            # The timestamp doesn't need to be as index > if it is as an index gives error.
            READ_PATH = f'{homeStr}/Desktop/quant-research-env/DARWINStrategyContentSeries/Data/{eachAssetName}_former_Quotes.csv'

            # Read the data:
            bars = pd.read_csv(READ_PATH,
                               index_col=0,
                               parse_dates=True,
                               infer_datetime_format=True)

            # Generate the vol fake column and take the index out:
            bars['volume_fake'] = bars.quote * 100
            bars.reset_index(inplace=True)
            print(bars.head())

            # Get the suitable columns:
            bars = bars[['timestamp', 'quote', 'volume_fake']]

            # Generate the tick bars.
            bars = standard_data_structures.get_tick_bars(
                bars,
                threshold=thresholdVariable,
                batch_size=100000,
                verbose=False)

            # Get log returns for this bars:
            bars['Returns'] = np.log(bars.close / bars.close.shift(1))
            bars.dropna(how='any', inplace=True)
            print(f'TICK BARS for: {eachAssetName} >> Shape: {bars.shape}')
            print(bars.head())

            # Add them to the dict based on their symbol:
            self.ALTERNATIVE_BARS[eachAssetName] = bars

コード例 #5

0

ファイルを表示

ファイル: test_standard_data_structures.py プロジェクト: idanre1/mlfinlab_research

    def test_tick_bars_add_features(self):
        """
        Tests the additional features functionality with tick bars.
        """

        # Arrange
        threshold = 10
        high_over_low = BarFeature(
                        name='high_over_low',
                        function=lambda df: df['Price'].max() / df['Price'].min()
                        )
        low_over_high = BarFeature(
                        name='low_over_high',
                        function=lambda df: df['Price'].min() / df['Price'].max()
                        )

        # Act
        bars = ds.get_tick_bars(self.path, threshold=threshold, batch_size=1000, verbose=False,
                                additional_features=[high_over_low, low_over_high])

        # Assert
        self.assertTrue(np.all(bars['high_over_low'] == bars['high'] / bars['low']))
        self.assertTrue(np.all(bars['low_over_high'] == bars['low'] / bars['high']))

コード例 #6

0

ファイルを表示

grouper = data.groupby(pd.Grouper(key="date_time", freq="15min"))
time_bars = grouper.aggregate(
    open=pd.NamedAgg(column="price", aggfunc="first"),
    close=pd.NamedAgg(column="price", aggfunc="last"),
    cum_buy_volume=pd.NamedAgg(column="volume", aggfunc="sum"),
    cum_dollar_value=pd.NamedAgg(column="dollar_value", aggfunc="sum"),
)
time_bars = time_bars[time_bars["cum_buy_volume"] > 0].copy()
# NO!: time_bars["close"] = time_bars["close"].fillna(method="ffill")
time_bars["returns"] = np.log(time_bars["close"]).diff()

# ~0.34%
print(f"Date bars {len(time_bars)}: {100*len(time_bars) / len(data)}%")

# Tick Bars
tick_bars = standard_data_structures.get_tick_bars(data,
                                                   threshold=10000)  # ticks
tick_bars["returns"] = np.log(tick_bars["close"]).diff()
print(f"Tick bars {len(tick_bars)}: {100*len(tick_bars) / len(data)}%")

# Dollar Bars
# 1/50 of the avg daily dollar value
dollar_threshold = (data.groupby(pd.Grouper(
    key="date_time", freq="1D")).sum()["dollar_value"].mean())
dollar_threshold /= 50
dollar_threshold = int(dollar_threshold)

dollar_bars = standard_data_structures.get_dollar_bars(
    data, threshold=dollar_threshold, verbose=True)
dollar_bars["returns"] = np.log(dollar_bars["close"]).diff()
print(f"Dollar bars {len(dollar_bars)}: {100*len(dollar_bars) / len(data)}%")