def get_exchange_history_window(self, exchange_name, assets, end_dt, bar_count, frequency, field, data_frequency, ffill=True): """ Fetching price history window from the exchange bundle. Parameters ---------- exchange: Exchange assets: list[TradingPair] end_dt: datetime bar_count: int frequency: str field: str data_frequency: str ffill: bool Returns ------- DataFrame """ # TODO: verify that the exchange supports the timeframe bundle = self.exchange_bundles[exchange_name] # type: ExchangeBundle freq, candle_size, unit, adj_data_frequency = get_frequency( frequency, data_frequency, supported_freqs=['T', 'D'] ) adj_bar_count = candle_size * bar_count if data_frequency == 'minute' and adj_data_frequency == 'daily': end_dt = end_dt.floor('1D') series = bundle.get_history_window_series_and_load( assets=assets, end_dt=end_dt, bar_count=adj_bar_count, field=field, data_frequency=adj_data_frequency, algo_end_dt=self._last_available_session, ) start_dt = get_start_dt(end_dt, adj_bar_count, adj_data_frequency) df = resample_history_df(pd.DataFrame(series), freq, field, start_dt) return df
def test_ingest_candles(self): exchange_name = 'bitfinex' data_frequency = 'minute' exchange = get_exchange(exchange_name) bundle = ExchangeBundle(exchange) assets = [exchange.get_asset('iot_btc')] end_dt = pd.to_datetime('2017-10-20', utc=True) bar_count = 100 start_dt = get_start_dt(end_dt, bar_count, data_frequency) candles = exchange.get_candles(assets=assets, start_dt=start_dt, end_dt=end_dt, bar_count=bar_count, freq='1T') writer = bundle.get_writer(start_dt, end_dt, data_frequency) for asset in assets: dates = [candle['last_traded'] for candle in candles[asset]] values = dict() for field in ['open', 'high', 'low', 'close', 'volume']: values[field] = [candle[field] for candle in candles[asset]] periods = bundle.get_calendar_periods_range( start_dt, end_dt, data_frequency) df = pd.DataFrame(values, index=dates) df = df.loc[periods].fillna(method='ffill') # TODO: why do I get an extra bar? bundle.ingest_df(ohlcv_df=df, data_frequency=data_frequency, asset=asset, writer=writer, empty_rows_behavior='raise', duplicates_behavior='raise') bundle_series = bundle.get_history_window_series( assets=assets, end_dt=end_dt, bar_count=bar_count, field='close', data_frequency=data_frequency, reset_reader=True) df = pd.DataFrame(bundle_series) print('\n' + df_to_string(df)) pass
def test_validate_data(self): exchange_name = 'bitfinex' data_frequency = 'minute' exchange = get_exchange(exchange_name) exchange_bundle = ExchangeBundle(exchange) assets = [exchange.get_asset('iot_btc')] end_dt = pd.to_datetime('2017-9-2 1:00', utc=True) bar_count = 60 bundle_series = exchange_bundle.get_history_window_series( assets=assets, end_dt=end_dt, bar_count=bar_count * 5, field='close', data_frequency='minute', ) candles = exchange.get_candles(assets=assets, end_dt=end_dt, bar_count=bar_count, freq='1T') start_dt = get_start_dt(end_dt, bar_count, data_frequency) frames = [] for asset in assets: bundle_df = pd.DataFrame( data=dict(bundle_price=bundle_series[asset]), index=bundle_series[asset].index) exchange_series = exchange.get_series_from_candles( candles=candles[asset], start_dt=start_dt, end_dt=end_dt, data_frequency=data_frequency, field='close') exchange_df = pd.DataFrame( data=dict(exchange_price=exchange_series), index=exchange_series.index) df = exchange_df.join(bundle_df, how='left') df['last_traded'] = df.index df['asset'] = asset.symbol df.set_index(['asset', 'last_traded'], inplace=True) frames.append(df) df = pd.concat(frames) print('\n' + df_to_string(df)) pass
def get_history_window_with_bundle(self, assets, end_dt, bar_count, frequency, field, data_frequency=None, ffill=True, force_auto_ingest=False): """ Public API method that returns a dataframe containing the requested history window. Data is fully adjusted. Parameters ---------- assets : list[TradingPair] The assets whose data is desired. end_dt: datetime The date of the last bar. bar_count: int The number of bars desired. frequency: string "1d" or "1m" field: string The desired field of the asset. data_frequency: string The frequency of the data to query; i.e. whether the data is 'daily' or 'minute' bars. # TODO: fill how? ffill: boolean Forward-fill missing values. Only has effect if field is 'price'. Returns ------- DataFrame A dataframe containing the requested data. """ # TODO: this function needs some work, # we're currently using it just for benchmark data freq, candle_size, unit, data_frequency = get_frequency( frequency, data_frequency, supported_freqs=['T', 'D']) adj_bar_count = candle_size * bar_count try: series = self.bundle.get_history_window_series_and_load( assets=assets, end_dt=end_dt, bar_count=adj_bar_count, field=field, data_frequency=data_frequency, force_auto_ingest=force_auto_ingest) except (PricingDataNotLoadedError, NoDataAvailableOnExchange): series = dict() for asset in assets: if asset not in series or series[asset].index[-1] < end_dt: # Adding bars too recent to be contained in the consolidated # exchanges bundles. We go directly against the exchange # to retrieve the candles. start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency) trailing_dt = \ series[asset].index[-1] + get_delta(1, data_frequency) \ if asset in series else start_dt # The get_history method supports multiple asset # Use the original frequency to let each api optimize # the size of result sets trailing_bars = get_periods(trailing_dt, end_dt, freq) candles = self.get_candles( freq=freq, assets=asset, end_dt=end_dt, bar_count=trailing_bars if trailing_bars < 500 else 500, ) last_value = series[asset].iloc(0) if asset in series \ else np.nan # Create a series with the common data_frequency, ffill # missing values candle_series = self.get_series_from_candles( candles=candles, start_dt=trailing_dt, end_dt=end_dt, data_frequency=data_frequency, field=field, previous_value=last_value) if asset in series: series[asset].append(candle_series) else: series[asset] = candle_series df = resample_history_df(pd.DataFrame(series), freq, field) # TODO: consider this more carefully df.dropna(inplace=True) return df
def get_history_window_series(self, assets, end_dt, bar_count, field, data_frequency, reset_reader=False): start_dt = get_start_dt(end_dt, bar_count, data_frequency, False) start_dt, _ = self.get_adj_dates( start_dt, end_dt, assets, data_frequency ) # This is an attempt to resolve some caching with the reader # when auto-ingesting data. # TODO: needs more work reader = self.get_reader(data_frequency) if reset_reader: del self._readers[reader._rootdir] reader = self.get_reader(data_frequency) if reader is None: symbols = [asset.symbol for asset in assets] raise PricingDataNotLoadedError( field=field, first_trading_day=min([asset.start_date for asset in assets]), exchange=self.exchange_name, symbols=symbols, symbol_list=','.join(symbols), data_frequency=data_frequency, start_dt=start_dt, end_dt=end_dt ) series = dict() for asset in assets: asset_start_dt, _ = self.get_adj_dates( start_dt, end_dt, assets, data_frequency ) in_bundle = range_in_bundle( asset, asset_start_dt, end_dt, reader ) if not in_bundle: raise PricingDataNotLoadedError( field=field, first_trading_day=asset.start_date, exchange=self.exchange_name, symbols=asset.symbol, symbol_list=asset.symbol, data_frequency=data_frequency, start_dt=asset_start_dt, end_dt=end_dt ) periods = self.get_calendar_periods_range( asset_start_dt, end_dt, data_frequency ) # This does not behave well when requesting multiple assets # when the start or end date of one asset is outside of the range # looking at the logic in load_raw_arrays(), we are not achieving # any performance gain by requesting multiple sids at once. It's # looping through the sids and making separate requests anyway. arrays = reader.load_raw_arrays( sids=[asset.sid], fields=[field], start_dt=start_dt, end_dt=end_dt ) if len(arrays) == 0: raise DataCorruptionError( exchange=self.exchange_name, symbols=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt ) field_values = arrays[0][:, 0] try: value_series = pd.Series(field_values, index=periods) series[asset] = value_series except ValueError as e: raise PricingDataValueError( exchange=asset.exchange, symbol=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt, error=e ) return series
def get_history_window_series_and_load(self, assets, end_dt, bar_count, field, data_frequency, algo_end_dt=None, force_auto_ingest=False ): """ Retrieve price data history, ingest missing data. Parameters ---------- assets: list[TradingPair] end_dt: pd.Timestamp bar_count: int field: str data_frequency: str algo_end_dt: pd.Timestamp Returns ------- Series """ if AUTO_INGEST or force_auto_ingest: try: series = self.get_history_window_series( assets=assets, end_dt=end_dt, bar_count=bar_count, field=field, data_frequency=data_frequency, ) return pd.DataFrame(series) except PricingDataNotLoadedError: start_dt = get_start_dt(end_dt, bar_count, data_frequency) log.info( 'pricing data for {symbol} not found in range ' '{start} to {end}, updating the bundles.'.format( symbol=[asset.symbol for asset in assets], start=start_dt, end=end_dt ) ) self.ingest_assets( assets=assets, start_dt=start_dt, end_dt=algo_end_dt, # TODO: apply trailing bars data_frequency=data_frequency, show_progress=True, show_breakdown=True ) series = self.get_history_window_series( assets=assets, end_dt=end_dt, bar_count=bar_count, field=field, data_frequency=data_frequency, reset_reader=True, ) return series else: series = self.get_history_window_series( assets=assets, end_dt=end_dt, bar_count=bar_count, field=field, data_frequency=data_frequency, ) return pd.DataFrame(series)
def get_exchange_history_window(self, exchange_name, assets, end_dt, bar_count, frequency, field, data_frequency, ffill=True): """ Fetching price history window from the exchange bundle. Parameters ---------- exchange: Exchange assets: list[TradingPair] end_dt: datetime bar_count: int frequency: str field: str data_frequency: str ffill: bool Returns ------- DataFrame """ # TODO: verify that the exchange supports the timeframe bundle = self.exchange_bundles[exchange_name] # type: ExchangeBundle freq, candle_size, unit, adj_data_frequency = get_frequency( frequency, data_frequency, supported_freqs=['T', 'D']) adj_bar_count = candle_size * bar_count if data_frequency == "minute": # for minute frequency always request data until the # current minute (do not include the current minute) last_dt_for_series = end_dt - datetime.timedelta(minutes=1) # read the minute bundles for daily frequency to # support last partial candle # TODO: optimize this by applying this logic only for the last day if adj_data_frequency == 'daily': adj_data_frequency = 'minute' adj_bar_count = adj_bar_count * 1440 else: # data_frequency == "daily": last_dt_for_series = end_dt series = bundle.get_history_window_series_and_load( assets=assets, end_dt=last_dt_for_series, bar_count=adj_bar_count, field=field, data_frequency=adj_data_frequency, algo_end_dt=self._last_available_session, ) start_dt = get_start_dt(last_dt_for_series, adj_bar_count, adj_data_frequency, False) df = resample_history_df(pd.DataFrame(series), freq, field, start_dt) return df
def get_history_window_with_bundle(self, assets, end_dt, bar_count, frequency, field, data_frequency=None, ffill=True, force_auto_ingest=False): """ Public API method that returns a dataframe containing the requested history window. Data is fully adjusted. Parameters ---------- assets : list[TradingPair] The assets whose data is desired. end_dt: datetime The date of the last bar. bar_count: int The number of bars desired. frequency: string "1d" or "1m" field: string The desired field of the asset. data_frequency: string The frequency of the data to query; i.e. whether the data is 'daily' or 'minute' bars. # TODO: fill how? ffill: boolean Forward-fill missing values. Only has effect if field is 'price'. Returns ------- DataFrame A dataframe containing the requested data. """ # TODO: this function needs some work, we're currently using it just for benchmark data freq, candle_size, unit, data_frequency = get_frequency( frequency, data_frequency ) adj_bar_count = candle_size * bar_count try: series = self.bundle.get_history_window_series_and_load( assets=assets, end_dt=end_dt, bar_count=adj_bar_count, field=field, data_frequency=data_frequency, force_auto_ingest=force_auto_ingest ) except (PricingDataNotLoadedError, NoDataAvailableOnExchange): series = dict() for asset in assets: if asset not in series or series[asset].index[-1] < end_dt: # Adding bars too recent to be contained in the consolidated # exchanges bundles. We go directly against the exchange # to retrieve the candles. start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency) trailing_dt = \ series[asset].index[-1] + get_delta(1, data_frequency) \ if asset in series else start_dt # The get_history method supports multiple asset # Use the original frequency to let each api optimize # the size of result sets trailing_bars = get_periods( trailing_dt, end_dt, freq ) candles = self.get_candles( freq=freq, assets=asset, end_dt=end_dt, bar_count=trailing_bars if trailing_bars < 500 else 500, ) last_value = series[asset].iloc(0) if asset in series \ else np.nan # Create a series with the common data_frequency, ffill # missing values candle_series = self.get_series_from_candles( candles=candles, start_dt=trailing_dt, end_dt=end_dt, data_frequency=data_frequency, field=field, previous_value=last_value ) if asset in series: series[asset].append(candle_series) else: series[asset] = candle_series df = resample_history_df(pd.DataFrame(series), freq, field) # TODO: consider this more carefully df.dropna(inplace=True) return df