def get_history_window_series(self, assets, end_dt, bar_count, field, data_frequency, trailing_bar_count=None, reset_reader=False): start_dt = get_start_dt(end_dt, bar_count, data_frequency, False) start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets, data_frequency) if trailing_bar_count: delta = get_delta(trailing_bar_count, data_frequency) end_dt += delta # This is an attempt to resolve some caching with the reader # when auto-ingesting data. # TODO: needs more work reader = self.get_reader(data_frequency) if reset_reader: del self._readers[reader._rootdir] reader = self.get_reader(data_frequency) if reader is None: symbols = [asset.symbol for asset in assets] raise PricingDataNotLoadedError( field=field, first_trading_day=min([asset.start_date for asset in assets]), exchange=self.exchange_name, symbols=symbols, symbol_list=','.join(symbols), data_frequency=data_frequency, start_dt=start_dt, end_dt=end_dt) series = dict() for asset in assets: asset_start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets, data_frequency) in_bundle = range_in_bundle(asset, asset_start_dt, end_dt, reader) if not in_bundle: raise PricingDataNotLoadedError( field=field, first_trading_day=asset.start_date, exchange=self.exchange_name, symbols=asset.symbol, symbol_list=asset.symbol, data_frequency=data_frequency, start_dt=asset_start_dt, end_dt=end_dt) periods = self.get_calendar_periods_range(asset_start_dt, end_dt, data_frequency) # This does not behave well when requesting multiple assets # when the start or end date of one asset is outside of the range # looking at the logic in load_raw_arrays(), we are not achieving # any performance gain by requesting multiple sids at once. It's # looping through the sids and making separate requests anyway. arrays = reader.load_raw_arrays(sids=[asset.sid], fields=[field], start_dt=start_dt, end_dt=end_dt) if len(arrays) == 0: raise DataCorruptionError(exchange=self.exchange_name, symbols=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt) field_values = arrays[0][:, 0] try: value_series = pd.Series(field_values, index=periods) series[asset] = value_series except ValueError as e: raise PricingDataValueError(exchange=asset.exchange, symbol=asset.symbol, start_dt=asset_start_dt, end_dt=end_dt, error=e) return series
def get_history_window(self, assets, end_dt, bar_count, frequency, field, data_frequency=None, ffill=True): """ Public API method that returns a dataframe containing the requested history window. Data is fully adjusted. Parameters ---------- assets : list of catalyst.data.Asset objects The assets whose data is desired. end_dt: not applicable to cryptocurrencies bar_count: int The number of bars desired. frequency: string "1d" or "1m" field: string The desired field of the asset. data_frequency: string The frequency of the data to query; i.e. whether the data is 'daily' or 'minute' bars. # TODO: fill how? ffill: boolean Forward-fill missing values. Only has effect if field is 'price'. Returns ------- A dataframe containing the requested data. """ freq_match = re.match(r'([0-9].*)(m|M|d|D)', frequency, re.M | re.I) if freq_match: candle_size = int(freq_match.group(1)) unit = freq_match.group(2) else: raise InvalidHistoryFrequencyError(frequency) if unit.lower() == 'd': if data_frequency == 'minute': data_frequency = 'daily' elif unit.lower() == 'm': if data_frequency == 'daily': data_frequency = 'minute' else: raise InvalidHistoryFrequencyError(frequency) adj_bar_count = candle_size * bar_count try: series = self.bundle.get_history_window_series_and_load( assets=assets, end_dt=end_dt, bar_count=adj_bar_count, field=field, data_frequency=data_frequency) except PricingDataNotLoadedError: series = dict() for asset in assets: if asset not in series or series[asset].index[-1] < end_dt: # Adding bars too recent to be contained in the consolidated # exchanges bundles. We go directly against the exchange # to retrieve the candles. start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency) trailing_dt = \ series[asset].index[-1] + get_delta(1, data_frequency) \ if asset in series else start_dt trailing_bar_count = \ get_periods(trailing_dt, end_dt, data_frequency) # The get_history method supports multiple asset candles = self.get_candles(data_frequency=data_frequency, assets=asset, bar_count=trailing_bar_count, end_dt=end_dt) last_value = series[asset].iloc(0) if asset in series \ else np.nan candle_series = self.get_series_from_candles( candles=candles, start_dt=trailing_dt, end_dt=end_dt, field=field, previous_value=last_value) if asset in series: series[asset].append(candle_series) else: series[asset] = candle_series df = pd.DataFrame(series) if candle_size > 1: if field == 'open': agg = 'first' elif field == 'high': agg = 'max' elif field == 'low': agg = 'min' elif field == 'close': agg = 'last' elif field == 'volume': agg = 'sum' else: raise ValueError('Invalid field.') df = df.resample('{}T'.format(candle_size)).agg(agg) return df
def get_history_window_with_bundle(self, assets, end_dt, bar_count, frequency, field, data_frequency=None, ffill=True, force_auto_ingest=False): """ Public API method that returns a dataframe containing the requested history window. Data is fully adjusted. Parameters ---------- assets : list[TradingPair] The assets whose data is desired. end_dt: datetime The date of the last bar. bar_count: int The number of bars desired. frequency: string "1d" or "1m" field: string The desired field of the asset. data_frequency: string The frequency of the data to query; i.e. whether the data is 'daily' or 'minute' bars. # TODO: fill how? ffill: boolean Forward-fill missing values. Only has effect if field is 'price'. Returns ------- DataFrame A dataframe containing the requested data. """ freq, candle_size, unit, data_frequency = get_frequency( frequency, data_frequency) adj_bar_count = candle_size * bar_count try: series = self.bundle.get_history_window_series_and_load( assets=assets, end_dt=end_dt, bar_count=adj_bar_count, field=field, data_frequency=data_frequency, force_auto_ingest=force_auto_ingest) except (PricingDataNotLoadedError, NoDataAvailableOnExchange): series = dict() for asset in assets: if asset not in series or series[asset].index[-1] < end_dt: # Adding bars too recent to be contained in the consolidated # exchanges bundles. We go directly against the exchange # to retrieve the candles. start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency) trailing_dt = \ series[asset].index[-1] + get_delta(1, data_frequency) \ if asset in series else start_dt # The get_history method supports multiple asset # Use the original frequency to let each api optimize # the size of result sets trailing_bar_count = get_periods(trailing_dt, end_dt, freq) candles = self.get_candles(freq=freq, assets=asset, bar_count=trailing_bar_count, start_dt=start_dt, end_dt=end_dt) last_value = series[asset].iloc(0) if asset in series \ else np.nan # Create a series with the common data_frequency, ffill # missing values candle_series = self.get_series_from_candles( candles=candles, start_dt=trailing_dt, end_dt=end_dt, data_frequency=data_frequency, field=field, previous_value=last_value) if asset in series: series[asset].append(candle_series) else: series[asset] = candle_series df = resample_history_df(pd.DataFrame(series), freq, field) # TODO: consider this more carefully df.dropna(inplace=True) return df
def get_history_window(self, assets, end_dt, bar_count, frequency, field, data_frequency=None, is_current=False): """ Public API method that returns a dataframe containing the requested history window. Data is fully adjusted. Parameters ---------- assets : list[TradingPair] The assets whose data is desired. end_dt: datetime The date of the last bar bar_count: int The number of bars desired. frequency: string "1d" or "1m" field: string The desired field of the asset. data_frequency: string The frequency of the data to query; i.e. whether the data is 'daily' or 'minute' bars. is_current: bool Skip date filters when current data is requested (last few bars until now). Notes ----- Catalysts requires an end data with bar count both CCXT wants a start data with bar count. Since we have to make calculations here, we ensure that the last candle match the end_dt parameter. Returns ------- DataFrame A dataframe containing the requested data. """ freq, candle_size, unit, data_frequency = get_frequency( frequency, data_frequency) adj_bar_count = candle_size * bar_count start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency) # The get_history method supports multiple asset candles = self.get_candles( freq=freq, assets=assets, bar_count=bar_count, start_dt=start_dt if not is_current else None, end_dt=end_dt if not is_current else None, ) series = dict() for asset in candles: asset_series = self.get_series_from_candles( candles=candles[asset], start_dt=start_dt, end_dt=end_dt, data_frequency=frequency, field=field, ) if end_dt is not None: delta = get_delta(candle_size, data_frequency) adj_end_dt = end_dt - delta last_traded = asset_series.index[-1] if last_traded < adj_end_dt: raise LastCandleTooEarlyError( last_traded=last_traded, end_dt=adj_end_dt, exchange=self.name, ) series[asset] = asset_series df = pd.DataFrame(series) df.dropna(inplace=True) return df
def ingest_df(self, ohlcv_df, data_frequency, asset, writer, empty_rows_behavior='strip'): """ Ingest a DataFrame of OHLCV data for a given market. Parameters ---------- ohlcv_df: DataFrame data_frequency: str asset: TradingPair writer: empty_rows_behavior: str """ if empty_rows_behavior is not 'ignore': nan_rows = ohlcv_df[ohlcv_df.isnull().T.any().T].index if len(nan_rows) > 0: dates = [] previous_date = None for row_date in nan_rows.values: row_date = pd.to_datetime(row_date) if previous_date is None: dates.append(row_date) else: seq_date = previous_date + get_delta(1, data_frequency) if row_date > seq_date: dates.append(previous_date) dates.append(row_date) previous_date = row_date dates.append(pd.to_datetime(nan_rows.values[-1])) name = '{} from {} to {}'.format(asset.symbol, ohlcv_df.index[0], ohlcv_df.index[-1]) if empty_rows_behavior == 'warn': log.warn( '\n{name} with end minute {end_minute} has empty rows ' 'in ranges: {dates}'.format( name=name, end_minute=asset.end_minute, dates=dates)) elif empty_rows_behavior == 'raise': raise EmptyValuesInBundleError(name=name, end_minute=asset.end_minute, dates=dates) else: ohlcv_df.dropna(inplace=True) data = [] if not ohlcv_df.empty: ohlcv_df.sort_index(inplace=True) data.append((asset.sid, ohlcv_df)) self._write(data, writer, data_frequency)
def ingest_ctable(self, asset, data_frequency, period, start_dt, end_dt, writer, empty_rows_behavior='strip', cleanup=False): """ Merge a ctable bundle chunk into the main bundle for the exchange. :param asset: TradingPair :param data_frequency: str :param period: str :param writer: :param empty_rows_behavior: str Ensure that the bundle does not have any missing data. :param cleanup: bool Remove the temp bundle directory after ingestion. :return: """ path = get_bcolz_chunk(exchange_name=self.exchange.name, symbol=asset.symbol, data_frequency=data_frequency, period=period) reader = self.get_reader(data_frequency, path=path) if reader is None: raise TempBundleNotFoundError(path=path) arrays = reader.load_raw_arrays( sids=[asset.sid], fields=['open', 'high', 'low', 'close', 'volume'], start_dt=start_dt, end_dt=end_dt) if not arrays: return path periods = self.get_calendar_periods_range(start_dt, end_dt, data_frequency) df = get_df_from_arrays(arrays, periods) if empty_rows_behavior is not 'ignore': nan_rows = df[df.isnull().T.any().T].index if len(nan_rows) > 0: dates = [] previous_date = None for row_date in nan_rows.values: row_date = pd.to_datetime(row_date) if previous_date is None: dates.append(row_date) else: seq_date = previous_date + get_delta(1, data_frequency) if row_date > seq_date: dates.append(previous_date) dates.append(row_date) previous_date = row_date dates.append(pd.to_datetime(nan_rows.values[-1])) name = path.split('/')[-1] if empty_rows_behavior == 'warn': log.warn( '\n{name} with end minute {end_minute} has empty rows ' 'in ranges: {dates}'.format( name=name, end_minute=asset.end_minute, dates=dates)) elif empty_rows_behavior == 'raise': raise EmptyValuesInBundleError(name=name, end_minute=asset.end_minute, dates=dates) else: df.dropna(inplace=True) data = [] if not df.empty: df.sort_index(inplace=True) data.append((asset.sid, df)) self._write(data, writer, data_frequency) if cleanup: log.debug('removing bundle folder following ' 'ingestion: {}'.format(path)) shutil.rmtree(path) return path