Ejemplo n.º 1
0
    def get_history_window_series(self,
                                  assets,
                                  end_dt,
                                  bar_count,
                                  field,
                                  data_frequency,
                                  trailing_bar_count=None,
                                  reset_reader=False):
        start_dt = get_start_dt(end_dt, bar_count, data_frequency, False)
        start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets,
                                         data_frequency)

        if trailing_bar_count:
            delta = get_delta(trailing_bar_count, data_frequency)
            end_dt += delta

        # This is an attempt to resolve some caching with the reader
        # when auto-ingesting data.
        # TODO: needs more work
        reader = self.get_reader(data_frequency)
        if reset_reader:
            del self._readers[reader._rootdir]
            reader = self.get_reader(data_frequency)

        if reader is None:
            symbols = [asset.symbol for asset in assets]
            raise PricingDataNotLoadedError(
                field=field,
                first_trading_day=min([asset.start_date for asset in assets]),
                exchange=self.exchange_name,
                symbols=symbols,
                symbol_list=','.join(symbols),
                data_frequency=data_frequency,
                start_dt=start_dt,
                end_dt=end_dt)

        series = dict()
        for asset in assets:
            asset_start_dt, _ = self.get_adj_dates(start_dt, end_dt, assets,
                                                   data_frequency)
            in_bundle = range_in_bundle(asset, asset_start_dt, end_dt, reader)
            if not in_bundle:
                raise PricingDataNotLoadedError(
                    field=field,
                    first_trading_day=asset.start_date,
                    exchange=self.exchange_name,
                    symbols=asset.symbol,
                    symbol_list=asset.symbol,
                    data_frequency=data_frequency,
                    start_dt=asset_start_dt,
                    end_dt=end_dt)

            periods = self.get_calendar_periods_range(asset_start_dt, end_dt,
                                                      data_frequency)
            # This does not behave well when requesting multiple assets
            # when the start or end date of one asset is outside of the range
            # looking at the logic in load_raw_arrays(), we are not achieving
            # any performance gain by requesting multiple sids at once. It's
            # looping through the sids and making separate requests anyway.
            arrays = reader.load_raw_arrays(sids=[asset.sid],
                                            fields=[field],
                                            start_dt=start_dt,
                                            end_dt=end_dt)
            if len(arrays) == 0:
                raise DataCorruptionError(exchange=self.exchange_name,
                                          symbols=asset.symbol,
                                          start_dt=asset_start_dt,
                                          end_dt=end_dt)

            field_values = arrays[0][:, 0]

            try:
                value_series = pd.Series(field_values, index=periods)
                series[asset] = value_series
            except ValueError as e:
                raise PricingDataValueError(exchange=asset.exchange,
                                            symbol=asset.symbol,
                                            start_dt=asset_start_dt,
                                            end_dt=end_dt,
                                            error=e)

        return series
Ejemplo n.º 2
0
    def get_history_window(self,
                           assets,
                           end_dt,
                           bar_count,
                           frequency,
                           field,
                           data_frequency=None,
                           ffill=True):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list of catalyst.data.Asset objects
            The assets whose data is desired.

        end_dt: not applicable to cryptocurrencies

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        # TODO: fill how?
        ffill: boolean
            Forward-fill missing values. Only has effect if field
            is 'price'.

        Returns
        -------
        A dataframe containing the requested data.
        """

        freq_match = re.match(r'([0-9].*)(m|M|d|D)', frequency, re.M | re.I)
        if freq_match:
            candle_size = int(freq_match.group(1))
            unit = freq_match.group(2)

        else:
            raise InvalidHistoryFrequencyError(frequency)

        if unit.lower() == 'd':
            if data_frequency == 'minute':
                data_frequency = 'daily'

        elif unit.lower() == 'm':
            if data_frequency == 'daily':
                data_frequency = 'minute'

        else:
            raise InvalidHistoryFrequencyError(frequency)

        adj_bar_count = candle_size * bar_count
        try:
            series = self.bundle.get_history_window_series_and_load(
                assets=assets,
                end_dt=end_dt,
                bar_count=adj_bar_count,
                field=field,
                data_frequency=data_frequency)
        except PricingDataNotLoadedError:
            series = dict()

        for asset in assets:
            if asset not in series or series[asset].index[-1] < end_dt:
                # Adding bars too recent to be contained in the consolidated
                # exchanges bundles. We go directly against the exchange
                # to retrieve the candles.
                start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)
                trailing_dt = \
                    series[asset].index[-1] + get_delta(1, data_frequency) \
                        if asset in series else start_dt

                trailing_bar_count = \
                    get_periods(trailing_dt, end_dt, data_frequency)

                # The get_history method supports multiple asset
                candles = self.get_candles(data_frequency=data_frequency,
                                           assets=asset,
                                           bar_count=trailing_bar_count,
                                           end_dt=end_dt)

                last_value = series[asset].iloc(0) if asset in series \
                    else np.nan

                candle_series = self.get_series_from_candles(
                    candles=candles,
                    start_dt=trailing_dt,
                    end_dt=end_dt,
                    field=field,
                    previous_value=last_value)

                if asset in series:
                    series[asset].append(candle_series)

                else:
                    series[asset] = candle_series

        df = pd.DataFrame(series)

        if candle_size > 1:
            if field == 'open':
                agg = 'first'
            elif field == 'high':
                agg = 'max'
            elif field == 'low':
                agg = 'min'
            elif field == 'close':
                agg = 'last'
            elif field == 'volume':
                agg = 'sum'
            else:
                raise ValueError('Invalid field.')

            df = df.resample('{}T'.format(candle_size)).agg(agg)

        return df
Ejemplo n.º 3
0
    def get_history_window_with_bundle(self,
                                       assets,
                                       end_dt,
                                       bar_count,
                                       frequency,
                                       field,
                                       data_frequency=None,
                                       ffill=True,
                                       force_auto_ingest=False):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list[TradingPair]
            The assets whose data is desired.

        end_dt: datetime
            The date of the last bar.

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        # TODO: fill how?
        ffill: boolean
            Forward-fill missing values. Only has effect if field
            is 'price'.

        Returns
        -------
        DataFrame
            A dataframe containing the requested data.

        """
        freq, candle_size, unit, data_frequency = get_frequency(
            frequency, data_frequency)
        adj_bar_count = candle_size * bar_count

        try:
            series = self.bundle.get_history_window_series_and_load(
                assets=assets,
                end_dt=end_dt,
                bar_count=adj_bar_count,
                field=field,
                data_frequency=data_frequency,
                force_auto_ingest=force_auto_ingest)

        except (PricingDataNotLoadedError, NoDataAvailableOnExchange):
            series = dict()

        for asset in assets:
            if asset not in series or series[asset].index[-1] < end_dt:
                # Adding bars too recent to be contained in the consolidated
                # exchanges bundles. We go directly against the exchange
                # to retrieve the candles.
                start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)
                trailing_dt = \
                    series[asset].index[-1] + get_delta(1, data_frequency) \
                        if asset in series else start_dt

                # The get_history method supports multiple asset
                # Use the original frequency to let each api optimize
                # the size of result sets
                trailing_bar_count = get_periods(trailing_dt, end_dt, freq)
                candles = self.get_candles(freq=freq,
                                           assets=asset,
                                           bar_count=trailing_bar_count,
                                           start_dt=start_dt,
                                           end_dt=end_dt)

                last_value = series[asset].iloc(0) if asset in series \
                    else np.nan

                # Create a series with the common data_frequency, ffill
                # missing values
                candle_series = self.get_series_from_candles(
                    candles=candles,
                    start_dt=trailing_dt,
                    end_dt=end_dt,
                    data_frequency=data_frequency,
                    field=field,
                    previous_value=last_value)

                if asset in series:
                    series[asset].append(candle_series)

                else:
                    series[asset] = candle_series

        df = resample_history_df(pd.DataFrame(series), freq, field)
        # TODO: consider this more carefully
        df.dropna(inplace=True)

        return df
Ejemplo n.º 4
0
    def get_history_window(self,
                           assets,
                           end_dt,
                           bar_count,
                           frequency,
                           field,
                           data_frequency=None,
                           is_current=False):
        """
        Public API method that returns a dataframe containing the requested
        history window.  Data is fully adjusted.

        Parameters
        ----------
        assets : list[TradingPair]
            The assets whose data is desired.

        end_dt: datetime
            The date of the last bar

        bar_count: int
            The number of bars desired.

        frequency: string
            "1d" or "1m"

        field: string
            The desired field of the asset.

        data_frequency: string
            The frequency of the data to query; i.e. whether the data is
            'daily' or 'minute' bars.

        is_current: bool
            Skip date filters when current data is requested (last few bars
            until now).

        Notes
        -----
        Catalysts requires an end data with bar count both CCXT wants a
        start data with bar count. Since we have to make calculations here,
        we ensure that the last candle match the end_dt parameter.

        Returns
        -------
        DataFrame
            A dataframe containing the requested data.

        """
        freq, candle_size, unit, data_frequency = get_frequency(
            frequency, data_frequency)
        adj_bar_count = candle_size * bar_count

        start_dt = get_start_dt(end_dt, adj_bar_count, data_frequency)

        # The get_history method supports multiple asset
        candles = self.get_candles(
            freq=freq,
            assets=assets,
            bar_count=bar_count,
            start_dt=start_dt if not is_current else None,
            end_dt=end_dt if not is_current else None,
        )

        series = dict()
        for asset in candles:
            asset_series = self.get_series_from_candles(
                candles=candles[asset],
                start_dt=start_dt,
                end_dt=end_dt,
                data_frequency=frequency,
                field=field,
            )
            if end_dt is not None:
                delta = get_delta(candle_size, data_frequency)
                adj_end_dt = end_dt - delta
                last_traded = asset_series.index[-1]

                if last_traded < adj_end_dt:
                    raise LastCandleTooEarlyError(
                        last_traded=last_traded,
                        end_dt=adj_end_dt,
                        exchange=self.name,
                    )
            series[asset] = asset_series

        df = pd.DataFrame(series)
        df.dropna(inplace=True)

        return df
Ejemplo n.º 5
0
    def ingest_df(self,
                  ohlcv_df,
                  data_frequency,
                  asset,
                  writer,
                  empty_rows_behavior='strip'):
        """
        Ingest a DataFrame of OHLCV data for a given market.

        Parameters
        ----------
        ohlcv_df: DataFrame
        data_frequency: str
        asset: TradingPair
        writer:
        empty_rows_behavior: str

        """
        if empty_rows_behavior is not 'ignore':
            nan_rows = ohlcv_df[ohlcv_df.isnull().T.any().T].index

            if len(nan_rows) > 0:
                dates = []
                previous_date = None
                for row_date in nan_rows.values:
                    row_date = pd.to_datetime(row_date)

                    if previous_date is None:
                        dates.append(row_date)

                    else:
                        seq_date = previous_date + get_delta(1, data_frequency)

                        if row_date > seq_date:
                            dates.append(previous_date)
                            dates.append(row_date)

                    previous_date = row_date

                dates.append(pd.to_datetime(nan_rows.values[-1]))

                name = '{} from {} to {}'.format(asset.symbol,
                                                 ohlcv_df.index[0],
                                                 ohlcv_df.index[-1])
                if empty_rows_behavior == 'warn':
                    log.warn(
                        '\n{name} with end minute {end_minute} has empty rows '
                        'in ranges: {dates}'.format(
                            name=name,
                            end_minute=asset.end_minute,
                            dates=dates))

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(name=name,
                                                   end_minute=asset.end_minute,
                                                   dates=dates)
                else:
                    ohlcv_df.dropna(inplace=True)

        data = []
        if not ohlcv_df.empty:
            ohlcv_df.sort_index(inplace=True)
            data.append((asset.sid, ohlcv_df))

        self._write(data, writer, data_frequency)
Ejemplo n.º 6
0
    def ingest_ctable(self,
                      asset,
                      data_frequency,
                      period,
                      start_dt,
                      end_dt,
                      writer,
                      empty_rows_behavior='strip',
                      cleanup=False):
        """
        Merge a ctable bundle chunk into the main bundle for the exchange.

        :param asset: TradingPair
        :param data_frequency: str
        :param period: str
        :param writer:
        :param empty_rows_behavior: str
            Ensure that the bundle does not have any missing data.

        :param cleanup: bool
            Remove the temp bundle directory after ingestion.

        :return:
        """

        path = get_bcolz_chunk(exchange_name=self.exchange.name,
                               symbol=asset.symbol,
                               data_frequency=data_frequency,
                               period=period)

        reader = self.get_reader(data_frequency, path=path)
        if reader is None:
            raise TempBundleNotFoundError(path=path)

        arrays = reader.load_raw_arrays(
            sids=[asset.sid],
            fields=['open', 'high', 'low', 'close', 'volume'],
            start_dt=start_dt,
            end_dt=end_dt)

        if not arrays:
            return path

        periods = self.get_calendar_periods_range(start_dt, end_dt,
                                                  data_frequency)

        df = get_df_from_arrays(arrays, periods)

        if empty_rows_behavior is not 'ignore':
            nan_rows = df[df.isnull().T.any().T].index

            if len(nan_rows) > 0:
                dates = []
                previous_date = None
                for row_date in nan_rows.values:
                    row_date = pd.to_datetime(row_date)

                    if previous_date is None:
                        dates.append(row_date)

                    else:
                        seq_date = previous_date + get_delta(1, data_frequency)

                        if row_date > seq_date:
                            dates.append(previous_date)
                            dates.append(row_date)

                    previous_date = row_date

                dates.append(pd.to_datetime(nan_rows.values[-1]))

                name = path.split('/')[-1]
                if empty_rows_behavior == 'warn':
                    log.warn(
                        '\n{name} with end minute {end_minute} has empty rows '
                        'in ranges: {dates}'.format(
                            name=name,
                            end_minute=asset.end_minute,
                            dates=dates))

                elif empty_rows_behavior == 'raise':
                    raise EmptyValuesInBundleError(name=name,
                                                   end_minute=asset.end_minute,
                                                   dates=dates)
                else:
                    df.dropna(inplace=True)

        data = []
        if not df.empty:
            df.sort_index(inplace=True)
            data.append((asset.sid, df))
        self._write(data, writer, data_frequency)

        if cleanup:
            log.debug('removing bundle folder following '
                      'ingestion: {}'.format(path))
            shutil.rmtree(path)

        return path