Beispiel #1
0
    def filter_existing_assets(self, assets, start_dt, end_dt, data_frequency):
        """
        For each asset, get the close on the start and end dates of the chunk.
            If the data exists, the chunk ingestion is complete.
            If any data is missing we ingest the data.

        Parameters
        ----------
        assets: list[TradingPair]
            The assets is scope.
        start_dt: pd.Timestamp
            The chunk start date.
        end_dt: pd.Timestamp
            The chunk end date.
        data_frequency: str

        Returns
        -------
        list[TradingPair]
            The assets missing from the bundle
        """
        reader = self.get_reader(data_frequency)
        missing_assets = []
        for asset in assets:
            has_data = range_in_bundle(asset, start_dt, end_dt, reader)

            if not has_data:
                missing_assets.append(asset)

        return missing_assets
Beispiel #2
0
    def get_history_window_series(self,
                                  assets,
                                  end_dt,
                                  bar_count,
                                  field,
                                  data_frequency,
                                  reset_reader=False):
        start_dt = get_start_dt(end_dt, bar_count, data_frequency, False)
        start_dt, _ = self.get_adj_dates(
            start_dt, end_dt, assets, data_frequency
        )

        # This is an attempt to resolve some caching with the reader
        # when auto-ingesting data.
        # TODO: needs more work
        reader = self.get_reader(data_frequency)
        if reset_reader:
            del self._readers[reader._rootdir]
            reader = self.get_reader(data_frequency)

        if reader is None:
            symbols = [asset.symbol for asset in assets]
            raise PricingDataNotLoadedError(
                field=field,
                first_trading_day=min([asset.start_date for asset in assets]),
                exchange=self.exchange_name,
                symbols=symbols,
                symbol_list=','.join(symbols),
                data_frequency=data_frequency,
                start_dt=start_dt,
                end_dt=end_dt
            )

        series = dict()
        for asset in assets:
            asset_start_dt, _ = self.get_adj_dates(
                start_dt, end_dt, assets, data_frequency
            )
            in_bundle = range_in_bundle(
                asset, asset_start_dt, end_dt, reader
            )
            if not in_bundle:
                raise PricingDataNotLoadedError(
                    field=field,
                    first_trading_day=asset.start_date,
                    exchange=self.exchange_name,
                    symbols=asset.symbol,
                    symbol_list=asset.symbol,
                    data_frequency=data_frequency,
                    start_dt=asset_start_dt,
                    end_dt=end_dt
                )

            periods = self.get_calendar_periods_range(
                asset_start_dt, end_dt, data_frequency
            )
            # This does not behave well when requesting multiple assets
            # when the start or end date of one asset is outside of the range
            # looking at the logic in load_raw_arrays(), we are not achieving
            # any performance gain by requesting multiple sids at once. It's
            # looping through the sids and making separate requests anyway.
            arrays = reader.load_raw_arrays(
                sids=[asset.sid],
                fields=[field],
                start_dt=start_dt,
                end_dt=end_dt
            )
            if len(arrays) == 0:
                raise DataCorruptionError(
                    exchange=self.exchange_name,
                    symbols=asset.symbol,
                    start_dt=asset_start_dt,
                    end_dt=end_dt
                )

            field_values = arrays[0][:, 0]

            try:
                value_series = pd.Series(field_values, index=periods)
                series[asset] = value_series
            except ValueError as e:
                raise PricingDataValueError(
                    exchange=asset.exchange,
                    symbol=asset.symbol,
                    start_dt=asset_start_dt,
                    end_dt=end_dt,
                    error=e
                )

        return series
Beispiel #3
0
    def prepare_chunks(self, assets, data_frequency, start_dt, end_dt):
        """
        Split a price data request into chunks corresponding to individual
        bundles.

        Parameters
        ----------
        assets: list[TradingPair]
        data_frequency: str
        start_dt: pd.Timestamp
        end_dt: pd.Timestamp

        Returns
        -------
        dict[TradingPair, list[dict(str, Object]]]

        """
        get_start_end = get_month_start_end \
            if data_frequency == 'minute' else get_year_start_end

        # Get a reader for the main bundle to verify if data exists
        reader = self.get_reader(data_frequency)

        chunks = dict()
        for asset in assets:
            try:
                # Checking if the the asset has price data in the specified
                # date range
                adj_start, adj_end = self.get_adj_dates(
                    start_dt, end_dt, [asset], data_frequency
                )

            except NoDataAvailableOnExchange as e:
                # If not, we continue to the next asset
                log.debug('skipping {}: {}'.format(asset.symbol, e))
                continue

            dates = pd.date_range(
                start=get_period_label(adj_start, data_frequency),
                end=get_period_label(adj_end, data_frequency),
                freq='MS' if data_frequency == 'minute' else 'AS',
                tz=UTC
            )

            # Adjusting the last date of the range to avoid
            # going over the asset's trading bounds
            dates.values[0] = adj_start
            dates.values[-1] = adj_end

            chunks[asset] = []
            for index, dt in enumerate(dates):
                period_start, period_end = get_start_end(
                    dt=dt,
                    first_day=dt if index == 0 else None,
                    last_day=dt if index == len(dates) - 1 else None
                )

                # Currencies don't always start trading at midnight.
                # Checking the last minute of the day instead.
                range_start = period_start.replace(hour=23, minute=59) \
                    if data_frequency == 'minute' else period_start

                # Checking if the data already exists in the bundle
                # for the date range of the chunk. If not, we create
                # a chunk for ingestion.
                has_data = range_in_bundle(
                    asset, range_start, period_end, reader
                )
                if not has_data:
                    period = get_period_label(dt, data_frequency)
                    chunk = dict(
                        asset=asset,
                        period=period,
                    )
                    chunks[asset].append(chunk)

            # We sort the chunks by end date to ingest most recent data first
            chunks[asset].sort(
                key=lambda chunk: pd.to_datetime(chunk['period'])
            )

        return chunks