def test_optionally(self):
        error = TypeError("arg must be int")

        def preprocessor(func, argname, arg):
            if not isinstance(arg, int):
                raise error
            return arg

        @preprocess(a=optionally(preprocessor))
        def f(a):
            return a

        assert f(1) == 1
        assert f(None) is None

        with pytest.raises(TypeError, match=str(error)):
            f("a")
Esempio n. 2
0
    def test_optionally(self):
        error = TypeError('arg must be int')

        def preprocessor(func, argname, arg):
            if not isinstance(arg, int):
                raise error
            return arg

        @preprocess(a=optionally(preprocessor))
        def f(a):
            return a

        self.assertIs(f(1), 1)
        self.assertIsNone(f(None))

        with self.assertRaises(TypeError) as e:
            f('a')
        self.assertIs(e.exception, error)
Esempio n. 3
0
    def test_optionally(self):
        error = TypeError('arg must be int')

        def preprocessor(func, argname, arg):
            if not isinstance(arg, int):
                raise error
            return arg

        @preprocess(a=optionally(preprocessor))
        def f(a):
            return a

        self.assertIs(f(1), 1)
        self.assertIsNone(f(None))

        with self.assertRaises(TypeError) as e:
            f('a')
        self.assertIs(e.exception, error)
Esempio n. 4
0
def _make_bundle_core():
    """Create a family of data bundle functions that read from the same
    bundle mapping.

    Returns
    -------
    bundles : mappingproxy
        The mapping of bundles to bundle payloads.
    register : callable
        The function which registers new bundles in the ``bundles`` mapping.
    unregister : callable
        The function which deregisters bundles from the ``bundles`` mapping.
    ingest : callable
        The function which downloads and write data for a given data bundle.
    load : callable
        The function which loads the ingested bundles back into memory.
    clean : callable
        The function which cleans up data written with ``ingest``.
    """
    _bundles = {}  # the registered bundles
    # Expose _bundles through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another bundle.
    bundles = mappingproxy(_bundles)

    @curry
    def register(name,
                 f,
                 calendar='NYSE',
                 start_session=None,
                 end_session=None,
                 minutes_per_day=390,
                 create_writers=True):
        """Register a data bundle ingest function.

        Parameters
        ----------
        name : str
            The name of the bundle.
        f : callable
            The ingest function. This function will be passed:

              environ : mapping
                  The environment this is being run with.
              asset_db_writer : AssetDBWriter
                  The asset db writer to write into.
              minute_bar_writer : BcolzMinuteBarWriter
                  The minute bar writer to write into.
              daily_bar_writer : BcolzDailyBarWriter
                  The daily bar writer to write into.
              adjustment_writer : SQLiteAdjustmentWriter
                  The adjustment db writer to write into.
              calendar : zipline.utils.calendars.TradingCalendar
                  The trading calendar to ingest for.
              start_session : pd.Timestamp
                  The first session of data to ingest.
              end_session : pd.Timestamp
                  The last session of data to ingest.
              cache : DataFrameCache
                  A mapping object to temporarily store dataframes.
                  This should be used to cache intermediates in case the load
                  fails. This will be automatically cleaned up after a
                  successful load.
              show_progress : bool
                  Show the progress for the current load where possible.
        calendar : zipline.utils.calendars.TradingCalendar or str, optional
            The trading calendar to align the data to, or the name of a trading
            calendar. This defaults to 'NYSE', in which case we use the NYSE
            calendar.
        start_session : pd.Timestamp, optional
            The first session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the first_session of the calendar is used.
        end_session : pd.Timestamp, optional
            The last session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the last_session of the calendar is used.
        minutes_per_day : int, optional
            The number of minutes in each normal trading day.
        create_writers : bool, optional
            Should the ingest machinery create the writers for the ingest
            function. This can be disabled as an optimization for cases where
            they are not needed, like the ``quantopian-quandl`` bundle.

        Notes
        -----
        This function my be used as a decorator, for example:

        .. code-block:: python

           @register('quandl')
           def quandl_ingest_function(...):
               ...

        See Also
        --------
        zipline.data.bundles.bundles
        """
        if name in bundles:
            warnings.warn(
                'Overwriting bundle with name %r' % name,
                stacklevel=3,
            )

        if isinstance(calendar, str):
            calendar = get_calendar(calendar)

        # If the start and end sessions are not provided or lie outside
        # the bounds of the calendar being used, set them to the first
        # and last sessions of the calendar.
        if start_session is None or start_session < calendar.first_session:
            start_session = calendar.first_session
        if end_session is None or end_session > calendar.last_session:
            end_session = calendar.last_session

        _bundles[name] = _BundlePayload(
            calendar,
            start_session,
            end_session,
            minutes_per_day,
            f,
            create_writers,
        )
        return f

    def unregister(name):
        """Unregister a bundle.

        Parameters
        ----------
        name : str
            The name of the bundle to unregister.

        Raises
        ------
        UnknownBundle
            Raised when no bundle has been registered with the given name.

        See Also
        --------
        zipline.data.bundles.bundles
        """
        try:
            del _bundles[name]
        except KeyError:
            raise UnknownBundle(name)

    def ingest(name, environ=os.environ, timestamp=None, show_progress=False):
        """Ingest data for a given bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. By default this is os.environ.
        timestamp : datetime, optional
            The timestamp to use for the load.
            By default this is the current time.
        show_progress : bool, optional
            Tell the ingest function to display the progress where possible.
        """
        try:
            bundle = bundles[name]
        except KeyError:
            raise UnknownBundle(name)

        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestamp = timestamp.tz_convert('utc').tz_localize(None)
        timestr = to_bundle_ingest_dirname(timestamp)
        cachepath = cache_path(name, environ=environ)
        pth.ensure_directory(pth.data_path([name, timestr], environ=environ))
        pth.ensure_directory(cachepath)
        with dataframe_cache(cachepath, clean_on_failure=False) as cache, \
                ExitStack() as stack:
            # we use `cleanup_on_failure=False` so that we don't purge the
            # cache directory if the load fails in the middle
            if bundle.create_writers:
                wd = stack.enter_context(
                    working_dir(pth.data_path([], environ=environ)))
                daily_bars_path = wd.ensure_dir(*daily_equity_relative(
                    name,
                    timestr,
                    environ=environ,
                ))
                daily_bar_writer = BcolzDailyBarWriter(
                    daily_bars_path,
                    bundle.calendar,
                    bundle.start_session,
                    bundle.end_session,
                )
                # Do an empty write to ensure that the daily ctables exist
                # when we create the SQLiteAdjustmentWriter below. The
                # SQLiteAdjustmentWriter needs to open the daily ctables so
                # that it can compute the adjustment ratios for the dividends.

                daily_bar_writer.write(())
                minute_bar_writer = BcolzMinuteBarWriter(
                    wd.ensure_dir(*minute_equity_relative(
                        name, timestr, environ=environ)),
                    bundle.calendar,
                    bundle.start_session,
                    bundle.end_session,
                    minutes_per_day=bundle.minutes_per_day,
                )
                asset_db_writer = AssetDBWriter(
                    wd.getpath(*asset_db_relative(
                        name,
                        timestr,
                        environ=environ,
                    )))

                adjustment_db_writer = stack.enter_context(
                    SQLiteAdjustmentWriter(
                        wd.getpath(*adjustment_db_relative(
                            name, timestr, environ=environ)),
                        BcolzDailyBarReader(daily_bars_path),
                        bundle.calendar.all_sessions,
                        overwrite=True,
                    ))
            else:
                daily_bar_writer = None
                minute_bar_writer = None
                asset_db_writer = None
                adjustment_db_writer = None
            bundle.ingest(
                environ,
                asset_db_writer,
                minute_bar_writer,
                daily_bar_writer,
                adjustment_db_writer,
                bundle.calendar,
                bundle.start_session,
                bundle.end_session,
                cache,
                show_progress,
                pth.data_path([name, timestr], environ=environ),
            )

    def most_recent_data(bundle_name, timestamp, environ=None):
        """Get the path to the most recent data after ``date``for the
        given bundle.

        Parameters
        ----------
        bundle_name : str
            The name of the bundle to lookup.
        timestamp : datetime
            The timestamp to begin searching on or before.
        environ : dict, optional
            An environment dict to forward to zipline_root.
        """
        if bundle_name not in bundles:
            raise UnknownBundle(bundle_name)

        try:
            candidates = os.listdir(
                pth.data_path([bundle_name], environ=environ), )
            return pth.data_path(
                [
                    bundle_name,
                    max(
                        filter(complement(pth.hidden), candidates),
                        key=from_bundle_ingest_dirname,
                    )
                ],
                environ=environ,
            )
        except (ValueError, OSError) as e:
            if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT:
                raise
            raise ValueError(
                'no data for bundle {bundle!r} on or before {timestamp}\n'
                'maybe you need to run: $ zipline ingest -b {bundle}'.format(
                    bundle=bundle_name,
                    timestamp=timestamp,
                ), )

    def load(name, environ=os.environ, timestamp=None):
        """Loads a previously ingested bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. Defaults of os.environ.
        timestamp : datetime, optional
            The timestamp of the data to lookup.
            Defaults to the current time.

        Returns
        -------
        bundle_data : BundleData
            The raw data readers for this bundle.
        """
        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestr = most_recent_data(name, timestamp, environ=environ)
        return BundleData(
            asset_finder=AssetFinder(
                asset_db_path(name, timestr, environ=environ), ),
            equity_minute_bar_reader=BcolzMinuteBarReader(
                minute_equity_path(name, timestr, environ=environ), ),
            equity_daily_bar_reader=BcolzDailyBarReader(
                daily_equity_path(name, timestr, environ=environ), ),
            adjustment_reader=SQLiteAdjustmentReader(
                adjustment_db_path(name, timestr, environ=environ), ),
        )

    @preprocess(
        before=optionally(ensure_timestamp),
        after=optionally(ensure_timestamp),
    )
    def clean(name,
              before=None,
              after=None,
              keep_last=None,
              environ=os.environ):
        """Clean up data that was created with ``ingest`` or
        ``$ python -m zipline ingest``

        Parameters
        ----------
        name : str
            The name of the bundle to remove data for.
        before : datetime, optional
            Remove data ingested before this date.
            This argument is mutually exclusive with: keep_last
        after : datetime, optional
            Remove data ingested after this date.
            This argument is mutually exclusive with: keep_last
        keep_last : int, optional
            Remove all but the last ``keep_last`` ingestions.
            This argument is mutually exclusive with:
              before
              after
        environ : mapping, optional
            The environment variables. Defaults of os.environ.

        Returns
        -------
        cleaned : set[str]
            The names of the runs that were removed.

        Raises
        ------
        BadClean
            Raised when ``before`` and or ``after`` are passed with
            ``keep_last``. This is a subclass of ``ValueError``.
        """
        try:
            all_runs = sorted(
                filter(
                    complement(pth.hidden),
                    os.listdir(pth.data_path([name], environ=environ)),
                ),
                key=from_bundle_ingest_dirname,
            )
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise
            raise UnknownBundle(name)
        if ((before is not None or after is not None)
                and keep_last is not None):
            raise BadClean(before, after, keep_last)

        if keep_last is None:

            def should_clean(name):
                dt = from_bundle_ingest_dirname(name)
                return ((before is not None and dt < before)
                        or (after is not None and dt > after))

        elif keep_last >= 0:
            last_n_dts = set(take(keep_last, reversed(all_runs)))

            def should_clean(name):
                return name not in last_n_dts
        else:
            raise BadClean(before, after, keep_last)

        cleaned = set()
        for run in all_runs:
            if should_clean(run):
                path = pth.data_path([name, run], environ=environ)
                shutil.rmtree(path)
                cleaned.add(path)

        return cleaned

    return BundleCore(bundles, register, unregister, ingest, load, clean)
Esempio n. 5
0
def _make_bundle_core():
    """Create a family of data bundle functions that read from the same
    bundle mapping.

    Returns
    -------
    bundles : mappingproxy
        The mapping of bundles to bundle payloads.
    register : callable
        The function which registers new bundles in the ``bundles`` mapping.
    unregister : callable
        The function which deregisters bundles from the ``bundles`` mapping.
    ingest : callable
        The function which downloads and write data for a given data bundle.
    load : callable
        The function which loads the ingested bundles back into memory.
    clean : callable
        The function which cleans up data written with ``ingest``.
    """
    _bundles = {}  # the registered bundles
    # Expose _bundles through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another bundle.
    bundles = mappingproxy(_bundles)

    @curry
    def register(name,
                 f,
                 calendar=trading_days,
                 opens=open_and_closes['market_open'],
                 closes=open_and_closes['market_close'],
                 minutes_per_day=390,
                 create_writers=True):
        """Register a data bundle ingest function.

        Parameters
        ----------
        name : str
            The name of the bundle.
        f : callable
            The ingest function. This function will be passed:

              environ : mapping
                  The environment this is being run with.
              asset_db_writer : AssetDBWriter
                  The asset db writer to write into.
              minute_bar_writer : BcolzMinuteBarWriter
                  The minute bar writer to write into.
              daily_bar_writer : BcolzDailyBarWriter
                  The daily bar writer to write into.
              adjustment_writer : SQLiteAdjustmentWriter
                  The adjustment db writer to write into.
              calendar : pd.DatetimeIndex
                  The trading calendar to ingest for.
              cache : DataFrameCache
                  A mapping object to temporarily store dataframes.
                  This should be used to cache intermediates in case the load
                  fails. This will be automatically cleaned up after a
                  successful load.
              show_progress : bool
                  Show the progress for the current load where possible.
        calendar : pd.DatetimeIndex, optional
            The exchange calendar to align the data to. This defaults to the
            NYSE calendar.
        market_open : pd.DatetimeIndex, optional
            The minute when the market opens each day. This defaults to the
            NYSE calendar.
        market_close : pd.DatetimeIndex, optional
            The minute when the market closes each day. This defaults to the
            NYSE calendar.
        minutes_per_day : int, optional
            The number of minutes in each normal trading day.
        create_writers : bool, optional
            Should the ingest machinery create the writers for the ingest
            function. This can be disabled as an optimization for cases where
            they are not needed, like the ``quantopian-quandl`` bundle.

        Notes
        -----
        This function my be used as a decorator, for example:

        .. code-block:: python

           @register('quandl')
           def quandl_ingest_function(...):
               ...

        See Also
        --------
        zipline.data.bundles.bundles
        """
        if name in bundles:
            warnings.warn(
                'Overwriting bundle with name %r' % name,
                stacklevel=3,
            )
        _bundles[name] = _BundlePayload(
            calendar,
            opens,
            closes,
            minutes_per_day,
            f,
            create_writers,
        )
        return f

    def unregister(name):
        """Unregister a bundle.

        Parameters
        ----------
        name : str
            The name of the bundle to unregister.

        Raises
        ------
        UnknownBundle
            Raised when no bundle has been registered with the given name.

        See Also
        --------
        zipline.data.bundles.bundles
        """
        try:
            del _bundles[name]
        except KeyError:
            raise UnknownBundle(name)

    def ingest(name,
               environ=os.environ,
               timestamp=None,
               show_progress=False):
        """Ingest data for a given bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. By default this is os.environ.
        timestamp : datetime, optional
            The timestamp to use for the load.
            By default this is the current time.
        show_progress : bool, optional
            Tell the ingest function to display the progress where possible.
        """
        try:
            bundle = bundles[name]
        except KeyError:
            raise UnknownBundle(name)

        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestamp = timestamp.tz_convert('utc').tz_localize(None)
        timestr = to_bundle_ingest_dirname(timestamp)
        cachepath = cache_path(name, environ=environ)
        pth.ensure_directory(pth.data_path([name, timestr], environ=environ))
        pth.ensure_directory(cachepath)
        with dataframe_cache(cachepath, clean_on_failure=False) as cache, \
                ExitStack() as stack:
            # we use `cleanup_on_failure=False` so that we don't purge the
            # cache directory if the load fails in the middle
            if bundle.create_writers:
                wd = stack.enter_context(working_dir(
                    pth.data_path([], environ=environ))
                )
                daily_bars_path = wd.ensure_dir(
                    *daily_equity_relative(
                        name, timestr, environ=environ,
                    )
                )
                daily_bar_writer = BcolzDailyBarWriter(
                    daily_bars_path,
                    nyse_cal,
                    bundle.calendar[0],
                    bundle.calendar[-1]
                )
                # Do an empty write to ensure that the daily ctables exist
                # when we create the SQLiteAdjustmentWriter below. The
                # SQLiteAdjustmentWriter needs to open the daily ctables so
                # that it can compute the adjustment ratios for the dividends.

                daily_bar_writer.write(())
                minute_bar_writer = BcolzMinuteBarWriter(
                    bundle.calendar[0],
                    wd.ensure_dir(*minute_equity_relative(
                        name, timestr, environ=environ)
                    ),
                    bundle.opens,
                    bundle.closes,
                    minutes_per_day=bundle.minutes_per_day,
                )
                asset_db_writer = AssetDBWriter(
                    wd.getpath(*asset_db_relative(
                        name, timestr, environ=environ,
                    ))
                )

                adjustment_db_writer = stack.enter_context(
                    SQLiteAdjustmentWriter(
                        wd.getpath(*adjustment_db_relative(
                            name, timestr, environ=environ)),
                        BcolzDailyBarReader(daily_bars_path),
                        bundle.calendar,
                        overwrite=True,
                    )
                )
            else:
                daily_bar_writer = None
                minute_bar_writer = None
                asset_db_writer = None
                adjustment_db_writer = None
            bundle.ingest(
                environ,
                asset_db_writer,
                minute_bar_writer,
                daily_bar_writer,
                adjustment_db_writer,
                bundle.calendar,
                cache,
                show_progress,
                pth.data_path([name, timestr], environ=environ),
            )

    def most_recent_data(bundle_name, timestamp, environ=None):
        """Get the path to the most recent data after ``date``for the
        given bundle.

        Parameters
        ----------
        bundle_name : str
            The name of the bundle to lookup.
        timestamp : datetime
            The timestamp to begin searching on or before.
        environ : dict, optional
            An environment dict to forward to zipline_root.
        """
        if bundle_name not in bundles:
            raise UnknownBundle(bundle_name)

        try:
            candidates = os.listdir(
                pth.data_path([bundle_name], environ=environ),
            )
            return pth.data_path(
                [bundle_name,
                 max(
                     filter(complement(pth.hidden), candidates),
                     key=from_bundle_ingest_dirname,
                 )],
                environ=environ,
            )
        except (ValueError, OSError) as e:
            if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT:
                raise
            raise ValueError(
                'no data for bundle {bundle!r} on or before {timestamp}\n'
                'maybe you need to run: $ zipline ingest -b {bundle}'.format(
                    bundle=bundle_name,
                    timestamp=timestamp,
                ),
            )

    def load(name, environ=os.environ, timestamp=None):
        """Loads a previously ingested bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. Defaults of os.environ.
        timestamp : datetime, optional
            The timestamp of the data to lookup.
            Defaults to the current time.

        Returns
        -------
        bundle_data : BundleData
            The raw data readers for this bundle.
        """
        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestr = most_recent_data(name, timestamp, environ=environ)
        return BundleData(
            asset_finder=AssetFinder(
                asset_db_path(name, timestr, environ=environ),
            ),
            equity_minute_bar_reader=BcolzMinuteBarReader(
                minute_equity_path(name, timestr,  environ=environ),
            ),
            equity_daily_bar_reader=BcolzDailyBarReader(
                daily_equity_path(name, timestr, environ=environ),
            ),
            adjustment_reader=SQLiteAdjustmentReader(
                adjustment_db_path(name, timestr, environ=environ),
            ),
        )

    @preprocess(
        before=optionally(ensure_timestamp),
        after=optionally(ensure_timestamp),
    )
    def clean(name,
              before=None,
              after=None,
              keep_last=None,
              environ=os.environ):
        """Clean up data that was created with ``ingest`` or
        ``$ python -m zipline ingest``

        Parameters
        ----------
        name : str
            The name of the bundle to remove data for.
        before : datetime, optional
            Remove data ingested before this date.
            This argument is mutually exclusive with: keep_last
        after : datetime, optional
            Remove data ingested after this date.
            This argument is mutually exclusive with: keep_last
        keep_last : int, optional
            Remove all but the last ``keep_last`` ingestions.
            This argument is mutually exclusive with:
              before
              after
        environ : mapping, optional
            The environment variables. Defaults of os.environ.

        Returns
        -------
        cleaned : set[str]
            The names of the runs that were removed.

        Raises
        ------
        BadClean
            Raised when ``before`` and or ``after`` are passed with
            ``keep_last``. This is a subclass of ``ValueError``.
        """
        try:
            all_runs = sorted(
                filter(
                    complement(pth.hidden),
                    os.listdir(pth.data_path([name], environ=environ)),
                ),
                key=from_bundle_ingest_dirname,
            )
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise
            raise UnknownBundle(name)
        if ((before is not None or after is not None) and
                keep_last is not None):
            raise BadClean(before, after, keep_last)

        if keep_last is None:
            def should_clean(name):
                dt = from_bundle_ingest_dirname(name)
                return (
                    (before is not None and dt < before) or
                    (after is not None and dt > after)
                )

        else:
            last_n_dts = set(all_runs[-keep_last:])

            def should_clean(name):
                return name not in last_n_dts

        cleaned = set()
        for run in all_runs:
            if should_clean(run):
                path = pth.data_path([name, run], environ=environ)
                shutil.rmtree(path)
                cleaned.add(path)

        return cleaned

    return BundleCore(bundles, register, unregister, ingest, load, clean)
Esempio n. 6
0
class BlazeEventsLoader(PipelineLoader):
    """An abstract pipeline loader for the events datasets that loads
    data from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The expression representing the data to load.
    resources : dict, optional
        Mapping from the atomic terms of ``expr`` to actual data resources.
    odo_kwargs : dict, optional
        Extra keyword arguments to pass to odo when executing the expression.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezeone to use for the data query cutoff.
    dataset : DataSet
        The DataSet object for which this loader loads data.

    Notes
    -----
    The expression should have a tabular dshape of::

       Dim * {{
           {SID_FIELD_NAME}: int64,
           {TS_FIELD_NAME}: datetime,
       }}

    And other dataset-specific fields, where each row of the table is a
    record including the sid to identify the company, the timestamp where we
    learned about the announcement, and the date when the earnings will be z
    announced.

    If the '{TS_FIELD_NAME}' field is not included it is assumed that we
    start the backtest with knowledge of all announcements.
    """

    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 expr,
                 resources=None,
                 odo_kwargs=None,
                 data_query_time=None,
                 data_query_tz=None,
                 dataset=None):
        dshape = expr.dshape

        if not istabular(dshape):
            raise ValueError(
                'expression dshape must be tabular, got: %s' % dshape,
            )

        expected_fields = self._expected_fields
        self._expr = bind_expression_to_resources(
            expr[list(expected_fields)],
            resources,
        )
        self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}
        self._dataset = dataset
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

    @abc.abstractproperty
    def concrete_loader(self):
        NotImplementedError('concrete_loader')

    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~sids.isin(assets)].index,
            inplace=True
        )
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )
        gb = raw.groupby(SID_FIELD_NAME)
        return self.concrete_loader(
            dates,
            self.prepare_data(raw, gb),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)

    def prepare_data(self, raw, gb):
        return {sid: raw.loc[group].drop(SID_FIELD_NAME, axis=1) for sid, group
                in gb.groups.items()}
Esempio n. 7
0
File: core.py Progetto: yu68/zipline
class BlazeLoader(dict):
    """A PipelineLoader for datasets constructed with ``from_blaze``.

    Parameters
    ----------
    dsmap : mapping, optional
        An initial mapping of datasets to ``ExprData`` objects.
        NOTE: Further mutations to this map will not be reflected by this
        object.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str, optional
        The timezeone to use for the data query cutoff.
    pool : Pool, optional
        The pool to use to run blaze queries concurrently. This object must
        support ``imap_unordered``, ``apply`` and ``apply_async`` methods.

    Attributes
    ----------
    pool : Pool
        The pool to use to run blaze queries concurrently. This object must
        support ``imap_unordered``, ``apply`` and ``apply_async`` methods.
        It is possible to change the pool after the loader has been
        constructed. This allows us to set a new pool for the ``global_loader``
        like: ``global_loader.pool = multiprocessing.Pool(4)``.

    See Also
    --------
    :class:`zipline.utils.pool.SequentialPool`
    :class:`multiprocessing.Pool`
    """
    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 dsmap=None,
                 data_query_time=None,
                 data_query_tz=None,
                 pool=SequentialPool()):
        self.update(dsmap or {})
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

        # explicitly public
        self.pool = pool

    @classmethod
    @memoize(cache=WeakKeyDictionary())
    def global_instance(cls):
        return cls()

    def __hash__(self):
        return id(self)

    def __call__(self, column):
        if column.dataset in self:
            return self
        raise KeyError(column)

    def __repr__(self):
        return '<%s: %s>' % (
            type(self).__name__,
            super(BlazeLoader, self).__repr__(),
        )

    def load_adjusted_array(self, columns, dates, assets, mask):
        return merge(
            self.pool.imap_unordered(
                partial(self._load_dataset, dates, assets, mask),
                itervalues(groupby(getdataset, columns)),
            ), )

    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[
            dataset]
        have_sids = (dataset.ndim == 2)
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr = self.pool.apply_async(collect_expr, (expr, lower))
        materialized_deltas = (self.pool.apply(collect_expr,
                                               (deltas, lower)) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr.get(),
                ),
                ignore_index=True,
                copy=False,
            )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        if AD_FIELD_NAME not in requested_columns:
            sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        sparse_deltas = last_in_date_group(non_novel_deltas,
                                           dates,
                                           assets,
                                           reindex=False,
                                           have_sids=have_sids)
        dense_output = last_in_date_group(sparse_output,
                                          dates,
                                          assets,
                                          reindex=True,
                                          have_sids=have_sids)
        ffill_across_cols(dense_output, columns,
                          {c.name: c.name
                           for c in columns})

        # By default, no non-novel deltas are applied.
        def no_adjustments_from_deltas(*args):
            return {}

        adjustments_from_deltas = no_adjustments_from_deltas
        if have_sids:
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # If we do not have sids, use the column view to make a single
            # column vector which is unassociated with any assets.
            column_view = op.itemgetter(np.s_[:, np.newaxis])
            if apply_deltas_adjustments:
                adjustments_from_deltas = adjustments_from_deltas_no_sids
            mask = np.full(
                shape=(len(mask), 1),
                fill_value=True,
                dtype=bool_dtype,
            )

        return {
            column: AdjustedArray(
                column_view(
                    dense_output[column.name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column.name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
            for column_idx, column in enumerate(columns)
        }
Esempio n. 8
0
class BlazeLoader(dict):
    """A PipelineLoader for datasets constructed with ``from_blaze``.

    Parameters
    ----------
    dsmap : mapping, optional
        An initial mapping of datasets to ``ExprData`` objects.
        NOTE: Further mutations to this map will not be reflected by this
        object.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezeone to use for the data query cutoff.
    """
    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None):
        self.update(dsmap or {})
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

    @classmethod
    @memoize(cache=WeakKeyDictionary())
    def global_instance(cls):
        return cls()

    def __hash__(self):
        return id(self)

    def __call__(self, column):
        if column.dataset in self:
            return self
        raise KeyError(column)

    def __repr__(self):
        return '<%s: %s>' % (
            type(self).__name__,
            super(BlazeLoader, self).__repr__(),
        )

    def load_adjusted_array(self, columns, dates, assets, mask):
        return dict(
            concat(
                map(partial(self._load_dataset, dates, assets, mask),
                    itervalues(groupby(getdataset, columns)))))

    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset, ) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, checkpoints, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME
                              ] + ([SID_FIELD_NAME] if have_sids else [])
        colnames = added_query_fields + list(map(getname, columns))

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] <= upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        if checkpoints is not None:
            ts = checkpoints[TS_FIELD_NAME]
            checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp)
            if pd.isnull(checkpoints_ts):
                materialized_checkpoints = pd.DataFrame(columns=colnames)
                lower = None
            else:
                materialized_checkpoints = odo(
                    checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame,
                    **odo_kwargs)
                lower = checkpoints_ts
        else:
            materialized_checkpoints = pd.DataFrame(columns=colnames)
            lower = None

        materialized_expr = collect_expr(expr, lower)
        if materialized_checkpoints is not None:
            materialized_expr = pd.concat(
                (
                    materialized_checkpoints,
                    materialized_expr,
                ),
                ignore_index=True,
                copy=False,
            )
        materialized_deltas = (collect_expr(deltas, lower) if deltas
                               is not None else pd.DataFrame(columns=colnames))

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype(
                    'datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]'))]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        # Fill in missing values specified by each column. This is made
        # significantly more complex by the fact that we need to work around
        # two pandas issues:

        # 1) When we have sids, if there are no records for a given sid for any
        #    dates, pandas will generate a column full of NaNs for that sid.
        #    This means that some of the columns in `dense_output` are now
        #    float instead of the intended dtype, so we have to coerce back to
        #    our expected type and convert NaNs into the desired missing value.

        # 2) DataFrame.ffill assumes that receiving None as a fill-value means
        #    that no value was passed.  Consequently, there's no way to tell
        #    pandas to replace NaNs in an object column with None using fillna,
        #    so we have to roll our own instead using df.where.
        for column in columns:
            # Special logic for strings since `fillna` doesn't work if the
            # missing value is `None`.
            if column.dtype == categorical_dtype:
                dense_output[column.name] = dense_output[column.name].where(
                    pd.notnull(dense_output[column.name]),
                    column.missing_value)
            else:
                # We need to execute `fillna` before `astype` in case the
                # column contains NaNs and needs to be cast to bool or int.
                # This is so that the NaNs are replaced first, since pandas
                # can't convert NaNs for those types.
                dense_output[column.name] = dense_output[column.name].fillna(
                    column.missing_value).astype(column.dtype)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype), ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Esempio n. 9
0
class BlazeLoader(object):
    """A PipelineLoader for datasets constructed with ``from_blaze``.

    Parameters
    ----------
    dsmap : mapping, optional
        An initial mapping of datasets to ``ExprData`` objects.
        NOTE: Further mutations to this map will not be reflected by this
        object.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str, optional
        The timezeone to use for the data query cutoff.
    pool : Pool, optional
        The pool to use to run blaze queries concurrently. This object must
        support ``imap_unordered``, ``apply`` and ``apply_async`` methods.

    Attributes
    ----------
    pool : Pool
        The pool to use to run blaze queries concurrently. This object must
        support ``imap_unordered``, ``apply`` and ``apply_async`` methods.
        It is possible to change the pool after the loader has been
        constructed. This allows us to set a new pool for the ``global_loader``
        like: ``global_loader.pool = multiprocessing.Pool(4)``.

    See Also
    --------
    :class:`zipline.utils.pool.SequentialPool`
    :class:`multiprocessing.Pool`
    """
    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 dsmap=None,
                 data_query_time=None,
                 data_query_tz=None,
                 pool=SequentialPool()):
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

        # explicitly public
        self.pool = pool

        self._table_expressions = (dsmap or {}).copy()

    @classmethod
    @memoize(cache=WeakKeyDictionary())
    def global_instance(cls):
        return cls()

    def __hash__(self):
        return id(self)

    def __contains__(self, column):
        return column in self._table_expressions

    def __getitem__(self, column):
        return self._table_expressions[column]

    def __iter__(self):
        return iter(self._table_expressions)

    def __len__(self):
        return len(self._table_expressions)

    def __call__(self, column):
        if column in self:
            return self
        raise KeyError(column)

    def register_dataset(self,
                         dataset,
                         expr,
                         deltas=None,
                         checkpoints=None,
                         odo_kwargs=None):
        """Explicitly map a datset to a collection of blaze expressions.

        Parameters
        ----------
        dataset : DataSet
            The pipeline dataset to map to the given expressions.
        expr : Expr
            The baseline values.
        deltas : Expr, optional
            The deltas for the data.
        checkpoints : Expr, optional
            The forward fill checkpoints for the data.
        odo_kwargs : dict, optional
            The keyword arguments to forward to the odo calls internally.

        See Also
        --------
        :func:`zipline.pipeline.loaders.blaze.from_blaze`
        """
        expr_data = ExprData(
            expr,
            deltas,
            checkpoints,
            odo_kwargs,
        )
        for column in dataset.columns:
            self._table_expressions[column] = expr_data

    def register_column(self,
                        column,
                        expr,
                        deltas=None,
                        checkpoints=None,
                        odo_kwargs=None):
        """Explicitly map a single bound column to a collection of blaze
        expressions. The expressions need to have ``timestamp`` and ``as_of``
        columns.

        Parameters
        ----------
        column : BoundColumn
            The pipeline dataset to map to the given expressions.
        expr : Expr
            The baseline values.
        deltas : Expr, optional
            The deltas for the data.
        checkpoints : Expr, optional
            The forward fill checkpoints for the data.
        odo_kwargs : dict, optional
            The keyword arguments to forward to the odo calls internally.

        See Also
        --------
        :func:`zipline.pipeline.loaders.blaze.from_blaze`
        """
        self._table_expressions[column] = ExprData(
            expr,
            deltas,
            checkpoints,
            odo_kwargs,
        )

    def load_adjusted_array(self, columns, dates, assets, mask):
        return merge(
            self.pool.imap_unordered(
                partial(self._load_dataset, dates, assets, mask),
                itervalues(groupby(getitem(self._table_expressions), columns)),
            ), )

    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (expr_data, ) = {self._table_expressions[c] for c in columns}
        except ValueError:
            raise AssertionError(
                'all columns must share the same expression data', )

        expr, deltas, checkpoints, odo_kwargs = expr_data

        have_sids = (first(columns).dataset.ndim == 2)
        added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME
                              } | ({SID_FIELD_NAME} if have_sids else set())
        requested_columns = set(map(getname, columns))
        colnames = sorted(added_query_fields | requested_columns)

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def collect_expr(e, lower):
            """Materialize the expression as a dataframe.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.
            lower : datetime
                The lower time bound to query.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            predicate = e[TS_FIELD_NAME] < upper_dt
            if lower is not None:
                predicate &= e[TS_FIELD_NAME] >= lower

            return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs)

        lower, materialized_checkpoints = get_materialized_checkpoints(
            checkpoints, colnames, lower_dt, odo_kwargs)

        materialized_expr_deferred = self.pool.apply_async(
            collect_expr,
            (expr, lower),
        )
        materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower))
                               if deltas is not None else None)

        all_rows = pd.concat(
            filter(
                lambda df: df is not None,
                (
                    materialized_checkpoints,
                    materialized_expr_deferred.get(),
                    materialized_deltas,
                ),
            ),
            ignore_index=True,
            copy=False,
        )

        all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype(
            'datetime64[ns]', )
        all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True)

        if have_sids:
            return adjusted_arrays_from_rows_with_assets(
                dates,
                data_query_time,
                data_query_tz,
                assets,
                mask,
                columns,
                all_rows,
            )
        else:
            return adjusted_arrays_from_rows_without_assets(
                dates,
                data_query_time,
                data_query_tz,
                None,
                columns,
                all_rows,
            )
Esempio n. 10
0
class BlazeLoader(dict):
    """A PipelineLoader for datasets constructed with ``from_blaze``.

    Parameters
    ----------
    dsmap : mapping, optional
        An initial mapping of datasets to ``ExprData`` objects.
        NOTE: Further mutations to this map will not be reflected by this
        object.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezeone to use for the data query cutoff.
    """
    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 dsmap=None,
                 data_query_time=None,
                 data_query_tz=None):
        self.update(dsmap or {})
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

    @classmethod
    @memoize(cache=WeakKeyDictionary())
    def global_instance(cls):
        return cls()

    def __hash__(self):
        return id(self)

    def __call__(self, column):
        if column.dataset in self:
            return self
        raise KeyError(column)

    def load_adjusted_array(self, columns, dates, assets, mask):
        return dict(
            concat(map(
                partial(self._load_dataset, dates, assets, mask),
                itervalues(groupby(getdataset, columns))
            ))
        )

    def _load_dataset(self, dates, assets, mask, columns):
        try:
            (dataset,) = set(map(getdataset, columns))
        except ValueError:
            raise AssertionError('all columns must come from the same dataset')

        expr, deltas, odo_kwargs = self[dataset]
        have_sids = SID_FIELD_NAME in expr.fields
        asset_idx = pd.Series(index=assets, data=np.arange(len(assets)))
        assets = list(map(int, assets))  # coerce from numpy.int64
        added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + (
            [SID_FIELD_NAME] if have_sids else []
        )

        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        def where(e):
            """Create the query to run against the resources.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            q : Expr
                The query to run.
            """
            def lower_for_col(column):
                pred = e[TS_FIELD_NAME] <= lower_dt
                colname = column.name
                schema = e[colname].schema.measure
                if isinstance(schema, Option):
                    pred &= e[colname].notnull()
                    schema = schema.ty
                if schema in floating:
                    pred &= ~e[colname].isnan()

                filtered = e[pred]
                lower = filtered[TS_FIELD_NAME].max()
                if have_sids:
                    # If we have sids, then we need to take the earliest of the
                    # greatest date that has a non-null value by sid.
                    lower = bz.by(
                        filtered[SID_FIELD_NAME],
                        timestamp=lower,
                    ).timestamp.min()
                return lower

            lower = odo(
                reduce(
                    bz.least,
                    map(lower_for_col, columns),
                ),
                pd.Timestamp,
                **odo_kwargs
            )
            if lower is pd.NaT:
                lower = lower_dt
            return e[
                (e[TS_FIELD_NAME] >= lower) &
                (e[TS_FIELD_NAME] <= upper_dt)
            ][added_query_fields + list(map(getname, columns))]

        def collect_expr(e):
            """Execute and merge all of the per-column subqueries.

            Parameters
            ----------
            e : Expr
                The baseline or deltas expression.

            Returns
            -------
            result : pd.DataFrame
                The resulting dataframe.

            Notes
            -----
            This can return more data than needed. The in memory reindex will
            handle this.
            """
            df = odo(where(e), pd.DataFrame, **odo_kwargs)
            df.sort(TS_FIELD_NAME, inplace=True)  # sort for the groupby later
            return df

        materialized_expr = collect_expr(expr)
        materialized_deltas = (
            collect_expr(deltas)
            if deltas is not None else
            pd.DataFrame(
                columns=added_query_fields + list(map(getname, columns)),
            )
        )

        # It's not guaranteed that assets returned by the engine will contain
        # all sids from the deltas table; filter out such mismatches here.
        if not materialized_deltas.empty and have_sids:
            materialized_deltas = materialized_deltas[
                materialized_deltas[SID_FIELD_NAME].isin(assets)
            ]

        if data_query_time is not None:
            for m in (materialized_expr, materialized_deltas):
                m.loc[:, TS_FIELD_NAME] = m.loc[
                    :, TS_FIELD_NAME
                ].astype('datetime64[ns]')

                normalize_timestamp_to_query_time(
                    m,
                    data_query_time,
                    data_query_tz,
                    inplace=True,
                    ts_field=TS_FIELD_NAME,
                )

        # Inline the deltas that changed our most recently known value.
        # Also, we reindex by the dates to create a dense representation of
        # the data.
        sparse_output, non_novel_deltas = overwrite_novel_deltas(
            materialized_expr,
            materialized_deltas,
            dates,
        )
        sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True)

        def last_in_date_group(df, reindex, have_sids=have_sids):
            idx = dates[dates.searchsorted(
                df[TS_FIELD_NAME].values.astype('datetime64[D]')
            )]
            if have_sids:
                idx = [idx, SID_FIELD_NAME]

            last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby(
                idx,
                sort=False,
            ).last()

            if have_sids:
                last_in_group = last_in_group.unstack()

            if reindex:
                if have_sids:
                    cols = last_in_group.columns
                    last_in_group = last_in_group.reindex(
                        index=dates,
                        columns=pd.MultiIndex.from_product(
                            (cols.levels[0], assets),
                            names=cols.names,
                        ),
                    )
                else:
                    last_in_group = last_in_group.reindex(dates)

            return last_in_group

        sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False)
        dense_output = last_in_date_group(sparse_output, reindex=True)
        dense_output.ffill(inplace=True)

        if have_sids:
            adjustments_from_deltas = adjustments_from_deltas_with_sids
            column_view = identity
        else:
            # We use the column view to make an array per asset.
            column_view = compose(
                # We need to copy this because we need a concrete ndarray.
                # The `repeat_last_axis` call will give us a fancy strided
                # array which uses a buffer to represent `len(assets)` columns.
                # The engine puts nans at the indicies for which we do not have
                # sid information so that the nan-aware reductions still work.
                # A future change to the engine would be to add first class
                # support for macro econimic datasets.
                copy,
                partial(repeat_last_axis, count=len(assets)),
            )
            adjustments_from_deltas = adjustments_from_deltas_no_sids

        for column_idx, column in enumerate(columns):
            column_name = column.name
            yield column, AdjustedArray(
                column_view(
                    dense_output[column_name].values.astype(column.dtype),
                ),
                mask,
                adjustments_from_deltas(
                    dates,
                    sparse_output[TS_FIELD_NAME].values,
                    column_idx,
                    column_name,
                    asset_idx,
                    sparse_deltas,
                ),
                column.missing_value,
            )
Esempio n. 11
0
class BlazeEstimatesLoader(implements(PipelineLoader)):
    """An abstract pipeline loader for the estimates datasets that loads
    data from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The expression representing the data to load.
    columns : dict[str -> str]
        A dict mapping BoundColumn names to the associated names in `expr`.
    resources : dict, optional
        Mapping from the loadable terms of ``expr`` to actual data resources.
    odo_kwargs : dict, optional
        Extra keyword arguments to pass to odo when executing the expression.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezeone to use for the data query cutoff.
    checkpoints : Expr, optional
        The expression representing checkpointed data to be used for faster
        forward-filling of data from `expr`.

    Notes
    -----
    The expression should have a tabular dshape of::

       Dim * {{
           {SID_FIELD_NAME}: int64,
           {TS_FIELD_NAME}: datetime,
           {FISCAL_YEAR_FIELD_NAME}: float64,
           {FISCAL_QUARTER_FIELD_NAME}: float64,
           {EVENT_DATE_FIELD_NAME}: datetime,
       }}

    And other dataset-specific fields, where each row of the table is a
    record including the sid to identify the company, the timestamp where we
    learned about the announcement, and the date of the event.

    If the '{TS_FIELD_NAME}' field is not included it is assumed that we
    start the backtest with knowledge of all announcements.
    """
    __doc__ = __doc__.format(
        SID_FIELD_NAME=SID_FIELD_NAME,
        TS_FIELD_NAME=TS_FIELD_NAME,
        FISCAL_YEAR_FIELD_NAME=FISCAL_YEAR_FIELD_NAME,
        FISCAL_QUARTER_FIELD_NAME=FISCAL_QUARTER_FIELD_NAME,
        EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME,
    )

    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 expr,
                 columns,
                 resources=None,
                 odo_kwargs=None,
                 data_query_time=None,
                 data_query_tz=None,
                 checkpoints=None):

        dshape = expr.dshape
        if not istabular(dshape):
            raise ValueError(
                'expression dshape must be tabular, got: %s' % dshape, )

        required_cols = list(required_estimates_fields(columns))
        self._expr = bind_expression_to_resources(
            expr[required_cols],
            resources,
        )
        self._columns = columns
        self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz
        self._checkpoints = checkpoints

    def load_adjusted_array(self, domain, columns, dates, sids, mask):
        # Only load requested columns.
        requested_column_names = [
            self._columns[column.name] for column in columns
        ]

        raw = load_raw_data(
            sids,
            dates,
            self._data_query_time,
            self._data_query_tz,
            self._expr[sorted(metadata_columns.union(requested_column_names))],
            self._odo_kwargs,
            checkpoints=self._checkpoints,
        )

        return self.loader(
            raw,
            {column.name: self._columns[column.name]
             for column in columns},
        ).load_adjusted_array(
            domain,
            columns,
            dates,
            sids,
            mask,
        )
Esempio n. 12
0
class BlazeEventsLoader(PipelineLoader):
    """An abstract pipeline loader for the events datasets that loads
    data from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The expression representing the data to load.
    next_value_columns : dict[BoundColumn -> raw column name]
        A dict mapping 'next' BoundColumns to their column names in `expr`.
    previous_value_columns : dict[BoundColumn -> raw column name]
        A dict mapping 'previous' BoundColumns to their column names in `expr`.
    resources : dict, optional
        Mapping from the loadable terms of ``expr`` to actual data resources.
    odo_kwargs : dict, optional
        Extra keyword arguments to pass to odo when executing the expression.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezone to use for the data query cutoff.

    Notes
    -----
    The expression should have a tabular dshape of::

       Dim * {{
           {SID_FIELD_NAME}: int64,
           {TS_FIELD_NAME}: datetime,
           {EVENT_DATE_FIELD_NAME}: datetime,
       }}

    And other dataset-specific fields, where each row of the table is a
    record including the sid to identify the company, the timestamp where we
    learned about the announcement, and the event date.

    If the '{TS_FIELD_NAME}' field is not included it is assumed that we
    start the backtest with knowledge of all announcements.
    """

    __doc__ = __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME,
                             TS_FIELD_NAME=TS_FIELD_NAME,
                             EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME)

    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 expr,
                 next_value_columns,
                 previous_value_columns,
                 resources=None,
                 odo_kwargs=None,
                 data_query_time=None,
                 data_query_tz=None):

        dshape = expr.dshape
        if not istabular(dshape):
            raise ValueError(
                'expression dshape must be tabular, got: %s' % dshape, )

        required_cols = list(
            required_event_fields(next_value_columns, previous_value_columns))
        self._expr = bind_expression_to_resources(
            expr[required_cols],
            resources,
        )
        self._next_value_columns = next_value_columns
        self._previous_value_columns = previous_value_columns
        self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

    def load_adjusted_array(self, columns, dates, assets, mask):
        raw = load_raw_data(assets, dates, self._data_query_time,
                            self._data_query_tz, self._expr, self._odo_kwargs)

        return EventsLoader(
            events=raw,
            next_value_columns=self._next_value_columns,
            previous_value_columns=self._previous_value_columns,
        ).load_adjusted_array(
            columns,
            dates,
            assets,
            mask,
        )
Esempio n. 13
0
class BlazeEarningsCalendarLoader(PipelineLoader):
    """A pipeline loader for the ``EarningsCalendar`` dataset that loads
    data from a blaze expression.

    Parameters
    ----------
    expr : Expr
        The expression representing the data to load.
    resources : dict, optional
        Mapping from the atomic terms of ``expr`` to actual data resources.
    odo_kwargs : dict, optional
        Extra keyword arguments to pass to odo when executing the expression.
    data_query_time : time, optional
        The time to use for the data query cutoff.
    data_query_tz : tzinfo or str
        The timezeone to use for the data query cutoff.

    Notes
    -----
    The expression should have a tabular dshape of::

       Dim * {{
           {SID_FIELD_NAME}: int64,
           {TS_FIELD_NAME}: datetime,
           {ANNOUNCEMENT_FIELD_NAME}: ?datetime,
       }}

    Where each row of the table is a record including the sid to identify the
    company, the timestamp where we learned about the announcement, and the
    date when the earnings will be announced.

    If the '{TS_FIELD_NAME}' field is not included it is assumed that we
    start the backtest with knowledge of all announcements.
    """
    __doc__ = __doc__.format(
        TS_FIELD_NAME=TS_FIELD_NAME,
        SID_FIELD_NAME=SID_FIELD_NAME,
        ANNOUNCEMENT_FIELD_NAME=ANNOUNCEMENT_FIELD_NAME,
    )

    _expected_fields = frozenset({
        TS_FIELD_NAME,
        SID_FIELD_NAME,
        ANNOUNCEMENT_FIELD_NAME,
    })

    @preprocess(data_query_tz=optionally(ensure_timezone))
    def __init__(self,
                 expr,
                 resources=None,
                 odo_kwargs=None,
                 data_query_time=None,
                 data_query_tz=None,
                 dataset=EarningsCalendar):
        dshape = expr.dshape

        if not istabular(dshape):
            raise ValueError(
                'expression dshape must be tabular, got: %s' % dshape,
            )

        expected_fields = self._expected_fields
        self._expr = bind_expression_to_resources(
            expr[list(expected_fields)],
            resources,
        )
        self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {}
        self._dataset = dataset
        check_data_query_args(data_query_time, data_query_tz)
        self._data_query_time = data_query_time
        self._data_query_tz = data_query_tz

    def load_adjusted_array(self, columns, dates, assets, mask):
        data_query_time = self._data_query_time
        data_query_tz = self._data_query_tz
        lower_dt, upper_dt = normalize_data_query_bounds(
            dates[0],
            dates[-1],
            data_query_time,
            data_query_tz,
        )

        raw = ffill_query_in_range(
            self._expr,
            lower_dt,
            upper_dt,
            self._odo_kwargs,
        )
        sids = raw.loc[:, SID_FIELD_NAME]
        raw.drop(
            sids[~sids.isin(assets)].index,
            inplace=True
        )
        if data_query_time is not None:
            normalize_timestamp_to_query_time(
                raw,
                data_query_time,
                data_query_tz,
                inplace=True,
                ts_field=TS_FIELD_NAME,
            )

        gb = raw.groupby(SID_FIELD_NAME)

        def mkseries(idx, raw_loc=raw.loc):
            vs = raw_loc[
                idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]
            ].values
            return pd.Series(
                index=pd.DatetimeIndex(vs[:, 0]),
                data=vs[:, 1],
            )

        return EarningsCalendarLoader(
            dates,
            valmap(mkseries, gb.groups),
            dataset=self._dataset,
        ).load_adjusted_array(columns, dates, assets, mask)