Exemple #1
0
    def test_register_non_decorator(self):
        ayy_lmao_set = set()

        def ayy_lmao():
            return ayy_lmao_set

        self.register('ayy-lmao', ayy_lmao)

        expected_metrics_sets = mappingproxy({'ayy-lmao': ayy_lmao})
        assert_equal(self.metrics_sets, expected_metrics_sets)
        assert_is(self.load('ayy-lmao'), ayy_lmao_set)

        def other():  # pragma: no cover
            raise AssertionError('dead')

        msg = "metrics set 'ayy-lmao' is already registered"
        with assert_raises_str(ValueError, msg):
            self.register('ayy-lmao', other)

        # ensure that the failed registration didn't break the previously
        # registered set
        assert_equal(self.metrics_sets, expected_metrics_sets)
        assert_is(self.load('ayy-lmao'), ayy_lmao_set)

        self.unregister('ayy-lmao')
        assert_equal(self.metrics_sets, mappingproxy({}))

        msg = "no metrics set registered as 'ayy-lmao', options are: []"
        with assert_raises_str(ValueError, msg):
            self.load('ayy-lmao')

        msg = "metrics set 'ayy-lmao' was not already registered"
        with assert_raises_str(ValueError, msg):
            self.unregister('ayy-lmao')
    def test_register_non_decorator(self):
        ayy_lmao_set = set()

        def ayy_lmao():
            return ayy_lmao_set

        self.register('ayy-lmao', ayy_lmao)

        expected_metrics_sets = mappingproxy({'ayy-lmao': ayy_lmao})
        assert_equal(self.metrics_sets, expected_metrics_sets)
        assert_is(self.load('ayy-lmao'), ayy_lmao_set)

        def other():  # pragma: no cover
            raise AssertionError('dead')

        msg = "metrics set 'ayy-lmao' is already registered"
        with assert_raises_str(ValueError, msg):
            self.register('ayy-lmao', other)

        # ensure that the failed registration didn't break the previously
        # registered set
        assert_equal(self.metrics_sets, expected_metrics_sets)
        assert_is(self.load('ayy-lmao'), ayy_lmao_set)

        self.unregister('ayy-lmao')
        assert_equal(self.metrics_sets, mappingproxy({}))

        msg = "no metrics set registered as 'ayy-lmao', options are: []"
        with assert_raises_str(ValueError, msg):
            self.load('ayy-lmao')

        msg = "metrics set 'ayy-lmao' was not already registered"
        with assert_raises_str(ValueError, msg):
            self.unregister('ayy-lmao')
Exemple #3
0
    def test_register_non_decorator(self, metrics):
        ayy_lmao_set = set()

        def ayy_lmao():
            return ayy_lmao_set

        metrics.register("ayy-lmao", ayy_lmao)

        expected_metrics_sets = mappingproxy({"ayy-lmao": ayy_lmao})
        assert metrics.metrics_sets == expected_metrics_sets
        assert metrics.load("ayy-lmao") is ayy_lmao_set

        def other():  # pragma: no cover
            raise AssertionError("dead")

        msg = "metrics set 'ayy-lmao' is already registered"
        with pytest.raises(ValueError, match=msg):
            metrics.register("ayy-lmao", other)

        # ensure that the failed registration didn't break the previously
        # registered set
        assert metrics.metrics_sets == expected_metrics_sets
        assert metrics.load("ayy-lmao") is ayy_lmao_set

        metrics.unregister("ayy-lmao")
        assert_equal(metrics.metrics_sets, mappingproxy({}))

        msg = "no metrics set registered as 'ayy-lmao', options are: []"
        with pytest.raises(ValueError, match=re.escape(msg)):
            metrics.load("ayy-lmao")

        msg = "metrics set 'ayy-lmao' was not already registered"
        with pytest.raises(ValueError, match=msg):
            metrics.unregister("ayy-lmao")
    def init_instance_fixtures(self):
        super(MetricsSetCoreTestCase, self).init_instance_fixtures()

        self.metrics_sets, self.register, self.unregister, self.load = (
            _make_metrics_set_core())

        # make sure this starts empty
        assert_equal(self.metrics_sets, mappingproxy({}))
Exemple #5
0
    def init_instance_fixtures(self):
        super(MetricsSetCoreTestCase, self).init_instance_fixtures()

        self.metrics_sets, self.register, self.unregister, self.load = (
            _make_metrics_set_core()
        )

        # make sure this starts empty
        assert_equal(self.metrics_sets, mappingproxy({}))
Exemple #6
0
 def __init__(self, interface):
     """
     Parameters
     ----------
     interface : type
         The abstract base class to manage
     """
     self.interface = interface
     self._classes = {}
     self.classes = mappingproxy(self._classes)
Exemple #7
0
def metrics():
    MetricsCoreSet = namedtuple(
        "MetricsCoreSet",
        [
            "metrics_sets",
            "register",
            "unregister",
            "load",
        ],
    )
    metrics_set_core = MetricsCoreSet(*_make_metrics_set_core())
    # make sure this starts empty
    assert metrics_set_core.metrics_sets == mappingproxy({})
    yield metrics_set_core
Exemple #8
0
def _make_bundle_core():
    """Create a family of data bundle functions that read from the same
    bundle mapping.

    Returns
    -------
    bundles : mappingproxy
        The mapping of bundles to bundle payloads.
    register : callable
        The function which registers new bundles in the ``bundles`` mapping.
    unregister : callable
        The function which deregisters bundles from the ``bundles`` mapping.
    ingest : callable
        The function which downloads and write data for a given data bundle.
    load : callable
        The function which loads the ingested bundles back into memory.
    clean : callable
        The function which cleans up data written with ``ingest``.
    """
    _bundles = {}  # the registered bundles
    # Expose _bundles through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another bundle.
    bundles = mappingproxy(_bundles)

    @curry
    def register(name,
                 f,
                 calendar='NYSE',
                 start_session=None,
                 end_session=None,
                 minutes_per_day=390,
                 create_writers=True):
        """Register a data bundle ingest function.

        Parameters
        ----------
        name : str
            The name of the bundle.
        f : callable
            The ingest function. This function will be passed:

              environ : mapping
                  The environment this is being run with.
              asset_db_writer : AssetDBWriter
                  The asset db writer to write into.
              minute_bar_writer : BcolzMinuteBarWriter
                  The minute bar writer to write into.
              daily_bar_writer : BcolzDailyBarWriter
                  The daily bar writer to write into.
              adjustment_writer : SQLiteAdjustmentWriter
                  The adjustment db writer to write into.
              calendar : zipline.utils.calendars.TradingCalendar
                  The trading calendar to ingest for.
              start_session : pd.Timestamp
                  The first session of data to ingest.
              end_session : pd.Timestamp
                  The last session of data to ingest.
              cache : DataFrameCache
                  A mapping object to temporarily store dataframes.
                  This should be used to cache intermediates in case the load
                  fails. This will be automatically cleaned up after a
                  successful load.
              show_progress : bool
                  Show the progress for the current load where possible.
        calendar : zipline.utils.calendars.TradingCalendar or str, optional
            The trading calendar to align the data to, or the name of a trading
            calendar. This defaults to 'NYSE', in which case we use the NYSE
            calendar.
        start_session : pd.Timestamp, optional
            The first session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the first_session of the calendar is used.
        end_session : pd.Timestamp, optional
            The last session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the last_session of the calendar is used.
        minutes_per_day : int, optional
            The number of minutes in each normal trading day.
        create_writers : bool, optional
            Should the ingest machinery create the writers for the ingest
            function. This can be disabled as an optimization for cases where
            they are not needed, like the ``quantopian-quandl`` bundle.

        Notes
        -----
        This function my be used as a decorator, for example:

        .. code-block:: python

           @register('quandl')
           def quandl_ingest_function(...):
               ...

        See Also
        --------
        zipline.data.bundles.bundles
        """
        if name in bundles:
            warnings.warn(
                'Overwriting bundle with name %r' % name,
                stacklevel=3,
            )

        if isinstance(calendar, str):
            calendar = get_calendar(calendar)

        # If the start and end sessions are not provided or lie outside
        # the bounds of the calendar being used, set them to the first
        # and last sessions of the calendar.
        if start_session is None or start_session < calendar.first_session:
            start_session = calendar.first_session
        if end_session is None or end_session > calendar.last_session:
            end_session = calendar.last_session

        _bundles[name] = _BundlePayload(
            calendar,
            start_session,
            end_session,
            minutes_per_day,
            f,
            create_writers,
        )
        return f

    def unregister(name):
        """Unregister a bundle.

        Parameters
        ----------
        name : str
            The name of the bundle to unregister.

        Raises
        ------
        UnknownBundle
            Raised when no bundle has been registered with the given name.

        See Also
        --------
        zipline.data.bundles.bundles
        """
        try:
            del _bundles[name]
        except KeyError:
            raise UnknownBundle(name)

    def ingest(name, environ=os.environ, timestamp=None, show_progress=False):
        """Ingest data for a given bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. By default this is os.environ.
        timestamp : datetime, optional
            The timestamp to use for the load.
            By default this is the current time.
        show_progress : bool, optional
            Tell the ingest function to display the progress where possible.
        """
        try:
            bundle = bundles[name]
        except KeyError:
            raise UnknownBundle(name)

        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestamp = timestamp.tz_convert('utc').tz_localize(None)
        timestr = to_bundle_ingest_dirname(timestamp)
        cachepath = cache_path(name, environ=environ)
        pth.ensure_directory(pth.data_path([name, timestr], environ=environ))
        pth.ensure_directory(cachepath)
        with dataframe_cache(cachepath, clean_on_failure=False) as cache, \
                ExitStack() as stack:
            # we use `cleanup_on_failure=False` so that we don't purge the
            # cache directory if the load fails in the middle
            if bundle.create_writers:
                wd = stack.enter_context(
                    working_dir(pth.data_path([], environ=environ)))
                daily_bars_path = wd.ensure_dir(*daily_equity_relative(
                    name,
                    timestr,
                    environ=environ,
                ))
                daily_bar_writer = BcolzDailyBarWriter(
                    daily_bars_path,
                    bundle.calendar,
                    bundle.start_session,
                    bundle.end_session,
                )
                # Do an empty write to ensure that the daily ctables exist
                # when we create the SQLiteAdjustmentWriter below. The
                # SQLiteAdjustmentWriter needs to open the daily ctables so
                # that it can compute the adjustment ratios for the dividends.

                daily_bar_writer.write(())
                minute_bar_writer = BcolzMinuteBarWriter(
                    wd.ensure_dir(*minute_equity_relative(
                        name, timestr, environ=environ)),
                    bundle.calendar,
                    bundle.start_session,
                    bundle.end_session,
                    minutes_per_day=bundle.minutes_per_day,
                )
                asset_db_writer = AssetDBWriter(
                    wd.getpath(*asset_db_relative(
                        name,
                        timestr,
                        environ=environ,
                    )))

                adjustment_db_writer = stack.enter_context(
                    SQLiteAdjustmentWriter(
                        wd.getpath(*adjustment_db_relative(
                            name, timestr, environ=environ)),
                        BcolzDailyBarReader(daily_bars_path),
                        bundle.calendar.all_sessions,
                        overwrite=True,
                    ))
            else:
                daily_bar_writer = None
                minute_bar_writer = None
                asset_db_writer = None
                adjustment_db_writer = None
            bundle.ingest(
                environ,
                asset_db_writer,
                minute_bar_writer,
                daily_bar_writer,
                adjustment_db_writer,
                bundle.calendar,
                bundle.start_session,
                bundle.end_session,
                cache,
                show_progress,
                pth.data_path([name, timestr], environ=environ),
            )

    def most_recent_data(bundle_name, timestamp, environ=None):
        """Get the path to the most recent data after ``date``for the
        given bundle.

        Parameters
        ----------
        bundle_name : str
            The name of the bundle to lookup.
        timestamp : datetime
            The timestamp to begin searching on or before.
        environ : dict, optional
            An environment dict to forward to zipline_root.
        """
        if bundle_name not in bundles:
            raise UnknownBundle(bundle_name)

        try:
            candidates = os.listdir(
                pth.data_path([bundle_name], environ=environ), )
            return pth.data_path(
                [
                    bundle_name,
                    max(
                        filter(complement(pth.hidden), candidates),
                        key=from_bundle_ingest_dirname,
                    )
                ],
                environ=environ,
            )
        except (ValueError, OSError) as e:
            if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT:
                raise
            raise ValueError(
                'no data for bundle {bundle!r} on or before {timestamp}\n'
                'maybe you need to run: $ zipline ingest -b {bundle}'.format(
                    bundle=bundle_name,
                    timestamp=timestamp,
                ), )

    def load(name, environ=os.environ, timestamp=None):
        """Loads a previously ingested bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. Defaults of os.environ.
        timestamp : datetime, optional
            The timestamp of the data to lookup.
            Defaults to the current time.

        Returns
        -------
        bundle_data : BundleData
            The raw data readers for this bundle.
        """
        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestr = most_recent_data(name, timestamp, environ=environ)
        return BundleData(
            asset_finder=AssetFinder(
                asset_db_path(name, timestr, environ=environ), ),
            equity_minute_bar_reader=BcolzMinuteBarReader(
                minute_equity_path(name, timestr, environ=environ), ),
            equity_daily_bar_reader=BcolzDailyBarReader(
                daily_equity_path(name, timestr, environ=environ), ),
            adjustment_reader=SQLiteAdjustmentReader(
                adjustment_db_path(name, timestr, environ=environ), ),
        )

    @preprocess(
        before=optionally(ensure_timestamp),
        after=optionally(ensure_timestamp),
    )
    def clean(name,
              before=None,
              after=None,
              keep_last=None,
              environ=os.environ):
        """Clean up data that was created with ``ingest`` or
        ``$ python -m zipline ingest``

        Parameters
        ----------
        name : str
            The name of the bundle to remove data for.
        before : datetime, optional
            Remove data ingested before this date.
            This argument is mutually exclusive with: keep_last
        after : datetime, optional
            Remove data ingested after this date.
            This argument is mutually exclusive with: keep_last
        keep_last : int, optional
            Remove all but the last ``keep_last`` ingestions.
            This argument is mutually exclusive with:
              before
              after
        environ : mapping, optional
            The environment variables. Defaults of os.environ.

        Returns
        -------
        cleaned : set[str]
            The names of the runs that were removed.

        Raises
        ------
        BadClean
            Raised when ``before`` and or ``after`` are passed with
            ``keep_last``. This is a subclass of ``ValueError``.
        """
        try:
            all_runs = sorted(
                filter(
                    complement(pth.hidden),
                    os.listdir(pth.data_path([name], environ=environ)),
                ),
                key=from_bundle_ingest_dirname,
            )
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise
            raise UnknownBundle(name)
        if ((before is not None or after is not None)
                and keep_last is not None):
            raise BadClean(before, after, keep_last)

        if keep_last is None:

            def should_clean(name):
                dt = from_bundle_ingest_dirname(name)
                return ((before is not None and dt < before)
                        or (after is not None and dt > after))

        elif keep_last >= 0:
            last_n_dts = set(take(keep_last, reversed(all_runs)))

            def should_clean(name):
                return name not in last_n_dts
        else:
            raise BadClean(before, after, keep_last)

        cleaned = set()
        for run in all_runs:
            if should_clean(run):
                path = pth.data_path([name, run], environ=environ)
                shutil.rmtree(path)
                cleaned.add(path)

        return cleaned

    return BundleCore(bundles, register, unregister, ingest, load, clean)
Exemple #9
0
class BcolzMinuteBarReader(MinuteBarReader):
    """
    Reader for data written by BcolzMinuteBarWriter

    Parameters
    ----------
    rootdir : string
        The root directory containing the metadata and asset bcolz
        directories.

    See Also
    --------
    zipline.data.minute_bars.BcolzMinuteBarWriter
    """
    FIELDS = ('open', 'high', 'low', 'close', 'volume')
    DEFAULT_MINUTELY_SID_CACHE_SIZES = {
        'close': 3000,
        'open': 1550,
        'high': 1550,
        'low': 1550,
        'volume': 1550,
    }
    assert set(FIELDS) == set(DEFAULT_MINUTELY_SID_CACHE_SIZES), \
        "FIELDS should match DEFAULT_MINUTELY_SID_CACHE_SIZES keys"

    # Wrap the defaults in proxy so that we don't accidentally mutate them in
    # place in the constructor. If a user wants to change the defaults, they
    # can do so by mutating DEFAULT_MINUTELY_SID_CACHE_SIZES.
    _default_proxy = mappingproxy(DEFAULT_MINUTELY_SID_CACHE_SIZES)

    def __init__(self, rootdir, sid_cache_sizes=_default_proxy):

        self._rootdir = rootdir

        metadata = self._get_metadata()

        self._start_session = metadata.start_session
        self._end_session = metadata.end_session

        self.calendar = metadata.calendar
        slicer = self.calendar.schedule.index.slice_indexer(
            self._start_session,
            self._end_session,
        )
        self._schedule = self.calendar.schedule[slicer]
        self._market_opens = self._schedule.market_open
        self._market_open_values = self._market_opens.values.\
            astype('datetime64[m]').astype(np.int64)
        self._market_closes = self._schedule.market_close
        self._market_close_values = self._market_closes.values.\
            astype('datetime64[m]').astype(np.int64)

        self._default_ohlc_inverse = 1.0 / metadata.default_ohlc_ratio
        ohlc_ratios = metadata.ohlc_ratios_per_sid
        if ohlc_ratios:
            self._ohlc_inverses_per_sid = (valmap(lambda x: 1.0 / x,
                                                  ohlc_ratios))
        else:
            self._ohlc_inverses_per_sid = None

        self._minutes_per_day = metadata.minutes_per_day

        self._carrays = {
            field: LRU(sid_cache_sizes[field])
            for field in self.FIELDS
        }

        self._last_get_value_dt_position = None
        self._last_get_value_dt_value = None

        # This is to avoid any bad data or other performance-killing situation
        # where there a consecutive streak of 0 (no volume) starting at an
        # asset's start date.
        # if asset 1 started on 2015-01-03 but its first trade is 2015-01-06
        # 10:31 AM US/Eastern, this dict would store {1: 23675971},
        # which is the minute epoch of that date.
        self._known_zero_volume_dict = {}

    def _get_metadata(self):
        return BcolzMinuteBarMetadata.read(self._rootdir)

    @property
    def trading_calendar(self):
        return self.calendar

    @lazyval
    def last_available_dt(self):
        _, close = self.calendar.open_and_close_for_session(self._end_session)
        return close

    @property
    def first_trading_day(self):
        return self._start_session

    def _ohlc_ratio_inverse_for_sid(self, sid):
        if self._ohlc_inverses_per_sid is not None:
            try:
                return self._ohlc_inverses_per_sid[sid]
            except KeyError:
                pass

        # If we can not get a sid-specific OHLC inverse for this sid,
        # fallback to the default.
        return self._default_ohlc_inverse

    def _minutes_to_exclude(self):
        """
        Calculate the minutes which should be excluded when a window
        occurs on days which had an early close, i.e. days where the close
        based on the regular period of minutes per day and the market close
        do not match.

        Returns
        -------
        List of DatetimeIndex representing the minutes to exclude because
        of early closes.
        """
        market_opens = self._market_opens.values.astype('datetime64[m]')
        market_closes = self._market_closes.values.astype('datetime64[m]')
        minutes_per_day = (market_closes - market_opens).astype(np.int64)
        early_indices = np.where(
            minutes_per_day != self._minutes_per_day - 1)[0]
        early_opens = self._market_opens[early_indices]
        early_closes = self._market_closes[early_indices]
        minutes = [
            (market_open, early_close)
            for market_open, early_close in zip(early_opens, early_closes)
        ]
        return minutes

    @lazyval
    def _minute_exclusion_tree(self):
        """
        Build an interval tree keyed by the start and end of each range
        of positions should be dropped from windows. (These are the minutes
        between an early close and the minute which would be the close based
        on the regular period if there were no early close.)
        The value of each node is the same start and end position stored as
        a tuple.

        The data is stored as such in support of a fast answer to the question,
        does a given start and end position overlap any of the exclusion spans?

        Returns
        -------
        IntervalTree containing nodes which represent the minutes to exclude
        because of early closes.
        """
        itree = IntervalTree()
        for market_open, early_close in self._minutes_to_exclude():
            start_pos = self._find_position_of_minute(early_close) + 1
            end_pos = (self._find_position_of_minute(market_open) +
                       self._minutes_per_day - 1)
            data = (start_pos, end_pos)
            itree[start_pos:end_pos + 1] = data
        return itree

    def _exclusion_indices_for_range(self, start_idx, end_idx):
        """
        Returns
        -------
        List of tuples of (start, stop) which represent the ranges of minutes
        which should be excluded when a market minute window is requested.
        """
        itree = self._minute_exclusion_tree
        if itree.overlaps(start_idx, end_idx):
            ranges = []
            intervals = itree[start_idx:end_idx]
            for interval in intervals:
                ranges.append(interval.data)
            return sorted(ranges)
        else:
            return None

    def _get_carray_path(self, sid, field):
        sid_subdir = _sid_subdir_path(sid)
        # carrays are subdirectories of the sid's rootdir
        return os.path.join(self._rootdir, sid_subdir, field)

    def _open_minute_file(self, field, sid):
        sid = int(sid)

        try:
            carray = self._carrays[field][sid]
        except KeyError:
            try:
                carray = self._carrays[field][sid] = bcolz.carray(
                    rootdir=self._get_carray_path(sid, field),
                    mode='r',
                )
            except IOError:
                raise NoDataForSid('No minute data for sid {}.'.format(sid))

        return carray

    def table_len(self, sid):
        """Returns the length of the underlying table for this sid."""
        return len(self._open_minute_file('close', sid))

    def get_sid_attr(self, sid, name):
        sid_subdir = _sid_subdir_path(sid)
        sid_path = os.path.join(self._rootdir, sid_subdir)
        attrs = bcolz.attrs.attrs(sid_path, 'r')
        try:
            return attrs[name]
        except KeyError:
            return None

    def get_value(self, sid, dt, field):
        """
        Retrieve the pricing info for the given sid, dt, and field.

        Parameters
        ----------
        sid : int
            Asset identifier.
        dt : datetime-like
            The datetime at which the trade occurred.
        field : string
            The type of pricing data to retrieve.
            ('open', 'high', 'low', 'close', 'volume')

        Returns
        -------
        out : float|int

        The market data for the given sid, dt, and field coordinates.

        For OHLC:
            Returns a float if a trade occurred at the given dt.
            If no trade occurred, a np.nan is returned.

        For volume:
            Returns the integer value of the volume.
            (A volume of 0 signifies no trades for the given dt.)
        """
        if self._last_get_value_dt_value == dt.value:
            minute_pos = self._last_get_value_dt_position
        else:
            try:
                minute_pos = self._find_position_of_minute(dt)
            except ValueError:
                raise NoDataOnDate()

            self._last_get_value_dt_value = dt.value
            self._last_get_value_dt_position = minute_pos

        try:
            value = self._open_minute_file(field, sid)[minute_pos]
        except IndexError:
            value = 0
        if value == 0:
            if field == 'volume':
                return 0
            else:
                return np.nan

        if field != 'volume':
            value *= self._ohlc_ratio_inverse_for_sid(sid)
        return value

    def get_last_traded_dt(self, asset, dt):
        minute_pos = self._find_last_traded_position(asset, dt)
        if minute_pos == -1:
            return pd.NaT
        return self._pos_to_minute(minute_pos)

    def _find_last_traded_position(self, asset, dt):
        volumes = self._open_minute_file('volume', asset)
        start_date_minute = asset.start_date.value / NANOS_IN_MINUTE
        dt_minute = dt.value / NANOS_IN_MINUTE

        try:
            # if we know of a dt before which this asset has no volume,
            # don't look before that dt
            earliest_dt_to_search = self._known_zero_volume_dict[asset.sid]
        except KeyError:
            earliest_dt_to_search = start_date_minute

        if dt_minute < earliest_dt_to_search:
            return -1

        pos = find_last_traded_position_internal(
            self._market_open_values,
            self._market_close_values,
            dt_minute,
            earliest_dt_to_search,
            volumes,
            self._minutes_per_day,
        )

        if pos == -1:
            # if we didn't find any volume before this dt, save it to avoid
            # work in the future.
            try:
                self._known_zero_volume_dict[asset.sid] = max(
                    dt_minute, self._known_zero_volume_dict[asset.sid])
            except KeyError:
                self._known_zero_volume_dict[asset.sid] = dt_minute

        return pos

    def _pos_to_minute(self, pos):
        minute_epoch = minute_value(self._market_open_values, pos,
                                    self._minutes_per_day)

        return pd.Timestamp(minute_epoch, tz='UTC', unit="m")

    def _find_position_of_minute(self, minute_dt):
        """
        Internal method that returns the position of the given minute in the
        list of every trading minute since market open of the first trading
        day. Adjusts non market minutes to the last close.

        ex. this method would return 1 for 2002-01-02 9:32 AM Eastern, if
        2002-01-02 is the first trading day of the dataset.

        Parameters
        ----------
        minute_dt: pd.Timestamp
            The minute whose position should be calculated.

        Returns
        -------
        int: The position of the given minute in the list of all trading
        minutes since market open on the first trading day.
        """
        return find_position_of_minute(
            self._market_open_values,
            self._market_close_values,
            minute_dt.value / NANOS_IN_MINUTE,
            self._minutes_per_day,
            False,
        )

    def load_raw_arrays(self, fields, start_dt, end_dt, sids):
        """
        Parameters
        ----------
        fields : list of str
           'open', 'high', 'low', 'close', or 'volume'
        start_dt: Timestamp
           Beginning of the window range.
        end_dt: Timestamp
           End of the window range.
        sids : list of int
           The asset identifiers in the window.

        Returns
        -------
        list of np.ndarray
            A list with an entry per field of ndarrays with shape
            (minutes in range, sids) with a dtype of float64, containing the
            values for the respective field over start and end dt range.
        """
        start_idx = self._find_position_of_minute(start_dt)
        end_idx = self._find_position_of_minute(end_dt)

        num_minutes = (end_idx - start_idx + 1)

        results = []

        indices_to_exclude = self._exclusion_indices_for_range(
            start_idx, end_idx)
        if indices_to_exclude is not None:
            for excl_start, excl_stop in indices_to_exclude:
                length = excl_stop - excl_start + 1
                num_minutes -= length

        shape = num_minutes, len(sids)

        for field in fields:
            if field != 'volume':
                out = np.full(shape, np.nan)
            else:
                out = np.zeros(shape, dtype=np.uint32)

            for i, sid in enumerate(sids):
                carray = self._open_minute_file(field, sid)
                values = carray[start_idx:end_idx + 1]
                if indices_to_exclude is not None:
                    for excl_start, excl_stop in indices_to_exclude[::-1]:
                        excl_slice = np.s_[excl_start - start_idx:excl_stop -
                                           start_idx + 1]
                        values = np.delete(values, excl_slice)

                where = values != 0
                # first slice down to len(where) because we might not have
                # written data for all the minutes requested
                if field != 'volume':
                    out[:len(where),
                        i][where] = (values[where] *
                                     self._ohlc_ratio_inverse_for_sid(sid))
                else:
                    out[:len(where), i][where] = values[where]

            results.append(out)
        return results
Exemple #10
0
def _make_bundle_core():
    """Create a family of data bundle functions that read from the same
    bundle mapping.

    Returns
    -------
    bundles : mappingproxy
        The mapping of bundles to bundle payloads.
    register : callable
        The function which registers new bundles in the ``bundles`` mapping.
    unregister : callable
        The function which deregisters bundles from the ``bundles`` mapping.
    ingest : callable
        The function which downloads and write data for a given data bundle.
    load : callable
        The function which loads the ingested bundles back into memory.
    clean : callable
        The function which cleans up data written with ``ingest``.
    """
    _bundles = {}  # the registered bundles
    # Expose _bundles through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another bundle.
    bundles = mappingproxy(_bundles)

    @curry
    def register(name,
                 f,
                 calendar=trading_days,
                 opens=open_and_closes['market_open'],
                 closes=open_and_closes['market_close'],
                 minutes_per_day=390,
                 create_writers=True):
        """Register a data bundle ingest function.

        Parameters
        ----------
        name : str
            The name of the bundle.
        f : callable
            The ingest function. This function will be passed:

              environ : mapping
                  The environment this is being run with.
              asset_db_writer : AssetDBWriter
                  The asset db writer to write into.
              minute_bar_writer : BcolzMinuteBarWriter
                  The minute bar writer to write into.
              daily_bar_writer : BcolzDailyBarWriter
                  The daily bar writer to write into.
              adjustment_writer : SQLiteAdjustmentWriter
                  The adjustment db writer to write into.
              calendar : pd.DatetimeIndex
                  The trading calendar to ingest for.
              cache : DataFrameCache
                  A mapping object to temporarily store dataframes.
                  This should be used to cache intermediates in case the load
                  fails. This will be automatically cleaned up after a
                  successful load.
              show_progress : bool
                  Show the progress for the current load where possible.
        calendar : pd.DatetimeIndex, optional
            The exchange calendar to align the data to. This defaults to the
            NYSE calendar.
        market_open : pd.DatetimeIndex, optional
            The minute when the market opens each day. This defaults to the
            NYSE calendar.
        market_close : pd.DatetimeIndex, optional
            The minute when the market closes each day. This defaults to the
            NYSE calendar.
        minutes_per_day : int, optional
            The number of minutes in each normal trading day.
        create_writers : bool, optional
            Should the ingest machinery create the writers for the ingest
            function. This can be disabled as an optimization for cases where
            they are not needed, like the ``quantopian-quandl`` bundle.

        Notes
        -----
        This function my be used as a decorator, for example:

        .. code-block:: python

           @register('quandl')
           def quandl_ingest_function(...):
               ...

        See Also
        --------
        zipline.data.bundles.bundles
        """
        if name in bundles:
            warnings.warn(
                'Overwriting bundle with name %r' % name,
                stacklevel=3,
            )
        _bundles[name] = _BundlePayload(
            calendar,
            opens,
            closes,
            minutes_per_day,
            f,
            create_writers,
        )
        return f

    def unregister(name):
        """Unregister a bundle.

        Parameters
        ----------
        name : str
            The name of the bundle to unregister.

        Raises
        ------
        UnknownBundle
            Raised when no bundle has been registered with the given name.

        See Also
        --------
        zipline.data.bundles.bundles
        """
        try:
            del _bundles[name]
        except KeyError:
            raise UnknownBundle(name)

    def ingest(name,
               environ=os.environ,
               timestamp=None,
               show_progress=False):
        """Ingest data for a given bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. By default this is os.environ.
        timestamp : datetime, optional
            The timestamp to use for the load.
            By default this is the current time.
        show_progress : bool, optional
            Tell the ingest function to display the progress where possible.
        """
        try:
            bundle = bundles[name]
        except KeyError:
            raise UnknownBundle(name)

        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestamp = timestamp.tz_convert('utc').tz_localize(None)
        timestr = to_bundle_ingest_dirname(timestamp)
        cachepath = cache_path(name, environ=environ)
        pth.ensure_directory(pth.data_path([name, timestr], environ=environ))
        pth.ensure_directory(cachepath)
        with dataframe_cache(cachepath, clean_on_failure=False) as cache, \
                ExitStack() as stack:
            # we use `cleanup_on_failure=False` so that we don't purge the
            # cache directory if the load fails in the middle
            if bundle.create_writers:
                wd = stack.enter_context(working_dir(
                    pth.data_path([], environ=environ))
                )
                daily_bars_path = wd.ensure_dir(
                    *daily_equity_relative(
                        name, timestr, environ=environ,
                    )
                )
                daily_bar_writer = BcolzDailyBarWriter(
                    daily_bars_path,
                    nyse_cal,
                    bundle.calendar[0],
                    bundle.calendar[-1]
                )
                # Do an empty write to ensure that the daily ctables exist
                # when we create the SQLiteAdjustmentWriter below. The
                # SQLiteAdjustmentWriter needs to open the daily ctables so
                # that it can compute the adjustment ratios for the dividends.

                daily_bar_writer.write(())
                minute_bar_writer = BcolzMinuteBarWriter(
                    bundle.calendar[0],
                    wd.ensure_dir(*minute_equity_relative(
                        name, timestr, environ=environ)
                    ),
                    bundle.opens,
                    bundle.closes,
                    minutes_per_day=bundle.minutes_per_day,
                )
                asset_db_writer = AssetDBWriter(
                    wd.getpath(*asset_db_relative(
                        name, timestr, environ=environ,
                    ))
                )

                adjustment_db_writer = stack.enter_context(
                    SQLiteAdjustmentWriter(
                        wd.getpath(*adjustment_db_relative(
                            name, timestr, environ=environ)),
                        BcolzDailyBarReader(daily_bars_path),
                        bundle.calendar,
                        overwrite=True,
                    )
                )
            else:
                daily_bar_writer = None
                minute_bar_writer = None
                asset_db_writer = None
                adjustment_db_writer = None
            bundle.ingest(
                environ,
                asset_db_writer,
                minute_bar_writer,
                daily_bar_writer,
                adjustment_db_writer,
                bundle.calendar,
                cache,
                show_progress,
                pth.data_path([name, timestr], environ=environ),
            )

    def most_recent_data(bundle_name, timestamp, environ=None):
        """Get the path to the most recent data after ``date``for the
        given bundle.

        Parameters
        ----------
        bundle_name : str
            The name of the bundle to lookup.
        timestamp : datetime
            The timestamp to begin searching on or before.
        environ : dict, optional
            An environment dict to forward to zipline_root.
        """
        if bundle_name not in bundles:
            raise UnknownBundle(bundle_name)

        try:
            candidates = os.listdir(
                pth.data_path([bundle_name], environ=environ),
            )
            return pth.data_path(
                [bundle_name,
                 max(
                     filter(complement(pth.hidden), candidates),
                     key=from_bundle_ingest_dirname,
                 )],
                environ=environ,
            )
        except (ValueError, OSError) as e:
            if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT:
                raise
            raise ValueError(
                'no data for bundle {bundle!r} on or before {timestamp}\n'
                'maybe you need to run: $ zipline ingest -b {bundle}'.format(
                    bundle=bundle_name,
                    timestamp=timestamp,
                ),
            )

    def load(name, environ=os.environ, timestamp=None):
        """Loads a previously ingested bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. Defaults of os.environ.
        timestamp : datetime, optional
            The timestamp of the data to lookup.
            Defaults to the current time.

        Returns
        -------
        bundle_data : BundleData
            The raw data readers for this bundle.
        """
        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestr = most_recent_data(name, timestamp, environ=environ)
        return BundleData(
            asset_finder=AssetFinder(
                asset_db_path(name, timestr, environ=environ),
            ),
            equity_minute_bar_reader=BcolzMinuteBarReader(
                minute_equity_path(name, timestr,  environ=environ),
            ),
            equity_daily_bar_reader=BcolzDailyBarReader(
                daily_equity_path(name, timestr, environ=environ),
            ),
            adjustment_reader=SQLiteAdjustmentReader(
                adjustment_db_path(name, timestr, environ=environ),
            ),
        )

    @preprocess(
        before=optionally(ensure_timestamp),
        after=optionally(ensure_timestamp),
    )
    def clean(name,
              before=None,
              after=None,
              keep_last=None,
              environ=os.environ):
        """Clean up data that was created with ``ingest`` or
        ``$ python -m zipline ingest``

        Parameters
        ----------
        name : str
            The name of the bundle to remove data for.
        before : datetime, optional
            Remove data ingested before this date.
            This argument is mutually exclusive with: keep_last
        after : datetime, optional
            Remove data ingested after this date.
            This argument is mutually exclusive with: keep_last
        keep_last : int, optional
            Remove all but the last ``keep_last`` ingestions.
            This argument is mutually exclusive with:
              before
              after
        environ : mapping, optional
            The environment variables. Defaults of os.environ.

        Returns
        -------
        cleaned : set[str]
            The names of the runs that were removed.

        Raises
        ------
        BadClean
            Raised when ``before`` and or ``after`` are passed with
            ``keep_last``. This is a subclass of ``ValueError``.
        """
        try:
            all_runs = sorted(
                filter(
                    complement(pth.hidden),
                    os.listdir(pth.data_path([name], environ=environ)),
                ),
                key=from_bundle_ingest_dirname,
            )
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise
            raise UnknownBundle(name)
        if ((before is not None or after is not None) and
                keep_last is not None):
            raise BadClean(before, after, keep_last)

        if keep_last is None:
            def should_clean(name):
                dt = from_bundle_ingest_dirname(name)
                return (
                    (before is not None and dt < before) or
                    (after is not None and dt > after)
                )

        else:
            last_n_dts = set(all_runs[-keep_last:])

            def should_clean(name):
                return name not in last_n_dts

        cleaned = set()
        for run in all_runs:
            if should_clean(run):
                path = pth.data_path([name, run], environ=environ)
                shutil.rmtree(path)
                cleaned.add(path)

        return cleaned

    return BundleCore(bundles, register, unregister, ingest, load, clean)
Exemple #11
0
def _make_metrics_set_core():
    """Create a family of metrics sets functions that read from the same
    metrics set mapping.

    Returns
    -------
    metrics_sets : mappingproxy
        The mapping of metrics sets to load functions.
    register : callable
        The function which registers new metrics sets in the ``metrics_sets``
        mapping.
    unregister : callable
        The function which deregisters metrics sets from the ``metrics_sets``
        mapping.
    load : callable
        The function which loads the ingested metrics sets back into memory.
    """
    _metrics_sets = {}
    # Expose _metrics_sets through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another metrics set.
    metrics_sets = mappingproxy(_metrics_sets)

    def register(name, function=None):
        """Register a new metrics set.

        Parameters
        ----------
        name : str
            The name of the metrics set
        function : callable
            The callable which produces the metrics set.

        Notes
        -----
        This may be used as a decorator if only ``name`` is passed.

        See Also
        --------
        zipline.finance.metrics.get_metrics_set
        zipline.finance.metrics.unregister_metrics_set
        """
        if function is None:
            # allow as decorator with just name.
            return partial(register, name)

        if name in _metrics_sets:
            raise ValueError("metrics set %r is already registered" % name)

        _metrics_sets[name] = function

        return function

    def unregister(name):
        """Unregister an existing metrics set.

        Parameters
        ----------
        name : str
            The name of the metrics set

        See Also
        --------
        zipline.finance.metrics.register_metrics_set
        """
        try:
            del _metrics_sets[name]
        except KeyError:
            raise ValueError(
                "metrics set %r was not already registered" % name, )

    def load(name):
        """Return an instance of the metrics set registered with the given name.

        Returns
        -------
        metrics : set[Metric]
            A new instance of the metrics set.

        Raises
        ------
        ValueError
            Raised when no metrics set is registered to ``name``
        """
        try:
            function = _metrics_sets[name]
        except KeyError:
            raise ValueError(
                "no metrics set registered as %r, options are: %r" % (
                    name,
                    sorted(_metrics_sets),
                ), )

        return function()

    return metrics_sets, register, unregister, load
Exemple #12
0
def _make_bundle_core():
    """Create a family of data bundle functions that read from the same
    bundle mapping.

    Returns
    -------
    bundles : mappingproxy
        The mapping of bundles to bundle payloads.
    register : callable
        The function which registers new bundles in the ``bundles`` mapping.
    unregister : callable
        The function which deregisters bundles from the ``bundles`` mapping.
    ingest : callable
        The function which downloads and write data for a given data bundle.
    load : callable
        The function which loads the ingested bundles back into memory.
    """
    _bundles = {}  # the registered bundles
    # Expose _bundles through a proxy so that users cannot mutate this
    # accidentally. Users may go through `register` to update this which will
    # warn when trampling another bundle.
    bundles = mappingproxy(_bundles)

    @curry
    def register(name,
                 f,
                 calendar_name='NYSE',
                 start_session=None,
                 end_session=None,
                 minutes_per_day=390,
                 create_writers=True):
        """Register a data bundle ingest function.

        Parameters
        ----------
        name : str
            The name of the bundle.
        f : callable
            The ingest function. This function will be passed:

              environ : mapping
                  The environment this is being run with.
              asset_db_writer : AssetDBWriter
                  The asset db writer to write into.
              minute_bar_writer : BcolzMinuteBarWriter
                  The minute bar writer to write into.
              daily_bar_writer : BcolzDailyBarWriter
                  The daily bar writer to write into.
              adjustment_writer : SQLiteAdjustmentWriter
                  The adjustment db writer to write into.
              calendar : trading_calendars.TradingCalendar
                  The trading calendar to ingest for.
              start_session : pd.Timestamp
                  The first session of data to ingest.
              end_session : pd.Timestamp
                  The last session of data to ingest.
              cache : DataFrameCache
                  A mapping object to temporarily store dataframes.
                  This should be used to cache intermediates in case the load
                  fails. This will be automatically cleaned up after a
                  successful load.
        calendar_name : str, optional
            The name of a calendar used to align bundle data.
            Default is 'NYSE'.
        start_session : pd.Timestamp, optional
            The first session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the first_session of the calendar is used.
        end_session : pd.Timestamp, optional
            The last session for which we want data. If not provided,
            or if the date lies outside the range supported by the
            calendar, the last_session of the calendar is used.
        minutes_per_day : int, optional
            The number of minutes in each normal trading day.
        create_writers : bool, optional
            Should the ingest machinery create the writers for the ingest
            function. This can be disabled as an optimization for cases where
            they are not needed.

        Notes
        -----
        This function my be used as a decorator, for example:

        .. code-block:: python

           @register('quandl')
           def quandl_ingest_function(...):
               ...

        See Also
        --------
        zipline.data.bundles.bundles
        """
        if name in bundles:
            warnings.warn(
                'Overwriting bundle with name %r' % name,
                stacklevel=3,
            )

        # NOTE: We don't eagerly compute calendar values here because
        # `register` is called at module scope in zipline, and creating a
        # calendar currently takes between 0.5 and 1 seconds, which causes a
        # noticeable delay on the zipline CLI.
        _bundles[name] = RegisteredBundle(
            calendar_name=calendar_name,
            start_session=start_session,
            end_session=end_session,
            minutes_per_day=minutes_per_day,
            ingest=f,
            create_writers=create_writers,
        )
        return f

    def unregister(name):
        """Unregister a bundle.

        Parameters
        ----------
        name : str
            The name of the bundle to unregister.

        Raises
        ------
        UnknownBundle
            Raised when no bundle has been registered with the given name.

        See Also
        --------
        zipline.data.bundles.bundles
        """
        try:
            del _bundles[name]
        except KeyError:
            raise UnknownBundle(name)

    def ingest(name, environ=os.environ, timestamp=None):
        """Ingest data for a given bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. By default this is os.environ.
        timestamp : datetime, optional
            The timestamp to use for the load.
            By default this is the current time.
        """
        try:
            bundle = bundles[name]
        except KeyError:
            raise UnknownBundle(name)

        calendar = get_calendar(bundle.calendar_name)

        start_session = bundle.start_session
        end_session = bundle.end_session

        if start_session is None or start_session < calendar.first_session:
            start_session = calendar.first_session

        if end_session is None or end_session > calendar.last_session:
            end_session = calendar.last_session

        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestamp = timestamp.tz_convert('utc').tz_localize(None)

        timestr = to_bundle_ingest_dirname(timestamp)
        cachepath = cache_path(name, environ=environ)
        pth.ensure_directory(pth.data_path([name, timestr], environ=environ))
        pth.ensure_directory(cachepath)
        with dataframe_cache(cachepath, clean_on_failure=False) as cache, \
                ExitStack() as stack:
            # we use `cleanup_on_failure=False` so that we don't purge the
            # cache directory if the load fails in the middle
            if bundle.create_writers:
                wd = stack.enter_context(
                    working_dir(pth.data_path([], environ=environ)))
                daily_bars_path = wd.ensure_dir(
                    *daily_equity_relative(name, timestr))
                daily_bar_writer = BcolzDailyBarWriter(
                    daily_bars_path,
                    calendar,
                    start_session,
                    end_session,
                )
                # Do an empty write to ensure that the daily ctables exist
                # when we create the SQLiteAdjustmentWriter below. The
                # SQLiteAdjustmentWriter needs to open the daily ctables so
                # that it can compute the adjustment ratios for the dividends.

                daily_bar_writer.write(())
                minute_bar_writer = BcolzMinuteBarWriter(
                    wd.ensure_dir(*minute_equity_relative(name, timestr)),
                    calendar,
                    start_session,
                    end_session,
                    minutes_per_day=bundle.minutes_per_day,
                )
                assets_db_path = wd.getpath(*asset_db_relative(name, timestr))
                asset_db_writer = AssetDBWriter(assets_db_path)

                adjustment_db_writer = stack.enter_context(
                    SQLiteAdjustmentWriter(
                        wd.getpath(*adjustment_db_relative(name, timestr)),
                        BcolzDailyBarReader(daily_bars_path),
                        overwrite=True,
                    ))
            else:
                daily_bar_writer = None
                minute_bar_writer = None
                asset_db_writer = None
                adjustment_db_writer = None

            bundle.ingest(
                environ,
                asset_db_writer,
                minute_bar_writer,
                daily_bar_writer,
                adjustment_db_writer,
                calendar,
                start_session,
                end_session,
                cache,
                pth.data_path([name, timestr], environ=environ),
            )

    def most_recent_data(bundle_name, timestamp, environ=None):
        """Get the path to the most recent data after ``date``for the
        given bundle.

        Parameters
        ----------
        bundle_name : str
            The name of the bundle to lookup.
        timestamp : datetime
            The timestamp to begin searching on or before.
        environ : dict, optional
            An environment dict to forward to zipline_root.
        """
        if bundle_name not in bundles:
            raise UnknownBundle(bundle_name)

        try:
            candidates = os.listdir(
                pth.data_path([bundle_name], environ=environ), )
            return pth.data_path(
                [
                    bundle_name,
                    max(
                        filter(complement(pth.hidden), candidates),
                        key=from_bundle_ingest_dirname,
                    )
                ],
                environ=environ,
            )
        except (ValueError, OSError) as e:
            if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT:
                raise
            raise ValueError(
                'no data for bundle {bundle!r} on or before {timestamp}\n'
                'maybe you need to run: $ zipline ingest -b {bundle}'.format(
                    bundle=bundle_name,
                    timestamp=timestamp,
                ), )

    def load(name,
             environ=os.environ,
             timestamp=None,
             daily_bar_reader_kwargs={},
             minute_bar_reader_kwargs={}):
        """Loads a previously ingested bundle.

        Parameters
        ----------
        name : str
            The name of the bundle.
        environ : mapping, optional
            The environment variables. Defaults of os.environ.
        timestamp : datetime, optional
            The timestamp of the data to lookup.
            Defaults to the current time.

        Returns
        -------
        bundle_data : BundleData
            The raw data readers for this bundle.
        """
        if timestamp is None:
            timestamp = pd.Timestamp.utcnow()
        timestr = most_recent_data(name, timestamp, environ=environ)
        return BundleData(
            asset_finder=AssetFinder(
                asset_db_path(name, timestr, environ=environ), ),
            equity_minute_bar_reader=BcolzMinuteBarReader(
                minute_equity_path(name, timestr, environ=environ),
                **minute_bar_reader_kwargs),
            equity_daily_bar_reader=BcolzDailyBarReader(
                daily_equity_path(name, timestr, environ=environ),
                **daily_bar_reader_kwargs),
            adjustment_reader=SQLiteAdjustmentReader(
                adjustment_db_path(name, timestr, environ=environ), ),
        )

    return BundleCore(bundles, register, unregister, ingest, load)