def test_optionally(self): error = TypeError("arg must be int") def preprocessor(func, argname, arg): if not isinstance(arg, int): raise error return arg @preprocess(a=optionally(preprocessor)) def f(a): return a assert f(1) == 1 assert f(None) is None with pytest.raises(TypeError, match=str(error)): f("a")
def test_optionally(self): error = TypeError('arg must be int') def preprocessor(func, argname, arg): if not isinstance(arg, int): raise error return arg @preprocess(a=optionally(preprocessor)) def f(a): return a self.assertIs(f(1), 1) self.assertIsNone(f(None)) with self.assertRaises(TypeError) as e: f('a') self.assertIs(e.exception, error)
def test_optionally(self): error = TypeError('arg must be int') def preprocessor(func, argname, arg): if not isinstance(arg, int): raise error return arg @preprocess(a=optionally(preprocessor)) def f(a): return a self.assertIs(f(1), 1) self.assertIsNone(f(None)) with self.assertRaises(TypeError) as e: f('a') self.assertIs(e.exception, error)
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar='NYSE', start_session=None, end_session=None, minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : zipline.utils.calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar : zipline.utils.calendars.TradingCalendar or str, optional The trading calendar to align the data to, or the name of a trading calendar. This defaults to 'NYSE', in which case we use the NYSE calendar. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- zipline.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) if isinstance(calendar, str): calendar = get_calendar(calendar) # If the start and end sessions are not provided or lie outside # the bounds of the calendar being used, set them to the first # and last sessions of the calendar. if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session _bundles[name] = _BundlePayload( calendar, start_session, end_session, minutes_per_day, f, create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- zipline.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, show_progress=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context( working_dir(pth.data_path([], environ=environ))) daily_bars_path = wd.ensure_dir(*daily_equity_relative( name, timestr, environ=environ, )) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, bundle.calendar, bundle.start_session, bundle.end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_equity_relative( name, timestr, environ=environ)), bundle.calendar, bundle.start_session, bundle.end_session, minutes_per_day=bundle.minutes_per_day, ) asset_db_writer = AssetDBWriter( wd.getpath(*asset_db_relative( name, timestr, environ=environ, ))) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), bundle.calendar.all_sessions, overwrite=True, )) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, bundle.calendar, bundle.start_session, bundle.end_session, cache, show_progress, pth.data_path([name, timestr], environ=environ), ) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to zipline_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [ bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, ) ], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ zipline ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), ), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m zipline ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ((before is not None and dt < before) or (after is not None and dt > after)) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore(bundles, register, unregister, ingest, load, clean)
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar=trading_days, opens=open_and_closes['market_open'], closes=open_and_closes['market_close'], minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : pd.DatetimeIndex The trading calendar to ingest for. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar : pd.DatetimeIndex, optional The exchange calendar to align the data to. This defaults to the NYSE calendar. market_open : pd.DatetimeIndex, optional The minute when the market opens each day. This defaults to the NYSE calendar. market_close : pd.DatetimeIndex, optional The minute when the market closes each day. This defaults to the NYSE calendar. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- zipline.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) _bundles[name] = _BundlePayload( calendar, opens, closes, minutes_per_day, f, create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- zipline.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, show_progress=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context(working_dir( pth.data_path([], environ=environ)) ) daily_bars_path = wd.ensure_dir( *daily_equity_relative( name, timestr, environ=environ, ) ) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, nyse_cal, bundle.calendar[0], bundle.calendar[-1] ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( bundle.calendar[0], wd.ensure_dir(*minute_equity_relative( name, timestr, environ=environ) ), bundle.opens, bundle.closes, minutes_per_day=bundle.minutes_per_day, ) asset_db_writer = AssetDBWriter( wd.getpath(*asset_db_relative( name, timestr, environ=environ, )) ) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), bundle.calendar, overwrite=True, ) ) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, bundle.calendar, cache, show_progress, pth.data_path([name, timestr], environ=environ), ) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to zipline_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, )], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ zipline ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), ), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m zipline ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) else: last_n_dts = set(all_runs[-keep_last:]) def should_clean(name): return name not in last_n_dts cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore(bundles, register, unregister, ingest, load, clean)
class BlazeEventsLoader(PipelineLoader): """An abstract pipeline loader for the events datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. resources : dict, optional Mapping from the atomic terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. dataset : DataSet The DataSet object for which this loader loads data. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the date when the earnings will be z announced. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, dataset=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) expected_fields = self._expected_fields self._expr = bind_expression_to_resources( expr[list(expected_fields)], resources, ) self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} self._dataset = dataset check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz @abc.abstractproperty def concrete_loader(self): NotImplementedError('concrete_loader') def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) return self.concrete_loader( dates, self.prepare_data(raw, gb), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask) def prepare_data(self, raw, gb): return {sid: raw.loc[group].drop(SID_FIELD_NAME, axis=1) for sid, group in gb.groups.items()}
class BlazeLoader(dict): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str, optional The timezeone to use for the data query cutoff. pool : Pool, optional The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. Attributes ---------- pool : Pool The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. It is possible to change the pool after the loader has been constructed. This allows us to set a new pool for the ``global_loader`` like: ``global_loader.pool = multiprocessing.Pool(4)``. See Also -------- :class:`zipline.utils.pool.SequentialPool` :class:`multiprocessing.Pool` """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None, pool=SequentialPool()): self.update(dsmap or {}) check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz # explicitly public self.pool = pool @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __call__(self, column): if column.dataset in self: return self raise KeyError(column) def __repr__(self): return '<%s: %s>' % ( type(self).__name__, super(BlazeLoader, self).__repr__(), ) def load_adjusted_array(self, columns, dates, assets, mask): return merge( self.pool.imap_unordered( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)), ), ) def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
class BlazeLoader(dict): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None): self.update(dsmap or {}) check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __call__(self, column): if column.dataset in self: return self raise KeyError(column) def __repr__(self): return '<%s: %s>' % ( type(self).__name__, super(BlazeLoader, self).__repr__(), ) def load_adjusted_array(self, columns, dates, assets, mask): return dict( concat( map(partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns))))) def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME ] + ([SID_FIELD_NAME] if have_sids else []) colnames = added_query_fields + list(map(getname, columns)) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) if checkpoints is not None: ts = checkpoints[TS_FIELD_NAME] checkpoints_ts = odo(ts[ts <= lower_dt].max(), pd.Timestamp) if pd.isnull(checkpoints_ts): materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None else: materialized_checkpoints = odo( checkpoints[ts == checkpoints_ts][colnames], pd.DataFrame, **odo_kwargs) lower = checkpoints_ts else: materialized_checkpoints = pd.DataFrame(columns=colnames) lower = None materialized_expr = collect_expr(expr, lower) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr, ), ignore_index=True, copy=False, ) materialized_deltas = (collect_expr(deltas, lower) if deltas is not None else pd.DataFrame(columns=colnames)) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]'))] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) # Fill in missing values specified by each column. This is made # significantly more complex by the fact that we need to work around # two pandas issues: # 1) When we have sids, if there are no records for a given sid for any # dates, pandas will generate a column full of NaNs for that sid. # This means that some of the columns in `dense_output` are now # float instead of the intended dtype, so we have to coerce back to # our expected type and convert NaNs into the desired missing value. # 2) DataFrame.ffill assumes that receiving None as a fill-value means # that no value was passed. Consequently, there's no way to tell # pandas to replace NaNs in an object column with None using fillna, # so we have to roll our own instead using df.where. for column in columns: # Special logic for strings since `fillna` doesn't work if the # missing value is `None`. if column.dtype == categorical_dtype: dense_output[column.name] = dense_output[column.name].where( pd.notnull(dense_output[column.name]), column.missing_value) else: # We need to execute `fillna` before `astype` in case the # column contains NaNs and needs to be cast to bool or int. # This is so that the NaNs are replaced first, since pandas # can't convert NaNs for those types. dense_output[column.name] = dense_output[column.name].fillna( column.missing_value).astype(column.dtype) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
class BlazeLoader(object): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str, optional The timezeone to use for the data query cutoff. pool : Pool, optional The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. Attributes ---------- pool : Pool The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. It is possible to change the pool after the loader has been constructed. This allows us to set a new pool for the ``global_loader`` like: ``global_loader.pool = multiprocessing.Pool(4)``. See Also -------- :class:`zipline.utils.pool.SequentialPool` :class:`multiprocessing.Pool` """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None, pool=SequentialPool()): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz # explicitly public self.pool = pool self._table_expressions = (dsmap or {}).copy() @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __contains__(self, column): return column in self._table_expressions def __getitem__(self, column): return self._table_expressions[column] def __iter__(self): return iter(self._table_expressions) def __len__(self): return len(self._table_expressions) def __call__(self, column): if column in self: return self raise KeyError(column) def register_dataset(self, dataset, expr, deltas=None, checkpoints=None, odo_kwargs=None): """Explicitly map a datset to a collection of blaze expressions. Parameters ---------- dataset : DataSet The pipeline dataset to map to the given expressions. expr : Expr The baseline values. deltas : Expr, optional The deltas for the data. checkpoints : Expr, optional The forward fill checkpoints for the data. odo_kwargs : dict, optional The keyword arguments to forward to the odo calls internally. See Also -------- :func:`zipline.pipeline.loaders.blaze.from_blaze` """ expr_data = ExprData( expr, deltas, checkpoints, odo_kwargs, ) for column in dataset.columns: self._table_expressions[column] = expr_data def register_column(self, column, expr, deltas=None, checkpoints=None, odo_kwargs=None): """Explicitly map a single bound column to a collection of blaze expressions. The expressions need to have ``timestamp`` and ``as_of`` columns. Parameters ---------- column : BoundColumn The pipeline dataset to map to the given expressions. expr : Expr The baseline values. deltas : Expr, optional The deltas for the data. checkpoints : Expr, optional The forward fill checkpoints for the data. odo_kwargs : dict, optional The keyword arguments to forward to the odo calls internally. See Also -------- :func:`zipline.pipeline.loaders.blaze.from_blaze` """ self._table_expressions[column] = ExprData( expr, deltas, checkpoints, odo_kwargs, ) def load_adjusted_array(self, columns, dates, assets, mask): return merge( self.pool.imap_unordered( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getitem(self._table_expressions), columns)), ), ) def _load_dataset(self, dates, assets, mask, columns): try: (expr_data, ) = {self._table_expressions[c] for c in columns} except ValueError: raise AssertionError( 'all columns must share the same expression data', ) expr, deltas, checkpoints, odo_kwargs = expr_data have_sids = (first(columns).dataset.ndim == 2) added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] < upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr_deferred = self.pool.apply_async( collect_expr, (expr, lower), ) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else None) all_rows = pd.concat( filter( lambda df: df is not None, ( materialized_checkpoints, materialized_expr_deferred.get(), materialized_deltas, ), ), ignore_index=True, copy=False, ) all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype( 'datetime64[ns]', ) all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True) if have_sids: return adjusted_arrays_from_rows_with_assets( dates, data_query_time, data_query_tz, assets, mask, columns, all_rows, ) else: return adjusted_arrays_from_rows_without_assets( dates, data_query_time, data_query_tz, None, columns, all_rows, )
class BlazeLoader(dict): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None): self.update(dsmap or {}) check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __call__(self, column): if column.dataset in self: return self raise KeyError(column) def load_adjusted_array(self, columns, dates, assets, mask): return dict( concat(map( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)) )) ) def _load_dataset(self, dates, assets, mask, columns): try: (dataset,) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, odo_kwargs = self[dataset] have_sids = SID_FIELD_NAME in expr.fields asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = [AD_FIELD_NAME, TS_FIELD_NAME] + ( [SID_FIELD_NAME] if have_sids else [] ) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def where(e): """Create the query to run against the resources. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- q : Expr The query to run. """ def lower_for_col(column): pred = e[TS_FIELD_NAME] <= lower_dt colname = column.name schema = e[colname].schema.measure if isinstance(schema, Option): pred &= e[colname].notnull() schema = schema.ty if schema in floating: pred &= ~e[colname].isnan() filtered = e[pred] lower = filtered[TS_FIELD_NAME].max() if have_sids: # If we have sids, then we need to take the earliest of the # greatest date that has a non-null value by sid. lower = bz.by( filtered[SID_FIELD_NAME], timestamp=lower, ).timestamp.min() return lower lower = odo( reduce( bz.least, map(lower_for_col, columns), ), pd.Timestamp, **odo_kwargs ) if lower is pd.NaT: lower = lower_dt return e[ (e[TS_FIELD_NAME] >= lower) & (e[TS_FIELD_NAME] <= upper_dt) ][added_query_fields + list(map(getname, columns))] def collect_expr(e): """Execute and merge all of the per-column subqueries. Parameters ---------- e : Expr The baseline or deltas expression. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ df = odo(where(e), pd.DataFrame, **odo_kwargs) df.sort(TS_FIELD_NAME, inplace=True) # sort for the groupby later return df materialized_expr = collect_expr(expr) materialized_deltas = ( collect_expr(deltas) if deltas is not None else pd.DataFrame( columns=added_query_fields + list(map(getname, columns)), ) ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets) ] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[ :, TS_FIELD_NAME ].astype('datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) def last_in_date_group(df, reindex, have_sids=have_sids): idx = dates[dates.searchsorted( df[TS_FIELD_NAME].values.astype('datetime64[D]') )] if have_sids: idx = [idx, SID_FIELD_NAME] last_in_group = df.drop(TS_FIELD_NAME, axis=1).groupby( idx, sort=False, ).last() if have_sids: last_in_group = last_in_group.unstack() if reindex: if have_sids: cols = last_in_group.columns last_in_group = last_in_group.reindex( index=dates, columns=pd.MultiIndex.from_product( (cols.levels[0], assets), names=cols.names, ), ) else: last_in_group = last_in_group.reindex(dates) return last_in_group sparse_deltas = last_in_date_group(non_novel_deltas, reindex=False) dense_output = last_in_date_group(sparse_output, reindex=True) dense_output.ffill(inplace=True) if have_sids: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # We use the column view to make an array per asset. column_view = compose( # We need to copy this because we need a concrete ndarray. # The `repeat_last_axis` call will give us a fancy strided # array which uses a buffer to represent `len(assets)` columns. # The engine puts nans at the indicies for which we do not have # sid information so that the nan-aware reductions still work. # A future change to the engine would be to add first class # support for macro econimic datasets. copy, partial(repeat_last_axis, count=len(assets)), ) adjustments_from_deltas = adjustments_from_deltas_no_sids for column_idx, column in enumerate(columns): column_name = column.name yield column, AdjustedArray( column_view( dense_output[column_name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column_name, asset_idx, sparse_deltas, ), column.missing_value, )
class BlazeEstimatesLoader(implements(PipelineLoader)): """An abstract pipeline loader for the estimates datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. columns : dict[str -> str] A dict mapping BoundColumn names to the associated names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. checkpoints : Expr, optional The expression representing checkpointed data to be used for faster forward-filling of data from `expr`. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {FISCAL_YEAR_FIELD_NAME}: float64, {FISCAL_QUARTER_FIELD_NAME}: float64, {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the date of the event. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format( SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, FISCAL_YEAR_FIELD_NAME=FISCAL_YEAR_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME=FISCAL_QUARTER_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME, ) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, columns, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, checkpoints=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) required_cols = list(required_estimates_fields(columns)) self._expr = bind_expression_to_resources( expr[required_cols], resources, ) self._columns = columns self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz self._checkpoints = checkpoints def load_adjusted_array(self, domain, columns, dates, sids, mask): # Only load requested columns. requested_column_names = [ self._columns[column.name] for column in columns ] raw = load_raw_data( sids, dates, self._data_query_time, self._data_query_tz, self._expr[sorted(metadata_columns.union(requested_column_names))], self._odo_kwargs, checkpoints=self._checkpoints, ) return self.loader( raw, {column.name: self._columns[column.name] for column in columns}, ).load_adjusted_array( domain, columns, dates, sids, mask, )
class BlazeEventsLoader(PipelineLoader): """An abstract pipeline loader for the events datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. next_value_columns : dict[BoundColumn -> raw column name] A dict mapping 'next' BoundColumns to their column names in `expr`. previous_value_columns : dict[BoundColumn -> raw column name] A dict mapping 'previous' BoundColumns to their column names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezone to use for the data query cutoff. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the event date. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, next_value_columns, previous_value_columns, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) required_cols = list( required_event_fields(next_value_columns, previous_value_columns)) self._expr = bind_expression_to_resources( expr[required_cols], resources, ) self._next_value_columns = next_value_columns self._previous_value_columns = previous_value_columns self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz def load_adjusted_array(self, columns, dates, assets, mask): raw = load_raw_data(assets, dates, self._data_query_time, self._data_query_tz, self._expr, self._odo_kwargs) return EventsLoader( events=raw, next_value_columns=self._next_value_columns, previous_value_columns=self._previous_value_columns, ).load_adjusted_array( columns, dates, assets, mask, )
class BlazeEarningsCalendarLoader(PipelineLoader): """A pipeline loader for the ``EarningsCalendar`` dataset that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. resources : dict, optional Mapping from the atomic terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {ANNOUNCEMENT_FIELD_NAME}: ?datetime, }} Where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the date when the earnings will be announced. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format( TS_FIELD_NAME=TS_FIELD_NAME, SID_FIELD_NAME=SID_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME=ANNOUNCEMENT_FIELD_NAME, ) _expected_fields = frozenset({ TS_FIELD_NAME, SID_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME, }) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, dataset=EarningsCalendar): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) expected_fields = self._expected_fields self._expr = bind_expression_to_resources( expr[list(expected_fields)], resources, ) self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} self._dataset = dataset check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz def load_adjusted_array(self, columns, dates, assets, mask): data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) raw = ffill_query_in_range( self._expr, lower_dt, upper_dt, self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop( sids[~sids.isin(assets)].index, inplace=True ) if data_query_time is not None: normalize_timestamp_to_query_time( raw, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[ idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME] ].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)