def test_optionally(self): error = TypeError('arg must be int') def preprocessor(func, argname, arg): if not isinstance(arg, int): raise error return arg @preprocess(a=optionally(preprocessor)) def f(a): return a self.assertIs(f(1), 1) self.assertIsNone(f(None)) with self.assertRaises(TypeError) as e: f('a') self.assertIs(e.exception, error)
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) @curry def register(name, f, calendar_name='NYSE', start_session=None, end_session=None, minutes_per_day=390, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : gateway.utils.calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar_name : str, optional The name of a calendar used to align bundle data. Default is 'NYSE'. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- gateway.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) # NOTE: We don't eagerly compute calendar values here because # `register` is called at module scope in gateway, and creating a # calendar currently takes between 0.5 and 1 seconds, which causes a # noticeable delay on the gateway CLI. _bundles[name] = RegisteredBundle( calendar_name=calendar_name, start_session=start_session, end_session=end_session, minutes_per_day=minutes_per_day, ingest=f, create_writers=create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- gateway.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, assets_versions=(), show_progress=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. assets_versions : Iterable[int], optional Versions of the assets db to which to downgrade. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) calendar = get_calendar(bundle.calendar_name) start_session = bundle.start_session end_session = bundle.end_session if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context(working_dir( pth.data_path([], environ=environ)) ) daily_bars_path = wd.ensure_dir( *daily_equity_relative( name, timestr, environ=environ, ) ) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, calendar, start_session, end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_equity_relative( name, timestr, environ=environ) ), calendar, start_session, end_session, minutes_per_day=bundle.minutes_per_day, ) assets_db_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, )) asset_db_writer = AssetDBWriter(assets_db_path) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), calendar.all_sessions, overwrite=True, ) ) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None if assets_versions: raise ValueError('Need to ingest a bundle that creates ' 'writers in order to downgrade the assets' ' db.') bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, calendar, start_session, end_session, cache, show_progress, pth.data_path([name, timestr], environ=environ), ) for version in sorted(set(assets_versions), reverse=True): version_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, db_version=version, )) with working_file(version_path) as wf: shutil.copy2(assets_db_path, wf.path) downgrade(wf.path, version) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to gateway_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, )], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ gateway ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), equity_minute_bar_reader=BcolzMinuteBarReader( minute_equity_path(name, timestr, environ=environ), ), equity_daily_bar_reader=BcolzDailyBarReader( daily_equity_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m gateway ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore(bundles, register, unregister, ingest, load, clean)
class BlazeLoader(object): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str, optional The timezeone to use for the data query cutoff. pool : Pool, optional The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. Attributes ---------- pool : Pool The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. It is possible to change the pool after the loader has been constructed. This allows us to set a new pool for the ``global_loader`` like: ``global_loader.pool = multiprocessing.Pool(4)``. See Also -------- :class:`gateway.utils.pool.SequentialPool` :class:`multiprocessing.Pool` """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None, pool=SequentialPool()): check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz # explicitly public self.pool = pool self._table_expressions = (dsmap or {}).copy() @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __contains__(self, column): return column in self._table_expressions def __getitem__(self, column): return self._table_expressions[column] def __iter__(self): return iter(self._table_expressions) def __len__(self): return len(self._table_expressions) def __call__(self, column): if column in self: return self raise KeyError(column) def register_dataset(self, dataset, expr, deltas=None, checkpoints=None, odo_kwargs=None): """Explicitly map a datset to a collection of blaze expressions. Parameters ---------- dataset : DataSet The pipeline dataset to map to the given expressions. expr : Expr The baseline values. deltas : Expr, optional The deltas for the data. checkpoints : Expr, optional The forward fill checkpoints for the data. odo_kwargs : dict, optional The keyword arguments to forward to the odo calls internally. See Also -------- :func:`gateway.pipeline.loaders.blaze.from_blaze` """ expr_data = ExprData( expr, deltas, checkpoints, odo_kwargs, ) for column in dataset.columns: self._table_expressions[column] = expr_data def register_column(self, column, expr, deltas=None, checkpoints=None, odo_kwargs=None): """Explicitly map a single bound column to a collection of blaze expressions. The expressions need to have ``timestamp`` and ``as_of`` columns. Parameters ---------- column : BoundColumn The pipeline dataset to map to the given expressions. expr : Expr The baseline values. deltas : Expr, optional The deltas for the data. checkpoints : Expr, optional The forward fill checkpoints for the data. odo_kwargs : dict, optional The keyword arguments to forward to the odo calls internally. See Also -------- :func:`gateway.pipeline.loaders.blaze.from_blaze` """ self._table_expressions[column] = ExprData( expr, deltas, checkpoints, odo_kwargs, ) def load_adjusted_array(self, columns, dates, assets, mask): return merge( self.pool.imap_unordered( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getitem(self._table_expressions), columns)), ), ) def _load_dataset(self, dates, assets, mask, columns): try: (expr_data, ) = {self._table_expressions[c] for c in columns} except ValueError: raise AssertionError( 'all columns must share the same expression data', ) expr, deltas, checkpoints, odo_kwargs = expr_data have_sids = (first(columns).dataset.ndim == 2) added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] < upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr_deferred = self.pool.apply_async( collect_expr, (expr, lower), ) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else None) all_rows = pd.concat( filter( lambda df: df is not None, ( materialized_checkpoints, materialized_expr_deferred.get(), materialized_deltas, ), ), ignore_index=True, copy=False, ) all_rows[TS_FIELD_NAME] = all_rows[TS_FIELD_NAME].astype( 'datetime64[ns]', ) all_rows.sort_values([TS_FIELD_NAME, AD_FIELD_NAME], inplace=True) if have_sids: return adjusted_arrays_from_rows_with_assets( dates, data_query_time, data_query_tz, assets, columns, all_rows, ) else: return adjusted_arrays_from_rows_without_assets( dates, data_query_time, data_query_tz, columns, all_rows, )
class BlazeEventsLoader(PipelineLoader): """An abstract pipeline loader for the events datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. next_value_columns : dict[BoundColumn -> raw column name] A dict mapping 'next' BoundColumns to their column names in `expr`. previous_value_columns : dict[BoundColumn -> raw column name] A dict mapping 'previous' BoundColumns to their column names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezone to use for the data query cutoff. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the event date. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format(SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, next_value_columns, previous_value_columns, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) required_cols = list( required_event_fields(next_value_columns, previous_value_columns)) self._expr = bind_expression_to_resources( expr[required_cols], resources, ) self._next_value_columns = next_value_columns self._previous_value_columns = previous_value_columns self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz def load_adjusted_array(self, columns, dates, assets, mask): raw = load_raw_data(assets, dates, self._data_query_time, self._data_query_tz, self._expr, self._odo_kwargs) return EventsLoader( events=raw, next_value_columns=self._next_value_columns, previous_value_columns=self._previous_value_columns, ).load_adjusted_array( columns, dates, assets, mask, )
class BlazeEstimatesLoader(PipelineLoader): """An abstract pipeline loader for the estimates datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. columns : dict[str -> str] A dict mapping BoundColumn names to the associated names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. checkpoints : Expr, optional The expression representing checkpointed data to be used for faster forward-filling of data from `expr`. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {FISCAL_YEAR_FIELD_NAME}: float64, {FISCAL_QUARTER_FIELD_NAME}: float64, {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the date of the event. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format( SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, FISCAL_YEAR_FIELD_NAME=FISCAL_YEAR_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME=FISCAL_QUARTER_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME, ) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, columns, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, checkpoints=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) required_cols = list(required_estimates_fields(columns)) self._expr = bind_expression_to_resources( expr[required_cols], resources, ) self._columns = columns self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz self._checkpoints = checkpoints def load_adjusted_array(self, columns, dates, assets, mask): # Only load requested columns. requested_column_names = [ self._columns[column.name] for column in columns ] raw = load_raw_data( assets, dates, self._data_query_time, self._data_query_tz, self._expr[sorted(metadata_columns.union(requested_column_names))], self._odo_kwargs, checkpoints=self._checkpoints, ) return self.loader( raw, {column.name: self._columns[column.name] for column in columns}, ).load_adjusted_array( columns, dates, assets, mask, )