def test_optionally(self): error = TypeError('arg must be int') def preprocessor(func, argname, arg): if not isinstance(arg, int): raise error return arg @preprocess(a=optionally(preprocessor)) def f(a): return a self.assertIs(f(1), 1) self.assertIsNone(f(None)) with self.assertRaises(TypeError) as e: f('a') self.assertIs(e.exception, error)
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register_bundle : Bundle A bundle instance to add to the ``bundles`` mapping. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) def register_bundle(bundle_cls, asset_filter=None, start_session=None, end_session=None, create_writers=True): bundle = bundle_cls(asset_filter=asset_filter) return register( bundle.name, bundle.ingest, calendar_name=bundle.calendar_name, minutes_per_day=bundle.minutes_per_day, five_minutes_per_day=bundle.five_minutes_per_day, start_session=start_session, end_session=end_session, create_writers=create_writers, ) @curry def register(name, f, calendar_name='OPEN', start_session=None, end_session=None, minutes_per_day=1440, five_minutes_per_day=288, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : catalyst.utils.calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar_name : str, optional The name of a calendar used to align bundle data. Default is 'NYSE'. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- catalyst.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) # NOTE: We don't eagerly compute calendar values here because # `register` is called at module scope in catalyst, and creating a # calendar currently takes between 0.5 and 1 seconds, which causes a # noticeable delay on the catalyst CLI. _bundles[name] = RegisteredBundle( calendar_name=calendar_name, start_session=start_session, end_session=end_session, minutes_per_day=minutes_per_day, five_minutes_per_day=five_minutes_per_day, ingest=f, create_writers=create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- catalyst.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, assets_versions=(), show_progress=False, is_compile=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. assets_versions : Iterable[int], optional Versions of the assets db to which to downgrade. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) calendar = get_calendar(bundle.calendar_name) start_session = bundle.start_session end_session = bundle.end_session if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context(working_dir( pth.data_path([], environ=environ)) ) daily_bars_path = wd.ensure_dir( *daily_relative( name, timestr, environ=environ, ) ) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, calendar, start_session, end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) five_minute_bar_writer = BcolzFiveMinuteBarWriter( wd.ensure_dir(*five_minute_relative( name, timestr, environ=environ) ), calendar, start_session, end_session, five_minutes_per_day=bundle.five_minutes_per_day, ) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_relative( name, timestr, environ=environ) ), calendar, start_session, end_session, minutes_per_day=bundle.minutes_per_day, ) assets_db_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, )) asset_db_writer = AssetDBWriter(assets_db_path) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), calendar.all_sessions, overwrite=True, ) ) else: daily_bar_writer = None five_minute_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None if assets_versions: raise ValueError('Need to ingest a bundle that creates ' 'writers in order to downgrade the assets' ' db.') bundle.ingest( environ, asset_db_writer, minute_bar_writer, five_minute_bar_writer, daily_bar_writer, adjustment_db_writer, calendar, start_session, end_session, cache, show_progress, is_compile, pth.data_path([name, timestr], environ=environ), ) for version in sorted(set(assets_versions), reverse=True): version_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, db_version=version, )) with working_file(version_path) as wf: shutil.copy2(assets_db_path, wf.path) downgrade(wf.path, version) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to catalyst_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, )], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ catalyst ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), minute_bar_reader=BcolzMinuteBarReader( minute_path(name, timestr, environ=environ), ), five_minute_bar_reader=BcolzFiveMinuteBarReader( five_minute_path(name, timestr, environ=environ), ), daily_bar_reader=BcolzDailyBarReader( daily_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m catalyst ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore( bundles, register_bundle, register, unregister, ingest, load, clean, )
class BlazeEstimatesLoader(PipelineLoader): """An abstract pipeline loader for the estimates datasets that loads data from a blaze expression. Parameters ---------- expr : Expr The expression representing the data to load. columns : dict[str -> str] A dict mapping BoundColumn names to the associated names in `expr`. resources : dict, optional Mapping from the loadable terms of ``expr`` to actual data resources. odo_kwargs : dict, optional Extra keyword arguments to pass to odo when executing the expression. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str The timezeone to use for the data query cutoff. checkpoints : Expr, optional The expression representing checkpointed data to be used for faster forward-filling of data from `expr`. Notes ----- The expression should have a tabular dshape of:: Dim * {{ {SID_FIELD_NAME}: int64, {TS_FIELD_NAME}: datetime, {FISCAL_YEAR_FIELD_NAME}: float64, {FISCAL_QUARTER_FIELD_NAME}: float64, {EVENT_DATE_FIELD_NAME}: datetime, }} And other dataset-specific fields, where each row of the table is a record including the sid to identify the company, the timestamp where we learned about the announcement, and the date of the event. If the '{TS_FIELD_NAME}' field is not included it is assumed that we start the backtest with knowledge of all announcements. """ __doc__ = __doc__.format( SID_FIELD_NAME=SID_FIELD_NAME, TS_FIELD_NAME=TS_FIELD_NAME, FISCAL_YEAR_FIELD_NAME=FISCAL_YEAR_FIELD_NAME, FISCAL_QUARTER_FIELD_NAME=FISCAL_QUARTER_FIELD_NAME, EVENT_DATE_FIELD_NAME=EVENT_DATE_FIELD_NAME, ) @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, expr, columns, resources=None, odo_kwargs=None, data_query_time=None, data_query_tz=None, checkpoints=None): dshape = expr.dshape if not istabular(dshape): raise ValueError( 'expression dshape must be tabular, got: %s' % dshape, ) required_cols = list( required_estimates_fields(columns) ) self._expr = bind_expression_to_resources( expr[required_cols], resources, ) self._columns = columns self._odo_kwargs = odo_kwargs if odo_kwargs is not None else {} check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz self._checkpoints = checkpoints def load_adjusted_array(self, columns, dates, assets, mask): # Only load requested columns. requested_column_names = [self._columns[column.name] for column in columns] raw = load_raw_data( assets, dates, self._data_query_time, self._data_query_tz, self._expr[sorted(metadata_columns.union(requested_column_names))], self._odo_kwargs, checkpoints=self._checkpoints, ) return self.loader( raw, {column.name: self._columns[column.name] for column in columns}, ).load_adjusted_array( columns, dates, assets, mask, ) @property def columns(self): return self._columns
class BlazeLoader(dict): """A PipelineLoader for datasets constructed with ``from_blaze``. Parameters ---------- dsmap : mapping, optional An initial mapping of datasets to ``ExprData`` objects. NOTE: Further mutations to this map will not be reflected by this object. data_query_time : time, optional The time to use for the data query cutoff. data_query_tz : tzinfo or str, optional The timezeone to use for the data query cutoff. pool : Pool, optional The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. Attributes ---------- pool : Pool The pool to use to run blaze queries concurrently. This object must support ``imap_unordered``, ``apply`` and ``apply_async`` methods. It is possible to change the pool after the loader has been constructed. This allows us to set a new pool for the ``global_loader`` like: ``global_loader.pool = multiprocessing.Pool(4)``. See Also -------- :class:`catalyst.utils.pool.SequentialPool` :class:`multiprocessing.Pool` """ @preprocess(data_query_tz=optionally(ensure_timezone)) def __init__(self, dsmap=None, data_query_time=None, data_query_tz=None, pool=SequentialPool()): self.update(dsmap or {}) check_data_query_args(data_query_time, data_query_tz) self._data_query_time = data_query_time self._data_query_tz = data_query_tz # explicitly public self.pool = pool @classmethod @memoize(cache=WeakKeyDictionary()) def global_instance(cls): return cls() def __hash__(self): return id(self) def __call__(self, column): if column.dataset in self: return self raise KeyError(column) def __repr__(self): return '<%s: %s>' % ( type(self).__name__, super(BlazeLoader, self).__repr__(), ) def load_adjusted_array(self, columns, dates, assets, mask): return merge( self.pool.imap_unordered( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)), ), ) def _load_dataset(self, dates, assets, mask, columns): try: (dataset, ) = set(map(getdataset, columns)) except ValueError: raise AssertionError('all columns must come from the same dataset') expr, deltas, checkpoints, odo_kwargs, apply_deltas_adjustments = self[ dataset] have_sids = (dataset.ndim == 2) asset_idx = pd.Series(index=assets, data=np.arange(len(assets))) assets = list(map(int, assets)) # coerce from numpy.int64 added_query_fields = {AD_FIELD_NAME, TS_FIELD_NAME } | ({SID_FIELD_NAME} if have_sids else set()) requested_columns = set(map(getname, columns)) colnames = sorted(added_query_fields | requested_columns) data_query_time = self._data_query_time data_query_tz = self._data_query_tz lower_dt, upper_dt = normalize_data_query_bounds( dates[0], dates[-1], data_query_time, data_query_tz, ) def collect_expr(e, lower): """Materialize the expression as a dataframe. Parameters ---------- e : Expr The baseline or deltas expression. lower : datetime The lower time bound to query. Returns ------- result : pd.DataFrame The resulting dataframe. Notes ----- This can return more data than needed. The in memory reindex will handle this. """ predicate = e[TS_FIELD_NAME] <= upper_dt if lower is not None: predicate &= e[TS_FIELD_NAME] >= lower return odo(e[predicate][colnames], pd.DataFrame, **odo_kwargs) lower, materialized_checkpoints = get_materialized_checkpoints( checkpoints, colnames, lower_dt, odo_kwargs) materialized_expr = self.pool.apply_async(collect_expr, (expr, lower)) materialized_deltas = (self.pool.apply(collect_expr, (deltas, lower)) if deltas is not None else pd.DataFrame(columns=colnames)) if materialized_checkpoints is not None: materialized_expr = pd.concat( ( materialized_checkpoints, materialized_expr.get(), ), ignore_index=True, copy=False, ) # It's not guaranteed that assets returned by the engine will contain # all sids from the deltas table; filter out such mismatches here. if not materialized_deltas.empty and have_sids: materialized_deltas = materialized_deltas[ materialized_deltas[SID_FIELD_NAME].isin(assets)] if data_query_time is not None: for m in (materialized_expr, materialized_deltas): m.loc[:, TS_FIELD_NAME] = m.loc[:, TS_FIELD_NAME].astype( 'datetime64[ns]') normalize_timestamp_to_query_time( m, data_query_time, data_query_tz, inplace=True, ts_field=TS_FIELD_NAME, ) # Inline the deltas that changed our most recently known value. # Also, we reindex by the dates to create a dense representation of # the data. sparse_output, non_novel_deltas = overwrite_novel_deltas( materialized_expr, materialized_deltas, dates, ) # If we ever have cases where we find out about multiple asof_dates' # data on the same TS, we want to make sure that last_in_date_group # selects the correct last asof_date's value. sparse_output.sort_values(AD_FIELD_NAME, inplace=True) non_novel_deltas.sort_values(AD_FIELD_NAME, inplace=True) if AD_FIELD_NAME not in requested_columns: sparse_output.drop(AD_FIELD_NAME, axis=1, inplace=True) sparse_deltas = last_in_date_group(non_novel_deltas, dates, assets, reindex=False, have_sids=have_sids) dense_output = last_in_date_group(sparse_output, dates, assets, reindex=True, have_sids=have_sids) ffill_across_cols(dense_output, columns, {c.name: c.name for c in columns}) # By default, no non-novel deltas are applied. def no_adjustments_from_deltas(*args): return {} adjustments_from_deltas = no_adjustments_from_deltas if have_sids: if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_with_sids column_view = identity else: # If we do not have sids, use the column view to make a single # column vector which is unassociated with any assets. column_view = op.itemgetter(np.s_[:, np.newaxis]) if apply_deltas_adjustments: adjustments_from_deltas = adjustments_from_deltas_no_sids mask = np.full( shape=(len(mask), 1), fill_value=True, dtype=bool_dtype, ) return { column: AdjustedArray( column_view( dense_output[column.name].values.astype(column.dtype), ), mask, adjustments_from_deltas( dates, sparse_output[TS_FIELD_NAME].values, column_idx, column.name, asset_idx, sparse_deltas, ), column.missing_value, ) for column_idx, column in enumerate(columns) }
def _make_bundle_core(): """Create a family of data bundle functions that read from the same bundle mapping. Returns ------- bundles : mappingproxy The mapping of bundles to bundle payloads. register_bundle : Bundle A bundle instance to add to the ``bundles`` mapping. register : callable The function which registers new bundles in the ``bundles`` mapping. unregister : callable The function which deregisters bundles from the ``bundles`` mapping. ingest : callable The function which downloads and write data for a given data bundle. load : callable The function which loads the ingested bundles back into memory. clean : callable The function which cleans up data written with ``ingest``. """ _bundles = {} # the registered bundles # Expose _bundles through a proxy so that users cannot mutate this # accidentally. Users may go through `register` to update this which will # warn when trampling another bundle. bundles = mappingproxy(_bundles) def register_bundle(bundle_cls, asset_filter=None, start_session=None, end_session=None, create_writers=True): bundle = bundle_cls(asset_filter=asset_filter) return register( bundle.name, bundle.ingest, calendar_name=bundle.calendar_name, minutes_per_day=bundle.minutes_per_day, start_session=start_session, end_session=end_session, create_writers=create_writers, ) @curry def register(name, f, calendar_name='OPEN', start_session=None, end_session=None, minutes_per_day=1440, create_writers=True): """Register a data bundle ingest function. Parameters ---------- name : str The name of the bundle. f : callable The ingest function. This function will be passed: environ : mapping The environment this is being run with. asset_db_writer : AssetDBWriter The asset db writer to write into. minute_bar_writer : BcolzMinuteBarWriter The minute bar writer to write into. daily_bar_writer : BcolzDailyBarWriter The daily bar writer to write into. adjustment_writer : SQLiteAdjustmentWriter The adjustment db writer to write into. calendar : catalyst.utils.calendars.TradingCalendar The trading calendar to ingest for. start_session : pd.Timestamp The first session of data to ingest. end_session : pd.Timestamp The last session of data to ingest. cache : DataFrameCache A mapping object to temporarily store dataframes. This should be used to cache intermediates in case the load fails. This will be automatically cleaned up after a successful load. show_progress : bool Show the progress for the current load where possible. calendar_name : str, optional The name of a calendar used to align bundle data. Default is 'NYSE'. start_session : pd.Timestamp, optional The first session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the first_session of the calendar is used. end_session : pd.Timestamp, optional The last session for which we want data. If not provided, or if the date lies outside the range supported by the calendar, the last_session of the calendar is used. minutes_per_day : int, optional The number of minutes in each normal trading day. create_writers : bool, optional Should the ingest machinery create the writers for the ingest function. This can be disabled as an optimization for cases where they are not needed, like the ``quantopian-quandl`` bundle. Notes ----- This function my be used as a decorator, for example: .. code-block:: python @register('quandl') def quandl_ingest_function(...): ... See Also -------- catalyst.data.bundles.bundles """ if name in bundles: warnings.warn( 'Overwriting bundle with name %r' % name, stacklevel=3, ) # NOTE: We don't eagerly compute calendar values here because # `register` is called at module scope in catalyst, and creating a # calendar currently takes between 0.5 and 1 seconds, which causes a # noticeable delay on the catalyst CLI. _bundles[name] = RegisteredBundle( calendar_name=calendar_name, start_session=start_session, end_session=end_session, minutes_per_day=minutes_per_day, ingest=f, create_writers=create_writers, ) return f def unregister(name): """Unregister a bundle. Parameters ---------- name : str The name of the bundle to unregister. Raises ------ UnknownBundle Raised when no bundle has been registered with the given name. See Also -------- catalyst.data.bundles.bundles """ try: del _bundles[name] except KeyError: raise UnknownBundle(name) def ingest(name, environ=os.environ, timestamp=None, assets_versions=(), show_progress=False, is_compile=False): """Ingest data for a given bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. By default this is os.environ. timestamp : datetime, optional The timestamp to use for the load. By default this is the current time. assets_versions : Iterable[int], optional Versions of the assets db to which to downgrade. show_progress : bool, optional Tell the ingest function to display the progress where possible. """ try: bundle = bundles[name] except KeyError: raise UnknownBundle(name) calendar = get_calendar(bundle.calendar_name) start_session = bundle.start_session end_session = bundle.end_session if start_session is None or start_session < calendar.first_session: start_session = calendar.first_session if end_session is None or end_session > calendar.last_session: end_session = calendar.last_session if timestamp is None: timestamp = pd.Timestamp.utcnow() timestamp = timestamp.tz_convert('utc').tz_localize(None) timestr = to_bundle_ingest_dirname(timestamp) cachepath = cache_path(name, environ=environ) pth.ensure_directory(pth.data_path([name, timestr], environ=environ)) pth.ensure_directory(cachepath) with dataframe_cache(cachepath, clean_on_failure=False) as cache, \ ExitStack() as stack: # we use `cleanup_on_failure=False` so that we don't purge the # cache directory if the load fails in the middle if bundle.create_writers: wd = stack.enter_context(working_dir( pth.data_path([], environ=environ)) ) daily_bars_path = wd.ensure_dir( *daily_relative( name, timestr, environ=environ, ) ) daily_bar_writer = BcolzDailyBarWriter( daily_bars_path, calendar, start_session, end_session, ) # Do an empty write to ensure that the daily ctables exist # when we create the SQLiteAdjustmentWriter below. The # SQLiteAdjustmentWriter needs to open the daily ctables so # that it can compute the adjustment ratios for the dividends. daily_bar_writer.write(()) minute_bar_writer = BcolzMinuteBarWriter( wd.ensure_dir(*minute_relative( name, timestr, environ=environ) ), calendar, start_session, end_session, minutes_per_day=bundle.minutes_per_day, ) assets_db_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, )) asset_db_writer = AssetDBWriter(assets_db_path) adjustment_db_writer = stack.enter_context( SQLiteAdjustmentWriter( wd.getpath(*adjustment_db_relative( name, timestr, environ=environ)), BcolzDailyBarReader(daily_bars_path), calendar.all_sessions, overwrite=True, ) ) else: daily_bar_writer = None minute_bar_writer = None asset_db_writer = None adjustment_db_writer = None if assets_versions: raise ValueError('Need to ingest a bundle that creates ' 'writers in order to downgrade the assets' ' db.') bundle.ingest( environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_db_writer, calendar, start_session, end_session, cache, show_progress, is_compile, pth.data_path([name, timestr], environ=environ), ) for version in sorted(set(assets_versions), reverse=True): version_path = wd.getpath(*asset_db_relative( name, timestr, environ=environ, db_version=version, )) with working_file(version_path) as wf: shutil.copy2(assets_db_path, wf.path) downgrade(wf.path, version) def most_recent_data(bundle_name, timestamp, environ=None): """Get the path to the most recent data after ``date``for the given bundle. Parameters ---------- bundle_name : str The name of the bundle to lookup. timestamp : datetime The timestamp to begin searching on or before. environ : dict, optional An environment dict to forward to catalyst_root. """ if bundle_name not in bundles: raise UnknownBundle(bundle_name) try: candidates = os.listdir( pth.data_path([bundle_name], environ=environ), ) return pth.data_path( [bundle_name, max( filter(complement(pth.hidden), candidates), key=from_bundle_ingest_dirname, )], environ=environ, ) except (ValueError, OSError) as e: if getattr(e, 'errno', errno.ENOENT) != errno.ENOENT: raise raise ValueError( 'no data for bundle {bundle!r} on or before {timestamp}\n' 'maybe you need to run: $ catalyst ingest -b {bundle}'.format( bundle=bundle_name, timestamp=timestamp, ), ) def load(name, environ=os.environ, timestamp=None): """Loads a previously ingested bundle. Parameters ---------- name : str The name of the bundle. environ : mapping, optional The environment variables. Defaults of os.environ. timestamp : datetime, optional The timestamp of the data to lookup. Defaults to the current time. Returns ------- bundle_data : BundleData The raw data readers for this bundle. """ if timestamp is None: timestamp = pd.Timestamp.utcnow() timestr = most_recent_data(name, timestamp, environ=environ) return BundleData( asset_finder=AssetFinder( asset_db_path(name, timestr, environ=environ), ), minute_bar_reader=BcolzMinuteBarReader( minute_path(name, timestr, environ=environ), ), daily_bar_reader=BcolzDailyBarReader( daily_path(name, timestr, environ=environ), ), adjustment_reader=SQLiteAdjustmentReader( adjustment_db_path(name, timestr, environ=environ), ), ) @preprocess( before=optionally(ensure_timestamp), after=optionally(ensure_timestamp), ) def clean(name, before=None, after=None, keep_last=None, environ=os.environ): """Clean up data that was created with ``ingest`` or ``$ python -m catalyst ingest`` Parameters ---------- name : str The name of the bundle to remove data for. before : datetime, optional Remove data ingested before this date. This argument is mutually exclusive with: keep_last after : datetime, optional Remove data ingested after this date. This argument is mutually exclusive with: keep_last keep_last : int, optional Remove all but the last ``keep_last`` ingestions. This argument is mutually exclusive with: before after environ : mapping, optional The environment variables. Defaults of os.environ. Returns ------- cleaned : set[str] The names of the runs that were removed. Raises ------ BadClean Raised when ``before`` and or ``after`` are passed with ``keep_last``. This is a subclass of ``ValueError``. """ try: all_runs = sorted( filter( complement(pth.hidden), os.listdir(pth.data_path([name], environ=environ)), ), key=from_bundle_ingest_dirname, ) except OSError as e: if e.errno != errno.ENOENT: raise raise UnknownBundle(name) if ((before is not None or after is not None) and keep_last is not None): raise BadClean(before, after, keep_last) if keep_last is None: def should_clean(name): dt = from_bundle_ingest_dirname(name) return ( (before is not None and dt < before) or (after is not None and dt > after) ) elif keep_last >= 0: last_n_dts = set(take(keep_last, reversed(all_runs))) def should_clean(name): return name not in last_n_dts else: raise BadClean(before, after, keep_last) cleaned = set() for run in all_runs: if should_clean(run): path = pth.data_path([name, run], environ=environ) shutil.rmtree(path) cleaned.add(path) return cleaned return BundleCore( bundles, register_bundle, register, unregister, ingest, load, clean, )