def make_trade_data_for_asset_info(dates, asset_info, price_start, price_step_by_date, price_step_by_sid, volume_start, volume_step_by_date, volume_step_by_sid, frequency, writer=None): """ Convert the asset info dataframe into a dataframe of trade data for each sid, and write to the writer if provided. Write NaNs for locations where assets did not exist. Return a dict of the dataframes, keyed by sid. """ trade_data = {} sids = asset_info.index price_sid_deltas = np.arange(len(sids), dtype=float64) * price_step_by_sid price_date_deltas = (np.arange(len(dates), dtype=float64) * price_step_by_date) prices = (price_sid_deltas + as_column(price_date_deltas)) + price_start volume_sid_deltas = np.arange(len(sids)) * volume_step_by_sid volume_date_deltas = np.arange(len(dates)) * volume_step_by_date volumes = volume_sid_deltas + as_column(volume_date_deltas) + volume_start for j, sid in enumerate(sids): start_date, end_date = asset_info.loc[sid, ['start_date', 'end_date']] # Normalize here so the we still generate non-NaN values on the minutes # for an asset's last trading day. for i, date in enumerate(dates.normalize()): if not (start_date <= date <= end_date): prices[i, j] = 0 volumes[i, j] = 0 df = pd.DataFrame( { "open": prices[:, j], "high": prices[:, j], "low": prices[:, j], "close": prices[:, j], "volume": volumes[:, j], }, index=dates, ) if writer: writer.write_sid(sid, df) trade_data[sid] = df return trade_data
def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader") elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & as_column(good_dates)) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), }
def load_adjusted_array(self, columns, dates, assets, mask): """ Load data from our stored baseline. """ column = self.column if len(columns) != 1: raise ValueError( "Can't load multiple columns with DataFrameLoader" ) elif columns[0] != column: raise ValueError("Can't load unknown column %s" % columns[0]) date_indexer = self.dates.get_indexer(dates) assets_indexer = self.assets.get_indexer(assets) # Boolean arrays with True on matched entries good_dates = (date_indexer != -1) good_assets = (assets_indexer != -1) return { column: AdjustedArray( # Pull out requested columns/rows from our baseline data. data=self.baseline[ix_(date_indexer, assets_indexer)], # Mask out requested columns/rows that didnt match. mask=(good_assets & as_column(good_dates)) & mask, adjustments=self.format_adjustments(dates, assets), missing_value=column.missing_value, ), }
def lifetimes(self, dates, include_start_date): """ Compute a DataFrame representing asset lifetimes for the specified date range. Parameters ---------- dates : pd.DatetimeIndex The dates for which to compute lifetimes. include_start_date : bool Whether or not to count the asset as alive on its start_date. This is useful in a backtesting context where `lifetimes` is being used to signify "do I have data for this asset as of the morning of this date?" For many financial metrics, (e.g. daily close), data isn't available for an asset until the end of the asset's first day. Returns ------- lifetimes : pd.DataFrame A frame of dtype bool with `dates` as index and an Int64Index of assets as columns. The value at `lifetimes.loc[date, asset]` will be True iff `asset` existed on `date`. If `include_start_date` is False, then lifetimes.loc[date, asset] will be false when date == asset.start_date. See Also -------- numpy.putmask catalyst.pipeline.engine.SimplePipelineEngine._compute_root_mask """ # This is a less than ideal place to do this, because if someone adds # assets to the finder after we've touched lifetimes we won't have # those new assets available. Mutability is not my favorite # programming feature. if self._asset_lifetimes is None: self._asset_lifetimes = self._compute_asset_lifetimes() lifetimes = self._asset_lifetimes raw_dates = as_column(dates.asi8) if include_start_date: mask = lifetimes.start <= raw_dates else: mask = lifetimes.start < raw_dates mask &= (raw_dates <= lifetimes.end) return pd.DataFrame(mask, index=dates, columns=lifetimes.sid)
def run_pipeline(self, pipeline, start_date, end_date): """ Compute a pipeline. The algorithm implemented here can be broken down into the following stages: 0. Build a dependency graph of all terms in `pipeline`. Topologically sort the graph to determine an order in which we can compute the terms. 1. Ask our AssetFinder for a "lifetimes matrix", which should contain, for each date between start_date and end_date, a boolean value for each known asset indicating whether the asset existed on that date. 2. Compute each term in the dependency order determined in (0), caching the results in a a dictionary to that they can be fed into future terms. 3. For each date, determine the number of assets passing pipeline.screen. The sum, N, of all these values is the total number of rows in our output frame, so we pre-allocate an output array of length N for each factor in `terms`. 4. Fill in the arrays allocated in (3) by copying computed values from our output cache into the corresponding rows. 5. Stick the values computed in (4) into a DataFrame and return it. Step 0 is performed by ``Pipeline.to_graph``. Step 1 is performed in ``SimplePipelineEngine._compute_root_mask``. Step 2 is performed in ``SimplePipelineEngine.compute_chunk``. Steps 3, 4, and 5 are performed in ``SimplePiplineEngine._to_narrow``. Parameters ---------- pipeline : catalyst.pipeline.Pipeline The pipeline to run. start_date : pd.Timestamp Start date of the computed matrix. end_date : pd.Timestamp End date of the computed matrix. Returns ------- result : pd.DataFrame A frame of computed results. The ``result`` columns correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of :class:`catalyst.pipeline.term.Term`. For each date between ``start_date`` and ``end_date``, ``result`` will contain a row for each asset that passed `pipeline.screen`. A screen of ``None`` indicates that a row should be returned for each asset that existed each day. See Also -------- :meth:`catalyst.pipeline.engine.PipelineEngine.run_pipeline` :meth:`catalyst.pipeline.engine.PipelineEngine.run_chunked_pipeline` """ if end_date < start_date: raise ValueError( "start_date must be before or equal to end_date \n" "start_date=%s, end_date=%s" % (start_date, end_date)) screen_name = uuid4().hex graph = pipeline.to_execution_plan( screen_name, self._root_mask_term, self._calendar, start_date, end_date, ) extra_rows = graph.extra_rows[self._root_mask_term] root_mask = self._compute_root_mask(start_date, end_date, extra_rows) dates, assets, root_mask_values = explode(root_mask) initial_workspace = self._populate_initial_workspace( { self._root_mask_term: root_mask_values, self._root_mask_dates_term: as_column(dates.values) }, self._root_mask_term, graph, dates, assets, ) results = self.compute_chunk( graph, dates, assets, initial_workspace, ) return self._to_narrow( graph.outputs, results, results.pop(screen_name), dates[extra_rows:], assets, )
def run_pipeline(self, pipeline, start_date, end_date): """ Compute a pipeline. The algorithm implemented here can be broken down into the following stages: 0. Build a dependency graph of all terms in `pipeline`. Topologically sort the graph to determine an order in which we can compute the terms. 1. Ask our AssetFinder for a "lifetimes matrix", which should contain, for each date between start_date and end_date, a boolean value for each known asset indicating whether the asset existed on that date. 2. Compute each term in the dependency order determined in (0), caching the results in a a dictionary to that they can be fed into future terms. 3. For each date, determine the number of assets passing pipeline.screen. The sum, N, of all these values is the total number of rows in our output frame, so we pre-allocate an output array of length N for each factor in `terms`. 4. Fill in the arrays allocated in (3) by copying computed values from our output cache into the corresponding rows. 5. Stick the values computed in (4) into a DataFrame and return it. Step 0 is performed by ``Pipeline.to_graph``. Step 1 is performed in ``SimplePipelineEngine._compute_root_mask``. Step 2 is performed in ``SimplePipelineEngine.compute_chunk``. Steps 3, 4, and 5 are performed in ``SimplePiplineEngine._to_narrow``. Parameters ---------- pipeline : catalyst.pipeline.Pipeline The pipeline to run. start_date : pd.Timestamp Start date of the computed matrix. end_date : pd.Timestamp End date of the computed matrix. Returns ------- result : pd.DataFrame A frame of computed results. The ``result`` columns correspond to the entries of `pipeline.columns`, which should be a dictionary mapping strings to instances of :class:`catalyst.pipeline.term.Term`. For each date between ``start_date`` and ``end_date``, ``result`` will contain a row for each asset that passed `pipeline.screen`. A screen of ``None`` indicates that a row should be returned for each asset that existed each day. See Also -------- :meth:`catalyst.pipeline.engine.PipelineEngine.run_pipeline` :meth:`catalyst.pipeline.engine.PipelineEngine.run_chunked_pipeline` """ if end_date < start_date: raise ValueError( "start_date must be before or equal to end_date \n" "start_date=%s, end_date=%s" % (start_date, end_date) ) screen_name = uuid4().hex graph = pipeline.to_execution_plan( screen_name, self._root_mask_term, self._calendar, start_date, end_date, ) extra_rows = graph.extra_rows[self._root_mask_term] root_mask = self._compute_root_mask(start_date, end_date, extra_rows) dates, assets, root_mask_values = explode(root_mask) initial_workspace = self._populate_initial_workspace( { self._root_mask_term: root_mask_values, self._root_mask_dates_term: as_column(dates.values) }, self._root_mask_term, graph, dates, assets, ) results = self.compute_chunk( graph, dates, assets, initial_workspace, ) return self._to_narrow( graph.outputs, results, results.pop(screen_name), dates[extra_rows:], assets, )