def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=calendar[0], lookback_start=start_date, lookback_length=extra_rows, ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) assert lifetimes.index[extra_rows] == start_date assert lifetimes.index[-1] == end_date if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist between the requested start and # end dates. existed = lifetimes.iloc[extra_rows:].any() ret = lifetimes.loc[:, existed] shape = ret.shape assert shape[0] * shape[1] != 0, 'root mask cannot be empty' return ret
def _compute_root_mask(self, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ calendar = self._calendar finder = self._finder start_idx, end_idx = self._calendar.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=calendar[0], lookback_start=start_date, lookback_length=extra_rows, ) # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` lifetimes = finder.lifetimes( calendar[start_idx - extra_rows:end_idx], include_start_date=False ) if lifetimes.index[extra_rows] != start_date: raise ValueError( 'The first date of the lifetimes matrix does not match the' ' start date of the pipeline. Did you forget to align the' ' start_date to the trading calendar?' ) if lifetimes.index[-1] != end_date: raise ValueError( 'The last date of the lifetimes matrix does not match the' ' start date of the pipeline. Did you forget to align the' ' end_date to the trading calendar?' ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] shape = ret.shape if shape[0] * shape[1] == 0: raise ValueError( "Found only empty asset-days between {} and {}.\n" "This probably means that either your asset db is out of date" " or that you're trying to run a Pipeline during a period with" " no market days.".format(start_date, end_date), ) return ret
def _compute_root_mask(self, domain, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- domain : zipline.pipeline.domain.Domain Domain for which we're computing a pipeline. start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ sessions = domain.all_sessions() if start_date not in sessions: raise ValueError( "Pipeline start date ({}) is not a trading session for " "domain {}.".format(start_date, domain)) elif end_date not in sessions: raise ValueError( "Pipeline end date {} is not a trading session for " "domain {}.".format(end_date, domain)) start_idx, end_idx = sessions.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=sessions[0], lookback_start=start_date, lookback_length=extra_rows, ) # NOTE: This logic should probably be delegated to the domain once we # start adding more complex domains. # # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` finder = self._finder lifetimes = finder.lifetimes( sessions[start_idx - extra_rows:end_idx], include_start_date=False, country_codes=(domain.country_code, ), ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] num_assets = ret.shape[1] if num_assets == 0: raise ValueError( "Failed to find any assets with country_code {!r} that traded " "between {} and {}.\n" "This probably means that your asset db is old or that it has " "incorrect country/exchange metadata.".format( domain.country_code, start_date, end_date, )) return ret
def compute_extra_rows(self, all_dates, start_date, end_date, min_extra_rows): """ Ensure that min_extra_rows pushes us back to a computation date. Parameters ---------- all_dates : pd.DatetimeIndex The trading sessions against which ``self`` will be computed. start_date : pd.Timestamp The first date for which final output is requested. end_date : pd.Timestamp The last date for which final output is requested. min_extra_rows : int The minimum number of extra rows required of ``self``, as determined by other terms that depend on ``self``. Returns ------- extra_rows : int The number of extra rows to compute. This will be the minimum number of rows required to make our computed start_date fall on a recomputation date. """ try: current_start_pos = all_dates.get_loc(start_date) - min_extra_rows if current_start_pos < 0: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=all_dates[0], lookback_start=start_date, lookback_length=min_extra_rows, ) except KeyError: before, after = nearest_unequal_elements(all_dates, start_date) raise ValueError( "Pipeline start_date {start_date} is not in calendar.\n" "Latest date before start_date is {before}.\n" "Earliest date after start_date is {after}.".format( start_date=start_date, before=before, after=after, ) ) # Our possible target dates are all the dates on or before the current # starting position. # TODO: Consider bounding this below by self.window_length candidates = all_dates[:current_start_pos + 1] # Choose the latest date in the candidates that is the start of a new # period at our frequency. choices = select_sampling_indices(candidates, self._frequency) # If we have choices, the last choice is the first date if the # period containing current_start_date. Choose it. new_start_date = candidates[choices[-1]] # Add the difference between the new and old start dates to get the # number of rows for the new start_date. new_start_pos = all_dates.get_loc(new_start_date) assert new_start_pos <= current_start_pos, \ "Computed negative extra rows!" return min_extra_rows + (current_start_pos - new_start_pos)
def compute_extra_rows(self, all_dates, start_date, end_date, min_extra_rows): """ Ensure that min_extra_rows pushes us back to a computation date. Parameters ---------- all_dates : pd.DatetimeIndex The trading sessions against which ``self`` will be computed. start_date : pd.Timestamp The first date for which final output is requested. end_date : pd.Timestamp The last date for which final output is requested. min_extra_rows : int The minimum number of extra rows required of ``self``, as determined by other terms that depend on ``self``. Returns ------- extra_rows : int The number of extra rows to compute. This will be the minimum number of rows required to make our computed start_date fall on a recomputation date. """ try: current_start_pos = all_dates.get_loc(start_date) - min_extra_rows if current_start_pos < 0: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=all_dates[0], lookback_start=start_date, lookback_length=min_extra_rows, ) except KeyError: before, after = nearest_unequal_elements(all_dates, start_date) raise ValueError( "Pipeline start_date {start_date} is not in calendar.\n" "Latest date before start_date is {before}.\n" "Earliest date after start_date is {after}.".format( start_date=start_date, before=before, after=after, )) # Our possible target dates are all the dates on or before the current # starting position. # TODO: Consider bounding this below by self.window_length candidates = all_dates[:current_start_pos + 1] # Choose the latest date in the candidates that is the start of a new # period at our frequency. choices = select_sampling_indices(candidates, self._frequency) # If we have choices, the last choice is the first date if the # period containing current_start_date. Choose it. new_start_date = candidates[choices[-1]] # Add the difference between the new and old start dates to get the # number of rows for the new start_date. new_start_pos = all_dates.get_loc(new_start_date) assert new_start_pos <= current_start_pos, \ "Computed negative extra rows!" return min_extra_rows + (current_start_pos - new_start_pos)
def _compute_root_mask(self, domain, start_date, end_date, extra_rows): """ Compute a lifetimes matrix from our AssetFinder, then drop columns that didn't exist at all during the query dates. Parameters ---------- domain : zipline.pipeline.domain.Domain Domain for which we're computing a pipeline. start_date : pd.Timestamp Base start date for the matrix. end_date : pd.Timestamp End date for the matrix. extra_rows : int Number of extra rows to compute before `start_date`. Extra rows are needed by terms like moving averages that require a trailing window of data. Returns ------- lifetimes : pd.DataFrame Frame of dtype `bool` containing dates from `extra_rows` days before `start_date`, continuing through to `end_date`. The returned frame contains as columns all assets in our AssetFinder that existed for at least one day between `start_date` and `end_date`. """ sessions = domain.all_sessions() if start_date not in sessions: raise ValueError( "Pipeline start date ({}) is not a trading session for " "domain {}.".format(start_date, domain) ) elif end_date not in sessions: raise ValueError( "Pipeline end date {} is not a trading session for " "domain {}.".format(end_date, domain) ) start_idx, end_idx = sessions.slice_locs(start_date, end_date) if start_idx < extra_rows: raise NoFurtherDataError.from_lookback_window( initial_message="Insufficient data to compute Pipeline:", first_date=sessions[0], lookback_start=start_date, lookback_length=extra_rows, ) # NOTE: This logic should probably be delegated to the domain once we # start adding more complex domains. # # Build lifetimes matrix reaching back to `extra_rows` days before # `start_date.` finder = self._finder lifetimes = finder.lifetimes( sessions[start_idx - extra_rows:end_idx], include_start_date=False, country_codes=(domain.country_code,), ) if not lifetimes.columns.unique: columns = lifetimes.columns duplicated = columns[columns.duplicated()].unique() raise AssertionError("Duplicated sids: %d" % duplicated) # Filter out columns that didn't exist from the farthest look back # window through the end of the requested dates. existed = lifetimes.any() ret = lifetimes.loc[:, existed] num_assets = ret.shape[1] if num_assets == 0: raise ValueError( "Failed to find any assets with country_code {!r} that traded " "between {} and {}.\n" "This probably means that your asset db is old or that it has " "incorrect country/exchange metadata.".format( domain.country_code, start_date, end_date, ) ) return ret