예제 #1
0
    def _compute_root_mask(self, start_date, end_date, extra_rows):
        """
        Compute a lifetimes matrix from our AssetFinder, then drop columns that
        didn't exist at all during the query dates.

        Parameters
        ----------
        start_date : pd.Timestamp
            Base start date for the matrix.
        end_date : pd.Timestamp
            End date for the matrix.
        extra_rows : int
            Number of extra rows to compute before `start_date`.
            Extra rows are needed by terms like moving averages that require a
            trailing window of data.

        Returns
        -------
        lifetimes : pd.DataFrame
            Frame of dtype `bool` containing dates from `extra_rows` days
            before `start_date`, continuing through to `end_date`.  The
            returned frame contains as columns all assets in our AssetFinder
            that existed for at least one day between `start_date` and
            `end_date`.
        """
        calendar = self._calendar
        finder = self._finder
        start_idx, end_idx = self._calendar.slice_locs(start_date, end_date)
        if start_idx < extra_rows:
            raise NoFurtherDataError.from_lookback_window(
                initial_message="Insufficient data to compute Pipeline:",
                first_date=calendar[0],
                lookback_start=start_date,
                lookback_length=extra_rows,
            )

        # Build lifetimes matrix reaching back to `extra_rows` days before
        # `start_date.`
        lifetimes = finder.lifetimes(
            calendar[start_idx - extra_rows:end_idx],
            include_start_date=False
        )

        assert lifetimes.index[extra_rows] == start_date
        assert lifetimes.index[-1] == end_date
        if not lifetimes.columns.unique:
            columns = lifetimes.columns
            duplicated = columns[columns.duplicated()].unique()
            raise AssertionError("Duplicated sids: %d" % duplicated)

        # Filter out columns that didn't exist between the requested start and
        # end dates.
        existed = lifetimes.iloc[extra_rows:].any()
        ret = lifetimes.loc[:, existed]
        shape = ret.shape
        assert shape[0] * shape[1] != 0, 'root mask cannot be empty'
        return ret
예제 #2
0
def _shift_dates(dates, start_date, end_date, shift):
    try:
        start = dates.get_loc(start_date)
    except KeyError:
        if start_date < dates[0]:
            raise NoFurtherDataError(
                msg=(
                    "Pipeline Query requested data starting on {query_start}, "
                    "but first known date is {calendar_start}"
                ).format(
                    query_start=str(start_date),
                    calendar_start=str(dates[0]),
                )
            )
        else:
            raise ValueError("Query start %s not in calendar" % start_date)

    # Make sure that shifting doesn't push us out of the calendar.
    if start < shift:
        raise NoFurtherDataError(
            msg=(
                "Pipeline Query requested data from {shift}"
                " days before {query_start}, but first known date is only "
                "{start} days earlier."
            ).format(shift=shift, query_start=start_date, start=start),
        )

    try:
        end = dates.get_loc(end_date)
    except KeyError:
        if end_date > dates[-1]:
            raise NoFurtherDataError(
                msg=(
                    "Pipeline Query requesting data up to {query_end}, "
                    "but last known date is {calendar_end}"
                ).format(
                    query_end=end_date,
                    calendar_end=dates[-1],
                )
            )
        else:
            raise ValueError("Query end %s not in calendar" % end_date)
    return dates[start - shift], dates[end - shift]
예제 #3
0
    def compute_extra_rows(self, all_dates, start_date, end_date,
                           min_extra_rows):
        """
        Ensure that min_extra_rows pushes us back to a computation date.

        Parameters
        ----------
        all_dates : pd.DatetimeIndex
            The trading sessions against which ``self`` will be computed.
        start_date : pd.Timestamp
            The first date for which final output is requested.
        end_date : pd.Timestamp
            The last date for which final output is requested.
        min_extra_rows : int
            The minimum number of extra rows required of ``self``, as
            determined by other terms that depend on ``self``.

        Returns
        -------
        extra_rows : int
            The number of extra rows to compute.  This will be the minimum
            number of rows required to make our computed start_date fall on a
            recomputation date.
        """
        try:
            current_start_pos = all_dates.get_loc(start_date) - min_extra_rows
            if current_start_pos < 0:
                raise NoFurtherDataError(
                    initial_message="Insufficient data to compute Pipeline:",
                    first_date=all_dates[0],
                    lookback_start=start_date,
                    lookback_length=min_extra_rows,
                )
        except KeyError:
            before, after = nearest_unequal_elements(all_dates, start_date)
            raise ValueError(
                "Pipeline start_date {start_date} is not in calendar.\n"
                "Latest date before start_date is {before}.\n"
                "Earliest date after start_date is {after}.".format(
                    start_date=start_date,
                    before=before,
                    after=after,
                ))

        # Our possible target dates are all the dates on or before the current
        # starting position.
        # TODO: Consider bounding this below by self.window_length
        candidates = all_dates[:current_start_pos + 1]

        # Choose the latest date in the candidates that is the start of a new
        # period at our frequency.
        choices = select_sampling_indices(candidates, self._frequency)

        # If we have choices, the last choice is the first date if the
        # period containing current_start_date.  Choose it.
        new_start_date = candidates[choices[-1]]

        # Add the difference between the new and old start dates to get the
        # number of rows for the new start_date.
        new_start_pos = all_dates.get_loc(new_start_date)
        assert new_start_pos <= current_start_pos, \
            "Computed negative extra rows!"

        return min_extra_rows + (current_start_pos - new_start_pos)