Beispiel #1
0
    def calibrate(
        self,
        data: Union[np.ndarray, pd.DataFrame],
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will be applied
        to new instances when calling predict_with_interval.

        Calls the parent class calibrate method after extracting the
        response from the data argument, if response is not passed
        and data is an xgb.DMatrix object.

        Parameters
        ----------
        data : np.ndarray or pd.DataFrame
            Dataset to calibrate baselines on.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray, pd.Series or None, default = None
            The associated response values for every record in data.

        """

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        super().calibrate(data=data, alpha=alpha, response=response)
Beispiel #2
0
    def test_non_list_exception(self):
        """Test an exception is raised if expected_types is not a list."""

        with pytest.raises(TypeError,
                           match=re.escape("expected_types must be a list")):

            checks.check_type(1, 1, "1")
Beispiel #3
0
    def calibrate(
        self,
        data: Any,
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will be applied
        to new instances when calling predict_with_interval.

        Method calls _calibrate_interval to set the default (fixed width)
        interval.

        Parameters
        ----------
        data : any
            Dataset to calibrate baselines on.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray, pd.Series
            The associated response values for every record in data.

        """

        check_type(alpha, [int, float], "alpha")
        check_type(response, [np.ndarray, pd.Series], "response")

        if not (alpha >= 0 and alpha <= 1):

            raise ValueError("alpha must be in range [0 ,1]")

        self._calibrate_interval(data=data, alpha=alpha, response=response)
Beispiel #4
0
    def _generate_leaf_node_predictions(self, data: xgb.DMatrix) -> np.ndarray:
        """Method to generate leaf node predictions from the xgboost model.

        Method calls xgb.Booster.predict with pred_leaf = True and
        ntree_limit = model's best_iteration + 1.

        If the output of predict is not a 2d matrix the output is shaped to
        be 2d.

        Parameters
        ----------
        data : xgb.DMatrix
            Data to generate predictions on.

        """

        check_type(data, [xgb.DMatrix], "data")

        # matrix of (nsample, ntrees) with each record giving
        # the leaf node of each sample in each tree
        leaf_node_predictions = self.model.predict(
            data=data,
            pred_leaf=True,
            ntree_limit=self.model.best_iteration + 1)

        # if the input data is a single column reshape the output to
        # be 2d array rather than 1d
        if len(leaf_node_predictions.shape) == 1:

            leaf_node_predictions = leaf_node_predictions.reshape(
                (data.num_row(), 1))

        return leaf_node_predictions
Beispiel #5
0
    def test_type_exception_raised(self, obj, expected_types, obj_name,
                                   exception_text):
        """Test an exception is raised if obj is not of the correct type(s)."""

        with pytest.raises(TypeError, match=re.escape(exception_text)):

            checks.check_type(obj, expected_types, obj_name)
Beispiel #6
0
    def _generate_leaf_node_predictions(
            self, data: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """Method to generate leaf node predictions from the xgboost model.

        Method calls the underlying model's apply method with ntree_limit =
        model's best_iteration + 1.

        If the output of predict is not a 2d matrix the output is shaped to
        be 2d.

        Parameters
        ----------
        data : np.ndarray or pd.DataFrame
            Data to generate predictions on.

        """

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        # matrix of (nsample, ntrees) with each record giving
        # the leaf node of each sample in each tree
        leaf_node_predictions = self.model.apply(
            X=data, ntree_limit=self.model.best_iteration + 1)

        # if the input data is a single column reshape the output to
        # be 2d array rather than 1d
        if len(leaf_node_predictions.shape) == 1:

            leaf_node_predictions = leaf_node_predictions.reshape(
                (data.shape[0], 1))

        return leaf_node_predictions
Beispiel #7
0
    def test_non_type_exception(self):
        """Test an exception is raised if not all of expected_types elements are types."""

        with pytest.raises(
                TypeError,
                match=re.escape(
                    "all elements in expected_types must be types")):

            checks.check_type(1, [int, 1], "1")
Beispiel #8
0
def check_response_within_interval(
    response: Union[np.ndarray, pd.Series],
    lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    intervals_with_predictions: Optional[np.ndarray] = None,
) -> pd.Series:
    """Function to check the number of times a response lies within
    a prediction interval.

    Either both lower_interval and upper_interval or intervals_with_predictions
    must be specified.

    The function returns the proportion of the response that lies between
    the intervals.

    Parameters
    ----------
    response : np.ndarray, pd.Series
        Response or actual values corresponding to each row in the passed
        intervals.

    lower_interval : np.ndarray, pd.Series or None, default = None
        Lower intervals, if None then lower interval will be taken from the
        first column in intervals_with_predictions.

    upper_interval : np.ndarray, pd.Series or None, default = None
        Upper intervals, if None then upper interval will be taken from the
        first column in intervals_with_predictions.

    intervals_with_predictions : np.ndarry or None, default = None
        Lower intervals and upper intervals combined in a single np array.
        The array must have 3 columns. The lower interval is assumed to be
        the first column and the upper column is assumed to be the third
        column.

    """

    lower_interval, upper_interval = gather_intervals(
        lower_interval=lower_interval,
        upper_interval=upper_interval,
        intervals_with_predictions=intervals_with_predictions,
    )

    check_type(response, [np.ndarray, pd.Series], "response")

    if not response.shape[0] == lower_interval.shape[0]:
        raise ValueError(
            "response and intervals have different numbers of rows")

    response_within_interval = (response >= lower_interval) & (response <=
                                                               upper_interval)

    results = pd.Series(
        response_within_interval).value_counts() / response.shape[0]

    return results
Beispiel #9
0
    def __init__(self, model: xgb.Booster) -> None:

        check_type(model, [xgb.Booster], "model")

        self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR

        check_objective_supported(model, self.SUPPORTED_OBJECTIVES)

        self.model = model

        super().__init__()
Beispiel #10
0
def create_interval_buckets(intervals_with_predictions: pd.DataFrame,
                            cut_function: str = "qcut",
                            **kwargs) -> pd.DataFrame:
    """Function to create a new column in a DataFrame that buckets all rows
    on the widthof the intervals in the DataFrame.

    Parameters
    ----------
    intervals_with_predictions : pd.DataFrame
        Data to add column too containing buckets of interval widths. Must
        have columns called "upper" and "lower" that gives the limits
        of the intervals for each row.

    cut_function : str
        Type of bucketing to use, must be either cut or qcut. Decides
        the pandas cut function to use.

    **kwargs : any
        Arbitrary keyword arguments to pass onto the pandas cut method.

    Returns
    -------
    intervals_with_predictions : pd.DataFrame
        Input data with new column called "interval_width_bucket" that
        splits the data on the width of the intervals in the data (defined
        by the "lower" and "upper" columns)

    """

    check_type(intervals_with_predictions, [pd.DataFrame],
               "intervals_with_predictions")

    check_type(cut_function, [str], "cut_function")

    if cut_function not in ["qcut", "cut"]:

        raise ValueError("cut_function must be either qcut or cut")

    interval_width = (intervals_with_predictions["upper"] -
                      intervals_with_predictions["lower"])

    if cut_function == "qcut":

        intervals_with_predictions["interval_width_bucket"] = pd.qcut(
            x=interval_width, **kwargs)

    else:

        intervals_with_predictions["interval_width_bucket"] = pd.cut(
            x=interval_width, **kwargs)

    return intervals_with_predictions
Beispiel #11
0
    def __init__(self, model: Union[xgb.XGBRegressor,
                                    xgb.XGBClassifier]) -> None:

        check_type(model, [xgb.XGBRegressor, xgb.XGBClassifier], "model")

        self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR

        check_objective_supported(model.get_booster(),
                                  self.SUPPORTED_OBJECTIVES)

        self.model = model

        LeafNodeScaledConformalPredictor.__init__(self)
Beispiel #12
0
    def __init__(self, model: Union[xgb.XGBRegressor,
                                    xgb.XGBClassifier]) -> None:

        super().__init__()

        check_type(model, [xgb.XGBRegressor, xgb.XGBClassifier], "model")

        self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR

        check_objective_supported(model.get_booster(),
                                  self.SUPPORTED_OBJECTIVES)

        self.model = model
Beispiel #13
0
def prepare_prediction_interval_df(intervals_with_predictions: np.ndarray,
                                   response: pd.Series) -> pd.DataFrame:
    """Put response column and n x 3 array into a pd.DataFrame with columns;
    "lower", "predictions", "upper" and response".

    Parameters
    ----------
    intervals_with_predictions : np.ndarray
        n by 3 array containing lower interval values, predictions and upper
        interval values. The columns will be added to output in columns;
        "lower", "predictions" and "upper".

    response : pd.Series or np.ndarray
        Response column to be added to output, in "response" column. Must have
        the same number of rows as intervals_with_predictions.

    Returns
    -------
    df : pd.DataFrame
        4 column pd.DataFrame containing values passed in intervals_with_predictions
        and response with columns; "lower", "predictions", "upper" and response".

    """

    check_type(intervals_with_predictions, [np.ndarray],
               "intervals_with_predictions")

    check_type(response, [np.ndarray, pd.Series], "response")

    if intervals_with_predictions.shape[1] != 3:

        raise ValueError("intervals_with_predictions must have 3 columns")

    if intervals_with_predictions.shape[0] != response.shape[0]:

        raise ValueError(
            "intervals_with_predictions and response have different numbers of rows"
        )

    df = pd.DataFrame(intervals_with_predictions,
                      columns=["lower", "prediction", "upper"])

    if type(response) is pd.Series:

        df["response"] = response.values

    else:

        df["response"] = response

    return df
Beispiel #14
0
    def predict_with_interval(
            self, data: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """Method to generate predictions on data with conformal intervals.

        This method runs the underlying model's predict method twice, once to
        generate predictions and once to produce the leaf node indexes.

        Each prediction is produced with an associated conformal interval.
        The default interval is of a fixed width and this is scaled
        differently for each row. Scaling is done, for a given row, by
        counting the number of times each leaf node, visited to make the
        prediction, was visited in the calibration dataset. The counts of
        leaf node visits in the calibration data are set by the
        _calibrate_leaf_node_counts method.

        The scaling factors, generated by _calculate_scaling_factors, are
        multiploed by the baseline_interval value. The scaled nonconformity
        function implements the inverse and divides the absolute error
        by the scaling factors.

        Parameters
        ----------
        data : np.ndarray or pd.DataFrame
            Data to generate predictions with conformal intervals on.

        Returns
        -------
        predictions_with_interval : np.ndarray
            Array of predictions with intervals for each row in data.
            Output array will have 3 columns where the first is the
            lower interval, second are the predictions and the third
            is the upper interval.

        """

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        check_attribute(
            self,
            "leaf_node_counts",
            "XGBSklearnLeafNodeScaledConformalPredictor does not have leaf_node_counts"
            " attribute, run calibrate first.",
        )

        predictions_with_interval = super().predict_with_interval(data=data)

        return predictions_with_interval
Beispiel #15
0
    def _generate_predictions(self, data: xgb.DMatrix) -> np.ndarray:
        """Method to generate predictions from the xgboost model.

        Calls predict method on the model attribute with
        ntree_limit = model's best_iteration + 1.

        Parameters
        ----------
        data : xgb.DMatrix
            Data to generate predictions on.

        """

        check_type(data, [xgb.DMatrix], "data")

        predictions = self.model.predict(
            data, ntree_limit=self.model.best_iteration + 1)

        return predictions
Beispiel #16
0
    def calibrate(
        self,
        data: xgb.DMatrix,
        response: Optional[Union[np.ndarray, pd.Series]] = None,
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will allow
        prediction intervals that vary by row.

        Method calls _calibrate_leaf_node_counts to record the number
        of times each leaf node is visited across the whole of the
        passed data.

        Method calls _calibrate_interval to set the default interval that
        will be scaled using the inverse of the noncomformity function
        when making predictions. This allows intervals to vary by instance.

        Parameters
        ----------
        data : xgb.DMatrix
            Dataset to use to set baselines.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray, pd.Series or None, default = None
            The response values for the records in data. If passed as
            None then the _calibrate_interval function will attempt to extract
            the response from the data argument with get_label.

        """

        check_type(data, [xgb.DMatrix], "data")

        if response is None:

            # only to stop mypy complaining about get_label method
            data = cast(xgb.DMatrix, data)

            response = data.get_label()

        super().calibrate(data=data, response=response, alpha=alpha)
Beispiel #17
0
def gather_intervals(
    lower_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    upper_interval: Optional[Union[np.ndarray, pd.Series]] = None,
    intervals_with_predictions: Optional[np.ndarray] = None,
) -> Tuple[Union[np.ndarray, pd.Series], Union[np.ndarray, pd.Series]]:
    """Function to perform checks on passed intervals and return lower and upper
    intervals separately if they are passed combined in intervals_with_predictions.
    """

    if ((lower_interval is None and intervals_with_predictions is None)
            or (upper_interval is None and intervals_with_predictions is None)
            or (upper_interval is None and lower_interval is None
                and intervals_with_predictions is None)):

        raise ValueError(
            "either lower_interval and upper_interval or intervals_with_predictions must"
            "be specified but both are None")

    if ((lower_interval is not None and intervals_with_predictions is not None)
            or
        (upper_interval is not None and intervals_with_predictions is not None)
            or (upper_interval is not None and lower_interval is not None
                and intervals_with_predictions is not None)):

        raise ValueError(
            "either lower_interval and upper_interval or intervals_with_predictions must"
            "be specified but both are specified")

    # if intervals_with_predictions is passed, split out the first and third columns
    # into lower_interval and upper_interval
    if intervals_with_predictions is not None:

        check_type(intervals_with_predictions, [np.ndarray],
                   "intervals_with_predictions")

        if not intervals_with_predictions.shape[1] == 3:
            raise ValueError(
                "expecting intervals_with_predictions to have 3 columns")

        lower_interval_return = intervals_with_predictions[:, 0]
        upper_interval_return = intervals_with_predictions[:, 2]

    else:

        lower_interval_return = lower_interval
        upper_interval_return = upper_interval

    check_type(lower_interval_return, [np.ndarray, pd.Series],
               "lower_interval_return")
    check_type(upper_interval_return, [np.ndarray, pd.Series],
               "upper_interval_return")

    if lower_interval_return.shape[0] != upper_interval_return.shape[0]:

        raise ValueError(
            "lower_interval_return and upper_interval_return have different shapes"
        )

    return lower_interval_return, upper_interval_return
Beispiel #18
0
    def calibrate(
        self,
        data: Any,
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will allow
        prediction intervals that vary by row.

        Method calls _calibrate_leaf_node_counts to record the number
        of times each leaf node is visited across the whole of the
        passed data.

        Method calls _calibrate_interval to set the default interval that
        will be scaled using the inverse of the noncomformity function
        when making predictions. This allows intervals to vary by instance.

        Parameters
        ----------
        data : xgb.DMatrix
            Dataset to use to set baselines.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray or pd.Series
            The response values for the records in data.

        """

        check_type(response, [pd.Series, np.ndarray], "response")
        check_type(alpha, [int, float], "alpha")

        if not (alpha >= 0 and alpha <= 1):

            raise ValueError("alpha must be in range [0 ,1]")

        self._calibrate_leaf_node_counts(data=data)
        self._calibrate_interval(data=data, alpha=alpha, response=response)
Beispiel #19
0
    def calibrate(
        self,
        data: Union[np.ndarray, pd.DataFrame],
        response: Union[np.ndarray, pd.Series],
        alpha: Union[int, float] = 0.95,
    ) -> None:
        """Method to calibrate conformal intervals that will allow
        prediction intervals that vary by row.

        Method calls the LeafNodeScaledConformalPredictor.calibrate
        method and skips the parent calibrate method.

        Gradnparent calibrate method calls _calibrate_leaf_node_counts
        to record the numberof times each leaf node is visited across
        the whole of the passed data.

        Gradnparent calibrate method calls _calibrate_interval to set the
        default interval that will be scaled using the inverse of the
        noncomformity function when making predictions. This allows
        intervals to vary by instance.

        Parameters
        ----------
        data : np.ndarray or pd.DataFrame
            Dataset to use to set baselines.

        alpha : int or float, default = 0.95
            Confidence level for the interval.

        response : np.ndarray or pd.Series
            The response values for the records in data.

        """

        check_type(data, [np.ndarray, pd.DataFrame], "data")

        super().calibrate(data=data, response=response, alpha=alpha)
Beispiel #20
0
def check_objective_supported(booster: xgb.Booster,
                              supported_objectives: List[str]) -> None:
    """Function to check that the booster objective parameter is in the
    supported_objectives list and raise and exception if not.
    """

    check_type(booster, [xgb.Booster], "booster")
    check_type(supported_objectives, [list], "supported_objectives")

    for i, objective in enumerate(supported_objectives):

        check_type(objective, [str], f"supported_objectives[{i}]")

    booster_config = json.loads(booster.save_config())

    booster_objective = booster_config["learner"]["objective"]["name"]

    check_allowed_value(booster_objective, supported_objectives,
                        "booster objective not supported")