def calibrate( self, data: Union[np.ndarray, pd.DataFrame], response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, ) -> None: """Method to calibrate conformal intervals that will be applied to new instances when calling predict_with_interval. Calls the parent class calibrate method after extracting the response from the data argument, if response is not passed and data is an xgb.DMatrix object. Parameters ---------- data : np.ndarray or pd.DataFrame Dataset to calibrate baselines on. alpha : int or float, default = 0.95 Confidence level for the interval. response : np.ndarray, pd.Series or None, default = None The associated response values for every record in data. """ check_type(data, [np.ndarray, pd.DataFrame], "data") super().calibrate(data=data, alpha=alpha, response=response)
def test_non_list_exception(self): """Test an exception is raised if expected_types is not a list.""" with pytest.raises(TypeError, match=re.escape("expected_types must be a list")): checks.check_type(1, 1, "1")
def calibrate( self, data: Any, response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, ) -> None: """Method to calibrate conformal intervals that will be applied to new instances when calling predict_with_interval. Method calls _calibrate_interval to set the default (fixed width) interval. Parameters ---------- data : any Dataset to calibrate baselines on. alpha : int or float, default = 0.95 Confidence level for the interval. response : np.ndarray, pd.Series The associated response values for every record in data. """ check_type(alpha, [int, float], "alpha") check_type(response, [np.ndarray, pd.Series], "response") if not (alpha >= 0 and alpha <= 1): raise ValueError("alpha must be in range [0 ,1]") self._calibrate_interval(data=data, alpha=alpha, response=response)
def _generate_leaf_node_predictions(self, data: xgb.DMatrix) -> np.ndarray: """Method to generate leaf node predictions from the xgboost model. Method calls xgb.Booster.predict with pred_leaf = True and ntree_limit = model's best_iteration + 1. If the output of predict is not a 2d matrix the output is shaped to be 2d. Parameters ---------- data : xgb.DMatrix Data to generate predictions on. """ check_type(data, [xgb.DMatrix], "data") # matrix of (nsample, ntrees) with each record giving # the leaf node of each sample in each tree leaf_node_predictions = self.model.predict( data=data, pred_leaf=True, ntree_limit=self.model.best_iteration + 1) # if the input data is a single column reshape the output to # be 2d array rather than 1d if len(leaf_node_predictions.shape) == 1: leaf_node_predictions = leaf_node_predictions.reshape( (data.num_row(), 1)) return leaf_node_predictions
def test_type_exception_raised(self, obj, expected_types, obj_name, exception_text): """Test an exception is raised if obj is not of the correct type(s).""" with pytest.raises(TypeError, match=re.escape(exception_text)): checks.check_type(obj, expected_types, obj_name)
def _generate_leaf_node_predictions( self, data: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: """Method to generate leaf node predictions from the xgboost model. Method calls the underlying model's apply method with ntree_limit = model's best_iteration + 1. If the output of predict is not a 2d matrix the output is shaped to be 2d. Parameters ---------- data : np.ndarray or pd.DataFrame Data to generate predictions on. """ check_type(data, [np.ndarray, pd.DataFrame], "data") # matrix of (nsample, ntrees) with each record giving # the leaf node of each sample in each tree leaf_node_predictions = self.model.apply( X=data, ntree_limit=self.model.best_iteration + 1) # if the input data is a single column reshape the output to # be 2d array rather than 1d if len(leaf_node_predictions.shape) == 1: leaf_node_predictions = leaf_node_predictions.reshape( (data.shape[0], 1)) return leaf_node_predictions
def test_non_type_exception(self): """Test an exception is raised if not all of expected_types elements are types.""" with pytest.raises( TypeError, match=re.escape( "all elements in expected_types must be types")): checks.check_type(1, [int, 1], "1")
def check_response_within_interval( response: Union[np.ndarray, pd.Series], lower_interval: Optional[Union[np.ndarray, pd.Series]] = None, upper_interval: Optional[Union[np.ndarray, pd.Series]] = None, intervals_with_predictions: Optional[np.ndarray] = None, ) -> pd.Series: """Function to check the number of times a response lies within a prediction interval. Either both lower_interval and upper_interval or intervals_with_predictions must be specified. The function returns the proportion of the response that lies between the intervals. Parameters ---------- response : np.ndarray, pd.Series Response or actual values corresponding to each row in the passed intervals. lower_interval : np.ndarray, pd.Series or None, default = None Lower intervals, if None then lower interval will be taken from the first column in intervals_with_predictions. upper_interval : np.ndarray, pd.Series or None, default = None Upper intervals, if None then upper interval will be taken from the first column in intervals_with_predictions. intervals_with_predictions : np.ndarry or None, default = None Lower intervals and upper intervals combined in a single np array. The array must have 3 columns. The lower interval is assumed to be the first column and the upper column is assumed to be the third column. """ lower_interval, upper_interval = gather_intervals( lower_interval=lower_interval, upper_interval=upper_interval, intervals_with_predictions=intervals_with_predictions, ) check_type(response, [np.ndarray, pd.Series], "response") if not response.shape[0] == lower_interval.shape[0]: raise ValueError( "response and intervals have different numbers of rows") response_within_interval = (response >= lower_interval) & (response <= upper_interval) results = pd.Series( response_within_interval).value_counts() / response.shape[0] return results
def __init__(self, model: xgb.Booster) -> None: check_type(model, [xgb.Booster], "model") self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR check_objective_supported(model, self.SUPPORTED_OBJECTIVES) self.model = model super().__init__()
def create_interval_buckets(intervals_with_predictions: pd.DataFrame, cut_function: str = "qcut", **kwargs) -> pd.DataFrame: """Function to create a new column in a DataFrame that buckets all rows on the widthof the intervals in the DataFrame. Parameters ---------- intervals_with_predictions : pd.DataFrame Data to add column too containing buckets of interval widths. Must have columns called "upper" and "lower" that gives the limits of the intervals for each row. cut_function : str Type of bucketing to use, must be either cut or qcut. Decides the pandas cut function to use. **kwargs : any Arbitrary keyword arguments to pass onto the pandas cut method. Returns ------- intervals_with_predictions : pd.DataFrame Input data with new column called "interval_width_bucket" that splits the data on the width of the intervals in the data (defined by the "lower" and "upper" columns) """ check_type(intervals_with_predictions, [pd.DataFrame], "intervals_with_predictions") check_type(cut_function, [str], "cut_function") if cut_function not in ["qcut", "cut"]: raise ValueError("cut_function must be either qcut or cut") interval_width = (intervals_with_predictions["upper"] - intervals_with_predictions["lower"]) if cut_function == "qcut": intervals_with_predictions["interval_width_bucket"] = pd.qcut( x=interval_width, **kwargs) else: intervals_with_predictions["interval_width_bucket"] = pd.cut( x=interval_width, **kwargs) return intervals_with_predictions
def __init__(self, model: Union[xgb.XGBRegressor, xgb.XGBClassifier]) -> None: check_type(model, [xgb.XGBRegressor, xgb.XGBClassifier], "model") self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR check_objective_supported(model.get_booster(), self.SUPPORTED_OBJECTIVES) self.model = model LeafNodeScaledConformalPredictor.__init__(self)
def __init__(self, model: Union[xgb.XGBRegressor, xgb.XGBClassifier]) -> None: super().__init__() check_type(model, [xgb.XGBRegressor, xgb.XGBClassifier], "model") self.SUPPORTED_OBJECTIVES = SUPPORTED_OBJECTIVES_ABS_ERROR check_objective_supported(model.get_booster(), self.SUPPORTED_OBJECTIVES) self.model = model
def prepare_prediction_interval_df(intervals_with_predictions: np.ndarray, response: pd.Series) -> pd.DataFrame: """Put response column and n x 3 array into a pd.DataFrame with columns; "lower", "predictions", "upper" and response". Parameters ---------- intervals_with_predictions : np.ndarray n by 3 array containing lower interval values, predictions and upper interval values. The columns will be added to output in columns; "lower", "predictions" and "upper". response : pd.Series or np.ndarray Response column to be added to output, in "response" column. Must have the same number of rows as intervals_with_predictions. Returns ------- df : pd.DataFrame 4 column pd.DataFrame containing values passed in intervals_with_predictions and response with columns; "lower", "predictions", "upper" and response". """ check_type(intervals_with_predictions, [np.ndarray], "intervals_with_predictions") check_type(response, [np.ndarray, pd.Series], "response") if intervals_with_predictions.shape[1] != 3: raise ValueError("intervals_with_predictions must have 3 columns") if intervals_with_predictions.shape[0] != response.shape[0]: raise ValueError( "intervals_with_predictions and response have different numbers of rows" ) df = pd.DataFrame(intervals_with_predictions, columns=["lower", "prediction", "upper"]) if type(response) is pd.Series: df["response"] = response.values else: df["response"] = response return df
def predict_with_interval( self, data: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: """Method to generate predictions on data with conformal intervals. This method runs the underlying model's predict method twice, once to generate predictions and once to produce the leaf node indexes. Each prediction is produced with an associated conformal interval. The default interval is of a fixed width and this is scaled differently for each row. Scaling is done, for a given row, by counting the number of times each leaf node, visited to make the prediction, was visited in the calibration dataset. The counts of leaf node visits in the calibration data are set by the _calibrate_leaf_node_counts method. The scaling factors, generated by _calculate_scaling_factors, are multiploed by the baseline_interval value. The scaled nonconformity function implements the inverse and divides the absolute error by the scaling factors. Parameters ---------- data : np.ndarray or pd.DataFrame Data to generate predictions with conformal intervals on. Returns ------- predictions_with_interval : np.ndarray Array of predictions with intervals for each row in data. Output array will have 3 columns where the first is the lower interval, second are the predictions and the third is the upper interval. """ check_type(data, [np.ndarray, pd.DataFrame], "data") check_attribute( self, "leaf_node_counts", "XGBSklearnLeafNodeScaledConformalPredictor does not have leaf_node_counts" " attribute, run calibrate first.", ) predictions_with_interval = super().predict_with_interval(data=data) return predictions_with_interval
def _generate_predictions(self, data: xgb.DMatrix) -> np.ndarray: """Method to generate predictions from the xgboost model. Calls predict method on the model attribute with ntree_limit = model's best_iteration + 1. Parameters ---------- data : xgb.DMatrix Data to generate predictions on. """ check_type(data, [xgb.DMatrix], "data") predictions = self.model.predict( data, ntree_limit=self.model.best_iteration + 1) return predictions
def calibrate( self, data: xgb.DMatrix, response: Optional[Union[np.ndarray, pd.Series]] = None, alpha: Union[int, float] = 0.95, ) -> None: """Method to calibrate conformal intervals that will allow prediction intervals that vary by row. Method calls _calibrate_leaf_node_counts to record the number of times each leaf node is visited across the whole of the passed data. Method calls _calibrate_interval to set the default interval that will be scaled using the inverse of the noncomformity function when making predictions. This allows intervals to vary by instance. Parameters ---------- data : xgb.DMatrix Dataset to use to set baselines. alpha : int or float, default = 0.95 Confidence level for the interval. response : np.ndarray, pd.Series or None, default = None The response values for the records in data. If passed as None then the _calibrate_interval function will attempt to extract the response from the data argument with get_label. """ check_type(data, [xgb.DMatrix], "data") if response is None: # only to stop mypy complaining about get_label method data = cast(xgb.DMatrix, data) response = data.get_label() super().calibrate(data=data, response=response, alpha=alpha)
def gather_intervals( lower_interval: Optional[Union[np.ndarray, pd.Series]] = None, upper_interval: Optional[Union[np.ndarray, pd.Series]] = None, intervals_with_predictions: Optional[np.ndarray] = None, ) -> Tuple[Union[np.ndarray, pd.Series], Union[np.ndarray, pd.Series]]: """Function to perform checks on passed intervals and return lower and upper intervals separately if they are passed combined in intervals_with_predictions. """ if ((lower_interval is None and intervals_with_predictions is None) or (upper_interval is None and intervals_with_predictions is None) or (upper_interval is None and lower_interval is None and intervals_with_predictions is None)): raise ValueError( "either lower_interval and upper_interval or intervals_with_predictions must" "be specified but both are None") if ((lower_interval is not None and intervals_with_predictions is not None) or (upper_interval is not None and intervals_with_predictions is not None) or (upper_interval is not None and lower_interval is not None and intervals_with_predictions is not None)): raise ValueError( "either lower_interval and upper_interval or intervals_with_predictions must" "be specified but both are specified") # if intervals_with_predictions is passed, split out the first and third columns # into lower_interval and upper_interval if intervals_with_predictions is not None: check_type(intervals_with_predictions, [np.ndarray], "intervals_with_predictions") if not intervals_with_predictions.shape[1] == 3: raise ValueError( "expecting intervals_with_predictions to have 3 columns") lower_interval_return = intervals_with_predictions[:, 0] upper_interval_return = intervals_with_predictions[:, 2] else: lower_interval_return = lower_interval upper_interval_return = upper_interval check_type(lower_interval_return, [np.ndarray, pd.Series], "lower_interval_return") check_type(upper_interval_return, [np.ndarray, pd.Series], "upper_interval_return") if lower_interval_return.shape[0] != upper_interval_return.shape[0]: raise ValueError( "lower_interval_return and upper_interval_return have different shapes" ) return lower_interval_return, upper_interval_return
def calibrate( self, data: Any, response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, ) -> None: """Method to calibrate conformal intervals that will allow prediction intervals that vary by row. Method calls _calibrate_leaf_node_counts to record the number of times each leaf node is visited across the whole of the passed data. Method calls _calibrate_interval to set the default interval that will be scaled using the inverse of the noncomformity function when making predictions. This allows intervals to vary by instance. Parameters ---------- data : xgb.DMatrix Dataset to use to set baselines. alpha : int or float, default = 0.95 Confidence level for the interval. response : np.ndarray or pd.Series The response values for the records in data. """ check_type(response, [pd.Series, np.ndarray], "response") check_type(alpha, [int, float], "alpha") if not (alpha >= 0 and alpha <= 1): raise ValueError("alpha must be in range [0 ,1]") self._calibrate_leaf_node_counts(data=data) self._calibrate_interval(data=data, alpha=alpha, response=response)
def calibrate( self, data: Union[np.ndarray, pd.DataFrame], response: Union[np.ndarray, pd.Series], alpha: Union[int, float] = 0.95, ) -> None: """Method to calibrate conformal intervals that will allow prediction intervals that vary by row. Method calls the LeafNodeScaledConformalPredictor.calibrate method and skips the parent calibrate method. Gradnparent calibrate method calls _calibrate_leaf_node_counts to record the numberof times each leaf node is visited across the whole of the passed data. Gradnparent calibrate method calls _calibrate_interval to set the default interval that will be scaled using the inverse of the noncomformity function when making predictions. This allows intervals to vary by instance. Parameters ---------- data : np.ndarray or pd.DataFrame Dataset to use to set baselines. alpha : int or float, default = 0.95 Confidence level for the interval. response : np.ndarray or pd.Series The response values for the records in data. """ check_type(data, [np.ndarray, pd.DataFrame], "data") super().calibrate(data=data, response=response, alpha=alpha)
def check_objective_supported(booster: xgb.Booster, supported_objectives: List[str]) -> None: """Function to check that the booster objective parameter is in the supported_objectives list and raise and exception if not. """ check_type(booster, [xgb.Booster], "booster") check_type(supported_objectives, [list], "supported_objectives") for i, objective in enumerate(supported_objectives): check_type(objective, [str], f"supported_objectives[{i}]") booster_config = json.loads(booster.save_config()) booster_objective = booster_config["learner"]["objective"]["name"] check_allowed_value(booster_objective, supported_objectives, "booster objective not supported")