def predict(self, X: Union[np.ndarray, None] = None) -> np.ndarray: """Use the fitted model to predict new responses. Args: X (Union[np.ndarray, None], optional): Data matrix to predict, or None for using the fitted dataset. Defaults to None. Returns: np.ndarray: Predicted response. """ if X is None: return mat_mul_inter(self._X, self.coefficients) else: return mat_mul_inter(X, self.coefficients)
def loss_sharp( alpha: np.ndarray, X: np.ndarray, Y: np.ndarray, epsilon: float, lambda1: float = 0, lambda2: float = 0, weight: Optional[np.ndarray] = None, ) -> float: """ Exact version of the loss. """ epsilon *= epsilon distances = (Y - mat_mul_inter(X, alpha))**2 if weight is None: loss = np.sum(distances[distances <= epsilon] - (epsilon * len(Y))) / len(Y) else: sumw = np.sum(weight) mask = distances <= epsilon loss = np.sum( (distances[mask] - (epsilon * sumw)) * weight[mask]) / sumw if lambda1 > 0: loss += lambda1 * np.sum(np.abs(alpha)) if lambda2 > 0: loss += lambda2 * np.sum(alpha * alpha) return loss
def predict(self, X: Union[np.ndarray, None] = None) -> np.ndarray: """Use the approximating linear model to predict new outcomes. Args: X (Union[np.ndarray, None], optional): Sata matrix to predict, or None for using the fitted dataset. Defaults to None. Returns: np.ndarray: Prediction vector. """ if X is None: Y = mat_mul_inter(self._X, self.coefficients) else: Y = mat_mul_inter(X, self.coefficients) if self._logit: Y = sigmoid(Y) return Y
def test_model_scaling(): print("Testing model scaling") for i in (4, 6, 8): X, Y, model2 = data_create2(i * 30, i) X2, x_center, x_scale = normalise_robust(X) Y2, y_center, y_scale = normalise_robust(Y) model2 = np.random.normal(size=i) model = unscale_model(model2, x_center, x_scale, y_center, y_scale) Z1 = mat_mul_inter(X, model) Z2 = mat_mul_inter(X2, model2) Z3 = scale_same(Z1, y_center, y_scale) assert np.allclose(Z2, Z3), f"Max Diff {np.max(np.abs(Z2 - Z3))}" model2 = np.random.normal(size=i + 1) model = unscale_model(model2, x_center, x_scale, y_center, y_scale) Z1 = mat_mul_inter(X, model) Z2 = mat_mul_inter(X2, model2) Z3 = scale_same(Z1, y_center, y_scale) assert np.allclose(Z2, Z3), f"Max Diff {np.max(np.abs(Z2 - Z3))}"
def subset(self, X: Union[np.ndarray, None] = None, Y: Union[np.ndarray, None] = None) -> np.ndarray: """Get the subset (of non-outliers) used for the robust regression model. Args: X (Union[np.ndarray, None], optional): Data matrix, or None for using the fitted dataset. Defaults to None. Y (Union[np.ndarray, None], optional): Response vector, or None for using the fitted dataset. Defaults to None. Returns: np.ndarray: The selected subset as a boolean mask. """ if X is None or Y is None: X = self._X Y = self._Y Y2 = mat_mul_inter(X, self.coefficients) return (Y2 - Y)**2 < self.scaled_epsilon**2
def subset(self, X: Union[np.ndarray, None] = None, Y: Union[np.ndarray, None] = None) -> np.ndarray: """Get the subset / neighbourhood used for the approximation (explanation). Args: X (Union[np.ndarray, None], optional): Data matrix, or None for using the fitted dataset. Defaults to None. Y (Union[np.ndarray, None], optional): Response vector, or None for using the fitted dataset. Defaults to None. Returns: np.ndarray: The subset as a boolean mask. """ if X is None or Y is None: X = self._X Y = self._Y if self._logit: Y = limited_logit(Y) res = mat_mul_inter(X, self.coefficients) - Y return res**2 < self.scaled_epsilon**2
def plot_2d( X: np.ndarray, Y: np.ndarray, model: np.ndarray, epsilon: float, x: Optional[np.ndarray] = None, y: Optional[float] = None, logit: bool = False, title: str = "SLISE for Robust Regression", label_x: str = "x", label_y: str = "y", decimals: int = 3, fig: Optional[Figure] = None, ): """Plot the regression/explanation in a 2D scatter plot with a line for the regression model (and the explained item marked). Args: X (np.ndarray): Data matrix. Y (np.ndarray): Response vector. model (np.ndarray): Linear model. epsilon (float): Error tolerance. x (Optional[np.ndarray], optional): Explained item. Defaults to None. y (Optional[float], optional): Explained outcome. Defaults to None. logit (bool, optional): Should Y be logit-transformed. Defaults to False. title (str, optional): Plot title. Defaults to "SLISE for Robust Regression". label_x (str, optional): X-axis label. Defaults to "x". label_y (str, optional): Y-axis label. Defaults to "y". decimals (int, optional): Number of decimals when writing numbers. Defaults to 3. fig (Optional[Figure], optional): Pyplot figure to plot on, if None then a new plot is created and shown. Defaults to None. Raises: SliseException: If the data has too many dimensions. """ if fig is None: plot = True fig, ax = plt.subplots() else: ax = fig.subplots() plot = False if X.size != Y.size: raise SliseException( f"Can only plot 1D data, |Y| = {Y.size} != {X.size} = |X|") x_limits = extended_limits(X, 0.03, 20 if logit else 2) y_limits = mat_mul_inter(x_limits[:, None], model) if logit: ax.fill_between( x_limits, sigmoid(y_limits + epsilon), sigmoid(y_limits - epsilon), color=SLISE_PURPLE + "33", label="Subset", ) y_limits = sigmoid(y_limits) else: ax.fill_between( x_limits, y_limits + epsilon, y_limits - epsilon, color=SLISE_PURPLE + "33", label="Subset", ) ax.plot(X.ravel(), Y, "o", color="black", label="Dataset") if x is not None and y is not None: ax.plot(x_limits, y_limits, "-", color=SLISE_PURPLE, label="Model") ax.plot(x, y, "o", color=SLISE_ORANGE, label="Explained Item") else: ax.plot(x_limits, y_limits, "-", color=SLISE_ORANGE, label="Model") formula = "" if isinstance(model, float) or len(model) == 1: formula = f"{float(model):.{decimals}f} * {label_x}" elif np.abs(model[0]) > 1e-8: sign = "-" if model[1] < 0.0 else "+" formula = f"{model[0]:.{decimals}f} {sign} {abs(model[1]):.{decimals}f} $\\cdot$ {label_x}" else: formula = f"{model[1]:.{decimals}f} * {label_x}" if logit: formula = f"$\\sigma$({formula})" ax.legend() ax.set_xlabel(label_x) ax.set_ylabel(label_y) ax.set_title(f"{title}: {label_y} = {formula}") fig.tight_layout() if plot: plt.show()
def test_slise_reg(): print("Testing slise regression") X, Y, mod = data_create2(40, 5) w = np.random.uniform(size=40) + 0.5 reg1 = regression(X, Y, epsilon=0.1, lambda1=1e-4, lambda2=1e-4, intercept=True, normalise=True) reg1.print() Yp = mat_mul_inter(X, reg1.coefficients) Yn = reg1._scale.scale_y(Y) Ynp = mat_mul_inter(reg1._scale.scale_x(X), reg1._alpha) Ypn = reg1._scale.scale_y(Yp) # S = (Y - Yp) ** 2 < reg1.epsilon ** 2 # Sn = (Yn - Ynp) ** 2 < reg1.epsilon_orig ** 2 assert np.allclose( Ypn, Ynp, ), f"The predicted Y's are not the same {np.max(np.abs(Ynp - Ypn))}" assert ( reg1.score() <= 0 ), f"SLISE loss should be negative ({reg1.score():.2f}, {reg1.subset().mean():.2f})" assert 1.0 >= reg1.subset().mean() > 0.75 reg2 = regression( X, Y, epsilon=0.1, lambda1=1e-4, lambda2=1e-4, intercept=True, normalise=False, ) reg2.print() assert ( reg2.score() <= 0 ), f"SLISE loss should be negative ({reg2.score():.2f}, {reg2.subset().mean():.2f})" assert 1.0 >= reg2.subset().mean() > 0.5 reg3 = regression( X, Y, epsilon=0.1, lambda1=0, lambda2=0, intercept=True, normalise=False, ) reg3.print() assert ( reg3.score() <= 0 ), f"SLISE loss should be negative ({reg3.score():.2f}, {reg3.subset().mean():.2f})" assert 1.0 >= reg3.subset().mean() > 0.5 reg4 = regression( X, Y, epsilon=0.1, lambda1=1e-4, lambda2=1e-4, intercept=True, normalise=False, weight=w, ) reg4.print() assert ( reg4.score() <= 0 ), f"SLISE loss should be negative ({reg4.score():.2f}, {reg4.subset().mean():.2f})" assert 1.0 >= reg4.subset().mean() > 0.4