def test_ridge_sample_weights(): rng = np.random.RandomState(0) for solver in ("cholesky", ): for n_samples, n_features in ((6, 5), (5, 10)): for alpha in (1.0, 1e-2): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1 + rng.rand(n_samples) coefs = ridge_regression(X, y, alpha=alpha, sample_weight=sample_weight, solver=solver) # Sample weight can be implemented via a simple rescaling # for the square loss. coefs2 = ridge_regression( X * np.sqrt(sample_weight)[:, np.newaxis], y * np.sqrt(sample_weight), alpha=alpha, solver=solver) assert_array_almost_equal(coefs, coefs2) # Test for fit_intercept = True est = Ridge(alpha=alpha, solver=solver) est.fit(X, y, sample_weight=sample_weight) # Check using Newton's Method # Quadratic function should be solved in a single step. # Initialize sample_weight = np.sqrt(sample_weight) X_weighted = sample_weight[:, np.newaxis] * (np.column_stack( (np.ones(n_samples), X))) y_weighted = y * sample_weight # Gradient is (X*coef-y)*X + alpha*coef_[1:] # Remove coef since it is initialized to zero. grad = -np.dot(y_weighted, X_weighted) # Hessian is (X.T*X) + alpha*I except that the first # diagonal element should be zero, since there is no # penalization of intercept. diag = alpha * np.ones(n_features + 1) diag[0] = 0. hess = np.dot(X_weighted.T, X_weighted) hess.flat[::n_features + 2] += diag coef_ = -np.dot(linalg.inv(hess), grad) assert_almost_equal(coef_[0], est.intercept_) assert_array_almost_equal(coef_[1:], est.coef_)
def test_ridge_sample_weights(): rng = np.random.RandomState(0) for solver in ("cholesky", ): for n_samples, n_features in ((6, 5), (5, 10)): for alpha in (1.0, 1e-2): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1 + rng.rand(n_samples) coefs = ridge_regression(X, y, alpha=alpha, sample_weight=sample_weight, solver=solver) # Sample weight can be implemented via a simple rescaling # for the square loss. coefs2 = ridge_regression( X * np.sqrt(sample_weight)[:, np.newaxis], y * np.sqrt(sample_weight), alpha=alpha, solver=solver) assert_array_almost_equal(coefs, coefs2) # Test for fit_intercept = True est = Ridge(alpha=alpha, solver=solver) est.fit(X, y, sample_weight=sample_weight) # Check using Newton's Method # Quadratic function should be solved in a single step. # Initialize sample_weight = np.sqrt(sample_weight) X_weighted = sample_weight[:, np.newaxis] * ( np.column_stack((np.ones(n_samples), X))) y_weighted = y * sample_weight # Gradient is (X*coef-y)*X + alpha*coef_[1:] # Remove coef since it is initialized to zero. grad = -np.dot(y_weighted, X_weighted) # Hessian is (X.T*X) + alpha*I except that the first # diagonal element should be zero, since there is no # penalization of intercept. diag = alpha * np.ones(n_features + 1) diag[0] = 0. hess = np.dot(X_weighted.T, X_weighted) hess.flat[::n_features + 2] += diag coef_ = - np.dot(linalg.inv(hess), grad) assert_almost_equal(coef_[0], est.intercept_) assert_array_almost_equal(coef_[1:], est.coef_)
def test_ridge_regression_dtype_stability(solver, seed): random_state = np.random.RandomState(seed) n_samples, n_features = 6, 5 X = random_state.randn(n_samples, n_features) coef = random_state.randn(n_features) y = np.dot(X, coef) + 0.01 * random_state.randn(n_samples) alpha = 1.0 results = dict() # XXX: Sparse CG seems to be far less numerically stable than the # others, maybe we should not enable float32 for this one. atol = 1e-3 if solver == "sparse_cg" else 1e-5 for current_dtype in (np.float32, np.float64): results[current_dtype] = ridge_regression(X.astype(current_dtype), y.astype(current_dtype), alpha=alpha, solver=solver, random_state=random_state, sample_weight=None, max_iter=500, tol=1e-10, return_n_iter=False, return_intercept=False) assert results[np.float32].dtype == np.float32 assert results[np.float64].dtype == np.float64 assert_allclose(results[np.float32], results[np.float64], atol=atol)
def test_ridge_regression_dtype_stability(solver): random_state = np.random.RandomState(0) n_samples, n_features = 6, 5 X = random_state.randn(n_samples, n_features) coef = random_state.randn(n_features) y = np.dot(X, coef) + 0.01 * rng.randn(n_samples) alpha = 1.0 rtol = 1e-2 if os.name == 'nt' and _IS_32BIT else 1e-5 results = dict() for current_dtype in (np.float32, np.float64): results[current_dtype] = ridge_regression(X.astype(current_dtype), y.astype(current_dtype), alpha=alpha, solver=solver, random_state=random_state, sample_weight=None, max_iter=500, tol=1e-10, return_n_iter=False, return_intercept=False) assert results[np.float32].dtype == np.float32 assert results[np.float64].dtype == np.float64 assert_allclose(results[np.float32], results[np.float64], rtol=rtol)
def test_ridge_sample_weights(): rng = np.random.RandomState(0) alpha = 1.0 for solver in ("sparse_cg", "dense_cholesky", "lsqr"): for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1 + rng.rand(n_samples) coefs = ridge_regression(X, y, alpha, sample_weight, solver=solver) # Sample weight can be implemented via a simple rescaling # for the square loss coefs2 = ridge_regression( X * np.sqrt(sample_weight)[:, np.newaxis], y * np.sqrt(sample_weight), alpha, solver=solver ) assert_array_almost_equal(coefs, coefs2)
def test_ridge_sample_weights(): rng = np.random.RandomState(0) alpha = 1.0 for solver in ("sparse_cg", "dense_cholesky", "lsqr"): for n_samples, n_features in ((6, 5), (5, 10)): y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) sample_weight = 1 + rng.rand(n_samples) coefs = ridge_regression(X, y, alpha, sample_weight, solver=solver) # Sample weight can be implemented via a simple rescaling # for the square loss coefs2 = ridge_regression(X * np.sqrt(sample_weight)[:, np.newaxis], y * np.sqrt(sample_weight), alpha, solver=solver) assert_array_almost_equal(coefs, coefs2)
def test_deprecation_warning_dense_cholesky(): """Tests if DeprecationWarning is raised at instantiation of estimators and when ridge_regression is called""" warning_class = DeprecationWarning warning_message = ("The name 'dense_cholesky' is deprecated." " Using 'cholesky' instead") func1 = lambda: Ridge(solver='dense_cholesky') func2 = lambda: RidgeClassifier(solver='dense_cholesky') X = np.ones([3, 2]) y = np.zeros(3) func3 = lambda: ridge_regression(X, y, alpha=1, solver='dense_cholesky') for func in [func1, func2, func3]: assert_warns_message(warning_class, warning_message, func)
def test_ridge_regression_check_arguments_validity(return_intercept, sample_weight, arr_type, solver): """check if all combinations of arguments give valid estimations""" # test excludes 'svd' solver because it raises exception for sparse inputs rng = check_random_state(42) X = rng.rand(1000, 3) true_coefs = [1, 2, 0.1] y = np.dot(X, true_coefs) true_intercept = 0. if return_intercept: true_intercept = 10000. y += true_intercept X_testing = arr_type(X) alpha, atol, tol = 1e-3, 1e-4, 1e-6 if solver not in ['sag', 'auto'] and return_intercept: assert_raises_regex(ValueError, "In Ridge, only 'sag' solver", ridge_regression, X_testing, y, alpha=alpha, solver=solver, sample_weight=sample_weight, return_intercept=return_intercept, tol=tol) return out = ridge_regression( X_testing, y, alpha=alpha, solver=solver, sample_weight=sample_weight, return_intercept=return_intercept, tol=tol, ) if return_intercept: coef, intercept = out assert_allclose(coef, true_coefs, rtol=0, atol=atol) assert_allclose(intercept, true_intercept, rtol=0, atol=atol) else: assert_allclose(out, true_coefs, rtol=0, atol=atol)
def test_ridge_regression_check_arguments_validity(return_intercept, sample_weight, arr_type, solver): """check if all combinations of arguments give valid estimations""" # test excludes 'svd' solver because it raises exception for sparse inputs rng = check_random_state(42) X = rng.rand(1000, 3) true_coefs = [1, 2, 0.1] y = np.dot(X, true_coefs) true_intercept = 0. if return_intercept: true_intercept = 10000. y += true_intercept X_testing = arr_type(X) alpha, atol, tol = 1e-3, 1e-4, 1e-6 if solver not in ['sag', 'auto'] and return_intercept: assert_raises_regex(ValueError, "In Ridge, only 'sag' solver", ridge_regression, X_testing, y, alpha=alpha, solver=solver, sample_weight=sample_weight, return_intercept=return_intercept, tol=tol) return out = ridge_regression(X_testing, y, alpha=alpha, solver=solver, sample_weight=sample_weight, return_intercept=return_intercept, tol=tol, ) if return_intercept: coef, intercept = out assert_allclose(coef, true_coefs, rtol=0, atol=atol) assert_allclose(intercept, true_intercept, rtol=0, atol=atol) else: assert_allclose(out, true_coefs, rtol=0, atol=atol)
def func(): X = np.eye(3) y = np.ones(3) ridge_regression(X, y, alpha=1., solver=wrong_solver)
def solT(X, y): return ridge_regression(X, y, alpha=0., solver="cholesky").T
def _fit(self, X, y, sample_weight=None, incremental=False): """Fit the model to the data X and target y.""" # Validate input params if self.n_hidden <= 0: raise ValueError("n_hidden must be > 0, got %s." % self.n_hidden) if self.C <= 0.0: raise ValueError("C must be > 0, got %s." % self.C) if self.activation not in ACTIVATIONS: raise ValueError("The activation %s is not supported. Supported " "activation are %s." % (self.activation, ACTIVATIONS)) # Initialize public attributes if not hasattr(self, 'classes_'): self.classes_ = None if not hasattr(self, 'coef_hidden_'): self.coef_hidden_ = None # Initialize private attributes if not hasattr(self, '_HT_H_accumulated'): self._HT_H_accumulated = None X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, order="C", multi_output=True) # This outputs a warning when a 1d array is expected if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) # Classification if isinstance(self, ClassifierMixin): self.label_binarizer_.fit(y) if self.classes_ is None or not incremental: self.classes_ = self.label_binarizer_.classes_ # if sample_weight is None: # sample_weight = compute_sample_weight(self.class_weight, # self.classes_, y) else: classes = self.label_binarizer_.classes_ if not np.all(np.in1d(classes, self.classes_)): raise ValueError("`y` has classes not in `self.classes_`." " `self.classes_` has %s. 'y' has %s." % (self.classes_, classes)) y = self.label_binarizer_.transform(y) # Ensure y is 2D if y.ndim == 1: y = np.reshape(y, (-1, 1)) n_samples, n_features = X.shape self.n_outputs_ = y.shape[1] # Step (1/2): Compute the hidden layer coefficients if (self.coef_hidden_ is None or (not incremental and not self.warm_start)): # Randomize and scale the input-to-hidden coefficients self._init_weights(n_features) # Step (2/2): Compute hidden-to-output coefficients if self.batch_size is None: # Run the least-square algorithm on the whole dataset batch_size = n_samples else: # Run the recursive least-square algorithm on mini-batches batch_size = self.batch_size batches = gen_batches(n_samples, batch_size) # (First time call) Run the least-square algorithm on batch 0 if not incremental or self._HT_H_accumulated is None: batch_slice = next(batches) H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] # beta_{0} = inv(H_{0}^T H_{0} + (1. / C) * I) * H_{0}.T y_{0} self.coef_output_ = ridge_regression(H_batch, y[batch_slice], 1. / self.C, sample_weight=sw).T # Initialize K if this is batch based or partial_fit if self.batch_size is not None or incremental: # K_{0} = H_{0}^T * W * H_{0} weighted_H_batch = _multiply_weights(H_batch, sw) self._HT_H_accumulated = safe_sparse_dot( H_batch.T, weighted_H_batch) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) if self.batch_size is None: verbose_string = "Training mean squared error =" else: verbose_string = "Batch 0, Training mean squared error =" print("%s %f" % (verbose_string, mean_squared_error( y[batch_slice], y_scores, sample_weight=sw))) # Run the least-square algorithm on batch 1, 2, ..., n for batch, batch_slice in enumerate(batches): # Compute hidden activations H_{i} for batch i H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights (sw) for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] weighted_H_batch = _multiply_weights(H_batch, sw) # Update K_{i+1} by H_{i}^T * W * H_{i} self._HT_H_accumulated += safe_sparse_dot(H_batch.T, weighted_H_batch) # Update beta_{i+1} by # K_{i+1}^{-1} * H_{i+1}^T * W * (y_{i+1} - H_{i+1} * beta_{i}) y_batch = y[batch_slice] - safe_sparse_dot(H_batch, self.coef_output_) weighted_y_batch = _multiply_weights(y_batch, sw) Hy_batch = safe_sparse_dot(H_batch.T, weighted_y_batch) # Update hidden-to-output coefficients regularized_HT_H = self._HT_H_accumulated.copy() regularized_HT_H.flat[::self.n_hidden + 1] += 1. / self.C # It is safe to use linalg.solve (instead of linalg.lstsq # which is slow) since it is highly unlikely that # regularized_HT_H is singular due to the random # projection of the first layer and 'C' regularization being # not dangerously large. self.coef_output_ += linalg.solve(regularized_HT_H, Hy_batch, sym_pos=True, overwrite_a=True, overwrite_b=True) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) print("Batch %d, Training mean squared error = %f" % (batch + 1, mean_squared_error( y[batch_slice], y_scores, sample_weight=sw))) return self
def _fit(self, X, y, sample_weight=None, incremental=False): """Fit the model to the data X and target y.""" # Validate input params if self.n_hidden <= 0: raise ValueError("n_hidden must be > 0, got %s." % self.n_hidden) if self.C <= 0.0: raise ValueError("C must be > 0, got %s." % self.C) if self.activation not in ACTIVATIONS: raise ValueError("The activation %s is not supported. Supported " "activation are %s." % (self.activation, ACTIVATIONS)) # Initialize public attributes if not hasattr(self, 'classes_'): self.classes_ = None if not hasattr(self, 'coef_hidden_'): self.coef_hidden_ = None # Initialize private attributes if not hasattr(self, '_HT_H_accumulated'): self._HT_H_accumulated = None X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, order="C", multi_output=True) # This outputs a warning when a 1d array is expected if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) # Classification if isinstance(self, ClassifierMixin): self.label_binarizer_.fit(y) if self.classes_ is None or not incremental: self.classes_ = self.label_binarizer_.classes_ if sample_weight is None: sample_weight = compute_sample_weight(self.class_weight, self.classes_, y) else: classes = self.label_binarizer_.classes_ if not np.all(np.in1d(classes, self.classes_)): raise ValueError("`y` has classes not in `self.classes_`." " `self.classes_` has %s. 'y' has %s." % (self.classes_, classes)) y = self.label_binarizer_.transform(y) # Ensure y is 2D if y.ndim == 1: y = np.reshape(y, (-1, 1)) n_samples, n_features = X.shape self.n_outputs_ = y.shape[1] # Step (1/2): Compute the hidden layer coefficients if (self.coef_hidden_ is None or (not incremental and not self.warm_start)): # Randomize and scale the input-to-hidden coefficients self._init_weights(n_features) # Step (2/2): Compute hidden-to-output coefficients if self.batch_size is None: # Run the least-square algorithm on the whole dataset batch_size = n_samples else: # Run the recursive least-square algorithm on mini-batches batch_size = self.batch_size batches = gen_batches(n_samples, batch_size) # (First time call) Run the least-square algorithm on batch 0 if not incremental or self._HT_H_accumulated is None: batch_slice = next(batches) H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] # beta_{0} = inv(H_{0}^T H_{0} + (1. / C) * I) * H_{0}.T y_{0} self.coef_output_ = ridge_regression(H_batch, y[batch_slice], 1. / self.C, sample_weight=sw).T # Initialize K if this is batch based or partial_fit if self.batch_size is not None or incremental: # K_{0} = H_{0}^T * W * H_{0} weighted_H_batch = _multiply_weights(H_batch, sw) self._HT_H_accumulated = safe_sparse_dot(H_batch.T, weighted_H_batch) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) if self.batch_size is None: verbose_string = "Training mean squared error =" else: verbose_string = "Batch 0, Training mean squared error =" print("%s %f" % (verbose_string, mean_squared_error(y[batch_slice], y_scores, sample_weight=sw))) # Run the least-square algorithm on batch 1, 2, ..., n for batch, batch_slice in enumerate(batches): # Compute hidden activations H_{i} for batch i H_batch = self._compute_hidden_activations(X[batch_slice]) # Get sample weights (sw) for the batch if sample_weight is None: sw = None else: sw = sample_weight[batch_slice] weighted_H_batch = _multiply_weights(H_batch, sw) # Update K_{i+1} by H_{i}^T * W * H_{i} self._HT_H_accumulated += safe_sparse_dot(H_batch.T, weighted_H_batch) # Update beta_{i+1} by # K_{i+1}^{-1} * H_{i+1}^T * W * (y_{i+1} - H_{i+1} * beta_{i}) y_batch = y[batch_slice] - safe_sparse_dot(H_batch, self.coef_output_) weighted_y_batch = _multiply_weights(y_batch, sw) Hy_batch = safe_sparse_dot(H_batch.T, weighted_y_batch) # Update hidden-to-output coefficients regularized_HT_H = self._HT_H_accumulated.copy() regularized_HT_H.flat[::self.n_hidden + 1] += 1. / self.C # It is safe to use linalg.solve (instead of linalg.lstsq # which is slow) since it is highly unlikely that # regularized_HT_H is singular due to the random # projection of the first layer and 'C' regularization being # not dangerously large. self.coef_output_ += linalg.solve(regularized_HT_H, Hy_batch, sym_pos=True, overwrite_a=True, overwrite_b=True) if self.verbose: y_scores = self._decision_scores(X[batch_slice]) print("Batch %d, Training mean squared error = %f" % (batch + 1, mean_squared_error(y[batch_slice], y_scores, sample_weight=sw))) return self
def solT(X, y): return ridge_regression(X, y, alpha=0.).T