def test_csr_sparse_center_data(): # Test output format of sparse_center_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = sparse_center_data(csr, y, True) assert_equal(csr_.getformat(), 'csr')
def _preprocess_data(X, y, fit_intercept, normalize, copy=True, return_mean=False): if not return_mean: return center_data(X, y, fit_intercept, normalize, copy=False) else: return sparse_center_data(X, y, fit_intercept, normalize)
def test_sparse_center_data(): n_samples = 200 n_features = 2 rng = check_random_state(0) # random_state not supported yet in sparse.rand X = sparse.rand(n_samples, n_features, density=.5) # , random_state=rng X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() # XXX: currently scaled to variance=n_samples expected_X_std = np.std(XA, axis=0) * np.sqrt(X.shape[0]) Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_std, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_std, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) Xt, yt, X_mean, y_mean, X_std = sparse_center_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_std, expected_X_std) assert_array_almost_equal(Xt.A, XA / expected_X_std) assert_array_almost_equal(yt, y - np.mean(y, axis=0))
def _sparse_fit(self, X, y, Xy=None, coef_init=None): if X.shape[0] != y.shape[0]: raise ValueError("X and y have incompatible shapes.\n" + "Note: Sparse matrices cannot be indexed w/" + "boolean masks (use `indices=True` in CV).") # NOTE: we are explicitly not centering the data the naive way to # avoid breaking the sparsity of X X_data, y, X_mean, y_mean, X_std = sparse_center_data( X, y, self.fit_intercept, self.normalize) if y.ndim == 1: y = y[:, np.newaxis] n_samples, n_features = X.shape[0], X.shape[1] n_targets = y.shape[1] coef_ = self._init_coef(coef_init, n_features, n_targets) dual_gap_ = np.empty(n_targets) eps_ = np.empty(n_targets) l1_reg = self.alpha * self.l1_ratio* n_samples l2_reg =0.0 #self.alpha * (1.0 - self.l1_ratio) * n_samples for k in xrange(n_targets): coef_[k, :], dual_gap_[k], eps_[k] = \ cd_fast.sparse_enet_coordinate_descent( coef_[k, :], l1_reg, l2_reg, X_data, X.indices, X.indptr, y[:, k], X_mean / X_std, self.max_iter, self.tol, True) if dual_gap_[k] > eps_[k]: warnings.warn('Objective did not converge for ' + 'target %d, you might want' % k + ' to increase the number of iterations') self.coef_, self.dual_gap_, self.eps_ = (np.squeeze(a) for a in (coef_, dual_gap_, eps_)) self._set_intercept(X_mean, y_mean, X_std) # return self for chaining fit and predict calls return self
def test_deprecation_center_data(): n_samples = 200 n_features = 2 w = 1.0 + rng.rand(n_samples) X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) param_grid = product([True, False], [True, False], [True, False], [None, w]) for (fit_intercept, normalize, copy, sample_weight) in param_grid: XX = X.copy() # such that we can try copy=False as well X1, y1, X1_mean, X1_var, y1_mean = \ center_data(XX, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) XX = X.copy() X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(XX, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) assert_array_almost_equal(X1, X2) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean) # Sparse cases X = sparse.csr_matrix(X) for (fit_intercept, normalize, copy, sample_weight) in param_grid: X1, y1, X1_mean, X1_var, y1_mean = \ center_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight, return_mean=False) assert_array_almost_equal(X1.toarray(), X2.toarray()) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean) for (fit_intercept, normalize) in product([True, False], [True, False]): X1, y1, X1_mean, X1_var, y1_mean = \ sparse_center_data(X, y, fit_intercept=fit_intercept, normalize=normalize) X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) assert_array_almost_equal(X1.toarray(), X2.toarray()) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean)