def test_preprocess_data_weighted(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) # XXX: if normalize=True, should we expect a weighted standard deviation? # Currently not weighted, but calculated with respect to weighted mean expected_X_norm = (np.sqrt(X.shape[0]) * np.mean( (X - expected_X_mean)**2, axis=0)**.5) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean)
def test_preprocess_data_multioutput(): n_samples = 200 n_features = 3 n_outputs = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_outputs) expected_y_mean = np.mean(y, axis=0) args = [X, sparse.csc_matrix(X)] for X in args: _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(y_mean, np.zeros(n_outputs)) assert_array_almost_equal(yt, y) _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(yt, y - y_mean) _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(yt, y - y_mean)
def test_preprocess_data(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean)
def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight, return_mean=True) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) self.is_fitted_ = True coef, alpha = fracridge(X, y, fracs=self.fracs) self.alpha_ = alpha self.coef_ = coef self._set_intercept(X_offset, y_offset, X_scale) return self
def test_csr_preprocess_data(): # Test output format of _preprocess_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = _preprocess_data(csr, y, True) assert csr_.getformat() == "csr"
def _validate_input(self, X, y, sample_weight=None): """ Helper function to validate the inputs """ X, y = check_X_y(X, y, y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight, check_input=True) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. outs = _rescale_data(X, y, sample_weight) X, y = outs[0], outs[1] return X, y, X_offset, y_offset, X_scale
def test_sparse_preprocess_data_with_return_mean(): n_samples = 200 n_features = 2 # random_state not supported yet in sparse.rand X = sparse.rand(n_samples, n_features, density=0.5) # , random_state=rng X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0]) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True, normalize=True, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_scale, expected_X_scale) assert_array_almost_equal(Xt.A, XA / expected_X_scale) assert_array_almost_equal(yt, y - np.mean(y, axis=0))
def _alpha_max_grp(X, y, groups, center=False, normalize=False): """This costly function (copies X) should only be used for debug.""" grp_ptr, grp_indices = _grp_converter(groups, X.shape[1]) X, y, X_offset, _, X_scale = _preprocess_data( X, y, center, normalize, copy=True) X_mean = X_offset / X_scale X_dense, X_data, X_indices, X_indptr = _sparse_and_dense(X) alpha_max = dnorm_grp( sparse.issparse(X), y, grp_ptr, grp_indices, X_dense, X_data, X_indices, X_indptr, X_mean, len(grp_ptr) - 1, np.zeros(1, dtype=np.int32), X_mean.any()) / len(y) return alpha_max
def test_sparse_preprocess_data_offsets(global_random_seed): rng = np.random.RandomState(global_random_seed) n_samples = 200 n_features = 2 X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng) X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0]) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=False, normalize=False ) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=False ) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=True ) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_scale, expected_X_scale) assert_array_almost_equal(Xt.A, XA / expected_X_scale) assert_array_almost_equal(yt, y - np.mean(y, axis=0))
def test_preprocess_data(global_random_seed): rng = np.random.RandomState(global_random_seed) n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=False, normalize=False ) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=False ) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=True ) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, expected_X_scale) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) assert_array_almost_equal(yt, y - expected_y_mean)
def fit(self, X, y): """Fit MultiTaskLasso model with Celer""" # Need to validate separately here. # We can't pass multi_ouput=True because that would allow y to be csr. check_X_params = dict(dtype=[np.float64, np.float32], order='F', copy=self.copy_X and self.fit_intercept) check_y_params = dict(ensure_2d=False, order='F') X, y = self._validate_data(X, y, validate_separately=(check_X_params, check_y_params)) y = y.astype(X.dtype) if y.ndim == 1: raise ValueError("For mono-task outputs, use Lasso") n_samples = X.shape[0] if n_samples != y.shape[0]: raise ValueError( "X and y have inconsistent dimensions (%d != %d)" % (n_samples, y.shape[0])) X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, self.fit_intercept, self.normalize, copy=False) if not self.warm_start or not hasattr(self, "coef_"): self.coef_ = None _, coefs, dual_gaps = mtl_path(X, y, alphas=[self.alpha], coef_init=self.coef_, max_iter=self.max_iter, max_epochs=self.max_epochs, p0=self.p0, verbose=self.verbose, tol=self.tol, prune=self.prune) self.coef_, self.dual_gap_ = coefs[..., 0], dual_gaps[-1] self.n_iter_ = len(dual_gaps) self._set_intercept(X_offset, y_offset, X_scale) return self
def test_preprocess_copy_data_no_checks(is_sparse, to_copy): X, y = make_regression() X[X < 2.5] = 0.0 if is_sparse: X = sparse.csr_matrix(X) X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) if to_copy and is_sparse: assert not np.may_share_memory(X_.data, X.data) elif to_copy: assert not np.may_share_memory(X_, X) elif is_sparse: assert np.may_share_memory(X_.data, X.data) else: assert np.may_share_memory(X_, X)
def test_preprocess_data_weighted(is_sparse): n_samples = 200 n_features = 4 # Generate random data with 50% of zero values to make sure # that the sparse variant of this test is actually sparse. This also # shifts the mean value for each columns in X further away from # zero. X = rng.rand(n_samples, n_features) X[X < 0.5] = 0.0 # Scale the first feature of X to be 10 larger than the other to # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature. X[:, 2] = 1.0 # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0.0 y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps assert_array_equal(constant_mask, [0, 0, 1, 1]) expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt( sample_weight.sum()) # near constant features should not be scaled expected_X_scale[constant_mask] = 1 if is_sparse: X = sparse.csr_matrix(X) # normalize is False Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight, return_mean=True, ) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, np.ones(n_features)) if is_sparse: assert_array_almost_equal(Xt.toarray(), X.toarray()) else: assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) # normalize is True Xt, yt, X_mean, y_mean, X_scale = _preprocess_data( X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight, return_mean=True, ) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, expected_X_scale) if is_sparse: # X is not centered assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale) else: assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) # _preprocess_data with normalize=True scales the data by the feature-wise # euclidean norms while StandardScaler scales the data by the feature-wise # standard deviations. # The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted # or np.sqrt(sample_weight.sum()) if weighted. if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) # Non-constant features are scaled similarly with np.sqrt(n_samples) assert_array_almost_equal( scaler.transform(X).toarray()[:, :2] / np.sqrt(sample_weight.sum()), Xt.toarray()[:, :2], ) # Constant features go through un-scaled. assert_array_almost_equal( scaler.transform(X).toarray()[:, 2:], Xt.toarray()[:, 2:]) else: scaler = StandardScaler(with_mean=True).fit( X, sample_weight=sample_weight) assert_array_almost_equal(scaler.mean_, X_mean) assert_array_almost_equal( scaler.transform(X) / np.sqrt(sample_weight.sum()), Xt, ) assert_array_almost_equal(yt, y - expected_y_mean)
def _alpha_grid( X, y, Xy=None, groups=None, scale_l2_by="group_length", l1_ratio=1.0, fit_intercept=True, eps=1e-3, n_alphas=100, normalize=False, copy_X=True, model=SGL, ): """Compute the grid of alpha values for elastic net parameter search. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication y : ndarray of shape (n_samples,) Target values Xy : array-like of shape (n_features,), default=None Xy = np.dot(X.T, y) that can be precomputed. If supplying ``Xy``, prevent train/test leakage by ensuring the ``Xy`` is precomputed using only training data. groups : list of numpy.ndarray list of arrays of non-overlapping indices for each group. For example, if nine features are grouped into equal contiguous groups of three, then groups would be ``[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]``. If the feature matrix contains a bias or intercept feature, do not include it as a group. If None, all features will belong to one group. scale_l2_by : ["group_length", None], default="group_length" Scaling technique for the group-wise L2 penalty. By default, ``scale_l2_by="group_length`` and the L2 penalty is scaled by the square root of the group length so that each variable has the same effect on the penalty. This may not be appropriate for one-hot encoded features and ``scale_l2_by=None`` would be more appropriate for that case. ``scale_l2_by=None`` will also reproduce ElasticNet results when all features belong to one group. l1_ratio : float, default=1.0 The elastic net mixing parameter, with ``0 < l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not supported) ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3`` n_alphas : int, default=100 Number of alphas along the regularization path fit_intercept : bool, default=True Whether to fit an intercept or not normalize : bool, default=False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. model : class, default=SGL The estimator class that will be used to confirm that alpha_max sets all coef values to zero. The default value of ``model=SGL`` is appropriate for regression while ``model=LogisticSGL`` is appropriate for classification. """ if l1_ratio == 1.0: return _lasso_alpha_grid( X=X, y=y, Xy=Xy, l1_ratio=l1_ratio, fit_intercept=fit_intercept, eps=eps, n_alphas=n_alphas, normalize=normalize, copy_X=copy_X, ) n_samples = len(y) if Xy is None: X = check_array(X, accept_sparse=False, copy=(copy_X and fit_intercept)) X, y, _, _, _ = _preprocess_data(X, y, fit_intercept, normalize, copy=False) Xy = safe_sparse_dot(X.T, y, dense_output=True) if Xy.ndim == 1: Xy = Xy[:, np.newaxis] groups = check_groups(groups, X, allow_overlap=False, fit_intercept=False) if scale_l2_by not in ["group_length", None]: raise ValueError("scale_l2_by must be 'group_length' or None; " "got {0}".format(scale_l2_by)) # When l1_ratio < 1 (i.e. not the lasso), then for each group, the # smallest alpha for which coef_ = 0 minimizes the objective will be # achieved when # # || S(Xy / n_samples, l1_ratio * alpha) ||_2 == sqrt(p_l) * (1 - l1_ratio) * alpha # # where S() is the element-wise soft-thresholding operator and p_l is # the group size (or 1 if ``scale_l2_by is None``) def beta_zero_root(alpha, group): soft = _soft_threshold(Xy[group] / n_samples, l1_ratio * alpha) scale = np.sqrt(group.size) if scale_l2_by == "group_length" else 1 return np.linalg.norm(soft) - (1 - l1_ratio) * alpha * scale # We use the brentq method to find the root, which requires a bracket # within which to find the root. We know that ``beta_zero_root`` will # be positive when alpha=0. In order to ensure that the upper limit # brackets the root, we increase the upper limit until # ``beta_zero_root`` returns a negative number for all groups def bracket_too_low(alpha): return any([beta_zero_root(alpha, group=grp) > 0 for grp in groups]) upper_bracket_lim = 1e1 while bracket_too_low(upper_bracket_lim): upper_bracket_lim *= 10 min_alphas = np.array([ root_scalar( partial(beta_zero_root, group=grp), bracket=[0, upper_bracket_lim], method="brentq", ).root for grp in groups ]) alpha_max = np.max(min_alphas) * 1.2 # Test feature sparsity just to make sure we're on the right side of the root while ( # pragma: no cover model( groups=groups, alpha=alpha_max, l1_ratio=l1_ratio, fit_intercept=fit_intercept, scale_l2_by=scale_l2_by, ).fit(X, y).chosen_features_.size > 0): alpha_max *= 1.2 # pragma: no cover if alpha_max <= np.finfo(float).resolution: alphas = np.empty(n_alphas) alphas.fill(np.finfo(float).resolution) return alphas return np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max), num=n_alphas)[::-1]
def ridge_regression( X_train, X_test, y_train, y_test, svd_solve=False, lambdas=[1e2], return_preds=True, return_model=False, clip_bounds=None, intercept=False, allow_linalg_warning_instances=False, ): """Train ridge regression model for a series of regularization parameters. Optionally clip the predictions to bounds. Used as the default solve_function argument for single_solve() and kfold_solve() below. Parameters ---------- X_{train,test} : :class:`numpy.ndarray` Features for training/test data (n_obs_{train,test} X n_ftrs 2darray). y_{train,test} : :class:`numpy.ndarray` Labels for training/test data (n_obs_{train,test} X n_outcomes 2darray). svd_solve : bool, optional If true, uses SVD to compute w^*, otherwise does matrix inverse for each lambda. lambdas : list of floats, optional Regularization values to sweep over. return_preds : bool, optional Whether to return predictions for training and test sets. return_model : bool, optional Whether to return the trained weights that define the ridge regression model. clip_bounds : array-like, optional If None, do not clip predictions. If not None, must be ann array of dimension ``n_outcomes X 2``. If any of the elements of the array are None, ignore that bound (e.g. if a row of the array is [None, 10], apply an upper bound of 10 but no lower bound). intercept : bool, optional Whether to add an unregulated intercept (or, equivalently, center the X and Y data). allow_linalg_warning_instances : bool, optional If False (default), track for which hyperparameters did ``scipy.linalg`` raise an ill-conditioned matrix error, which could lead to poor performance. This is used to discard these models in a cross-validation context. If True, allow these models to be included in the hyperparameter grid search. Note that these errors will not occur when using ``cupy.linalg`` (i.e. if a GPU is detected), so the default setting may give differing results across platforms. Returns ------- dict of :class:`numpy.ndarray` The results dictionary will always include the following key/value pairs: ``metrics_{test,train}`` : array of dimension n_outcomes X n_lambdas Each element is a dictionary of {Out-of,In}-sample model performance metrics for each lambda If ``return_preds``, the following arrays will be appended in order: ``y_pred_{test,train}`` : array of dimension n_outcomes X n_lambdas Each element is itself a 1darray of {Out-of,In}-sample predictions for each lambda. Each 1darray contains n_obs_{test,train} values if return_model, the following array will be appended: ``models`` : array of dimension n_outcomes X n_lambdas: Each element is itself a 1darray of model weights for each lambda. Each 1darray contains n_ftrs values """ # get dimensions needed to shape arrays n_ftrs, n_outcomes, n_obs_train, n_obs_test = get_dim_lengths( X_train, y_train, y_test) n_lambdas = len(lambdas) # center data if needed X_train, y_train, X_offset, y_offset, _ = _preprocess_data(X_train, y_train, intercept, normalize=False) # set up the data structures for reporting results results_dict = _initialize_results_arrays((n_outcomes, n_lambdas), return_preds, return_model) t1 = time.time() # send to GPU if available X_train = xp.asarray(X_train) y_train = xp.asarray(y_train) if DEBUG: if GPU: print( f"Time to transfer X_train and y_train to GPU: {time.time() - t1}" ) t1 = time.time() # precomputing large matrices to avoid redundant computation if svd_solve: # precompute the SVD U, s, Vh = linalg.svd(X_train, full_matrices=False) V = Vh.T UT_dot_y_train = U.T.dot(y_train) else: XtX = X_train.T.dot(X_train) XtY = X_train.T.dot(y_train) if DEBUG: t2 = time.time() print("Time to create XtX matrix:", t2 - t1) # iterate over the lambda regularization values training_time = 0 pred_time = 0 for lx, lambdan in enumerate(lambdas): if DEBUG: t3 = time.time() # train model if svd_solve: s_lambda = s / (s**2 + lambdan * xp.ones_like(s)) model = (V * s_lambda).dot(UT_dot_y_train) lambda_warning = None else: with warnings.catch_warnings(record=True) as w: # bind warnings to the value of w warnings.simplefilter("always") lambda_warning = False model = linalg.solve( XtX + lambdan * xp.eye(n_ftrs, dtype=np.float64), XtY, **linalg_solve_kwargs, ) # if there is a warning if len(w) > 1: for this_w in w: print(this_w.message) # more than one warning is bad raise Exception( "warning/exception other than LinAlgWarning") if len(w) > 0: # if it is a linalg warning if w[0].category == LinAlgWarning: print("linalg warning on lambda={0}: ".format(lambdan), end="") # linalg warning if not allow_linalg_warning_instances: print( "we will discard this model upon model selection" ) lambda_warning = True else: lambda_warning = None print( "we will allow this model upon model selection" ) else: raise Exception( "warning/exception other than LinAlgWarning") if DEBUG: t4 = time.time() training_time += t4 - t3 print(f"Training time for lambda {lambdan}: {t4 - t3}") ##################### # compute predictions ##################### # send to gpu if available X_test = xp.asarray(X_test) y_test = xp.asarray(y_test) y_offset = xp.asarray(y_offset) X_offset = xp.asarray(X_offset) if DEBUG: t5 = time.time() # train pred_train = X_train.dot(model) + y_offset pred_train = y_to_matrix(pred_train) # test pred_test = X_test.dot(model) - X_offset.dot(model) + y_offset pred_test = y_to_matrix(pred_test) # clip if needed if clip_bounds is not None: for ix, i in enumerate(clip_bounds): # only apply if both bounds aren't None for this outcome if not (i == None).all(): pred_train[:, ix] = xp.clip(pred_train[:, ix], *i) pred_test[:, ix] = xp.clip(pred_test[:, ix], *i) if DEBUG: t6 = time.time() pred_time += t6 - t5 # bring back to cpu if needed pred_train, pred_test = asnumpy(pred_train), asnumpy(pred_test) y_train, y_test, model = ( y_to_matrix(asnumpy(y_train)), y_to_matrix(asnumpy(y_test)), y_to_matrix(asnumpy(model)), ) # create tuple of lambda index to match argument structure # of _fill_results_arrays function hp_tuple = (lx, ) # Transpose model results so that n_outcomes is first dimension # so that _fill_results_array can handle it model = model.T # populate results dict with results from this lambda results_dict = _fill_results_arrays( y_train, y_test, pred_train, pred_test, model, hp_tuple, results_dict, hp_warning=lambda_warning, ) if DEBUG: print("Training time:", training_time) print("Prediction time:", pred_time) print("Total time:", time.time() - t1) return results_dict
def test_dtype_preprocess_data(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) X_32 = np.asarray(X, dtype=np.float32) y_32 = np.asarray(y, dtype=np.float32) X_64 = np.asarray(X, dtype=np.float64) y_64 = np.asarray(y, dtype=np.float64) for fit_intercept in [True, False]: for normalize in [True, False]: Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data( X_32, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True, ) Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data( X_64, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True, ) Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data( X_32, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True, ) Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data( X_64, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True, ) assert Xt_32.dtype == np.float32 assert yt_32.dtype == np.float32 assert X_mean_32.dtype == np.float32 assert y_mean_32.dtype == np.float32 assert X_scale_32.dtype == np.float32 assert Xt_64.dtype == np.float64 assert yt_64.dtype == np.float64 assert X_mean_64.dtype == np.float64 assert y_mean_64.dtype == np.float64 assert X_scale_64.dtype == np.float64 assert Xt_3264.dtype == np.float32 assert yt_3264.dtype == np.float32 assert X_mean_3264.dtype == np.float32 assert y_mean_3264.dtype == np.float32 assert X_scale_3264.dtype == np.float32 assert Xt_6432.dtype == np.float64 assert yt_6432.dtype == np.float64 assert X_mean_6432.dtype == np.float64 assert y_mean_6432.dtype == np.float64 assert X_scale_6432.dtype == np.float64 assert X_32.dtype == np.float32 assert y_32.dtype == np.float32 assert X_64.dtype == np.float64 assert y_64.dtype == np.float64 assert_array_almost_equal(Xt_32, Xt_64) assert_array_almost_equal(yt_32, yt_64) assert_array_almost_equal(X_mean_32, X_mean_64) assert_array_almost_equal(y_mean_32, y_mean_64) assert_array_almost_equal(X_scale_32, X_scale_64)
def test_preprocess_data_weighted(is_sparse): n_samples = 200 n_features = 4 # Generate random data with 50% of zero values to make sure # that the sparse variant of this test is actually sparse. This also # shifts the mean value for each columns in X further away from # zero. X = rng.rand(n_samples, n_features) X[X < 0.5] = 0. # Scale the first feature of X to be 10 larger than the other to # better check the impact of feature scaling. X[:, 0] *= 10 # Constant non-zero feature: this edge-case is currently not handled # correctly for sparse data, see: # https://github.com/scikit-learn/scikit-learn/issues/19450 # X[:, 2] = 1. # Constant zero feature (non-materialized in the sparse case) X[:, 3] = 0. y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0) X_sample_weight_var = np.average((X - X_sample_weight_avg)**2, weights=sample_weight, axis=0) expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples) # near constant features should not be scaled expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1 if is_sparse: X = sparse.csr_matrix(X) # normalize is False Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, np.ones(n_features)) if is_sparse: assert_array_almost_equal(Xt.toarray(), X.toarray()) else: assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) # normalize is True Xt, yt, X_mean, y_mean, X_scale = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight, return_mean=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_scale, expected_X_scale) if is_sparse: # X is not centered assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale) else: assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale) # _preprocess_data with normalize=True scales the data by the feature-wise # euclidean norms while StandardScaler scales the data by the feature-wise # standard deviations. # The two are equivalent up to a ratio of np.sqrt(n_samples) if is_sparse: scaler = StandardScaler(with_mean=False).fit( X, sample_weight=sample_weight) assert_array_almost_equal( scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray()) else: scaler = StandardScaler(with_mean=True).fit( X, sample_weight=sample_weight) assert_array_almost_equal(scaler.mean_, X_mean) assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt) assert_array_almost_equal(yt, y - expected_y_mean)