def test_preprocess_data_weighted(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) # XXX: if normalize=True, should we expect a weighted standard deviation? # Currently not weighted, but calculated with respect to weighted mean expected_X_norm = (np.sqrt(X.shape[0]) * np.mean((X - expected_X_mean) ** 2, axis=0) ** .5) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean)
def test_preprocess_data(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) expected_X_mean = np.mean(X, axis=0) expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0]) expected_y_mean = np.mean(y, axis=0) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean)
def test_preprocess_data_multioutput(): n_samples = 200 n_features = 3 n_outputs = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_outputs) expected_y_mean = np.mean(y, axis=0) args = [X, sparse.csc_matrix(X)] for X in args: _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False, normalize=False) assert_array_almost_equal(y_mean, np.zeros(n_outputs)) assert_array_almost_equal(yt, y) _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=False) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(yt, y - y_mean) _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True, normalize=True) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(yt, y - y_mean)
def test_preprocess_data_weighted(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) sample_weight = rng.rand(n_samples) expected_X_mean = np.average(X, axis=0, weights=sample_weight) expected_y_mean = np.average(y, axis=0, weights=sample_weight) # XXX: if normalize=True, should we expect a weighted standard deviation? # Currently not weighted, but calculated with respect to weighted mean expected_X_norm = (np.sqrt(X.shape[0]) * np.mean( (X - expected_X_mean)**2, axis=0)**.5) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt, X - expected_X_mean) assert_array_almost_equal(yt, y - expected_y_mean) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, sample_weight=sample_weight) assert_array_almost_equal(X_mean, expected_X_mean) assert_array_almost_equal(y_mean, expected_y_mean) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm) assert_array_almost_equal(yt, y - expected_y_mean)
def preprocess_data(self, X, y): # ensure that we fit the intercept by hand, but normalize if desired return _preprocess_data(X, y, fit_intercept=False, normalize=self.normalize, copy=self.copy_X)
def test_csr_preprocess_data(): # Test output format of _preprocess_data, when input is csr X, y = make_regression() X[X < 2.5] = 0.0 csr = sparse.csr_matrix(X) csr_, y, _, _, _ = _preprocess_data(csr, y, True) assert_equal(csr_.getformat(), 'csr')
def test_randomized_logistic_sparse(): # Check randomized sparse logistic regression on sparse data iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] # center here because sparse matrices are usually not centered # labels should not be centered X, _, _, _, _ = _preprocess_data(X, y, True, True) X_sp = sparse.csr_matrix(X) F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores_sp = clf.fit(X_sp, y).scores_ assert_array_equal(feature_scores, feature_scores_sp)
def _preprocess_data(self, X, y, fit_intercept, normalize=False): """Center the data in X but not in y""" X, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept, normalize=normalize) return X, y, X_offset, y, X_scale
def test_sparse_preprocess_data_with_return_mean(): n_samples = 200 n_features = 2 # random_state not supported yet in sparse.rand X = sparse.rand(n_samples, n_features, density=.5) # , random_state=rng X = X.tolil() y = rng.rand(n_samples) XA = X.toarray() expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0]) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=False, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.zeros(n_features)) assert_array_almost_equal(y_mean, 0) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=False, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_norm, np.ones(n_features)) assert_array_almost_equal(Xt.A, XA) assert_array_almost_equal(yt, y - np.mean(y, axis=0)) Xt, yt, X_mean, y_mean, X_norm = \ _preprocess_data(X, y, fit_intercept=True, normalize=True, return_mean=True) assert_array_almost_equal(X_mean, np.mean(XA, axis=0)) assert_array_almost_equal(y_mean, np.mean(y, axis=0)) assert_array_almost_equal(X_norm, expected_X_norm) assert_array_almost_equal(Xt.A, XA / expected_X_norm) assert_array_almost_equal(yt, y - np.mean(y, axis=0))
def test_preprocess_copy_data_no_checks(is_sparse, to_copy): X, y = make_regression() X[X < 2.5] = 0.0 if is_sparse: X = sparse.csr_matrix(X) X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False) if to_copy and is_sparse: assert not np.may_share_memory(X_.data, X.data) elif to_copy: assert not np.may_share_memory(X_, X) elif is_sparse: assert np.may_share_memory(X_.data, X.data) else: assert np.may_share_memory(X_, X)
def test_dtype_preprocess_data(): n_samples = 200 n_features = 2 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) X_32 = np.asarray(X, dtype=np.float32) y_32 = np.asarray(y, dtype=np.float32) X_64 = np.asarray(X, dtype=np.float64) y_64 = np.asarray(y, dtype=np.float64) for fit_intercept in [True, False]: for normalize in [True, False]: Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data( X_32, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data( X_64, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = ( _preprocess_data(X_32, y_64, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = ( _preprocess_data(X_64, y_32, fit_intercept=fit_intercept, normalize=normalize, return_mean=True)) assert_equal(Xt_32.dtype, np.float32) assert_equal(yt_32.dtype, np.float32) assert_equal(X_mean_32.dtype, np.float32) assert_equal(y_mean_32.dtype, np.float32) assert_equal(X_norm_32.dtype, np.float32) assert_equal(Xt_64.dtype, np.float64) assert_equal(yt_64.dtype, np.float64) assert_equal(X_mean_64.dtype, np.float64) assert_equal(y_mean_64.dtype, np.float64) assert_equal(X_norm_64.dtype, np.float64) assert_equal(Xt_3264.dtype, np.float32) assert_equal(yt_3264.dtype, np.float32) assert_equal(X_mean_3264.dtype, np.float32) assert_equal(y_mean_3264.dtype, np.float32) assert_equal(X_norm_3264.dtype, np.float32) assert_equal(Xt_6432.dtype, np.float64) assert_equal(yt_6432.dtype, np.float64) assert_equal(X_mean_6432.dtype, np.float64) assert_equal(y_mean_6432.dtype, np.float64) assert_equal(X_norm_6432.dtype, np.float64) assert_equal(X_32.dtype, np.float32) assert_equal(y_32.dtype, np.float32) assert_equal(X_64.dtype, np.float64) assert_equal(y_64.dtype, np.float64) assert_array_almost_equal(Xt_32, Xt_64) assert_array_almost_equal(yt_32, yt_64) assert_array_almost_equal(X_mean_32, X_mean_64) assert_array_almost_equal(y_mean_32, y_mean_64) assert_array_almost_equal(X_norm_32, X_norm_64)
def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, eps=1e-3, n_alphas=100, normalize=False, copy_X=True): """ Compute the grid of alpha values for elastic net parameter search Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication y : ndarray, shape (n_samples,) Target values Xy : array-like, optional Xy = np.dot(X.T, y) that can be precomputed. l1_ratio : float The elastic net mixing parameter, with ``0 < l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not supported) ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. eps : float, optional Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3`` n_alphas : int, optional Number of alphas along the regularization path fit_intercept : boolean, default True Whether to fit an intercept or not normalize : boolean, optional, default False If ``True``, the regressors X will be normalized before regression. This parameter is ignored when ``fit_intercept`` is set to ``False``. When the regressors are normalized, note that this makes the hyperparameters learnt more robust and almost independent of the number of samples. The same property is not valid for standardized data. However, if you wish to standardize, please use :class:`preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. """ if l1_ratio == 0: raise ValueError("Automatic alpha grid generation is not supported for" " l1_ratio=0. Please supply a grid by providing " "your estimator with the appropriate `alphas=` " "argument.") n_samples = len(y) sparse_center = False if Xy is None: X_sparse = sparse.isspmatrix(X) sparse_center = X_sparse and (fit_intercept or normalize) X = check_array(X, 'csc', copy=(copy_X and fit_intercept and not X_sparse)) if not X_sparse: # X can be touched inplace thanks to the above line X, y, _, _, _ = _preprocess_data(X, y, fit_intercept, normalize, copy=False) Xy = safe_sparse_dot(X.T, y, dense_output=True) if sparse_center: # Workaround to find alpha_max for sparse matrices. # since we should not destroy the sparsity of such matrices. _, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept, normalize, return_mean=True) mean_dot = X_offset * np.sum(y) if Xy.ndim == 1: Xy = Xy[:, np.newaxis] if sparse_center: if fit_intercept: Xy -= mean_dot[:, np.newaxis] if normalize: Xy /= X_scale[:, np.newaxis] alpha_max = (np.sqrt(np.sum(Xy ** 2, axis=1)).max() / (n_samples * l1_ratio)) if alpha_max <= np.finfo(float).resolution: alphas = np.empty(n_alphas) alphas.fill(np.finfo(float).resolution) return alphas return np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max), num=n_alphas)[::-1]
def fit(self, X, y, seed=None, verbose=False, sample_weight=None): """Fit data according to the UoI-Lasso algorithm. Relevant information (fits, residuals, model performance) is stored within object. Thus, nothing is returned by this function. Parameters ---------- X : np array (2d) the design matrix, containing the predictors. its shape is assumed to be (number of samples, number of features). y : np array (1d) the vector of dependent variables. its length is assumed to be (number of samples,). seed : int a seed for the random number generator. this number is relevant for the choosing bootstraps and dividing the data into training and test sets. verbose : boolean a boolean switch indicating whether the fitting should print out its progress. """ # initialize the seed, if it's provided if seed is not None: np.random.seed(seed) # start taken from sklearn.LinearModels.base.LinearRegression X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) # preprocess data through centering and normalization X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # extract model dimensions from design matrix self.n_samples_, self.n_features_ = X.shape if verbose: print('(1) Loaded data.\n %s samples with %s features.' % (self.n_samples_, self.n_features_)) # perform an initial coarse sweep over the lambda parameters # this is to zero-in on the relevant regularization region. if self.n_lambdas == 1: lambda_coarse = np.array([1.0]) else: lambda_coarse = np.logspace(-3., 3., self.n_lambdas, dtype=np.float64) # run the coarse lasso sweep estimates_coarse, scores_coarse = \ self.lasso_sweep( X, y, lambda_coarse, self.train_frac_sel, self.n_boots_coarse, self.use_admm, desc='coarse lasso sweep', verbose=verbose ) # deduce the index which maximizes the explained variance over bootstraps lambda_max_idx = np.argmax(np.mean(scores_coarse, axis=0)) # obtain the lambda which maximizes the explained variance over bootstraps lambda_max = lambda_coarse[lambda_max_idx] # in our dense sweep, we'll explore lambda values which encompass a # range that's one order of magnitude less than lambda_max itself d_lambda = 10**(np.floor(np.log10(lambda_max)) - 1) # now that we've narrowed down the regularization parameters, # we'll run a dense sweep which begins the model selection module of UoI ####################### ### Model Selection ### ####################### if verbose: print( '(2) Beginning model selection. Exploring penalty region centered at %d.' % lambda_max) # create the final lambda set based on the coarse sweep if self.n_lambdas == 1: lambdas = np.array([lambda_max]) else: lambdas = np.linspace(lambda_max - 5 * d_lambda, lambda_max + 5 * d_lambda, self.n_lambdas, dtype=np.float64) # run the lasso sweep with new lambda set estimates_dense, scores_dense = \ self.lasso_sweep( X, y, lambdas, self.train_frac_sel, self.n_boots_sel, self.use_admm, desc='fine lasso sweep', verbose=verbose ) # choose selection fraction threshold values to use selection_frac_thresholds = np.linspace(self.selection_thres_min, self.selection_thres_max, self.n_selection_thres) # calculate the actual number of thresholds, but delete any repetitions selection_thresholds = np.sort( np.unique( (self.n_boots_sel * selection_frac_thresholds).astype('int'))) # create support matrix self.supports_ = np.zeros( (self.n_selection_thres, self.n_lambdas, self.n_features_), dtype=bool) # iterate over each stability selection threshold for thres_idx, threshold in enumerate(selection_thresholds): # calculate the support given the specific selection threshold self.supports_[thres_idx, :] = np.count_nonzero( estimates_dense, axis=0) >= threshold # reshape support matrix so that first axis consists of all combinations of hyperparameters self.supports_ = np.reshape( self.supports_, (self.n_selection_thres * self.n_lambdas, self.n_features_)) ######################## ### Model Estimation ### ######################## # we'll use the supports obtained in the selection module to calculate # bagged OLS estimates over bootstraps if verbose: print( '(3) Model selection complete. Beginning model estimation, with %s bootstraps.' % self.n_boots_est) # create or overwrite arrays to collect final results self.coef_ = np.zeros(self.n_features_, dtype=np.float32) self.scores_ = np.zeros(1, dtype=np.float32) # determine how many samples will be used for overall training train_split = int(round(self.train_frac_overall * self.n_samples_)) # determine how many samples will be used for training within a bootstrap boot_train_split = int(round(self.train_frac_est * train_split)) # set up data arrays estimates = np.zeros( (self.n_boots_est, self.n_lambdas, self.n_features_), dtype=np.float32) scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32) # either we plan on using a test set, or we'll use the entire dataset for training if self.train_frac_overall < 1: # generate indices for the global training and testing blocks indices = np.random.permutation(self.n_samples_) train, test = np.split(indices, [train_split]) # compile the training and test sets X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] else: X_train = X y_train = y # iterate over bootstrap samples for bootstrap in trange(self.n_boots_est, desc='Model Estimation', disable=not verbose): # extract the bootstrap indices, keeping a fraction of the data # available for testing bootstrap_indices = np.random.permutation(train_split) train_boot, test_boot = np.split(bootstrap_indices, [boot_train_split]) # iterate over the regularization parameters for lamb_idx, lamb in enumerate(lambdas): support = self.supports_[lamb_idx] if np.any(support): # fit OLS using the supports from selection module X_boot = X_train[train_boot] y_boot = y_train[train_boot] ols = lm.LinearRegression() ols.fit(X_boot[:, support], y_boot - y_boot.mean()) # store the fitted coefficients estimates[bootstrap, lamb_idx, support] = ols.coef_ # calculate and store the performance on the test set y_hat_boot = np.dot(X_train[test_boot], estimates[bootstrap, lamb_idx, :]) y_true_boot = y_train[test_boot] - y_train[test_boot].mean( ) # calculate sum of squared residuals rss = np.sum((y_hat_boot - y_true_boot)**2) # calculate BIC as our scoring function if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = r2_score(y_true_boot, y_hat_boot) elif self.estimation_score == 'BIC': n_selected_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.BIC( n_features=n_selected_features, n_samples=boot_train_split, rss=rss) else: # if no variables were selected, throw a message # we'll leave the scores array unchanged, so any support # with no selection will be assigned a score of 0. print( 'No variables selected in the support for lambda = %d.' % lamb) if verbose: print('(4) Bagging estimates, using bagging option %s.' % self.bagging_options) if self.bagging_options == 1: # bagging option 1: for each bootstrap sample, find the regularization parameter that gave the best results lambda_max_idx = np.argmax(scores, axis=1) # extract the estimates over bootstraps from the model with best lambda best_estimates = estimates[np.arange(self.n_boots_est), lambda_max_idx, :] # take the median across estimates for the final, bagged estimate self.coef_ = np.median(best_estimates, axis=0) elif self.bagging_options == 2: # bagging option 2: average estimates across bootstraps, and then find the regularization parameter that gives the best results mean_scores = np.mean(scores, axis=0) lambda_max_idx = np.argmax(mean_scores) self.coef_ = np.median(estimates[:, lambda_max_idx, :], 0) else: raise ValueError('Bagging option %d is not available.' % self.bagging_options) # if we extracted a test set, evaluate the model if self.train_frac_overall < 1: # finally, see how the bagged estimates perform on the test set y_hat = np.dot(X_test, self.coef_) y_true = y_test - y_test.mean() # calculate and store performance of the final UoI_Lasso estimator over test set self.scores_ = r2_score(y_true, y_hat) else: self.scores_ = None if verbose: print("---> UoI Lasso complete.") if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def fit(self, X, y, init=None): """Fit the Poisson GLM with coordinate descent. Parameters ---------- X : nd-array, shape (n_samples, n_features) The design matrix. y : nd-array, shape (n_samples,) Response vector. Will be cast to X's dtype if necessary. Currently, this implementation does not handle multiple response variables. init : nd-array, shape (n_features) Initialization for parameters. """ self.n_samples, self.n_features = X.shape # initialization if self.warm_start and hasattr(self, 'coef_'): coef = self.coef_ else: coef = np.zeros(shape=(self.n_features)) if init is not None: coef = init intercept = 0 # we will handle the intercept by hand: only preprocess the design # matrix X, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept=False, normalize=self.normalize) # all features are initially active active_idx = np.arange(self.n_features) coef_update = np.zeros(coef.shape) # perform coordinate descent updates for iteration in range(self.max_iter): # linearize the log-likelihood w, z = self.adjusted_response(X, y, coef, intercept) # perform an update of coordinate descent coef_update, intercept = self.cd_sweep(coef=coef, intercept=intercept, X=X, w=w, z=z, active_idx=active_idx) # check convergence if np.max(np.abs(coef_update - coef)) < self.tol: break coef = coef_update # update the active features active_idx = np.argwhere(coef != 0).ravel() self.intercept_ = intercept self.coef_ = coef_update / X_scale
def fit(self, X, y, groups=None, seed=None, verbose=False, sample_weight=None, option=True): """Fit data according to the UoI-Lasso algorithm. Relevant information (fits, residuals, model performance) is stored within object. Thus, nothing is returned by this function. Parameters ---------- X : np array (2d) the design matrix, containing the predictors. its shape is assumed to be (number of samples, number of features). y : np array (1d) the vector of dependent variables. its length is assumed to be (number of samples,). seed : int a seed for the random number generator. this number is relevant for the choosing bootstraps and dividing the data into training and test sets. verbose : boolean a boolean switch indicating whether the fitting should print out its progress. """ # initialize the seed, if it's provided if seed is not None: np.random.seed(seed) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) # preprocess data through centering and normalization X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # extract model dimensions from design matrix self.n_samples_, self.n_features_ = X.shape # create or overwrite arrays to collect final results self.coef_ = np.zeros(self.n_features_, dtype=np.float32) # group leveling if groups is None: self.groups_ = np.ones(self.n_samples_) else: self.groups_ = np.array(groups) if verbose: print('(1) Loaded data.\n %s samples with %s features.' % (self.n_samples_, self.n_features_)) self.lambdas = _alpha_grid(X=X, y=y, l1_ratio=1.0, fit_intercept=self.fit_intercept, eps=1e-3, n_alphas=self.n_lambdas, normalize=self.normalize) # sweep over the grid of regularization strengths estimates_selection, _ = \ self.lasso_sweep( X, y, self.lambdas, self.train_frac_sel, self.n_boots_sel, self.use_admm, desc='fine lasso sweep', verbose=verbose ) # perform the intersection step self.intersection(estimates_selection) ######################## ### Model Estimation ### ######################## # we'll use the supports obtained in the selection module to calculate # bagged OLS estimates over bootstraps if verbose: print('(3) Beginning model estimation, with %s bootstraps.' % self.n_boots_est) # compute number of samples per bootstrap n_samples_bootstrap = int(round(self.train_frac_est * self.n_samples_)) # set up data arrays estimates = np.zeros( (self.n_boots_est, self.n_lambdas, self.n_features_), dtype=np.float32) scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32) # iterate over bootstrap samples for bootstrap in trange(self.n_boots_est, desc='Model Estimation', disable=not verbose): # extract the bootstrap indices, keeping a fraction of the data available for testing train_idx, test_idx = utils.leveled_randomized_ids( self.groups_, self.train_frac_est) # iterate over the regularization parameters for lamb_idx, lamb in enumerate(self.lambdas): # extract current support set support = self.supports_[lamb_idx] # extract response vectors y_train = y[train_idx] y_test = y[test_idx] # if nothing was selected, we won't bother running OLS if np.any(support): # get design matrices X_train = X[train_idx][:, support] X_test = X[test_idx][:, support] # compute ols estimate ols = lm.LinearRegression() ols.fit(X_train, y_train) # store the fitted coefficients estimates[bootstrap, lamb_idx, support] = ols.coef_ # calculate estimation score if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = ols.score(X_test, y_test) elif self.estimation_score == 'BIC': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.BIC(y_true=y_test, y_pred=y_pred, n_features=n_features) elif self.estimation_score == 'AIC': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.AIC(y_true=y_test, y_pred=y_pred, n_features=n_features) elif self.estimation_score == 'AICc': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.AICc(y_true=y_test, y_pred=y_pred, n_features=n_features) else: raise ValueError( str(self.estimation_score) + ' is not a valid option.') else: if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = r2_score( y_true=y_test, y_pred=np.zeros(y_test.size)) elif self.estimation_score == 'BIC': n_features = 0 scores[bootstrap, lamb_idx] = -utils.BIC( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) elif self.estimation_score == 'AIC': n_features = 0 scores[bootstrap, lamb_idx] = -utils.AIC( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) elif self.estimation_score == 'AICc': n_features = 0 scores[bootstrap, lamb_idx] = -utils.AICc( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) else: raise ValueError( str(self.estimation_score) + ' is not a valid option.') if verbose: print('(4) Bagging estimates, using bagging option %s.' % self.bagging_options) # bagging option 1: # for each bootstrap sample, find the regularization parameter that gave the best results if self.bagging_options == 1: self.lambda_max_idx = np.argmax(scores, axis=1) # extract the estimates over bootstraps from the model with best lambda best_estimates = estimates[np.arange(self.n_boots_est), self.lambda_max_idx, :] # take the median across estimates for the final, bagged estimate self.coef_ = np.median(best_estimates, axis=0) # bagging option 2: # average estimates across bootstraps, and then find the regularization parameter that gives the best results elif self.bagging_options == 2: mean_scores = np.mean(scores, axis=0) self.lambda_max_idx = np.argmax(mean_scores) self.coef_ = np.median(estimates[:, self.lambda_max_idx, :], 0) else: raise ValueError('Bagging option %d is not available.' % self.bagging_options) if verbose: print("---> UoI Lasso complete.") self._set_intercept(X_offset, y_offset, X_scale) return self
def test_deprecation_center_data(): n_samples = 200 n_features = 2 w = 1.0 + rng.rand(n_samples) X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) param_grid = product([True, False], [True, False], [True, False], [None, w]) for (fit_intercept, normalize, copy, sample_weight) in param_grid: XX = X.copy() # such that we can try copy=False as well X1, y1, X1_mean, X1_var, y1_mean = \ center_data(XX, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) XX = X.copy() X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(XX, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) assert_array_almost_equal(X1, X2) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean) # Sparse cases X = sparse.csr_matrix(X) for (fit_intercept, normalize, copy, sample_weight) in param_grid: X1, y1, X1_mean, X1_var, y1_mean = \ center_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight) X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy, sample_weight=sample_weight, return_mean=False) assert_array_almost_equal(X1.toarray(), X2.toarray()) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean) for (fit_intercept, normalize) in product([True, False], [True, False]): X1, y1, X1_mean, X1_var, y1_mean = \ sparse_center_data(X, y, fit_intercept=fit_intercept, normalize=normalize) X2, y2, X2_mean, X2_var, y2_mean = \ _preprocess_data(X, y, fit_intercept=fit_intercept, normalize=normalize, return_mean=True) assert_array_almost_equal(X1.toarray(), X2.toarray()) assert_array_almost_equal(y1, y2) assert_array_almost_equal(X1_mean, X2_mean) assert_array_almost_equal(X1_var, X2_var) assert_array_almost_equal(y1_mean, y2_mean)
def preprocess_data(self, X, y): return _preprocess_data(X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X)
def fit(self, T, y): """ Ref: Sparse Non-Negative Solution of a Linear System of Equations is Unique T: (N x L) y: (N x 1) max_iter: the max number of iteration. If requested_intermediate_solutions_sizes is None. Return the max_iter-sparse solution. requested_intermediate_solutions_sizes: a list of the other returned intermediate solutions than with max_iter (they are returned in a list with same indexes) Return the list of intermediate solutions. If the perfect solution is found before the end, the list may not be full. """ # this is copied from sklearn preprocessing hope this works fine but I am a believer T, y, T_offset, y_offset, T_scale = _preprocess_data( T, y, fit_intercept=True, normalize=False, copy=False, return_mean=True, check_input=True) iter_intermediate_solutions_sizes = iter(self.requested_intermediate_solutions_sizes) lst_intermediate_solutions = [] bool_arr_selected_indexes = np.zeros(T.shape[1], dtype=bool) residual = y i = 0 next_solution = next(iter_intermediate_solutions_sizes, None) while i < self.max_iter and next_solution != None and not np.isclose(np.linalg.norm(residual), 0): # if logger is not None: logger.debug("iter {}".format(i)) # compute all correlations between atoms and residual dot_products = T.T @ residual idx_max_dot_product = np.argmax(dot_products) # only positively correlated results can be taken if dot_products[idx_max_dot_product] <= 0: self._logger.warning("No other atoms is positively correlated with the residual. End prematurely with {} atoms.".format(i + 1)) break # selection of atom with max correlation with residual bool_arr_selected_indexes[idx_max_dot_product] = True tmp_T = T[:, bool_arr_selected_indexes] sol = nnls(tmp_T, y)[0] # non negative least square residual = y - tmp_T @ sol int_used_atoms = np.sum(sol.astype(bool)) if int_used_atoms != i+1: self._logger.warning("Atom found but not used. {} < {}".format(int_used_atoms, i+1)) if i + 1 == next_solution: final_vec = np.zeros(T.shape[1]) final_vec[bool_arr_selected_indexes] = sol # solution is full of zero but on selected indices lst_intermediate_solutions.append(final_vec) next_solution = next(iter_intermediate_solutions_sizes, None) i += 1 if len(lst_intermediate_solutions) == 0 and np.isclose(np.linalg.norm(residual), 0): final_vec = np.zeros(T.shape[1]) final_vec[bool_arr_selected_indexes] = sol # solution is full of zero but on selected indices lst_intermediate_solutions.append(final_vec) nb_missing_solutions = len(self.requested_intermediate_solutions_sizes) - len(lst_intermediate_solutions) if nb_missing_solutions > 0: if self.fill_with_final_solution: self._logger.warning("nn_omp ended prematurely and found less solution than expected: " "expected {}. found {}".format(len(self.requested_intermediate_solutions_sizes), len(lst_intermediate_solutions))) lst_intermediate_solutions.extend([deepcopy(lst_intermediate_solutions[-1]) for _ in range(nb_missing_solutions)]) else: self._logger.warning("nn_omp ended prematurely and found less solution than expected: " "expected {}. found {}. But fill with the last solution".format(len(self.requested_intermediate_solutions_sizes), len(lst_intermediate_solutions))) self.lst_intermediate_solutions = lst_intermediate_solutions self._set_intercept(T_offset, y_offset, T_scale)