def fit(self, x, y, sample_weight=None): x, y = check_X_y(x, y, accept_sparse=[], y_numeric=True, multi_output=False) x, y, X_offset, y_offset, X_scale = self._preprocess_data( x, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: x, y = _rescale_data(x, y, sample_weight) self.coef_ = sparse_group_lasso(x, y, self.alpha, self.rho, self.groups, max_iter=self.max_iter, rtol=self.tol) self._set_intercept(X_offset, y_offset, X_scale) return self
def fit(self, x_, y, sample_weight=None): n_samples, n_features = x_.shape X, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False) x, y, X_offset, y_offset, X_scale = self._preprocess_data( x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=None) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. x, y = _rescale_data(x, y, sample_weight) coefs, intercept = fit_with_noise(x, y, self.sigma, self.alpha, self.n) self.intercept_ = intercept self.coef_ = coefs self._set_intercept(X_offset, y_offset, X_scale) return self
def nonnegative_regression(X, y, sample_weight=None): r"""Solve the nonnegative least squares estimate regression problem. Solves :math:`\underset{x}{\text{argmin}} \| Ax - b \|_2^2` subject to :math:`x \geq 0` using `scipy.optimize.nnls <https://docs.scipy.org/doc/scipy/reference/ generated/scipy.optimize.nnls.html>`_ Parameters ---------- X : array, shape = (n_samples, n_features) Training data. y : array, shape = (n_samples,) or (n_samples, n_targets) Target values. sample_weight : float or array-like, shape (n_samples,), optional (default = None) Individual weights for each sample. Returns ------- coef : array, shape = (n_features,) or (n_samples, n_features) Weight vector(s). res : float The residual, :math:`\| Ax - y \|_2`. """ # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls # TODO order='F'? X = check_array(X) y = check_array(y, ensure_2d=False) check_consistent_length(X, y) n_samples, n_features = X.shape ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) has_sw = sample_weight is not None if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y = _rescale_data(X, y, sample_weight) coef, res = _solve_nnls(X, y) if ravel: # When y was passed as 1d-array, we flatten the coefficients coef = coef.ravel() return coef, res
def fit(self, x_, y, sample_weight=None): x_, y = check_X_y(x_, y, accept_sparse=[], y_numeric=True, multi_output=False) x, y, X_offset, y_offset, X_scale = self._preprocess_data( x_, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: x, y = _rescale_data(x, y, sample_weight) self.iters = 0 self.ind_ = np.ones(x.shape[1], dtype=bool) # initial guess if self.threshold > 0: self._reduce(x, y) else: self.coef_ = self._regress(x[:, self.ind_], y, self.alpha) if self.unbias and self.alpha >= 0: self._unbias(x, y) self._set_intercept(X_offset, y_offset, X_scale) if self.threshold_intercept and abs(self.intercept_) < self.threshold: self.intercept_ = 0 return self
def test_rescale_data(): n_samples = 200 n_features = 2 sample_weight = 1.0 + rng.rand(n_samples) X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight) rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis] rescaled_y2 = y * np.sqrt(sample_weight) assert_array_almost_equal(rescaled_X, rescaled_X2) assert_array_almost_equal(rescaled_y, rescaled_y2)
def test_rescale_data(): n_samples = 200 n_features = 2 sample_weight = 1.0 + rng.rand(n_samples) X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight) rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis] rescaled_y2 = y * np.sqrt(sample_weight) assert_array_almost_equal(rescaled_X, rescaled_X2) assert_array_almost_equal(rescaled_y, rescaled_y2)
def fit_linear_nnls(self, X, y, sample_weight=None): if not isinstance(self.model, LinearRegression): raise ValueError( 'Model is not linearRegression, could not call fit for linear nnls' ) n_jobs_ = self.model.n_jobs self.model.coef_ = [] X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self.model._preprocess_data( X, y, fit_intercept=self.model.fit_intercept, normalize=self.model.normalize, copy=self.model.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): if y.ndim < 2: # out = sparse_lsqr(X, y) out = lsq_linear(X, y, bounds=(0, np.Inf)) self.model.coef_ = out[0] self.model._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(lsq_linear)(X, y[:, j].ravel()) for j in range(y.shape[1])) self.model.coef_ = np.vstack(out[0] for out in outs) self.model._residues = np.vstack(out[3] for out in outs) else: # self.model.coef_, self.model.cost_, self.model.fun_, self.model.optimality_, self.model.active_mask_, # self.model.nit_, self.model.status_, self.model.message_, self.model.success_\ out = lsq_linear(X, y, bounds=(0, np.Inf)) self.model.coef_ = out.x self.model.coef_ = self.model.coef_.T if y.ndim == 1: self.model.coef_ = np.ravel(self.model.coef_) self.model._set_intercept(X_offset, y_offset, X_scale) return self.model
def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) if self.copy_X: if sp.issparse(X): X = X.copy() else: X = X.copy(order='K') if self.normalize == True: X = normalize(X) if self.fit_intercept: X = np.column_stack((np.ones(X.shape[0]), X)) if sample_weight is not None: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y = _rescale_data(X, y, sample_weight) optim_option = self.optim_args.copy() optim_option['fun'] = lambda x: self._l1_loss(betas=x, X=X, y=y) optim_option['jac'] = lambda x: self._jac(betas=x, X=X, y=y) optim_option['method'] = self.optim_method if 'x0' not in optim_option: optim_option['x0'] = self._x0_ls_guess(X, y) self.res = minimize(**optim_option) if self.fit_intercept: self.coef_ = self.res.x[1:] self.intercept_ = self.res.x[0] else: self.coef_ = self.res.x self.intercept_ = 0.0 return self
def fit(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : numpy array [n_samples,n_features] Training data y : numpy array of shape [n_samples] Target values sample_weight : numpy array of shape [n_samples] Individual weights for each sample Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, y_numeric=True, multi_output=True) if (sample_weight is not None) and \ np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) self.coef_, self.residues_ = optimize.nnls(X, y) self._set_intercept(X_offset, y_offset, X_scale) return self
def _fit_linear(self, X, y, sample_weight=None): """ Fit linear model. Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values sample_weight : numpy array of shape [n_samples] Individual weights for each sample .. versionadded:: 0.17 parameter *sample_weight* support to LinearRegression. Returns ------- self : returns an instance of self. """ n_jobs_ = self.n_jobs X, y = _daal_check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) dtype = get_dtype(X) self.sample_weight_ = sample_weight self.fit_shape_good_for_daal_ = bool( X.shape[0] > X.shape[1] + int(self.fit_intercept)) if self.fit_shape_good_for_daal_ and \ not sp.issparse(X) and \ (dtype == np.float64 or dtype == np.float32) and \ sample_weight is None: logging.info("sklearn.linar_model.LinearRegression." "fit: " + get_patch_message("daal")) res = _daal4py_fit(self, X, y) if res is not None: return res logging.info("sklearn.linar_model.LinearRegression." "fit: " + get_patch_message("sklearn_after_daal")) else: logging.info("sklearn.linar_model.LinearRegression." "fit: " + get_patch_message("sklearn")) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X, sample_weight=sample_weight) if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) if sp.issparse(X): if y.ndim < 2: out = sparse_lsqr(X, y) self.coef_ = out[0] self._residues = out[3] else: # sparse_lstsq cannot handle y with shape (M, K) outs = Parallel(n_jobs=n_jobs_)( delayed(sparse_lsqr)(X, y[:, j].ravel()) for j in range(y.shape[1])) self.coef_ = np.vstack(out[0] for out in outs) self._residues = np.vstack(out[3] for out in outs) else: self.coef_, self._residues, self.rank_, self.singular_ = \ linalg.lstsq(X, y) self.coef_ = self.coef_.T if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def nonnegative_ridge_regression(X, y, alpha, sample_weight=None, solver='SLSQP', **solver_kwargs): r"""Solve the nonnegative least squares estimate ridge regression problem. Solves .. math:: \underset{x}{\text{argmin}} \| Ax - b \|_2^2 + \alpha^2 \| x \|_2^2 \quad \text{s.t.} \quad x \geq 0 We can write this as the quadratic programming (QP) problem: .. math:: \underset{x}{\text{argmin}} x^TQx - c^Tx \quad \text{s.t.} \quad x \geq 0 where .. math:: Q = A^TA + \alpha I \quad \text{and} \quad c = -2A^Ty Parameters ---------- X : array, shape = (n_samples, n_features) Training data. y : array, shape = (n_samples,) or (n_samples, n_targets) Target values. alpha : float or array with shape = (n_features,) Regularization strength; must be a positive float. Improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. sample_weight : float or array-like, shape (n_samples,), optional (default = None) Individual weights for each sample. solver : string, optional (default = 'SLSQP') Solver with which to solve the QP. Must be one that supports bounds (i.e. 'L-BFGS-B', 'TNC', 'SLSQP'). **solver_kwargs See `scipy.optimize.minimize <https://docs.scipy.org/doc/scipy/ reference/generated/scipy.optimize.minimize.html>`_ for valid keyword arguments Returns ------- coef : array, shape = (n_features,) or (n_features, n_targets) Weight vector(s). res : float The residual, :math:`\| Qx - c \|_2` Notes ----- - This is an experimental function. - If one wishes to perform Lasso or Elastic-Net regression, see `sklearn.linear_model.lasso_path <http://scikit-learn.org/stable/modules/ generated/sklearn.linear_model.lasso_path.html>`_ or `sklearn.linear_model.enet_path <http://scikit-learn.org/stable/ modules/generated/sklearn.linear_model.enet_path.html>`_, and pass the parameters `fit_intercept=False, positive=True` See Also -------- nonnegative_regression """ if solver not in ('L-BFGS-B', 'TNC', 'SLSQP'): raise ValueError('solver must be one of L-BFGS-B, TNC, SLSQP, ' 'not %s' % solver) # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls # TODO order='F'? X = check_array(X) y = check_array(y, ensure_2d=False) check_consistent_length(X, y) n_samples, n_features = X.shape ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) has_sw = sample_weight is not None if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") X, y = _rescale_data(X, y, sample_weight) # there should be either 1 or n_targets penalties alpha = np.asarray(alpha, dtype=X.dtype).ravel() if alpha.size not in [1, n_features]: raise ValueError("Number of targets and number of L2 penalties " "do not correspond: %d != %d" % (alpha.size, n_features)) # NOTE: different from sklearn.linear_model.ridge if alpha.size == 1 and n_features > 1: alpha = np.repeat(alpha, n_features) coef, res = _solve_ridge_nnls(X, y, alpha, solver, **solver_kwargs) if ravel: # When y was passed as 1d-array, we flatten the coefficients coef = coef.ravel() return coef, res
def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64, multi_output=True, y_numeric=True) n_samples, n_features = X.shape if hasattr(LinearModel, '_preprocess_data'): # Scikit-learn 0.18 and up X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) else: X, y, X_offset, y_offset, X_scale = LinearModel._center_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) gcv_mode = self.gcv_mode with_sw = len(np.shape(sample_weight)) if gcv_mode is None or gcv_mode == 'auto': if sparse.issparse(X) or n_features > n_samples or with_sw: gcv_mode = 'eigen' else: gcv_mode = 'svd' elif gcv_mode == "svd" and with_sw: # FIXME non-uniform sample weights not yet supported warnings.warn("non-uniform sample weights unsupported for svd, " "forcing usage of eigen") gcv_mode = 'eigen' if gcv_mode == 'eigen': _pre_compute = self._pre_compute _errors = self._errors _values = self._values elif gcv_mode == 'svd': # assert n_samples >= n_features _pre_compute = self._pre_compute_svd _errors = self._errors_svd _values = self._values_svd else: raise ValueError('bad gcv_mode "%s"' % gcv_mode) if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) # Ensure that y is a 2D array: n_samples x n_targets flat_y = y.ndim == 1 if flat_y: y = np.atleast_2d(y).T n_targets = y.shape[1] centered_kernel = not sparse.issparse(X) and self.fit_intercept v, Q, QT_y = _pre_compute(X, y, centered_kernel) cv_values = np.zeros((n_samples, n_targets, len(self.alphas))) C = [] scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None for i, alpha in enumerate(self.alphas): if error: out, c = _errors(alpha, y, v, Q, QT_y) else: out, c = _values(alpha, y, v, Q, QT_y) cv_values[:, :, i] = out C.append(c) if self.store_cv_values: self.cv_values_ = cv_values if error: if self.alpha_per_target: # Find the best alpha for each target best = cv_values.mean(axis=0).argmin(axis=1) else: # Find the best alpha overall best = np.mean(cv_values.reshape(-1, len(self.alphas)), axis=0).argmin() else: # The scorer wants an object that will make the predictions, but # they are already computed efficiently by RidgeGCV. This # identity_estimator will just return them def identity_estimator(): pass identity_estimator.decision_function = lambda y_predict: y_predict identity_estimator.predict = lambda y_predict: y_predict if self.alpha_per_target: out = [ scorer(identity_estimator, target, cv_values[:, i, j]) for j in range(len(self.alphas)) for i, target in enumerate(y.T) ] best = np.argmax(out, axis=1) else: out = [ scorer(identity_estimator, y.ravel(), cv_values[:, :, i].ravel()) for i in range(len(self.alphas)) ] best = np.argmax(out) self.alpha_ = self.alphas[best] if self.alpha_per_target: self.dual_coef_ = np.vstack( [C[j][:, i] for i, j in enumerate(best)]).T else: self.dual_coef_ = C[best] self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) # If the original y was flat, remove some dimensions to match if flat_y: if self.store_cv_values: self.cv_values_ = cv_values.reshape(n_samples, len(self.alphas)) self.coef_ = self.coef_.ravel() self._set_intercept(X_offset, y_offset, X_scale) return self
def fit(self, X, y, seed=None, verbose=False, sample_weight=None): """Fit data according to the UoI-Lasso algorithm. Relevant information (fits, residuals, model performance) is stored within object. Thus, nothing is returned by this function. Parameters ---------- X : np array (2d) the design matrix, containing the predictors. its shape is assumed to be (number of samples, number of features). y : np array (1d) the vector of dependent variables. its length is assumed to be (number of samples,). seed : int a seed for the random number generator. this number is relevant for the choosing bootstraps and dividing the data into training and test sets. verbose : boolean a boolean switch indicating whether the fitting should print out its progress. """ # initialize the seed, if it's provided if seed is not None: np.random.seed(seed) # start taken from sklearn.LinearModels.base.LinearRegression X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) # preprocess data through centering and normalization X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # extract model dimensions from design matrix self.n_samples_, self.n_features_ = X.shape if verbose: print('(1) Loaded data.\n %s samples with %s features.' % (self.n_samples_, self.n_features_)) # perform an initial coarse sweep over the lambda parameters # this is to zero-in on the relevant regularization region. if self.n_lambdas == 1: lambda_coarse = np.array([1.0]) else: lambda_coarse = np.logspace(-3., 3., self.n_lambdas, dtype=np.float64) # run the coarse lasso sweep estimates_coarse, scores_coarse = \ self.lasso_sweep( X, y, lambda_coarse, self.train_frac_sel, self.n_boots_coarse, self.use_admm, desc='coarse lasso sweep', verbose=verbose ) # deduce the index which maximizes the explained variance over bootstraps lambda_max_idx = np.argmax(np.mean(scores_coarse, axis=0)) # obtain the lambda which maximizes the explained variance over bootstraps lambda_max = lambda_coarse[lambda_max_idx] # in our dense sweep, we'll explore lambda values which encompass a # range that's one order of magnitude less than lambda_max itself d_lambda = 10**(np.floor(np.log10(lambda_max)) - 1) # now that we've narrowed down the regularization parameters, # we'll run a dense sweep which begins the model selection module of UoI ####################### ### Model Selection ### ####################### if verbose: print( '(2) Beginning model selection. Exploring penalty region centered at %d.' % lambda_max) # create the final lambda set based on the coarse sweep if self.n_lambdas == 1: lambdas = np.array([lambda_max]) else: lambdas = np.linspace(lambda_max - 5 * d_lambda, lambda_max + 5 * d_lambda, self.n_lambdas, dtype=np.float64) # run the lasso sweep with new lambda set estimates_dense, scores_dense = \ self.lasso_sweep( X, y, lambdas, self.train_frac_sel, self.n_boots_sel, self.use_admm, desc='fine lasso sweep', verbose=verbose ) # choose selection fraction threshold values to use selection_frac_thresholds = np.linspace(self.selection_thres_min, self.selection_thres_max, self.n_selection_thres) # calculate the actual number of thresholds, but delete any repetitions selection_thresholds = np.sort( np.unique( (self.n_boots_sel * selection_frac_thresholds).astype('int'))) # create support matrix self.supports_ = np.zeros( (self.n_selection_thres, self.n_lambdas, self.n_features_), dtype=bool) # iterate over each stability selection threshold for thres_idx, threshold in enumerate(selection_thresholds): # calculate the support given the specific selection threshold self.supports_[thres_idx, :] = np.count_nonzero( estimates_dense, axis=0) >= threshold # reshape support matrix so that first axis consists of all combinations of hyperparameters self.supports_ = np.reshape( self.supports_, (self.n_selection_thres * self.n_lambdas, self.n_features_)) ######################## ### Model Estimation ### ######################## # we'll use the supports obtained in the selection module to calculate # bagged OLS estimates over bootstraps if verbose: print( '(3) Model selection complete. Beginning model estimation, with %s bootstraps.' % self.n_boots_est) # create or overwrite arrays to collect final results self.coef_ = np.zeros(self.n_features_, dtype=np.float32) self.scores_ = np.zeros(1, dtype=np.float32) # determine how many samples will be used for overall training train_split = int(round(self.train_frac_overall * self.n_samples_)) # determine how many samples will be used for training within a bootstrap boot_train_split = int(round(self.train_frac_est * train_split)) # set up data arrays estimates = np.zeros( (self.n_boots_est, self.n_lambdas, self.n_features_), dtype=np.float32) scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32) # either we plan on using a test set, or we'll use the entire dataset for training if self.train_frac_overall < 1: # generate indices for the global training and testing blocks indices = np.random.permutation(self.n_samples_) train, test = np.split(indices, [train_split]) # compile the training and test sets X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] else: X_train = X y_train = y # iterate over bootstrap samples for bootstrap in trange(self.n_boots_est, desc='Model Estimation', disable=not verbose): # extract the bootstrap indices, keeping a fraction of the data # available for testing bootstrap_indices = np.random.permutation(train_split) train_boot, test_boot = np.split(bootstrap_indices, [boot_train_split]) # iterate over the regularization parameters for lamb_idx, lamb in enumerate(lambdas): support = self.supports_[lamb_idx] if np.any(support): # fit OLS using the supports from selection module X_boot = X_train[train_boot] y_boot = y_train[train_boot] ols = lm.LinearRegression() ols.fit(X_boot[:, support], y_boot - y_boot.mean()) # store the fitted coefficients estimates[bootstrap, lamb_idx, support] = ols.coef_ # calculate and store the performance on the test set y_hat_boot = np.dot(X_train[test_boot], estimates[bootstrap, lamb_idx, :]) y_true_boot = y_train[test_boot] - y_train[test_boot].mean( ) # calculate sum of squared residuals rss = np.sum((y_hat_boot - y_true_boot)**2) # calculate BIC as our scoring function if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = r2_score(y_true_boot, y_hat_boot) elif self.estimation_score == 'BIC': n_selected_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.BIC( n_features=n_selected_features, n_samples=boot_train_split, rss=rss) else: # if no variables were selected, throw a message # we'll leave the scores array unchanged, so any support # with no selection will be assigned a score of 0. print( 'No variables selected in the support for lambda = %d.' % lamb) if verbose: print('(4) Bagging estimates, using bagging option %s.' % self.bagging_options) if self.bagging_options == 1: # bagging option 1: for each bootstrap sample, find the regularization parameter that gave the best results lambda_max_idx = np.argmax(scores, axis=1) # extract the estimates over bootstraps from the model with best lambda best_estimates = estimates[np.arange(self.n_boots_est), lambda_max_idx, :] # take the median across estimates for the final, bagged estimate self.coef_ = np.median(best_estimates, axis=0) elif self.bagging_options == 2: # bagging option 2: average estimates across bootstraps, and then find the regularization parameter that gives the best results mean_scores = np.mean(scores, axis=0) lambda_max_idx = np.argmax(mean_scores) self.coef_ = np.median(estimates[:, lambda_max_idx, :], 0) else: raise ValueError('Bagging option %d is not available.' % self.bagging_options) # if we extracted a test set, evaluate the model if self.train_frac_overall < 1: # finally, see how the bagged estimates perform on the test set y_hat = np.dot(X_test, self.coef_) y_true = y_test - y_test.mean() # calculate and store performance of the final UoI_Lasso estimator over test set self.scores_ = r2_score(y_true, y_hat) else: self.scores_ = None if verbose: print("---> UoI Lasso complete.") if y.ndim == 1: self.coef_ = np.ravel(self.coef_) self._set_intercept(X_offset, y_offset, X_scale) return self
def fit(self, X, y, groups=None, seed=None, verbose=False, sample_weight=None, option=True): """Fit data according to the UoI-Lasso algorithm. Relevant information (fits, residuals, model performance) is stored within object. Thus, nothing is returned by this function. Parameters ---------- X : np array (2d) the design matrix, containing the predictors. its shape is assumed to be (number of samples, number of features). y : np array (1d) the vector of dependent variables. its length is assumed to be (number of samples,). seed : int a seed for the random number generator. this number is relevant for the choosing bootstraps and dividing the data into training and test sets. verbose : boolean a boolean switch indicating whether the fitting should print out its progress. """ # initialize the seed, if it's provided if seed is not None: np.random.seed(seed) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=True) # preprocess data through centering and normalization X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, normalize=self.normalize, copy=self.copy_X) if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") if sample_weight is not None: # Sample weight can be implemented via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # extract model dimensions from design matrix self.n_samples_, self.n_features_ = X.shape # create or overwrite arrays to collect final results self.coef_ = np.zeros(self.n_features_, dtype=np.float32) # group leveling if groups is None: self.groups_ = np.ones(self.n_samples_) else: self.groups_ = np.array(groups) if verbose: print('(1) Loaded data.\n %s samples with %s features.' % (self.n_samples_, self.n_features_)) self.lambdas = _alpha_grid(X=X, y=y, l1_ratio=1.0, fit_intercept=self.fit_intercept, eps=1e-3, n_alphas=self.n_lambdas, normalize=self.normalize) # sweep over the grid of regularization strengths estimates_selection, _ = \ self.lasso_sweep( X, y, self.lambdas, self.train_frac_sel, self.n_boots_sel, self.use_admm, desc='fine lasso sweep', verbose=verbose ) # perform the intersection step self.intersection(estimates_selection) ######################## ### Model Estimation ### ######################## # we'll use the supports obtained in the selection module to calculate # bagged OLS estimates over bootstraps if verbose: print('(3) Beginning model estimation, with %s bootstraps.' % self.n_boots_est) # compute number of samples per bootstrap n_samples_bootstrap = int(round(self.train_frac_est * self.n_samples_)) # set up data arrays estimates = np.zeros( (self.n_boots_est, self.n_lambdas, self.n_features_), dtype=np.float32) scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32) # iterate over bootstrap samples for bootstrap in trange(self.n_boots_est, desc='Model Estimation', disable=not verbose): # extract the bootstrap indices, keeping a fraction of the data available for testing train_idx, test_idx = utils.leveled_randomized_ids( self.groups_, self.train_frac_est) # iterate over the regularization parameters for lamb_idx, lamb in enumerate(self.lambdas): # extract current support set support = self.supports_[lamb_idx] # extract response vectors y_train = y[train_idx] y_test = y[test_idx] # if nothing was selected, we won't bother running OLS if np.any(support): # get design matrices X_train = X[train_idx][:, support] X_test = X[test_idx][:, support] # compute ols estimate ols = lm.LinearRegression() ols.fit(X_train, y_train) # store the fitted coefficients estimates[bootstrap, lamb_idx, support] = ols.coef_ # calculate estimation score if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = ols.score(X_test, y_test) elif self.estimation_score == 'BIC': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.BIC(y_true=y_test, y_pred=y_pred, n_features=n_features) elif self.estimation_score == 'AIC': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.AIC(y_true=y_test, y_pred=y_pred, n_features=n_features) elif self.estimation_score == 'AICc': y_pred = ols.predict(X_test) n_features = np.count_nonzero(support) scores[bootstrap, lamb_idx] = -utils.AICc(y_true=y_test, y_pred=y_pred, n_features=n_features) else: raise ValueError( str(self.estimation_score) + ' is not a valid option.') else: if self.estimation_score == 'r2': scores[bootstrap, lamb_idx] = r2_score( y_true=y_test, y_pred=np.zeros(y_test.size)) elif self.estimation_score == 'BIC': n_features = 0 scores[bootstrap, lamb_idx] = -utils.BIC( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) elif self.estimation_score == 'AIC': n_features = 0 scores[bootstrap, lamb_idx] = -utils.AIC( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) elif self.estimation_score == 'AICc': n_features = 0 scores[bootstrap, lamb_idx] = -utils.AICc( y_true=y_test, y_pred=np.zeros(y_test.size), n_features=n_features) else: raise ValueError( str(self.estimation_score) + ' is not a valid option.') if verbose: print('(4) Bagging estimates, using bagging option %s.' % self.bagging_options) # bagging option 1: # for each bootstrap sample, find the regularization parameter that gave the best results if self.bagging_options == 1: self.lambda_max_idx = np.argmax(scores, axis=1) # extract the estimates over bootstraps from the model with best lambda best_estimates = estimates[np.arange(self.n_boots_est), self.lambda_max_idx, :] # take the median across estimates for the final, bagged estimate self.coef_ = np.median(best_estimates, axis=0) # bagging option 2: # average estimates across bootstraps, and then find the regularization parameter that gives the best results elif self.bagging_options == 2: mean_scores = np.mean(scores, axis=0) self.lambda_max_idx = np.argmax(mean_scores) self.coef_ = np.median(estimates[:, self.lambda_max_idx, :], 0) else: raise ValueError('Bagging option %d is not available.' % self.bagging_options) if verbose: print("---> UoI Lasso complete.") self._set_intercept(X_offset, y_offset, X_scale) return self