Example #1
0
    def fit(self, x, y, sample_weight=None):
        x, y = check_X_y(x,
                         y,
                         accept_sparse=[],
                         y_numeric=True,
                         multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            x, y = _rescale_data(x, y, sample_weight)

        self.coef_ = sparse_group_lasso(x,
                                        y,
                                        self.alpha,
                                        self.rho,
                                        self.groups,
                                        max_iter=self.max_iter,
                                        rtol=self.tol)

        self._set_intercept(X_offset, y_offset, X_scale)
        return self
Example #2
0
    def fit(self, x_, y, sample_weight=None):
        n_samples, n_features = x_.shape

        X, y = check_X_y(x_,
                         y,
                         accept_sparse=[],
                         y_numeric=True,
                         multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=None)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            x, y = _rescale_data(x, y, sample_weight)

        coefs, intercept = fit_with_noise(x, y, self.sigma, self.alpha, self.n)
        self.intercept_ = intercept
        self.coef_ = coefs
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
Example #3
0
def nonnegative_regression(X, y, sample_weight=None):
    r"""Solve the nonnegative least squares estimate regression problem.

    Solves :math:`\underset{x}{\text{argmin}} \| Ax - b \|_2^2` subject to :math:`x \geq 0`
    using `scipy.optimize.nnls <https://docs.scipy.org/doc/scipy/reference/
    generated/scipy.optimize.nnls.html>`_

    Parameters
    ----------
    X : array, shape = (n_samples, n_features)
        Training data.

    y : array, shape = (n_samples,) or (n_samples, n_targets)
        Target values.

    sample_weight : float or array-like, shape (n_samples,), optional (default = None)
        Individual weights for each sample.

    Returns
    -------
    coef : array, shape = (n_features,) or (n_samples, n_features)
        Weight vector(s).

    res : float
        The residual, :math:`\| Ax - y \|_2`.
    """
    # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls
    # TODO order='F'?
    X = check_array(X)
    y = check_array(y, ensure_2d=False)
    check_consistent_length(X, y)

    n_samples, n_features = X.shape

    ravel = False
    if y.ndim == 1:
        y = y.reshape(-1, 1)
        ravel = True

    n_samples_, n_targets = y.shape

    if n_samples != n_samples_:
        raise ValueError("Number of samples in X and y does not correspond:"
                         " %d != %d" % (n_samples, n_samples_))

    has_sw = sample_weight is not None

    if has_sw:
        if np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y = _rescale_data(X, y, sample_weight)

    coef, res = _solve_nnls(X, y)

    if ravel:
        # When y was passed as 1d-array, we flatten the coefficients
        coef = coef.ravel()

    return coef, res
Example #4
0
    def fit(self, x_, y, sample_weight=None):
        x_, y = check_X_y(x_,
                          y,
                          accept_sparse=[],
                          y_numeric=True,
                          multi_output=False)

        x, y, X_offset, y_offset, X_scale = self._preprocess_data(
            x_,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            x, y = _rescale_data(x, y, sample_weight)

        self.iters = 0
        self.ind_ = np.ones(x.shape[1], dtype=bool)  # initial guess
        if self.threshold > 0:
            self._reduce(x, y)
        else:
            self.coef_ = self._regress(x[:, self.ind_], y, self.alpha)

        if self.unbias and self.alpha >= 0:
            self._unbias(x, y)

        self._set_intercept(X_offset, y_offset, X_scale)
        if self.threshold_intercept and abs(self.intercept_) < self.threshold:
            self.intercept_ = 0
        return self
Example #5
0
def test_rescale_data():
    n_samples = 200
    n_features = 2

    sample_weight = 1.0 + rng.rand(n_samples)
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight)
    rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis]
    rescaled_y2 = y * np.sqrt(sample_weight)
    assert_array_almost_equal(rescaled_X, rescaled_X2)
    assert_array_almost_equal(rescaled_y, rescaled_y2)
Example #6
0
def test_rescale_data():
    n_samples = 200
    n_features = 2

    sample_weight = 1.0 + rng.rand(n_samples)
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight)
    rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis]
    rescaled_y2 = y * np.sqrt(sample_weight)
    assert_array_almost_equal(rescaled_X, rescaled_X2)
    assert_array_almost_equal(rescaled_y, rescaled_y2)
Example #7
0
    def fit_linear_nnls(self, X, y, sample_weight=None):
        if not isinstance(self.model, LinearRegression):
            raise ValueError(
                'Model is not linearRegression, could not call fit for linear nnls'
            )
        n_jobs_ = self.model.n_jobs
        self.model.coef_ = []
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y, X_offset, y_offset, X_scale = self.model._preprocess_data(
            X,
            y,
            fit_intercept=self.model.fit_intercept,
            normalize=self.model.normalize,
            copy=self.model.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        if sp.issparse(X):
            if y.ndim < 2:
                # out = sparse_lsqr(X, y)
                out = lsq_linear(X, y, bounds=(0, np.Inf))
                self.model.coef_ = out[0]
                self.model._residues = out[3]
            else:
                # sparse_lstsq cannot handle y with shape (M, K)
                outs = Parallel(n_jobs=n_jobs_)(
                    delayed(lsq_linear)(X, y[:, j].ravel())
                    for j in range(y.shape[1]))
                self.model.coef_ = np.vstack(out[0] for out in outs)
                self.model._residues = np.vstack(out[3] for out in outs)
        else:
            # self.model.coef_, self.model.cost_, self.model.fun_, self.model.optimality_, self.model.active_mask_,
            # self.model.nit_, self.model.status_, self.model.message_, self.model.success_\
            out = lsq_linear(X, y, bounds=(0, np.Inf))
            self.model.coef_ = out.x
            self.model.coef_ = self.model.coef_.T

        if y.ndim == 1:
            self.model.coef_ = np.ravel(self.model.coef_)
        self.model._set_intercept(X_offset, y_offset, X_scale)
        return self.model
Example #8
0
    def fit(self, X, y, sample_weight=None):

        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        if self.copy_X:
            if sp.issparse(X):
                X = X.copy()
            else:
                X = X.copy(order='K')

        if self.normalize == True:
            X = normalize(X)

        if self.fit_intercept:
            X = np.column_stack((np.ones(X.shape[0]), X))

        if sample_weight is not None:
            if np.atleast_1d(sample_weight).ndim > 1:
                raise ValueError("Sample weights must be 1D array or scalar")

            X, y = _rescale_data(X, y, sample_weight)

        optim_option = self.optim_args.copy()
        optim_option['fun'] = lambda x: self._l1_loss(betas=x, X=X, y=y)
        optim_option['jac'] = lambda x: self._jac(betas=x, X=X, y=y)
        optim_option['method'] = self.optim_method

        if 'x0' not in optim_option:
            optim_option['x0'] = self._x0_ls_guess(X, y)

        self.res = minimize(**optim_option)

        if self.fit_intercept:
            self.coef_ = self.res.x[1:]
            self.intercept_ = self.res.x[0]
        else:
            self.coef_ = self.res.x
            self.intercept_ = 0.0

        return self
Example #9
0
    def fit(self, X, y, sample_weight=None):
        """
        Fit linear model.
        Parameters
        ----------
        X : numpy array [n_samples,n_features]
            Training data
        y : numpy array of shape [n_samples]
            Target values
        sample_weight : numpy array of shape [n_samples]
            Individual weights for each sample
        Returns
        -------
        self : returns an instance of self.
        """
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)

        if (sample_weight is not None) and \
           np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y, X_offset, y_offset, X_scale = self._preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        self.coef_, self.residues_ = optimize.nnls(X, y)
        self._set_intercept(X_offset, y_offset, X_scale)

        return self
Example #10
0
def _fit_linear(self, X, y, sample_weight=None):
    """
    Fit linear model.

    Parameters
    ----------
    X : numpy array or sparse matrix of shape [n_samples,n_features]
        Training data

    y : numpy array of shape [n_samples, n_targets]
        Target values

    sample_weight : numpy array of shape [n_samples]
        Individual weights for each sample

        .. versionadded:: 0.17
           parameter *sample_weight* support to LinearRegression.

    Returns
    -------
    self : returns an instance of self.
    """

    n_jobs_ = self.n_jobs
    X, y = _daal_check_X_y(X,
                           y,
                           accept_sparse=['csr', 'csc', 'coo'],
                           y_numeric=True,
                           multi_output=True)

    dtype = get_dtype(X)

    self.sample_weight_ = sample_weight
    self.fit_shape_good_for_daal_ = bool(
        X.shape[0] > X.shape[1] + int(self.fit_intercept))

    if self.fit_shape_good_for_daal_ and \
            not sp.issparse(X) and \
            (dtype == np.float64 or dtype == np.float32) and \
            sample_weight is None:
        logging.info("sklearn.linar_model.LinearRegression."
                     "fit: " + get_patch_message("daal"))
        res = _daal4py_fit(self, X, y)
        if res is not None:
            return res
        logging.info("sklearn.linar_model.LinearRegression."
                     "fit: " + get_patch_message("sklearn_after_daal"))
    else:
        logging.info("sklearn.linar_model.LinearRegression."
                     "fit: " + get_patch_message("sklearn"))

    if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
        raise ValueError("Sample weights must be 1D array or scalar")

    X, y, X_offset, y_offset, X_scale = self._preprocess_data(
        X,
        y,
        fit_intercept=self.fit_intercept,
        normalize=self.normalize,
        copy=self.copy_X,
        sample_weight=sample_weight)

    if sample_weight is not None:
        # Sample weight can be implemented via a simple rescaling.
        X, y = _rescale_data(X, y, sample_weight)

    if sp.issparse(X):
        if y.ndim < 2:
            out = sparse_lsqr(X, y)
            self.coef_ = out[0]
            self._residues = out[3]
        else:
            # sparse_lstsq cannot handle y with shape (M, K)
            outs = Parallel(n_jobs=n_jobs_)(
                delayed(sparse_lsqr)(X, y[:, j].ravel())
                for j in range(y.shape[1]))
            self.coef_ = np.vstack(out[0] for out in outs)
            self._residues = np.vstack(out[3] for out in outs)
    else:
        self.coef_, self._residues, self.rank_, self.singular_ = \
            linalg.lstsq(X, y)
        self.coef_ = self.coef_.T

    if y.ndim == 1:
        self.coef_ = np.ravel(self.coef_)
    self._set_intercept(X_offset, y_offset, X_scale)
    return self
Example #11
0
def nonnegative_ridge_regression(X,
                                 y,
                                 alpha,
                                 sample_weight=None,
                                 solver='SLSQP',
                                 **solver_kwargs):
    r"""Solve the nonnegative least squares estimate ridge regression problem.

    Solves

    .. math::
        \underset{x}{\text{argmin}} \| Ax - b \|_2^2 + \alpha^2 \| x \|_2^2
        \quad \text{s.t.} \quad x \geq 0

    We can write this as the quadratic programming (QP) problem:

    .. math::

        \underset{x}{\text{argmin}} x^TQx - c^Tx \quad \text{s.t.} \quad x \geq 0

    where

    .. math::

        Q = A^TA + \alpha I \quad \text{and} \quad c = -2A^Ty

    Parameters
    ----------
    X : array, shape = (n_samples, n_features)
        Training data.

    y : array, shape = (n_samples,) or (n_samples, n_targets)
        Target values.

    alpha : float or array with shape = (n_features,)
        Regularization strength; must be a positive float. Improves the
        conditioning of the problem and reduces the variance of the estimates.
        Larger values specify stronger regularization.

    sample_weight : float or array-like, shape (n_samples,), optional (default = None)
        Individual weights for each sample.

    solver : string, optional (default = 'SLSQP')
        Solver with which to solve the QP. Must be one that supports bounds
        (i.e. 'L-BFGS-B', 'TNC', 'SLSQP').

    **solver_kwargs
        See `scipy.optimize.minimize <https://docs.scipy.org/doc/scipy/
        reference/generated/scipy.optimize.minimize.html>`_
        for valid keyword arguments

    Returns
    -------
    coef : array, shape = (n_features,) or (n_features, n_targets)
        Weight vector(s).

    res : float
        The residual, :math:`\| Qx - c \|_2`

    Notes
    -----
    - This is an experimental function.
    - If one wishes to perform Lasso or Elastic-Net regression, see
      `sklearn.linear_model.lasso_path <http://scikit-learn.org/stable/modules/
      generated/sklearn.linear_model.lasso_path.html>`_ or
      `sklearn.linear_model.enet_path <http://scikit-learn.org/stable/
      modules/generated/sklearn.linear_model.enet_path.html>`_,
      and pass the parameters `fit_intercept=False, positive=True`


    See Also
    --------
    nonnegative_regression
    """
    if solver not in ('L-BFGS-B', 'TNC', 'SLSQP'):
        raise ValueError('solver must be one of L-BFGS-B, TNC, SLSQP, '
                         'not %s' % solver)

    # TODO accept_sparse=['csr', 'csc', 'coo']? check sopt.nnls
    # TODO order='F'?
    X = check_array(X)
    y = check_array(y, ensure_2d=False)
    check_consistent_length(X, y)

    n_samples, n_features = X.shape

    ravel = False
    if y.ndim == 1:
        y = y.reshape(-1, 1)
        ravel = True

    n_samples_, n_targets = y.shape

    if n_samples != n_samples_:
        raise ValueError("Number of samples in X and y does not correspond:"
                         " %d != %d" % (n_samples, n_samples_))

    has_sw = sample_weight is not None

    if has_sw:
        if np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        X, y = _rescale_data(X, y, sample_weight)

    # there should be either 1 or n_targets penalties
    alpha = np.asarray(alpha, dtype=X.dtype).ravel()
    if alpha.size not in [1, n_features]:
        raise ValueError("Number of targets and number of L2 penalties "
                         "do not correspond: %d != %d" %
                         (alpha.size, n_features))

    # NOTE: different from sklearn.linear_model.ridge
    if alpha.size == 1 and n_features > 1:
        alpha = np.repeat(alpha, n_features)

    coef, res = _solve_ridge_nnls(X, y, alpha, solver, **solver_kwargs)

    if ravel:
        # When y was passed as 1d-array, we flatten the coefficients
        coef = coef.ravel()

    return coef, res
Example #12
0
    def fit(self, X, y, sample_weight=None):
        """Fit Ridge regression model

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training data

        y : array-like, shape = [n_samples] or [n_samples, n_targets]
            Target values

        sample_weight : float or array-like of shape [n_samples]
            Sample weight

        Returns
        -------
        self : Returns self.
        """
        X, y = check_X_y(X,
                         y, ['csr', 'csc', 'coo'],
                         dtype=np.float64,
                         multi_output=True,
                         y_numeric=True)
        n_samples, n_features = X.shape

        if hasattr(LinearModel, '_preprocess_data'):
            # Scikit-learn 0.18 and up
            X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
                X,
                y,
                self.fit_intercept,
                self.normalize,
                self.copy_X,
                sample_weight=sample_weight)
        else:
            X, y, X_offset, y_offset, X_scale = LinearModel._center_data(
                X,
                y,
                self.fit_intercept,
                self.normalize,
                self.copy_X,
                sample_weight=sample_weight)

        gcv_mode = self.gcv_mode
        with_sw = len(np.shape(sample_weight))

        if gcv_mode is None or gcv_mode == 'auto':
            if sparse.issparse(X) or n_features > n_samples or with_sw:
                gcv_mode = 'eigen'
            else:
                gcv_mode = 'svd'
        elif gcv_mode == "svd" and with_sw:
            # FIXME non-uniform sample weights not yet supported
            warnings.warn("non-uniform sample weights unsupported for svd, "
                          "forcing usage of eigen")
            gcv_mode = 'eigen'

        if gcv_mode == 'eigen':
            _pre_compute = self._pre_compute
            _errors = self._errors
            _values = self._values
        elif gcv_mode == 'svd':
            # assert n_samples >= n_features
            _pre_compute = self._pre_compute_svd
            _errors = self._errors_svd
            _values = self._values_svd
        else:
            raise ValueError('bad gcv_mode "%s"' % gcv_mode)

        if sample_weight is not None:
            X, y = _rescale_data(X, y, sample_weight)

        # Ensure that y is a 2D array: n_samples x n_targets
        flat_y = y.ndim == 1
        if flat_y:
            y = np.atleast_2d(y).T
        n_targets = y.shape[1]

        centered_kernel = not sparse.issparse(X) and self.fit_intercept
        v, Q, QT_y = _pre_compute(X, y, centered_kernel)
        cv_values = np.zeros((n_samples, n_targets, len(self.alphas)))
        C = []

        scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
        error = scorer is None

        for i, alpha in enumerate(self.alphas):
            if error:
                out, c = _errors(alpha, y, v, Q, QT_y)
            else:
                out, c = _values(alpha, y, v, Q, QT_y)
            cv_values[:, :, i] = out
            C.append(c)

        if self.store_cv_values:
            self.cv_values_ = cv_values

        if error:
            if self.alpha_per_target:
                # Find the best alpha for each target
                best = cv_values.mean(axis=0).argmin(axis=1)
            else:
                # Find the best alpha overall
                best = np.mean(cv_values.reshape(-1, len(self.alphas)),
                               axis=0).argmin()
        else:
            # The scorer wants an object that will make the predictions, but
            # they are already computed efficiently by RidgeGCV. This
            # identity_estimator will just return them
            def identity_estimator():
                pass

            identity_estimator.decision_function = lambda y_predict: y_predict
            identity_estimator.predict = lambda y_predict: y_predict

            if self.alpha_per_target:
                out = [
                    scorer(identity_estimator, target, cv_values[:, i, j])
                    for j in range(len(self.alphas))
                    for i, target in enumerate(y.T)
                ]
                best = np.argmax(out, axis=1)
            else:
                out = [
                    scorer(identity_estimator, y.ravel(), cv_values[:, :,
                                                                    i].ravel())
                    for i in range(len(self.alphas))
                ]
                best = np.argmax(out)

        self.alpha_ = self.alphas[best]

        if self.alpha_per_target:
            self.dual_coef_ = np.vstack(
                [C[j][:, i] for i, j in enumerate(best)]).T
        else:
            self.dual_coef_ = C[best]

        self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)

        # If the original y was flat, remove some dimensions to match
        if flat_y:
            if self.store_cv_values:
                self.cv_values_ = cv_values.reshape(n_samples,
                                                    len(self.alphas))
            self.coef_ = self.coef_.ravel()

        self._set_intercept(X_offset, y_offset, X_scale)

        return self
Example #13
0
    def fit(self, X, y, seed=None, verbose=False, sample_weight=None):
        """Fit data according to the UoI-Lasso algorithm.
		Relevant information (fits, residuals, model performance) is stored within object.
		Thus, nothing is returned by this function.

		Parameters
		----------
		X : np array (2d)
			the design matrix, containing the predictors.
			its shape is assumed to be (number of samples, number of features).

		y : np array (1d)
			the vector of dependent variables.
			its length is assumed to be (number of samples,).

		seed : int
			a seed for the random number generator. this number is relevant
			for the choosing bootstraps and dividing the data into training and test sets.

		verbose : boolean
			a boolean switch indicating whether the fitting should print out its progress.
		"""
        # initialize the seed, if it's provided
        if seed is not None:
            np.random.seed(seed)

        # start taken from sklearn.LinearModels.base.LinearRegression
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        # preprocess data through centering and normalization
        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        # extract model dimensions from design matrix
        self.n_samples_, self.n_features_ = X.shape

        if verbose:
            print('(1) Loaded data.\n %s samples with %s features.' %
                  (self.n_samples_, self.n_features_))

        # perform an initial coarse sweep over the lambda parameters
        # this is to zero-in on the relevant regularization region.
        if self.n_lambdas == 1:
            lambda_coarse = np.array([1.0])
        else:
            lambda_coarse = np.logspace(-3.,
                                        3.,
                                        self.n_lambdas,
                                        dtype=np.float64)
        # run the coarse lasso sweep
        estimates_coarse, scores_coarse = \
         self.lasso_sweep(
          X, y, lambda_coarse, self.train_frac_sel, self.n_boots_coarse,
          self.use_admm, desc='coarse lasso sweep', verbose=verbose
         )
        # deduce the index which maximizes the explained variance over bootstraps
        lambda_max_idx = np.argmax(np.mean(scores_coarse, axis=0))
        # obtain the lambda which maximizes the explained variance over bootstraps
        lambda_max = lambda_coarse[lambda_max_idx]
        # in our dense sweep, we'll explore lambda values which encompass a
        # range that's one order of magnitude less than lambda_max itself
        d_lambda = 10**(np.floor(np.log10(lambda_max)) - 1)

        # now that we've narrowed down the regularization parameters,
        # we'll run a dense sweep which begins the model selection module of UoI

        #######################
        ### Model Selection ###
        #######################
        if verbose:
            print(
                '(2) Beginning model selection. Exploring penalty region centered at %d.'
                % lambda_max)

        # create the final lambda set based on the coarse sweep
        if self.n_lambdas == 1:
            lambdas = np.array([lambda_max])
        else:
            lambdas = np.linspace(lambda_max - 5 * d_lambda,
                                  lambda_max + 5 * d_lambda,
                                  self.n_lambdas,
                                  dtype=np.float64)
        # run the lasso sweep with new lambda set
        estimates_dense, scores_dense = \
         self.lasso_sweep(
          X, y, lambdas, self.train_frac_sel, self.n_boots_sel,
          self.use_admm, desc='fine lasso sweep', verbose=verbose
         )
        # choose selection fraction threshold values to use
        selection_frac_thresholds = np.linspace(self.selection_thres_min,
                                                self.selection_thres_max,
                                                self.n_selection_thres)
        # calculate the actual number of thresholds, but delete any repetitions
        selection_thresholds = np.sort(
            np.unique(
                (self.n_boots_sel * selection_frac_thresholds).astype('int')))
        # create support matrix
        self.supports_ = np.zeros(
            (self.n_selection_thres, self.n_lambdas, self.n_features_),
            dtype=bool)
        # iterate over each stability selection threshold
        for thres_idx, threshold in enumerate(selection_thresholds):
            # calculate the support given the specific selection threshold
            self.supports_[thres_idx, :] = np.count_nonzero(
                estimates_dense, axis=0) >= threshold
        # reshape support matrix so that first axis consists of all combinations of hyperparameters
        self.supports_ = np.reshape(
            self.supports_,
            (self.n_selection_thres * self.n_lambdas, self.n_features_))

        ########################
        ### Model Estimation ###
        ########################
        # we'll use the supports obtained in the selection module to calculate
        # bagged OLS estimates over bootstraps

        if verbose:
            print(
                '(3) Model selection complete. Beginning model estimation, with %s bootstraps.'
                % self.n_boots_est)

        # create or overwrite arrays to collect final results
        self.coef_ = np.zeros(self.n_features_, dtype=np.float32)
        self.scores_ = np.zeros(1, dtype=np.float32)
        # determine how many samples will be used for overall training
        train_split = int(round(self.train_frac_overall * self.n_samples_))
        # determine how many samples will be used for training within a bootstrap
        boot_train_split = int(round(self.train_frac_est * train_split))

        # set up data arrays
        estimates = np.zeros(
            (self.n_boots_est, self.n_lambdas, self.n_features_),
            dtype=np.float32)
        scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32)
        # either we plan on using a test set, or we'll use the entire dataset for training
        if self.train_frac_overall < 1:
            # generate indices for the global training and testing blocks
            indices = np.random.permutation(self.n_samples_)
            train, test = np.split(indices, [train_split])
            # compile the training and test sets
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
        else:
            X_train = X
            y_train = y

        # iterate over bootstrap samples
        for bootstrap in trange(self.n_boots_est,
                                desc='Model Estimation',
                                disable=not verbose):
            # extract the bootstrap indices, keeping a fraction of the data
            # available for testing
            bootstrap_indices = np.random.permutation(train_split)
            train_boot, test_boot = np.split(bootstrap_indices,
                                             [boot_train_split])
            # iterate over the regularization parameters
            for lamb_idx, lamb in enumerate(lambdas):
                support = self.supports_[lamb_idx]
                if np.any(support):
                    # fit OLS using the supports from selection module
                    X_boot = X_train[train_boot]
                    y_boot = y_train[train_boot]
                    ols = lm.LinearRegression()
                    ols.fit(X_boot[:, support], y_boot - y_boot.mean())
                    # store the fitted coefficients
                    estimates[bootstrap, lamb_idx, support] = ols.coef_
                    # calculate and store the performance on the test set
                    y_hat_boot = np.dot(X_train[test_boot],
                                        estimates[bootstrap, lamb_idx, :])
                    y_true_boot = y_train[test_boot] - y_train[test_boot].mean(
                    )
                    # calculate sum of squared residuals
                    rss = np.sum((y_hat_boot - y_true_boot)**2)
                    # calculate BIC as our scoring function
                    if self.estimation_score == 'r2':
                        scores[bootstrap,
                               lamb_idx] = r2_score(y_true_boot, y_hat_boot)
                    elif self.estimation_score == 'BIC':
                        n_selected_features = np.count_nonzero(support)
                        scores[bootstrap, lamb_idx] = -utils.BIC(
                            n_features=n_selected_features,
                            n_samples=boot_train_split,
                            rss=rss)
                else:
                    # if no variables were selected, throw a message
                    # we'll leave the scores array unchanged, so any support
                    # with no selection will be assigned a score of 0.
                    print(
                        'No variables selected in the support for lambda = %d.'
                        % lamb)

        if verbose:
            print('(4) Bagging estimates, using bagging option %s.' %
                  self.bagging_options)

        if self.bagging_options == 1:
            # bagging option 1: for each bootstrap sample, find the regularization parameter that gave the best results
            lambda_max_idx = np.argmax(scores, axis=1)
            # extract the estimates over bootstraps from the model with best lambda
            best_estimates = estimates[np.arange(self.n_boots_est),
                                       lambda_max_idx, :]
            # take the median across estimates for the final, bagged estimate
            self.coef_ = np.median(best_estimates, axis=0)
        elif self.bagging_options == 2:
            # bagging option 2: average estimates across bootstraps, and then find the regularization parameter that gives the best results
            mean_scores = np.mean(scores, axis=0)
            lambda_max_idx = np.argmax(mean_scores)
            self.coef_ = np.median(estimates[:, lambda_max_idx, :], 0)
        else:
            raise ValueError('Bagging option %d is not available.' %
                             self.bagging_options)
        # if we extracted a test set, evaluate the model
        if self.train_frac_overall < 1:
            # finally, see how the bagged estimates perform on the test set
            y_hat = np.dot(X_test, self.coef_)
            y_true = y_test - y_test.mean()
            # calculate and store performance of the final UoI_Lasso estimator over test set
            self.scores_ = r2_score(y_true, y_hat)
        else:
            self.scores_ = None

        if verbose:
            print("---> UoI Lasso complete.")

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)

        return self
Example #14
0
    def fit(self,
            X,
            y,
            groups=None,
            seed=None,
            verbose=False,
            sample_weight=None,
            option=True):
        """Fit data according to the UoI-Lasso algorithm.
		Relevant information (fits, residuals, model performance) is stored within object.
		Thus, nothing is returned by this function.

		Parameters
		----------
		X : np array (2d)
			the design matrix, containing the predictors.
			its shape is assumed to be (number of samples, number of features).

		y : np array (1d)
			the vector of dependent variables.
			its length is assumed to be (number of samples,).

		seed : int
			a seed for the random number generator. this number is relevant
			for the choosing bootstraps and dividing the data into training and test sets.

		verbose : boolean
			a boolean switch indicating whether the fitting should print out its progress.
		"""
        # initialize the seed, if it's provided

        if seed is not None:
            np.random.seed(seed)

        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        # preprocess data through centering and normalization
        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        # extract model dimensions from design matrix
        self.n_samples_, self.n_features_ = X.shape
        # create or overwrite arrays to collect final results
        self.coef_ = np.zeros(self.n_features_, dtype=np.float32)

        # group leveling
        if groups is None:
            self.groups_ = np.ones(self.n_samples_)
        else:
            self.groups_ = np.array(groups)

        if verbose:
            print('(1) Loaded data.\n %s samples with %s features.' %
                  (self.n_samples_, self.n_features_))

        self.lambdas = _alpha_grid(X=X,
                                   y=y,
                                   l1_ratio=1.0,
                                   fit_intercept=self.fit_intercept,
                                   eps=1e-3,
                                   n_alphas=self.n_lambdas,
                                   normalize=self.normalize)

        # sweep over the grid of regularization strengths
        estimates_selection, _ = \
         self.lasso_sweep(
          X, y, self.lambdas, self.train_frac_sel, self.n_boots_sel,
          self.use_admm, desc='fine lasso sweep', verbose=verbose
         )

        # perform the intersection step
        self.intersection(estimates_selection)

        ########################
        ### Model Estimation ###
        ########################
        # we'll use the supports obtained in the selection module to calculate
        # bagged OLS estimates over bootstraps

        if verbose:
            print('(3) Beginning model estimation, with %s bootstraps.' %
                  self.n_boots_est)

        # compute number of samples per bootstrap
        n_samples_bootstrap = int(round(self.train_frac_est * self.n_samples_))

        # set up data arrays
        estimates = np.zeros(
            (self.n_boots_est, self.n_lambdas, self.n_features_),
            dtype=np.float32)
        scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32)

        # iterate over bootstrap samples
        for bootstrap in trange(self.n_boots_est,
                                desc='Model Estimation',
                                disable=not verbose):

            # extract the bootstrap indices, keeping a fraction of the data available for testing
            train_idx, test_idx = utils.leveled_randomized_ids(
                self.groups_, self.train_frac_est)

            # iterate over the regularization parameters
            for lamb_idx, lamb in enumerate(self.lambdas):
                # extract current support set
                support = self.supports_[lamb_idx]

                # extract response vectors
                y_train = y[train_idx]
                y_test = y[test_idx]

                # if nothing was selected, we won't bother running OLS
                if np.any(support):
                    # get design matrices
                    X_train = X[train_idx][:, support]
                    X_test = X[test_idx][:, support]

                    # compute ols estimate
                    ols = lm.LinearRegression()
                    ols.fit(X_train, y_train)

                    # store the fitted coefficients
                    estimates[bootstrap, lamb_idx, support] = ols.coef_

                    # calculate estimation score
                    if self.estimation_score == 'r2':
                        scores[bootstrap, lamb_idx] = ols.score(X_test, y_test)
                    elif self.estimation_score == 'BIC':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.BIC(y_true=y_test,
                                                      y_pred=y_pred,
                                                      n_features=n_features)
                    elif self.estimation_score == 'AIC':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.AIC(y_true=y_test,
                                                      y_pred=y_pred,
                                                      n_features=n_features)
                    elif self.estimation_score == 'AICc':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.AICc(y_true=y_test,
                                                       y_pred=y_pred,
                                                       n_features=n_features)
                    else:
                        raise ValueError(
                            str(self.estimation_score) +
                            ' is not a valid option.')
                else:
                    if self.estimation_score == 'r2':
                        scores[bootstrap, lamb_idx] = r2_score(
                            y_true=y_test, y_pred=np.zeros(y_test.size))
                    elif self.estimation_score == 'BIC':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.BIC(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    elif self.estimation_score == 'AIC':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.AIC(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    elif self.estimation_score == 'AICc':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.AICc(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    else:
                        raise ValueError(
                            str(self.estimation_score) +
                            ' is not a valid option.')

        if verbose:
            print('(4) Bagging estimates, using bagging option %s.' %
                  self.bagging_options)

        # bagging option 1:
        #	for each bootstrap sample, find the regularization parameter that gave the best results
        if self.bagging_options == 1:
            self.lambda_max_idx = np.argmax(scores, axis=1)
            # extract the estimates over bootstraps from the model with best lambda
            best_estimates = estimates[np.arange(self.n_boots_est),
                                       self.lambda_max_idx, :]
            # take the median across estimates for the final, bagged estimate
            self.coef_ = np.median(best_estimates, axis=0)

        # bagging option 2:
        #	average estimates across bootstraps, and then find the regularization parameter that gives the best results
        elif self.bagging_options == 2:
            mean_scores = np.mean(scores, axis=0)
            self.lambda_max_idx = np.argmax(mean_scores)
            self.coef_ = np.median(estimates[:, self.lambda_max_idx, :], 0)

        else:
            raise ValueError('Bagging option %d is not available.' %
                             self.bagging_options)

        if verbose:
            print("---> UoI Lasso complete.")

        self._set_intercept(X_offset, y_offset, X_scale)

        return self