コード例 #1
0
def test_preprocess_data_weighted():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    # XXX: if normalize=True, should we expect a weighted standard deviation?
    #      Currently not weighted, but calculated with respect to weighted mean
    expected_X_norm = (np.sqrt(X.shape[0]) * np.mean(
        (X - expected_X_mean)**2, axis=0)**.5)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
コード例 #2
0
def test_preprocess_data_multioutput():
    n_samples = 200
    n_features = 3
    n_outputs = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_outputs)
    expected_y_mean = np.mean(y, axis=0)

    args = [X, sparse.csc_matrix(X)]
    for X in args:
        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=False,
                                               normalize=False)
        assert_array_almost_equal(y_mean, np.zeros(n_outputs))
        assert_array_almost_equal(yt, y)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=False)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=True)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)
コード例 #3
0
def test_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
コード例 #4
0
    def fit(self, X, y, sample_weight=None):
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=X.dtype)

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight,
            return_mean=True)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        self.is_fitted_ = True
        coef, alpha = fracridge(X, y, fracs=self.fracs)
        self.alpha_ = alpha
        self.coef_ = coef
        self._set_intercept(X_offset, y_offset, X_scale)
        return self
コード例 #5
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert csr_.getformat() == "csr"
コード例 #6
0
ファイル: fracridge.py プロジェクト: nrdg/fracridge
    def _validate_input(self, X, y, sample_weight=None):
        """
        Helper function to validate the inputs
        """
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight,
                                                 X,
                                                 dtype=X.dtype)

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X,
            sample_weight=sample_weight,
            check_input=True)

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            outs = _rescale_data(X, y, sample_weight)
            X, y = outs[0], outs[1]

        return X, y, X_offset, y_offset, X_scale
コード例 #7
0
def test_sparse_preprocess_data_with_return_mean():
    n_samples = 200
    n_features = 2
    # random_state not supported yet in sparse.rand
    X = sparse.rand(n_samples, n_features, density=0.5)  # , random_state=rng
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X,
                                                       y,
                                                       fit_intercept=False,
                                                       normalize=False,
                                                       return_mean=True)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X,
                                                       y,
                                                       fit_intercept=True,
                                                       normalize=False,
                                                       return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X,
                                                       y,
                                                       fit_intercept=True,
                                                       normalize=True,
                                                       return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, expected_X_scale)
    assert_array_almost_equal(Xt.A, XA / expected_X_scale)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
コード例 #8
0
def _alpha_max_grp(X, y, groups, center=False, normalize=False):
    """This costly function (copies X) should only be used for debug."""
    grp_ptr, grp_indices = _grp_converter(groups, X.shape[1])
    X, y, X_offset, _, X_scale = _preprocess_data(
        X, y, center, normalize, copy=True)

    X_mean = X_offset / X_scale
    X_dense, X_data, X_indices, X_indptr = _sparse_and_dense(X)
    alpha_max = dnorm_grp(
        sparse.issparse(X), y, grp_ptr, grp_indices, X_dense, X_data,
        X_indices, X_indptr, X_mean, len(grp_ptr) - 1,
        np.zeros(1, dtype=np.int32), X_mean.any()) / len(y)

    return alpha_max
コード例 #9
0
def test_sparse_preprocess_data_offsets(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2
    X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_scale = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=False, normalize=False
    )
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=False
    )
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=True
    )
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_scale, expected_X_scale)
    assert_array_almost_equal(Xt.A, XA / expected_X_scale)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
コード例 #10
0
def test_preprocess_data(global_random_seed):
    rng = np.random.RandomState(global_random_seed)
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_scale = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=False, normalize=False
    )
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=False
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X, y, fit_intercept=True, normalize=True
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, expected_X_scale)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)
    assert_array_almost_equal(yt, y - expected_y_mean)
コード例 #11
0
ファイル: dropin_sklearn.py プロジェクト: mindis/celer
    def fit(self, X, y):
        """Fit MultiTaskLasso model with Celer"""
        # Need to validate separately here.
        # We can't pass multi_ouput=True because that would allow y to be csr.
        check_X_params = dict(dtype=[np.float64, np.float32],
                              order='F',
                              copy=self.copy_X and self.fit_intercept)
        check_y_params = dict(ensure_2d=False, order='F')
        X, y = self._validate_data(X,
                                   y,
                                   validate_separately=(check_X_params,
                                                        check_y_params))
        y = y.astype(X.dtype)

        if y.ndim == 1:
            raise ValueError("For mono-task outputs, use Lasso")

        n_samples = X.shape[0]

        if n_samples != y.shape[0]:
            raise ValueError(
                "X and y have inconsistent dimensions (%d != %d)" %
                (n_samples, y.shape[0]))

        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X, y, self.fit_intercept, self.normalize, copy=False)

        if not self.warm_start or not hasattr(self, "coef_"):
            self.coef_ = None

        _, coefs, dual_gaps = mtl_path(X,
                                       y,
                                       alphas=[self.alpha],
                                       coef_init=self.coef_,
                                       max_iter=self.max_iter,
                                       max_epochs=self.max_epochs,
                                       p0=self.p0,
                                       verbose=self.verbose,
                                       tol=self.tol,
                                       prune=self.prune)

        self.coef_, self.dual_gap_ = coefs[..., 0], dual_gaps[-1]
        self.n_iter_ = len(dual_gaps)
        self._set_intercept(X_offset, y_offset, X_scale)

        return self
コード例 #12
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X, y, True, copy=to_copy, check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
コード例 #13
0
def test_preprocess_data_weighted(is_sparse):
    n_samples = 200
    n_features = 4
    # Generate random data with 50% of zero values to make sure
    # that the sparse variant of this test is actually sparse. This also
    # shifts the mean value for each columns in X further away from
    # zero.
    X = rng.rand(n_samples, n_features)
    X[X < 0.5] = 0.0

    # Scale the first feature of X to be 10 larger than the other to
    # better check the impact of feature scaling.
    X[:, 0] *= 10

    # Constant non-zero feature.
    X[:, 2] = 1.0

    # Constant zero feature (non-materialized in the sparse case)
    X[:, 3] = 0.0
    y = rng.rand(n_samples)

    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
    X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
                                     weights=sample_weight,
                                     axis=0)
    constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
    assert_array_equal(constant_mask, [0, 0, 1, 1])
    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(
        sample_weight.sum())

    # near constant features should not be scaled
    expected_X_scale[constant_mask] = 1

    if is_sparse:
        X = sparse.csr_matrix(X)

    # normalize is False
    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X,
        y,
        fit_intercept=True,
        normalize=False,
        sample_weight=sample_weight,
        return_mean=True,
    )
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    if is_sparse:
        assert_array_almost_equal(Xt.toarray(), X.toarray())
    else:
        assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    # normalize is True
    Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
        X,
        y,
        fit_intercept=True,
        normalize=True,
        sample_weight=sample_weight,
        return_mean=True,
    )

    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, expected_X_scale)

    if is_sparse:
        # X is not centered
        assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale)
    else:
        assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)

    # _preprocess_data with normalize=True scales the data by the feature-wise
    # euclidean norms while StandardScaler scales the data by the feature-wise
    # standard deviations.
    # The two are equivalent up to a ratio of np.sqrt(n_samples) if unweighted
    # or np.sqrt(sample_weight.sum()) if weighted.
    if is_sparse:
        scaler = StandardScaler(with_mean=False).fit(
            X, sample_weight=sample_weight)

        # Non-constant features are scaled similarly with np.sqrt(n_samples)
        assert_array_almost_equal(
            scaler.transform(X).toarray()[:, :2] /
            np.sqrt(sample_weight.sum()),
            Xt.toarray()[:, :2],
        )

        # Constant features go through un-scaled.
        assert_array_almost_equal(
            scaler.transform(X).toarray()[:, 2:],
            Xt.toarray()[:, 2:])
    else:
        scaler = StandardScaler(with_mean=True).fit(
            X, sample_weight=sample_weight)
        assert_array_almost_equal(scaler.mean_, X_mean)
        assert_array_almost_equal(
            scaler.transform(X) / np.sqrt(sample_weight.sum()),
            Xt,
        )
    assert_array_almost_equal(yt, y - expected_y_mean)
コード例 #14
0
ファイル: sgl.py プロジェクト: richford/groupyr
def _alpha_grid(
    X,
    y,
    Xy=None,
    groups=None,
    scale_l2_by="group_length",
    l1_ratio=1.0,
    fit_intercept=True,
    eps=1e-3,
    n_alphas=100,
    normalize=False,
    copy_X=True,
    model=SGL,
):
    """Compute the grid of alpha values for elastic net parameter search.

    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training data. Pass directly as Fortran-contiguous data to avoid
        unnecessary memory duplication

    y : ndarray of shape (n_samples,)
        Target values

    Xy : array-like of shape (n_features,), default=None
        Xy = np.dot(X.T, y) that can be precomputed. If supplying ``Xy``,
        prevent train/test leakage by ensuring the ``Xy`` is precomputed
        using only training data.

    groups : list of numpy.ndarray
        list of arrays of non-overlapping indices for each group. For
        example, if nine features are grouped into equal contiguous groups of
        three, then groups would be ``[array([0, 1, 2]), array([3, 4, 5]),
        array([6, 7, 8])]``. If the feature matrix contains a bias or
        intercept feature, do not include it as a group. If None, all
        features will belong to one group.

    scale_l2_by : ["group_length", None], default="group_length"
        Scaling technique for the group-wise L2 penalty.
        By default, ``scale_l2_by="group_length`` and the L2 penalty is
        scaled by the square root of the group length so that each variable
        has the same effect on the penalty. This may not be appropriate for
        one-hot encoded features and ``scale_l2_by=None`` would be more
        appropriate for that case. ``scale_l2_by=None`` will also reproduce
        ElasticNet results when all features belong to one group.

    l1_ratio : float, default=1.0
        The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.
        For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not
        supported) ``For l1_ratio = 1`` it is an L1 penalty. For
        ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.

    eps : float, default=1e-3
        Length of the path. ``eps=1e-3`` means that
        ``alpha_min / alpha_max = 1e-3``

    n_alphas : int, default=100
        Number of alphas along the regularization path

    fit_intercept : bool, default=True
        Whether to fit an intercept or not

    normalize : bool, default=False
        This parameter is ignored when ``fit_intercept`` is set to False.
        If True, the regressors X will be normalized before regression by
        subtracting the mean and dividing by the l2-norm.
        If you wish to standardize, please use
        :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
        on an estimator with ``normalize=False``.

    copy_X : bool, default=True
        If ``True``, X will be copied; else, it may be overwritten.

    model : class, default=SGL
        The estimator class that will be used to confirm that alpha_max sets
        all coef values to zero. The default value of ``model=SGL`` is
        appropriate for regression while ``model=LogisticSGL`` is appropriate
        for classification.
    """
    if l1_ratio == 1.0:
        return _lasso_alpha_grid(
            X=X,
            y=y,
            Xy=Xy,
            l1_ratio=l1_ratio,
            fit_intercept=fit_intercept,
            eps=eps,
            n_alphas=n_alphas,
            normalize=normalize,
            copy_X=copy_X,
        )

    n_samples = len(y)
    if Xy is None:
        X = check_array(X,
                        accept_sparse=False,
                        copy=(copy_X and fit_intercept))
        X, y, _, _, _ = _preprocess_data(X,
                                         y,
                                         fit_intercept,
                                         normalize,
                                         copy=False)
        Xy = safe_sparse_dot(X.T, y, dense_output=True)

    if Xy.ndim == 1:
        Xy = Xy[:, np.newaxis]

    groups = check_groups(groups, X, allow_overlap=False, fit_intercept=False)

    if scale_l2_by not in ["group_length", None]:
        raise ValueError("scale_l2_by must be 'group_length' or None; "
                         "got {0}".format(scale_l2_by))

    # When l1_ratio < 1 (i.e. not the lasso), then for each group, the
    # smallest alpha for which coef_ = 0 minimizes the objective will be
    # achieved when
    #
    # || S(Xy / n_samples, l1_ratio * alpha) ||_2 == sqrt(p_l) * (1 - l1_ratio) * alpha
    #
    # where S() is the element-wise soft-thresholding operator and p_l is
    # the group size (or 1 if ``scale_l2_by is None``)
    def beta_zero_root(alpha, group):
        soft = _soft_threshold(Xy[group] / n_samples, l1_ratio * alpha)
        scale = np.sqrt(group.size) if scale_l2_by == "group_length" else 1
        return np.linalg.norm(soft) - (1 - l1_ratio) * alpha * scale

    # We use the brentq method to find the root, which requires a bracket
    # within which to find the root. We know that ``beta_zero_root`` will
    # be positive when alpha=0. In order to ensure that the upper limit
    # brackets the root, we increase the upper limit until
    # ``beta_zero_root`` returns a negative number for all groups
    def bracket_too_low(alpha):
        return any([beta_zero_root(alpha, group=grp) > 0 for grp in groups])

    upper_bracket_lim = 1e1
    while bracket_too_low(upper_bracket_lim):
        upper_bracket_lim *= 10

    min_alphas = np.array([
        root_scalar(
            partial(beta_zero_root, group=grp),
            bracket=[0, upper_bracket_lim],
            method="brentq",
        ).root for grp in groups
    ])

    alpha_max = np.max(min_alphas) * 1.2

    # Test feature sparsity just to make sure we're on the right side of the root
    while (  # pragma: no cover
            model(
                groups=groups,
                alpha=alpha_max,
                l1_ratio=l1_ratio,
                fit_intercept=fit_intercept,
                scale_l2_by=scale_l2_by,
            ).fit(X, y).chosen_features_.size > 0):
        alpha_max *= 1.2  # pragma: no cover

    if alpha_max <= np.finfo(float).resolution:
        alphas = np.empty(n_alphas)
        alphas.fill(np.finfo(float).resolution)
        return alphas

    return np.logspace(np.log10(alpha_max * eps),
                       np.log10(alpha_max),
                       num=n_alphas)[::-1]
コード例 #15
0
def ridge_regression(
    X_train,
    X_test,
    y_train,
    y_test,
    svd_solve=False,
    lambdas=[1e2],
    return_preds=True,
    return_model=False,
    clip_bounds=None,
    intercept=False,
    allow_linalg_warning_instances=False,
):
    """Train ridge regression model for a series of regularization parameters.
    Optionally clip the predictions to bounds. Used as the default solve_function
    argument for single_solve() and kfold_solve() below.

    Parameters
    ----------
        X_{train,test} : :class:`numpy.ndarray`
            Features for training/test data (n_obs_{train,test} X n_ftrs 2darray).
        y_{train,test} : :class:`numpy.ndarray`
            Labels for training/test data (n_obs_{train,test} X n_outcomes 2darray).
        svd_solve : bool, optional
            If true, uses SVD to compute w^*, otherwise does matrix inverse for each
            lambda.
        lambdas : list of floats, optional
            Regularization values to sweep over.
        return_preds : bool, optional
            Whether to return predictions for training and test sets.
        return_model : bool, optional
            Whether to return the trained weights that define the ridge regression
            model.
        clip_bounds : array-like, optional
            If None, do not clip predictions. If not None, must be ann array of
            dimension ``n_outcomes X 2``. If any of the elements of the array are None,
            ignore that bound (e.g. if a row of the array is [None, 10], apply an upper
            bound of 10 but no lower bound).
        intercept : bool, optional
            Whether to add an unregulated intercept (or, equivalently, center the X and
            Y data).
        allow_linalg_warning_instances : bool, optional
            If False (default), track for which hyperparameters did ``scipy.linalg``
            raise an ill-conditioned matrix error, which could lead to poor performance.
            This is used to discard these models in a cross-validation context. If True,
            allow these models to be included in the hyperparameter grid search. Note
            that these errors will not occur when using ``cupy.linalg`` (i.e. if a GPU
            is detected), so the default setting may give differing results across
            platforms.

    Returns
    -------
    dict of :class:`numpy.ndarray`
        The results dictionary will always include the following key/value pairs:
            ``metrics_{test,train}`` : array of dimension n_outcomes X n_lambdas
                Each element is a dictionary of {Out-of,In}-sample model performance
                metrics for each lambda

        If ``return_preds``, the following arrays will be appended in order:
            ``y_pred_{test,train}`` : array of dimension n_outcomes X n_lambdas
                Each element is itself a 1darray of {Out-of,In}-sample predictions for
                each lambda. Each 1darray contains n_obs_{test,train} values

        if return_model, the following array will be appended:
            ``models`` : array of dimension n_outcomes X n_lambdas:
                Each element is itself a 1darray of model weights for each lambda. Each
                1darray contains n_ftrs values
    """

    # get dimensions needed to shape arrays
    n_ftrs, n_outcomes, n_obs_train, n_obs_test = get_dim_lengths(
        X_train, y_train, y_test)
    n_lambdas = len(lambdas)

    # center data if needed
    X_train, y_train, X_offset, y_offset, _ = _preprocess_data(X_train,
                                                               y_train,
                                                               intercept,
                                                               normalize=False)

    # set up the data structures for reporting results
    results_dict = _initialize_results_arrays((n_outcomes, n_lambdas),
                                              return_preds, return_model)

    t1 = time.time()

    # send to GPU if available
    X_train = xp.asarray(X_train)
    y_train = xp.asarray(y_train)

    if DEBUG:
        if GPU:
            print(
                f"Time to transfer X_train and y_train to GPU: {time.time() - t1}"
            )
        t1 = time.time()

    # precomputing large matrices to avoid redundant computation
    if svd_solve:
        # precompute the SVD
        U, s, Vh = linalg.svd(X_train, full_matrices=False)
        V = Vh.T
        UT_dot_y_train = U.T.dot(y_train)
    else:
        XtX = X_train.T.dot(X_train)
        XtY = X_train.T.dot(y_train)

    if DEBUG:
        t2 = time.time()
        print("Time to create XtX matrix:", t2 - t1)

    # iterate over the lambda regularization values
    training_time = 0
    pred_time = 0
    for lx, lambdan in enumerate(lambdas):
        if DEBUG:
            t3 = time.time()

        # train model
        if svd_solve:
            s_lambda = s / (s**2 + lambdan * xp.ones_like(s))
            model = (V * s_lambda).dot(UT_dot_y_train)
            lambda_warning = None
        else:
            with warnings.catch_warnings(record=True) as w:
                # bind warnings to the value of w
                warnings.simplefilter("always")
                lambda_warning = False
                model = linalg.solve(
                    XtX + lambdan * xp.eye(n_ftrs, dtype=np.float64),
                    XtY,
                    **linalg_solve_kwargs,
                )

                # if there is a warning
                if len(w) > 1:
                    for this_w in w:
                        print(this_w.message)
                    # more than one warning is bad
                    raise Exception(
                        "warning/exception other than LinAlgWarning")
                if len(w) > 0:
                    # if it is a linalg warning
                    if w[0].category == LinAlgWarning:
                        print("linalg warning on lambda={0}: ".format(lambdan),
                              end="")
                        # linalg warning
                        if not allow_linalg_warning_instances:
                            print(
                                "we will discard this model upon model selection"
                            )
                            lambda_warning = True
                        else:
                            lambda_warning = None
                            print(
                                "we will allow this model upon model selection"
                            )
                    else:
                        raise Exception(
                            "warning/exception other than LinAlgWarning")

        if DEBUG:
            t4 = time.time()
            training_time += t4 - t3
            print(f"Training time for lambda {lambdan}: {t4 - t3}")

        #####################
        # compute predictions
        #####################

        # send to gpu if available
        X_test = xp.asarray(X_test)
        y_test = xp.asarray(y_test)
        y_offset = xp.asarray(y_offset)
        X_offset = xp.asarray(X_offset)

        if DEBUG:
            t5 = time.time()

        # train
        pred_train = X_train.dot(model) + y_offset
        pred_train = y_to_matrix(pred_train)

        # test
        pred_test = X_test.dot(model) - X_offset.dot(model) + y_offset
        pred_test = y_to_matrix(pred_test)

        # clip if needed
        if clip_bounds is not None:
            for ix, i in enumerate(clip_bounds):
                # only apply if both bounds aren't None for this outcome
                if not (i == None).all():
                    pred_train[:, ix] = xp.clip(pred_train[:, ix], *i)
                    pred_test[:, ix] = xp.clip(pred_test[:, ix], *i)

        if DEBUG:
            t6 = time.time()
            pred_time += t6 - t5

        # bring back to cpu if needed
        pred_train, pred_test = asnumpy(pred_train), asnumpy(pred_test)
        y_train, y_test, model = (
            y_to_matrix(asnumpy(y_train)),
            y_to_matrix(asnumpy(y_test)),
            y_to_matrix(asnumpy(model)),
        )

        # create tuple of lambda index to match argument structure
        # of _fill_results_arrays function
        hp_tuple = (lx, )

        # Transpose model results so that n_outcomes is first dimension
        # so that _fill_results_array can handle it
        model = model.T

        # populate results dict with results from this lambda
        results_dict = _fill_results_arrays(
            y_train,
            y_test,
            pred_train,
            pred_test,
            model,
            hp_tuple,
            results_dict,
            hp_warning=lambda_warning,
        )
    if DEBUG:
        print("Training time:", training_time)
        print("Prediction time:", pred_time)
        print("Total time:", time.time() - t1)
    return results_dict
コード例 #16
0
def test_dtype_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    X_32 = np.asarray(X, dtype=np.float32)
    y_32 = np.asarray(y, dtype=np.float32)
    X_64 = np.asarray(X, dtype=np.float64)
    y_64 = np.asarray(y, dtype=np.float64)

    for fit_intercept in [True, False]:
        for normalize in [True, False]:

            Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
                X_32,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True,
            )

            Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
                X_64,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True,
            )

            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
                X_32,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True,
            )

            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
                X_64,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True,
            )

            assert Xt_32.dtype == np.float32
            assert yt_32.dtype == np.float32
            assert X_mean_32.dtype == np.float32
            assert y_mean_32.dtype == np.float32
            assert X_scale_32.dtype == np.float32

            assert Xt_64.dtype == np.float64
            assert yt_64.dtype == np.float64
            assert X_mean_64.dtype == np.float64
            assert y_mean_64.dtype == np.float64
            assert X_scale_64.dtype == np.float64

            assert Xt_3264.dtype == np.float32
            assert yt_3264.dtype == np.float32
            assert X_mean_3264.dtype == np.float32
            assert y_mean_3264.dtype == np.float32
            assert X_scale_3264.dtype == np.float32

            assert Xt_6432.dtype == np.float64
            assert yt_6432.dtype == np.float64
            assert X_mean_6432.dtype == np.float64
            assert y_mean_6432.dtype == np.float64
            assert X_scale_6432.dtype == np.float64

            assert X_32.dtype == np.float32
            assert y_32.dtype == np.float32
            assert X_64.dtype == np.float64
            assert y_64.dtype == np.float64

            assert_array_almost_equal(Xt_32, Xt_64)
            assert_array_almost_equal(yt_32, yt_64)
            assert_array_almost_equal(X_mean_32, X_mean_64)
            assert_array_almost_equal(y_mean_32, y_mean_64)
            assert_array_almost_equal(X_scale_32, X_scale_64)
コード例 #17
0
def test_preprocess_data_weighted(is_sparse):
    n_samples = 200
    n_features = 4
    # Generate random data with 50% of zero values to make sure
    # that the sparse variant of this test is actually sparse. This also
    # shifts the mean value for each columns in X further away from
    # zero.
    X = rng.rand(n_samples, n_features)
    X[X < 0.5] = 0.

    # Scale the first feature of X to be 10 larger than the other to
    # better check the impact of feature scaling.
    X[:, 0] *= 10

    # Constant non-zero feature: this edge-case is currently not handled
    # correctly for sparse data, see:
    # https://github.com/scikit-learn/scikit-learn/issues/19450
    # X[:, 2] = 1.

    # Constant zero feature (non-materialized in the sparse case)
    X[:, 3] = 0.
    y = rng.rand(n_samples)

    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
    X_sample_weight_var = np.average((X - X_sample_weight_avg)**2,
                                     weights=sample_weight,
                                     axis=0)
    expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(n_samples)

    # near constant features should not be scaled
    expected_X_scale[expected_X_scale < 10 * np.finfo(np.float64).eps] = 1

    if is_sparse:
        X = sparse.csr_matrix(X)

    # normalize is False
    Xt, yt, X_mean, y_mean, X_scale = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         sample_weight=sample_weight, return_mean=True)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, np.ones(n_features))
    if is_sparse:
        assert_array_almost_equal(Xt.toarray(), X.toarray())
    else:
        assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    # normalize is True
    Xt, yt, X_mean, y_mean, X_scale = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         sample_weight=sample_weight, return_mean=True)

    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_scale, expected_X_scale)

    if is_sparse:
        # X is not centered
        assert_array_almost_equal(Xt.toarray(), X.toarray() / expected_X_scale)
    else:
        assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_scale)

    # _preprocess_data with normalize=True scales the data by the feature-wise
    # euclidean norms while StandardScaler scales the data by the feature-wise
    # standard deviations.
    # The two are equivalent up to a ratio of np.sqrt(n_samples)
    if is_sparse:
        scaler = StandardScaler(with_mean=False).fit(
            X, sample_weight=sample_weight)

        assert_array_almost_equal(
            scaler.transform(X).toarray() / np.sqrt(n_samples), Xt.toarray())
    else:
        scaler = StandardScaler(with_mean=True).fit(
            X, sample_weight=sample_weight)
        assert_array_almost_equal(scaler.mean_, X_mean)
        assert_array_almost_equal(scaler.transform(X) / np.sqrt(n_samples), Xt)
    assert_array_almost_equal(yt, y - expected_y_mean)