Ejemplo n.º 1
0
def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True):
    """Orthogonal Matching Pursuit step using the Cholesky decomposition.

Parameters:
-----------
X: array, shape = (n_samples, n_features)
Input dictionary. Columns are assumed to have unit norm.

y: array, shape = (n_samples,)
Input targets

n_nonzero_coefs: int
Targeted number of non-zero elements

tol: float
Targeted squared error, if not None overrides n_nonzero_coefs.

copy_X: bool, optional
Whether the design matrix X must be copied by the algorithm. A false
value is only helpful if X is already Fortran-ordered, otherwise a
copy is made anyway.

Returns:
--------
gamma: array, shape = (n_nonzero_coefs,)
Non-zero elements of the solution

idx: array, shape = (n_nonzero_coefs,)
Indices of the positions of the elements in gamma within the solution
vector

"""
    if copy_X:
        X = X.copy('F')
    else: # even if we are allowed to overwrite, still copy it if bad order
        X = np.asfortranarray(X)

    min_float = np.finfo(X.dtype).eps
    nrm2, swap = linalg.get_blas_funcs(('nrm2', 'swap'), (X,))
    potrs, = get_lapack_funcs(('potrs',), (X,))

    alpha = np.dot(X.T, y)
    residual = y
    n_active = 0
    indices = range(X.shape[1]) # keeping track of swapping

    #max_features = X.shape[1] if tol is not None else n_nonzero_coefs
    # Nic: tol not None should not overide n_nonzero_coefs, but act together
    max_features = n_nonzero_coefs
    
    L = np.empty((max_features, max_features), dtype=X.dtype)
    L[0, 0] = 1.

    while True:
        lam = np.argmax(np.abs(np.dot(X.T, residual)))
        if lam < n_active or alpha[lam] ** 2 < min_float:
            # atom already selected or inner product too small
            warn(premature)
            break
        if n_active > 0:
            # Updates the Cholesky decomposition of X' X
            L[n_active, :n_active] = np.dot(X[:, :n_active].T, X[:, lam])
            solve_triangular(L[:n_active, :n_active], L[n_active, :n_active])
            v = nrm2(L[n_active, :n_active]) ** 2
            if 1 - v <= min_float: # selected atoms are dependent
                warn(premature)
                break
            L[n_active, n_active] = np.sqrt(1 - v)
        X.T[n_active], X.T[lam] = swap(X.T[n_active], X.T[lam])
        alpha[n_active], alpha[lam] = alpha[lam], alpha[n_active]
        indices[n_active], indices[lam] = indices[lam], indices[n_active]
        n_active += 1
        # solves LL'x = y as a composition of two triangular systems
        gamma, _ = potrs(L[:n_active, :n_active], alpha[:n_active], lower=True,
                         overwrite_b=False)

        residual = y - np.dot(X[:, :n_active], gamma)
        if tol is not None and nrm2(residual) ** 2 <= tol:
            break
        #elif n_active == max_features:
        # Nic: tol not None should not overide n_nonzero_coefs, but act together
        if n_active == max_features:
            break

    return gamma, indices[:n_active]
Ejemplo n.º 2
0
def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None,
              copy_Gram=True, copy_Xy=True):
    """Orthogonal Matching Pursuit step on a precomputed Gram matrix.

This function uses the the Cholesky decomposition method.

Parameters:
-----------
Gram: array, shape = (n_features, n_features)
Gram matrix of the input data matrix

Xy: array, shape = (n_features,)
Input targets

n_nonzero_coefs: int
Targeted number of non-zero elements

tol_0: float
Squared norm of y, required if tol is not None.

tol: float
Targeted squared error, if not None overrides n_nonzero_coefs.

copy_Gram: bool, optional
Whether the gram matrix must be copied by the algorithm. A false
value is only helpful if it is already Fortran-ordered, otherwise a
copy is made anyway.

copy_Xy: bool, optional
Whether the covariance vector Xy must be copied by the algorithm.
If False, it may be overwritten.

Returns:
--------
gamma: array, shape = (n_nonzero_coefs,)
Non-zero elements of the solution

idx: array, shape = (n_nonzero_coefs,)
Indices of the positions of the elements in gamma within the solution
vector

"""
    Gram = Gram.copy('F') if copy_Gram else np.asfortranarray(Gram)

    if copy_Xy:
        Xy = Xy.copy()

    min_float = np.finfo(Gram.dtype).eps
    nrm2, swap = linalg.get_blas_funcs(('nrm2', 'swap'), (Gram,))
    potrs, = get_lapack_funcs(('potrs',), (Gram,))

    indices = range(len(Gram)) # keeping track of swapping
    alpha = Xy
    tol_curr = tol_0
    delta = 0
    n_active = 0

    max_features = len(Gram) if tol is not None else n_nonzero_coefs
    L = np.empty((max_features, max_features), dtype=Gram.dtype)
    L[0, 0] = 1.

    while True:
        lam = np.argmax(np.abs(alpha))
        if lam < n_active or alpha[lam] ** 2 < min_float:
            # selected same atom twice, or inner product too small
            warn(premature)
            break
        if n_active > 0:
            L[n_active, :n_active] = Gram[lam, :n_active]
            solve_triangular(L[:n_active, :n_active], L[n_active, :n_active])
            v = nrm2(L[n_active, :n_active]) ** 2
            if 1 - v <= min_float: # selected atoms are dependent
                warn(premature)
                break
            L[n_active, n_active] = np.sqrt(1 - v)
        Gram[n_active], Gram[lam] = swap(Gram[n_active], Gram[lam])
        Gram.T[n_active], Gram.T[lam] = swap(Gram.T[n_active], Gram.T[lam])
        indices[n_active], indices[lam] = indices[lam], indices[n_active]
        Xy[n_active], Xy[lam] = Xy[lam], Xy[n_active]
        n_active += 1
        # solves LL'x = y as a composition of two triangular systems
        gamma, _ = potrs(L[:n_active, :n_active], Xy[:n_active], lower=True,
                         overwrite_b=False)

        beta = np.dot(Gram[:, :n_active], gamma)
        alpha = Xy - beta
        if tol is not None:
            tol_curr += delta
            delta = np.inner(gamma, beta[:n_active])
            tol_curr -= delta
            if tol_curr <= tol:
                break
        elif n_active == max_features:
            break

    return gamma, indices[:n_active]
Ejemplo n.º 3
0
def _cholesky_omp(X, y, n_nonzero_coefs, tol=None, copy_X=True):
    """Orthogonal Matching Pursuit step using the Cholesky decomposition.

Parameters:
-----------
X: array, shape = (n_samples, n_features)
Input dictionary. Columns are assumed to have unit norm.

y: array, shape = (n_samples,)
Input targets

n_nonzero_coefs: int
Targeted number of non-zero elements

tol: float
Targeted squared error, if not None overrides n_nonzero_coefs.

copy_X: bool, optional
Whether the design matrix X must be copied by the algorithm. A false
value is only helpful if X is already Fortran-ordered, otherwise a
copy is made anyway.

Returns:
--------
gamma: array, shape = (n_nonzero_coefs,)
Non-zero elements of the solution

idx: array, shape = (n_nonzero_coefs,)
Indices of the positions of the elements in gamma within the solution
vector

"""
    if copy_X:
        X = X.copy('F')
    else:  # even if we are allowed to overwrite, still copy it if bad order
        X = np.asfortranarray(X)

    min_float = np.finfo(X.dtype).eps
    nrm2, swap = linalg.get_blas_funcs(('nrm2', 'swap'), (X, ))
    potrs, = get_lapack_funcs(('potrs', ), (X, ))

    alpha = np.dot(X.T, y)
    residual = y
    n_active = 0
    indices = range(X.shape[1])  # keeping track of swapping

    #max_features = X.shape[1] if tol is not None else n_nonzero_coefs
    # Nic: tol not None should not overide n_nonzero_coefs, but act together
    max_features = n_nonzero_coefs

    L = np.empty((max_features, max_features), dtype=X.dtype)
    L[0, 0] = 1.

    while True:
        lam = np.argmax(np.abs(np.dot(X.T, residual)))
        if lam < n_active or alpha[lam]**2 < min_float:
            # atom already selected or inner product too small
            warn(premature)
            break
        if n_active > 0:
            # Updates the Cholesky decomposition of X' X
            L[n_active, :n_active] = np.dot(X[:, :n_active].T, X[:, lam])
            solve_triangular(L[:n_active, :n_active], L[n_active, :n_active])
            v = nrm2(L[n_active, :n_active])**2
            if 1 - v <= min_float:  # selected atoms are dependent
                warn(premature)
                break
            L[n_active, n_active] = np.sqrt(1 - v)
        X.T[n_active], X.T[lam] = swap(X.T[n_active], X.T[lam])
        alpha[n_active], alpha[lam] = alpha[lam], alpha[n_active]
        indices[n_active], indices[lam] = indices[lam], indices[n_active]
        n_active += 1
        # solves LL'x = y as a composition of two triangular systems
        gamma, _ = potrs(L[:n_active, :n_active],
                         alpha[:n_active],
                         lower=True,
                         overwrite_b=False)

        residual = y - np.dot(X[:, :n_active], gamma)
        if tol is not None and nrm2(residual)**2 <= tol:
            break
        #elif n_active == max_features:
        # Nic: tol not None should not overide n_nonzero_coefs, but act together
        if n_active == max_features:
            break

    return gamma, indices[:n_active]
Ejemplo n.º 4
0
def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
              alpha_min=0, method='lar', copy_X=True,
              eps=np.finfo(np.float).eps,
              copy_Gram=True, verbose=False, return_path=True,
              group_ids=None, positive=False):
    """Compute Least Angle Regression and Lasso path

    The optimization objective for Lasso is::

    (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1

    Parameters
    -----------
    X: array, shape: (n_samples, n_features)
        Input data

    y: array, shape: (n_samples)
        Input targets

    max_iter: integer, optional
        Maximum number of iterations to perform, set to infinity for no limit.

    Gram: None, 'auto', array, shape: (n_features, n_features), optional
        Precomputed Gram matrix (X' * X), if 'auto', the Gram
        matrix is precomputed from the given X, if there are more samples
        than features

    alpha_min: float, optional
        Minimum correlation along the path. It corresponds to the
        regularization parameter alpha parameter in the Lasso.

    method: {'lar', 'lasso'}
        Specifies the returned model. Select 'lar' for Least Angle
        Regression, 'lasso' for the Lasso.

    eps: float, optional
        The machine-precision regularization in the computation of the
        Cholesky diagonal factors. Increase this for very ill-conditioned
        systems.

    copy_X: bool
        If False, X is overwritten.

    copy_Gram: bool
        If False, Gram is overwritten.

    Returns
    --------
    alphas: array, shape: (max_features + 1,)
        Maximum of covariances (in absolute value) at each iteration.

    active: array, shape (max_features,)
        Indices of active variables at the end of the path.

    coefs: array, shape (n_features, max_features + 1)
        Coefficients along the path

    See also
    --------
    lasso_path
    LassoLars
    Lars
    LassoLarsCV
    LarsCV
    sklearn.decomposition.sparse_encode

    Notes
    ------
    * http://en.wikipedia.org/wiki/Least-angle_regression

    * http://en.wikipedia.org/wiki/Lasso_(statistics)#LASSO_method
    """
    if group_ids is not None:
        group_ids = np.array(group_ids).copy()
        max_iter = len(np.unique(group_ids))

    n_features = X.shape[1]
    n_samples = y.size
    max_features = min(max_iter, n_features)

    if return_path:
        coefs = np.zeros((max_features + 1, n_features))
        alphas = np.zeros(max_features + 1)
    else:
        coef, prev_coef = np.zeros(n_features), np.zeros(n_features)
        alpha, prev_alpha = np.array([0.]), np.array([0.])  # better ideas?

    n_iter, n_active = 0, 0
    active, indices = list(), np.arange(n_features)
    # holds the sign of covariance
    sign_active = np.empty(max_features, dtype=np.int8)
    drop = False

    # will hold the cholesky factorization. Only lower part is
    # referenced.
    L = np.empty((max_features, max_features), dtype=X.dtype)
    swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (X,))
    solve_cholesky, = get_lapack_funcs(('potrs',), (X,))

    if Gram is None:
        if copy_X:
            # force copy. setting the array to be fortran-ordered
            # speeds up the calculation of the (partial) Gram matrix
            # and allows to easily swap columns
            X = X.copy('F')
    elif Gram == 'auto':
        Gram = None
        if X.shape[0] > X.shape[1]:
            Gram = np.dot(X.T, X)
    elif copy_Gram:
            Gram = Gram.copy()

    if Xy is None:
        Cov = np.dot(X.T, y)
    else:
        Cov = Xy.copy()

    if verbose:
        if verbose > 1:
            print "Step\t\tAdded\t\tDropped\t\tActive set size\t\tC"
        else:
            sys.stdout.write('.')
            sys.stdout.flush()

    tiny = np.finfo(np.float).tiny  # to avoid division by 0 warning
    tiny32 = np.finfo(np.float32).tiny  # to avoid division by 0 warning

    if group_ids is not None:
        selected_group = list()

    while True:
        if Cov.size:
            if group_ids is None:
                if positive:
                    C_idx = np.argmax(Cov)
                else:
                    C_idx = np.argmax(np.abs(Cov))
                C_ = Cov[C_idx]
                if C_ <= 0 and positive:
                    break
                C = np.fabs(C_)
            else:
                if positive:
                    tmp = Cov
                else:
                    tmp = np.abs(Cov)
                already_selected = np.zeros(len(tmp), dtype=np.bool)
                for gid in selected_group:
                    already_selected[group_ids == gid] = True
                tmp[already_selected] = 0.
                C_idx = np.argmax(tmp)
                C_ = Cov[C_idx]
                if C_ <= 0 and positive:
                    break
                C = np.fabs(C_)
                selected_group.append(group_ids[C_idx])
        else:
            C = 0.

        if return_path:
            alpha = alphas[n_iter, np.newaxis]
            coef = coefs[n_iter]
            prev_alpha = alphas[n_iter - 1, np.newaxis]
            prev_coef = coefs[n_iter - 1]

        alpha[0] = C / n_samples
        if alpha[0] < alpha_min:  # early stopping
            # interpolation factor 0 <= ss < 1
            if n_iter > 0:
                # In the first iteration, all alphas are zero, the formula
                # below would make ss a NaN
                ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])
                coef[:] = prev_coef + ss * (coef - prev_coef)
            alpha[0] = alpha_min
            if return_path:
                coefs[n_iter] = coef
            break

        if n_iter >= max_iter or n_active >= n_features:
            break

        if not drop:

            ##########################################################
            # Append x_j to the Cholesky factorization of (Xa * Xa') #
            #                                                        #
            #            ( L   0 )                                   #
            #     L  ->  (       )  , where L * w = Xa' x_j          #
            #            ( w   z )    and z = ||x_j||                #
            #                                                        #
            ##########################################################

            sign_active[n_active] = np.sign(C_)
            m, n = n_active, C_idx + n_active

            Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
            indices[n], indices[m] = indices[m], indices[n]
            Cov = Cov[1:]  # remove Cov[0]

            if group_ids is not None:
                group_ids[C_idx], group_ids[0] = group_ids[0], group_ids[C_idx]
                group_ids = group_ids[1:]  # remove group_ids[0]

            if Gram is None:
                X.T[n], X.T[m] = swap(X.T[n], X.T[m])
                c = nrm2(X.T[n_active]) ** 2
                L[n_active, :n_active] = \
                    np.dot(X.T[n_active], X.T[:n_active].T)
            else:
                # swap does only work inplace if matrix is fortran
                # contiguous ...
                Gram[m], Gram[n] = swap(Gram[m], Gram[n])
                Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
                c = Gram[n_active, n_active]
                L[n_active, :n_active] = Gram[n_active, :n_active]

            # Update the cholesky decomposition for the Gram matrix
            arrayfuncs.solve_triangular(L[:n_active, :n_active],
                                        L[n_active, :n_active])
            v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
            diag = max(np.sqrt(np.abs(c - v)), eps)
            L[n_active, n_active] = diag

            active.append(indices[n_active])
            n_active += 1

            if verbose > 1:
                print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], '',
                                                            n_active, C)
        # least squares solution
        least_squares, info = solve_cholesky(L[:n_active, :n_active],
                               sign_active[:n_active], lower=True)

        # is this really needed ?
        AA = 1. / np.sqrt(np.sum(least_squares * sign_active[:n_active]))

        if not np.isfinite(AA):
            # L is too ill-conditionned
            i = 0
            L_ = L[:n_active, :n_active].copy()
            while not np.isfinite(AA):
                L_.flat[::n_active + 1] += (2 ** i) * eps
                least_squares, info = solve_cholesky(L_,
                                    sign_active[:n_active], lower=True)
                tmp = max(np.sum(least_squares * sign_active[:n_active]), eps)
                AA = 1. / np.sqrt(tmp)
                i += 1
        least_squares *= AA

        if Gram is None:
            # equiangular direction of variables in the active set
            eq_dir = np.dot(X.T[:n_active].T, least_squares)
            # correlation between each unactive variables and
            # eqiangular vector
            corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
        else:
            # if huge number of features, this takes 50% of time, I
            # think could be avoided if we just update it using an
            # orthogonal (QR) decomposition of X
            corr_eq_dir = np.dot(Gram[:n_active, n_active:].T,
                                 least_squares)


        if group_ids is not None:
            mask = np.ones(group_ids.shape,dtype=bool)
            for g in selected_group:
                mask = mask & (group_ids!=g)

            arg = ((C - Cov) / (AA - corr_eq_dir + tiny))[mask]
            g1 = arrayfuncs.min_pos(arg)
            arg = ((C + Cov) / (AA + corr_eq_dir + tiny))[mask]
            g2 = arrayfuncs.min_pos(arg)
        else:
            g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
            g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
        
        if positive: 
            gamma_ = min(g1, C / AA)
        else:
            gamma_ = min(g1, g2, C / AA)

        # TODO: better names for these variables: z
        drop = False
        z = -coef[active] / (least_squares + tiny32)
        z_pos = arrayfuncs.min_pos(z)
        if z_pos < gamma_:
            # some coefficients have changed sign
            idx = np.where(z == z_pos)[0]

            # update the sign, important for LAR
            sign_active[idx] = -sign_active[idx]

            if method == 'lasso':
                gamma_ = z_pos
            drop = True

        n_iter += 1

        if return_path:
            if n_iter >= coefs.shape[0]:
                del coef, alpha, prev_alpha, prev_coef
                # resize the coefs and alphas array
                add_features = 2 * max(1, (max_features - n_active))
                coefs.resize((n_iter + add_features, n_features))
                alphas.resize(n_iter + add_features)
            coef = coefs[n_iter]
            prev_coef = coefs[n_iter - 1]
            alpha = alphas[n_iter, np.newaxis]
            prev_alpha = alphas[n_iter - 1, np.newaxis]
        else:
            # mimic the effect of incrementing n_iter on the array references
            prev_coef = coef
            prev_alpha[0] = alpha[0]
            coef = np.zeros_like(coef)

        coef[active] = prev_coef[active] + gamma_ * least_squares
        
        
        # update correlations
        Cov -= gamma_ * corr_eq_dir

        # See if any coefficient has changed sign
        if drop and method == 'lasso':

            arrayfuncs.cholesky_delete(L[:n_active, :n_active], idx)

            n_active -= 1
            m, n = idx, n_active
            drop_idx = active.pop(idx)

            if group_ids is not None:
                selected_group.remove(group_ids[idx])
                group_ids = np.r_[idx,group_ids]  # remove group_ids[0]
            
            if Gram is None:
                # propagate dropped variable
                for i in range(idx, n_active):
                    X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
                    indices[i], indices[i + 1] = \
                            indices[i + 1], indices[i]  # yeah this is stupid

                # TODO: this could be updated
                residual = y - np.dot(X[:, :n_active], coef[active])
                temp = np.dot(X.T[n_active], residual)

                Cov = np.r_[temp, Cov]
            else:
                for i in range(idx, n_active):
                    indices[i], indices[i + 1] = \
                                indices[i + 1], indices[i]
                    Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
                    Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
                                                      Gram[:, i + 1])

                # Cov_n = Cov_j + x_j * X + increment(betas) TODO:
                # will this still work with multiple drops ?

                # recompute covariance. Probably could be done better
                # wrong as Xy is not swapped with the rest of variables

                # TODO: this could be updated
                residual = y - np.dot(X, coef)
                temp = np.dot(X.T[drop_idx], residual)
                Cov = np.r_[temp, Cov]

            sign_active = np.delete(sign_active, idx)
            sign_active = np.append(sign_active, 0.)  # just to maintain size
            if verbose > 1:
                print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, '', drop_idx,
                                                      n_active, abs(temp))

    if return_path:
        # resize coefs in case of early stop
        alphas = alphas[:n_iter + 1]
        coefs = coefs[:n_iter + 1]

        return alphas, active, coefs.T
    else:
        return alpha, active, coef
Ejemplo n.º 5
0
def _gram_omp(Gram,
              Xy,
              n_nonzero_coefs,
              tol_0=None,
              tol=None,
              copy_Gram=True,
              copy_Xy=True):
    """Orthogonal Matching Pursuit step on a precomputed Gram matrix.

This function uses the the Cholesky decomposition method.

Parameters:
-----------
Gram: array, shape = (n_features, n_features)
Gram matrix of the input data matrix

Xy: array, shape = (n_features,)
Input targets

n_nonzero_coefs: int
Targeted number of non-zero elements

tol_0: float
Squared norm of y, required if tol is not None.

tol: float
Targeted squared error, if not None overrides n_nonzero_coefs.

copy_Gram: bool, optional
Whether the gram matrix must be copied by the algorithm. A false
value is only helpful if it is already Fortran-ordered, otherwise a
copy is made anyway.

copy_Xy: bool, optional
Whether the covariance vector Xy must be copied by the algorithm.
If False, it may be overwritten.

Returns:
--------
gamma: array, shape = (n_nonzero_coefs,)
Non-zero elements of the solution

idx: array, shape = (n_nonzero_coefs,)
Indices of the positions of the elements in gamma within the solution
vector

"""
    Gram = Gram.copy('F') if copy_Gram else np.asfortranarray(Gram)

    if copy_Xy:
        Xy = Xy.copy()

    min_float = np.finfo(Gram.dtype).eps
    nrm2, swap = linalg.get_blas_funcs(('nrm2', 'swap'), (Gram, ))
    potrs, = get_lapack_funcs(('potrs', ), (Gram, ))

    indices = range(len(Gram))  # keeping track of swapping
    alpha = Xy
    tol_curr = tol_0
    delta = 0
    n_active = 0

    max_features = len(Gram) if tol is not None else n_nonzero_coefs
    L = np.empty((max_features, max_features), dtype=Gram.dtype)
    L[0, 0] = 1.

    while True:
        lam = np.argmax(np.abs(alpha))
        if lam < n_active or alpha[lam]**2 < min_float:
            # selected same atom twice, or inner product too small
            warn(premature)
            break
        if n_active > 0:
            L[n_active, :n_active] = Gram[lam, :n_active]
            solve_triangular(L[:n_active, :n_active], L[n_active, :n_active])
            v = nrm2(L[n_active, :n_active])**2
            if 1 - v <= min_float:  # selected atoms are dependent
                warn(premature)
                break
            L[n_active, n_active] = np.sqrt(1 - v)
        Gram[n_active], Gram[lam] = swap(Gram[n_active], Gram[lam])
        Gram.T[n_active], Gram.T[lam] = swap(Gram.T[n_active], Gram.T[lam])
        indices[n_active], indices[lam] = indices[lam], indices[n_active]
        Xy[n_active], Xy[lam] = Xy[lam], Xy[n_active]
        n_active += 1
        # solves LL'x = y as a composition of two triangular systems
        gamma, _ = potrs(L[:n_active, :n_active],
                         Xy[:n_active],
                         lower=True,
                         overwrite_b=False)

        beta = np.dot(Gram[:, :n_active], gamma)
        alpha = Xy - beta
        if tol is not None:
            tol_curr += delta
            delta = np.inner(gamma, beta[:n_active])
            tol_curr -= delta
            if tol_curr <= tol:
                break
        elif n_active == max_features:
            break

    return gamma, indices[:n_active]