Example #1
0
def eRPCA(m,
          n,
          u,
          v,
          vecM,
          vecEpsilon,
          maxRank,
          lam=None,
          mu=None,
          rho=None,
          epsilon1=None,
          epsilon2=None,
          truncateK=0,
          maxIteration=1000,
          verbose=True,
          hasWeave=True):
    """This is an optimized code based on:

    Paffenroth, R., Du Toit, P., Nong, R., Scharf, L., Jayasumana,
    A. P., & Bandara, V. (2013). Space-time signal processing for
    distributed pattern detection in sensor networks. Selected Topics
    in Signal Processing, IEEE Journal of, 7(1), 38-49

    and

    Paffenroth, R. C., Nong, R., & Du Toit, P. C. (2013,
    September). On covariance structure in noisy, big data. In SPIE
    Optical Engineering+ Applications
    (pp. 88570E-88570E). International Society for Optics and
    Photonics.  Chicago.

    Args:

       m, n: the full size of the input matrix M.

       u, v, vecM: the samples of M as indices and values of a sparse
                    matrix.  All are one dimensional arrays.

       vecEpsilon: the pointwise error bounds.

       maxRank: the maximum rank of M to consider for completion.
                 (note, Lin-Che-Ma have a way to predict this, which
                 we are not using here)

       lam: the value of the coupling constant between L and S

       mu: the intial value for the augmented Lagrangian
            parameter.  (optional, defaults to value from
            Lin-Chen-Ma)

       rho: the growth factor for the augmented Lagrangian
             parameter.  (optional, defaults to value from
             Lin-Chen-Ma)

       epsilon1: the first error criterion that controls for the
                  error in the constraint.  (The idea for this is from
                  Lin-Chen-Ma)

       epsilon2: the second error criterion that controls for the
                  convergence of the method.  (The idea for this is from
                  Lin-Chen-Ma)

       maxIterations: the maximum number of iterations to
                       use. (optional, defaults to 100)

       verbose - print out the convergence history. (optional,
                 defaults to True)

    Returns:

       U,E,VT: the SVD of the recovered low rank matrix.

       S: A sparse matrix.
    """
    assert len(u.shape) == 1, 'u must be one dimensional'
    assert len(v.shape) == 1, 'v must be one dimensional'
    assert len(vecM.shape) == 1, 'vecM must be one dimensional'
    assert 0 <= np.max(u) < m, 'An entry in u is invalid'
    assert 0 <= np.max(v) < n, 'An entry in v is invalid'

    # The minimum value of the observed entries of M
    minM = np.min(vecM)

    if epsilon1 is None:
        # The default values for epsilon1 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # FIXME: Are these good for eRPCA?
        epsilon1 = 1e-5
    if epsilon2 is None:
        # The default values for epsilon2 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # FIXME: Are these good for eRPCA?
        epsilon2 = 1e-4

    # We want to keep around a sparse matrix version of Mvec, but we need
    # to be careful about 0 values in Mvec, we don't want them to get
    # discarded when we convert to a sparse matrix!  In particular, we
    # are using the sparse matrix in a slightly odd way.  We intend
    # that Mvec stores both 0 and non-zero values, and that the entries
    # of Mvec which are not stored are *unknown* (and not necessarily 0).
    # Therefore, we process the input Mvec to make all 0 entries "small"
    # numbers relative to its smallest value.
    for i in range(len(vecM)):
        if vecM[i] == 0:
            vecM[i] = minM * epsilon1
        if vecEpsilon[i] == 0:
            vecEpsilon[i] = minM * epsilon1

    # Create the required sparse matrices.  Note, u,v,d might have
    # repeats, and that is ok since the sp.coo_matrix handles
    # that case, and we don't actually use d after here.
    M = sp.csc_matrix(sp.coo_matrix((vecM, (u, v)), shape=[m, n]))
    Epsilon = sp.csc_matrix(sp.coo_matrix((vecEpsilon, (u, v)), shape=[m, n]))

    # The SVD of the low rank part of the answer.
    U = np.matrix(np.zeros([m, maxRank]))
    E = np.zeros([maxRank])
    VT = np.matrix(np.zeros([maxRank, n]))

    # Compute the largest singular values of D (assuming the
    # unobserved entries are 0.  I am not convinced this is
    # principled, but I believe it it what they do in the paper.
    dummy, E0, dummy = sparseSVDUpdate(M, U[:, 0], np.array([E[0]]), VT[0, :])

    if mu is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.  I believe that the use the
        # spectral norm of D (the largest singular value), where
        # the unobserved entries are assumed to be 0.
        # FIXME:  I am not sure this is principled.  I mean, why is 0 special?
        # I am pretty sure that I can break this with a inproperly scaled D.
        # FIXME: Are these good for eRPCA?
        mu = 1. / E0[0]
    if rho is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # The flatten here is important since the ord=1 norm
        # from np.linalg.norm for a matrix is max(sum(abs(x), axis=0)), which
        # is *not* what we want.
        # FIXME: Are these good for eRPCA?
        rho_s = len(vecM) / float(m * n)
        rho = 1.2172 + 1.8588 * rho_s
    if lam is None:
        # FIXME:  Double check this with Candes "RPCA?"
        lam = 1. / np.sqrt(np.max([m, n]))

    # The sparse Lagrange multiplers
    Y = M * 0.0

    # The sparse matrix S
    S = M * 0.0

    # The projection of L onto Omega.  This is not required
    # but is convenient to have.
    LOmega = M * 0.0

    # We keep the previous answer to check convergence
    LS0 = S + LOmega

    # We also want this to check convergence
    partialFrobeniusM = sparseFrobeniusNorm(M)

    iteration = 0

    while True:
        # Break if we use too many interations
        iteration += 1
        if iteration > maxIteration:
            break

        # This is the mathematical content of the algorithm
        ###################################################

        # # DEBUG ############################
        # print 'lagrangian before min L with S fixed',
        # vecLagrangianValue0 = vecLagrangian(E, S, M, LOmega,
        #                                     Epsilon.data, Y, mu, lam,
        #                                     truncateK=truncateK)
        # # DEBUG ############################

        # Minimize the Lagrangian with respect to L with S fixed
        [U, E, VT] = minNucPlusFrob(Y / mu + M - LOmega - S,
                                    U,
                                    E,
                                    VT,
                                    mu,
                                    truncateK=truncateK)

        # If the smallest signular value we compute is too large
        # then we might have maxRank too small (and run into error problems).
        # We check that here.
        if (E[0] > epsilon2) and (E[-1] / E[0] > epsilon2):
            print('Smallest singular value may be too big, consider')
            print('increasing maxRank.  This will make the solver slower,')
            print('but improve convergence')

        # Compute the project of L onto Omega
        LOmega = projSVD(U, E, VT, u, v)

        # FIXME  I need a good test here.  Because of my approximations
        #        I suspect that the Lagrangian can go up based upon
        #        the above minimization.
        # # DEBUG ############################
        # print 'lagrangian before min S with L fixed',
        # vecLagrangianValue1 = vecLagrangian(E, S, M, LOmega,
        #                                     Epsilon.data, Y, mu, lam,
        #                                     truncateK=truncateK)
        # assert vecLagrangianValue1 <= vecLagrangianValue0, \
        #    'Lagrangian went up!'
        # # DEBUG ############################

        # Minimize the Lagrangian with respect to S with L fixed
        S.data = minShrink1Plus2Norm(Y.data / mu + M.data - LOmega.data,
                                     Epsilon.data, lam, mu)

        # # DEBUG ############################
        # print 'lagrangian before Y update',
        # vecLagrangianValue2 = vecLagrangian(E, S, M, LOmega,
        #                                     Epsilon.data, Y, mu, lam,
        #                                     truncateK=truncateK)
        # assert vecLagrangianValue2 <= vecLagrangianValue1, \
        #    'Lagrangian went up!'
        # # DEBUG ############################

        # Update the Lagrange mutiplier
        Y.data = Y.data + mu * (M.data - S.data - LOmega.data)

        ###################################################
        # If the method is converging well then increase mu_0 to focus
        # more on the constraint.  if
        # mu_0*np.linalg.norm(POA_1-POA_0,ord=2)/partialFrobeniusD <
        # epsilon2: Again, I don't know how to compute the spectral
        # norm of a partially observed matrix, so I replace with the
        # Froebenius norm on the observed entries FIXME: Attempt to
        # justify later.
        if mu * sparseFrobeniusNorm(LS0 - LOmega -
                                    S) / partialFrobeniusM < epsilon2:
            mu = rho * mu

        # stopping criterion from page 12 of Lin, Chen, and Ma.
        # criterion1 = np.linalg.norm(D-A_1-E_1,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-A_1-(POA_1 - A_1),
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-POA_1,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # FIXME: I may need to justify the change from the full
        #        Froebenius norm to the partial one.
        criterion1 = sparseFrobeniusNorm(M - LOmega - S) / partialFrobeniusM
        # criterion2 = np.min([mu_0, np.sqrt(mu_0)])*\
        #              np.linalg.norm(E_1-E_0,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # This is the one place where I depart from Lin-Chen-Ma.  The
        # stopping criterion there have right about equation uses A
        # and POA.  As I want the algorithm to be fast I ignore the A
        # part, since that would be O(mn)
        # FIXME:  Need to justify
        # FIXME: I may need to justify the change from the full
        #         Froebenius norm to the partial one.
        criterion2 = (np.min([mu, np.sqrt(mu)]) *
                      sparseFrobeniusNorm(LS0 - LOmega - S) /
                      partialFrobeniusM)

        if verbose:
            if iteration == 1:
                print()
                print('criterion1 is the constraint')
                print('criterion2 is the solution')
                print('iteration criterion1 epsilon1 ', end='')
                print('criterion2 epsilon2 rho      mu')
            if iteration % 10 == 0:
                print('%9d %10.2e %8.2e ' % (iteration, criterion1, epsilon1),
                      end='')
                print('%10.2e %8.2e %8.2e %8.2e' %
                      (criterion2, epsilon2, rho, mu))

        # If both error criterions are satisfied stop the algorithm
        if criterion1 < epsilon1 and criterion2 < epsilon2:
            if verbose:
                print('%9d %10.2e %8.2e ' % (iteration, criterion1, epsilon1),
                      end='')
                print('%10.2e %8.2e %8.2e %8.2e' %
                      (criterion2, epsilon2, rho, mu))
            break

        # Keep around the old answer for convergence testing.
        LS0 = LOmega + S

    S.data = shrink(Epsilon.data, S.data)

    return [U, E, VT, S]
Example #2
0
def NDR(m,
        n,
        u,
        v,
        vecMD,
        vecEpsilon,
        maxRank,
        truncateK=0,
        mu=None,
        rho=None,
        epsilon1=None,
        epsilon2=None,
        tau=10.0,
        maxIteration=1000,
        verbose=True,
        hasWeave=True,
        grammianGuess=None,
        debug=False):
    """This is an optimized code based on:

    Paffenroth, R., Du Toit, P., Nong, R., Scharf, L., Jayasumana,
    A. P., & Bandara, V. (2013). Space-time signal processing for
    distributed pattern detection in sensor networks. Selected Topics
    in Signal Processing, IEEE Journal of, 7(1), 38-49

    and

    Paffenroth, R. C., Nong, R., & Du Toit, P. C. (2013,
    September). On covariance structure in noisy, big data. In SPIE
    Optical Engineering+ Applications
    (pp. 88570E-88570E). International Society for Optics and
    Photonics.  Chicago.

    Args:

       m, n: the full size of the input distance matrix MD.

       u, v, vecMD: the samples of MD as indices and values of a sparse
                    matrix.  All are one dimensional arrays.

       vecEpsilon: the pointwise error bounds for the distances

       maxRank: the maximum rank of MD to consider for completion.
                 (note, Lin-Che-Ma have a way to predict this, which
                 we are not using here)

       truncateK: how many singular values to ignore on the high end.
                  As opposed to 'maxRank', which ignores small singular
                  value, truncateK ignores the largest singular values.

       mu: the intial value for the augmented Lagrangian
            parameter.  (optional, defaults to value from
            Lin-Chen-Ma)

       rho: the growth factor for the augmented Lagrangian
             parameter.  (optional, defaults to value from
             Lin-Chen-Ma)

       epsilon1: the first error criterion that controls for the
                  error in the constraint.  (The idea for this is from
                  Lin-Chen-Ma)

       epsilon2: the second error criterion that controls for the
                  convergence of the method.  (The idea for this is from
                  Lin-Chen-Ma)

       tau: the Lipschitz constant for the APG solver.

       maxIterations: the maximum number of iterations to
                       use. (optional, defaults to 100)

       verbose - print out the convergence history. (optional,
                 defaults to True)

       grammianGuess - an initial guess for the solver.  Since the problem
               is convex this shouldn't matter as to the final
               answer, but may help convergence.

       dedug - turn on debugging output.

    Returns:

       U,E,VT: the SVD of the recovered low rank Grammian matrix.
    """
    assert len(u.shape) == 1, 'u must be one dimensional'
    assert len(v.shape) == 1, 'v must be one dimensional'
    assert len(vecMD.shape) == 1, 'vecMD must be one dimensional'
    assert 0 <= np.max(u) < m, 'An entry in u is invalid'
    assert 0 <= np.max(v) < n, 'An entry in v is invalid'

    # The minimum value of the observed entries of M
    minMD = np.min(vecMD)

    if epsilon1 is None:
        # The default values for epsilon1 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # FIXME: Are these good for NDR?
        epsilon1 = 1e-5
    if epsilon2 is None:
        # The default values for epsilon2 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # FIXME: Are these good for NDR?
        epsilon2 = 1e-4

    # We want to keep around a sparse matrix version of MDvec, but we need
    # to be careful about 0 values in MDvec, we don't want them to get
    # discarded when we convert to a sparse matrix!  In particular, we
    # are using the sparse matrix in a slightly odd way.  We intend
    # that MDvec stores both 0 and non-zero values, and that the entries
    # of MDvec which are not stored are *unknown* (and not necessarily 0).
    # Therefore, we process the input MDvec to make all 0 entries "small"
    # numbers relative to its smallest value.
    # Note, for distance matrices this is less of an issue, but we
    # do it regardless to keep thing consistent and safe.
    for i in range(len(vecMD)):
        if vecMD[i] == 0:
            vecMD[i] = minMD * epsilon1
        if vecEpsilon[i] == 0:
            vecEpsilon[i] = minMD * epsilon1

    # Create the required sparse matrices.  Note, u,v,vecMD might have
    # repeats, and that is ok since the sp.coo_matrix handles
    # that case, and we don't actually use d after here.
    MD = vecMD
    Epsilon = vecEpsilon

    # The SVD of the low rank part of the answer.
    if grammianGuess is None:
        UG = np.matrix(np.zeros([m, maxRank]))
        EG = np.zeros([maxRank])
        VGT = np.matrix(np.zeros([maxRank, n]))
    else:
        UG = grammianGuess['U']
        EG = grammianGuess['E']
        VGT = grammianGuess['VT']

    # Compute the largest singular values of D (assuming the
    # unobserved entries are 0.  I am not convinced this is
    # principled, but I believe it it what they do in the paper.
    MDsparse = csr_matrix(coo_matrix((MD, (u, v)), shape=(m, n)))
    dummy, ED0, dummy = sparseSVDUpdate(MDsparse, UG[:, 0], np.array([EG[0]]),
                                        VGT[0, :])

    if mu is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.  I believe that the use the
        # spectral norm of D (the largest singular value), where
        # the unobserved entries are assumed to be 0.
        # FIXME:  I am not sure this is principled.  I mean, why is 0 special?
        # I am pretty sure that I can break this with a inproperly scaled D.
        # FIXME: Are these good for eRPCA?
        mu = 1. / ED0[0]
    if rho is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # The flatten here is important since the ord=1 norm
        # from np.linalg.norm for a matrix is max(sum(abs(x), axis=0)), which
        # is *not* what we want.
        # FIXME: Are these good for eRPCA?
        rho_s = len(vecMD) / float(m * n)
        rho = 1.2172 + 1.8588 * rho_s

    # The sparse Lagrange multiplers
    Yt = MD * 0.0
    Yb = MD * 0.0

    # The sparse matrix S
    SD = MD * 0.0

    # The mapping of the Grammian LG to a distance matrix LD, and
    # then projected onto Omega.  This is not required
    # but is convenient to have.  We set it to be what we get
    # from the initial guess.
    fProjSVD = projSVDToDist
    LDOmega = fProjSVD(UG, EG, VGT, u, v, returnVec=True)

    # We keep the previous answer to check convergence
    # LDOmega0 = LDOmega.copy()

    # We also keep around a copy of the SVD of L, in case we want to
    # reset it.
    UG0 = UG.copy()
    EG0 = EG.copy()
    VGT0 = VGT.copy()

    # We also want this to check convergence
    partialFrobeniusMD = np.linalg.norm(MD)

    iteration = 0

    while True:
        # Break if we use too many interations
        iteration += 1
        if iteration > maxIteration:
            break

        # This is the mathematical content of the algorithm
        ###################################################

        ####################################
        # DEBUG ############################
        vecLagrangianValue0 = vecLagrangian(EG,
                                            SD,
                                            MD,
                                            LDOmega,
                                            Epsilon,
                                            Yt,
                                            Yb,
                                            mu,
                                            truncateK,
                                            debug=debug)
        # DEBUG ############################
        ####################################

        ####################################
        # TODO #############################
        ####################################
        # This should be something like the APG
        # algorithm on page 12 of papers/AccelProxForNucNorm.pdf.
        # You need equation (7) on page 3 of papers/AccelProxForNucNorm.pdf.
        # and the definition of the adjoint of the Grammian
        # to distance linear operator on page 5 of
        # papers/EDMhandbook.pdf
        ####################################
        # TODO #############################
        ####################################

        # Minimize the Lagrangian with respect to L with S fixed
        # NOTE: I think that getting the sign wrong here causes the
        # Lagrange multipler to make the solution worse!
        # X = -(-Yb/mu - MD + SD)
        X = Yb / mu + MD - SD

        [UG, EG, VGT] = minAPGFast(
            m,
            n,
            KFast,
            KAdjointFast,
            X,
            mu,
            tau=tau,
            truncateK=truncateK,
            debug=debug,
            guess={
                'U': UG0,
                'E': EG0,
                'VT': VGT0,
                'u': u,
                'v': v
            },
            # FIXME:  do something rational here.
            maxIter=3)

        # Compute the projection of L onto Omega
        LDOmega = fProjSVD(UG, EG, VGT, u, v, returnVec=True)

        ####################################
        # DEBUG ############################
        vecLagrangianValue1 = vecLagrangian(EG,
                                            SD,
                                            MD,
                                            LDOmega,
                                            Epsilon,
                                            Yt,
                                            Yb,
                                            mu,
                                            truncateK,
                                            debug=debug)
        if vecLagrangianValue1 > vecLagrangianValue0 + 1e-7:
            if verbose:
                print('Lagrangian went up after L minimization!')
                print('before', vecLagrangianValue0)
                print('after', vecLagrangianValue1)
                print('before-after',
                      vecLagrangianValue0 - vecLagrangianValue1)
                print('Perhaps you need to make tau bigger?')
            assert False, 'Lagrangian went up after L minimization!'
        # DEBUG ############################
        ####################################

        # Minimize the Lagrangian with respect to S with L fixed
        SD = minNDRSD(MD - LDOmega, Yt, Yb, Epsilon, mu, debug=debug, guess=SD)

        ####################################
        # DEBUG ############################
        vecLagrangianValue2 = vecLagrangian(EG, SD, MD, LDOmega, Epsilon, Yt,
                                            Yb, mu, truncateK)
        if vecLagrangianValue2 > vecLagrangianValue1 + 1e-7:
            if verbose:
                print('Lagrangian went up after S minimization!')
                print('before', vecLagrangianValue0)
                print('after', vecLagrangianValue1)
            assert False, 'Lagrangian went up after S minimization!'
        # DEBUG ############################
        ####################################

        # Update the Lagrange mutipliers
        Yt = Yt + mu * np.abs(shrink(Epsilon, SD))
        Yb = Yb + mu * (MD - LDOmega - SD)

        ###################################################
        # If the method is converging well then increase mu_0 to focus
        # more on the constraint.  if
        # mu_0*np.linalg.norm(POA_1-POA_0,ord=2)/partialFrobeniusD <
        # epsilon2: Again, I don't know how to compute the spectral
        # norm of a partially observed matrix, so I replace with the
        # Froebenius norm on the observed entries FIXME: Attempt to
        # justify later.
        # tmp = np.linalg.norm(LDOmega0-LDOmega)
        # if mu*tmp/partialFrobeniusMD < epsilon2:
        #     mu = rho*mu
        if np.sum(EG[truncateK:]) < epsilon2:
            mu = rho * mu

        # stopping criterion from page 12 of Lin, Chen, and Ma.
        # criterion1 = np.linalg.norm(D-A_1-E_1,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-A_1-(POA_1 - A_1),
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-POA_1,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # FIXME: I may need to justify the change from the full
        #        Froebenius norm to the partial one.
        tmp1 = np.linalg.norm(MD - LDOmega - SD)
        tmp2 = np.linalg.norm(shrink(Epsilon, SD), 1)
        criterion1 = (tmp1 + tmp2) / partialFrobeniusMD
        # criterion2 = np.min([mu_0, np.sqrt(mu_0)])*\
        #              np.linalg.norm(E_1-E_0,
        #                             ord='fro')/np.linalg.norm(D, ord='fro')
        # This is the one place where I depart from Lin-Chen-Ma.  The
        # stopping criterion there have right about equation uses A
        # and POA.  As I want the algorithm to be fast I ignore the A
        # part, since that would be O(mn)
        # FIXME:  Need to justify
        # FIXME: I may need to justify the change from the full
        #         Froebenius norm to the partial one.
        # criterion2 = (np.min([mu, np.sqrt(mu)]) *
        #               (np.linalg.norm(LDOmega0-LDOmega))/partialFrobeniusMD)
        criterion2 = np.sum(EG[truncateK:])

        if verbose:
            if iteration == 1:
                if verbose:
                    print()
                    print('criterion1 is the constraint')
                    print('criterion2 is the solution')
                    print('iteration criterion1 epsilon1 ', end=' ')
                    print('criterion2 epsilon2 rho      mu       objective')
            if iteration % 10 == 0:
                if verbose:
                    print('%9d %10.2e %8.2e ' %
                          (iteration, criterion1, epsilon1),
                          end=' ')
                    print('%10.2e %8.2e %8.2e %8.2e ' %
                          (criterion2, epsilon2, rho, mu),
                          end=' ')
                    print('%9.2e' % np.sum(EG[truncateK:]))

        # If both error criterions are satisfied stop the algorithm
        if criterion1 < epsilon1 and criterion2 < epsilon2:
            if verbose:
                print('%9d %10.2e %8.2e ' % (iteration, criterion1, epsilon1),
                      end=' ')
                print('%10.2e %8.2e %8.2e %8.2e ' %
                      (criterion2, epsilon2, rho, mu),
                      end=' ')
                print('%9.2e' % np.sum(EG[truncateK:]))
            break

        # Keep around the old answer for convergence testing.
        # LDOmega0 = LDOmega.copy()
        UG0 = UG.copy()
        EG0 = EG.copy()
        VGT0 = VGT.copy()

    return [UG, EG, VGT, iteration]
Example #3
0
def MC(m,
       n,
       u,
       v,
       d,
       maxRank,
       mu_0=None,
       rho=None,
       epsilon1=None,
       epsilon2=None,
       maxIteration=100,
       verbose=True,
       hasWeave=True):
    """ This is an optimized code from:
    "The Augmented Lagrange Multipler Method for Exact Recovery
     of Corrupted Low-Rank Matrices"
    by Zhouchen Lin, Minming Chen, and Yi Ma
    http://arxiv.org/abs/1009.5055

    Args:

        m, n: the full size of D.

        u, v, d: the samples of D as indices and values of a sparse matrix.
            All are one dimensional arrays.

        maxRank: the maximum rank of D to consider for completion.
          (note, Lin-Che-Ma have a way to predict this,
           which we are not using here)

        mu_0: the intial value for the augmented Lagrangian parameter.
          (optional, defaults to value from
               Lin-Chen-Ma)

        rho: the growth factor for the augmented Lagrangian parameter.
          (optional, defaults to value from Lin-Chen-Ma)

        epsilon1: the first error criterion that controls for the error in
          the constraint.  (optional, defaults to value from Lin-Chen-Ma)

        epsilon2: the second error criterion that controls for the convergence
          of the method. (optional, defaults to value from Lin-Chen-Ma)

        maxIterations: the maximum number of iterations to use.
          (optional, defaults to 100)

        verbose: print out the convergence history.
          (optional, defaults to True)

    Returns:

        A: the recovered matrix.

        E: the differences between the input matrix and the recovered matrix,
          so A+E=D.
          (Note, generally E is not important, but Lin-Chen-Ma return
           it so we do the same here.)
    """
    assert len(u.shape) == 1, 'u must be one dimensional'
    assert len(v.shape) == 1, 'v must be one dimensional'
    assert len(d.shape) == 1, 'd must be one dimensional'
    assert 0 <= np.max(u) < m, 'An entry in u is invalid'
    assert 0 <= np.max(v) < n, 'An entry in v is invalid'

    if epsilon1 is None:
        # The default values for epsilon1 is from bottom of page
        # 12 in Lin-Cheyn-Ma.
        epsilon1 = 1e-7
    if epsilon2 is None:
        # The default values for epsilon2 is from bottom of page
        # 12 in Lin-Chen-Ma.
        epsilon2 = 1e-6

    # The minimum value of the observed entries of D
    minD = np.min(d)

    # We want to keep around a sparse matrix version of D, but we need to be
    # careful about 0 values in d, we don't want them to get discarded when we
    # convert to a sparse matrix! In particular, we are using the sparse matrix
    # in a slightly odd way.  We intend that D stores both 0 and non-zero
    # values, and that the entries of D which are not stored are *unknown* (and
    # not necessarily 0).  Therefore, we process the input d to make all 0
    # entries "small" numbers relative to its smallest value.
    for i in range(len(d)):
        if d[i] == 0:
            d[i] = minD * epsilon1

    # Create the required sparse matrices.  Note, u,v,d might have
    # repeats, and that is ok since the sp.coo_matrix handles
    # that case, and we don't actually use d after here.
    D = sp.csc_matrix(sp.coo_matrix((d, (u, v)), shape=[m, n]))

    # The Frobenius norm of the observed entries of D.  This is
    # just the 2-norm of the *vector* of entries.
    partialFrobeniusD = sparseFrobeniusNorm(D)

    # The SVD of the answer A
    U = np.matrix(np.zeros([m, maxRank]))
    S = np.zeros([maxRank])
    VT = np.matrix(np.zeros([maxRank, n]))

    # Compute the largest singular values of D (assuming the unobserved entries
    # are 0. I am not convinced this is principled, but I believe it it what
    # they do in the paper.
    dummy, S0, dummy = sparseSVDUpdate(D, U[:, 0], np.array([S[0]]), VT[0, :])

    if mu_0 is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.  I believe that the use the
        # spectral norm of D (the largest singular value), where
        # the unobserved entries are assumed to be 0.
        # FIXME:  I am not sure this is principled.  I mean, why is 0 special?
        # I am pretty sure that I can break this with a inproperly scaled D.
        mu_0 = 1. / S0[0]
    if rho is None:
        # The default values for mu_0 is from bottom of page
        # 12 in Lin-Chen-Ma.
        # The flatten here is important since the ord=1 norm
        # from np.linalg.norm for a matrix is max(sum(abs(x), axis=0)), which
        # is *not* what we want.
        rho_s = len(d) / (m * n)
        rho = 1.2172 + 1.8588 * rho_s

    # The sparse Lagrange multiplers
    Y_0 = D * 0.0

    # The projection of A onto Omega.  This is not required
    # but is convenient to have.
    POA_0 = D * 0.0
    POA_1 = D * 0.0

    iteration = 0
    while True:
        # Break if we use too many interations
        iteration += 1
        if iteration > maxIteration:
            break

        # This is the mathematical content of the algorithm
        ###################################################
        # The full_matrices being true is required for non-square matrices
        # We know that E_0 = POA_0 - A_0 = POA_0 - U_0*S_0*VT_0
        # So,
        # [U,S,VT] = np.linalg.svd(D-E_0+Y_0/mu_0, full_matrices=False)
        # can be rewritten as
        # [U,S,VT] = np.linalg.svd(D-(POA_0 - U_0*S_0*VT_0)+Y_0/mu_0,
        #                          full_matrices=False)
        # Combining sparse terms we get
        # [U,S,VT] = np.linalg.svd( (D-POA_0+Y_0/mu_0) + U_0*S_0*VT_0,
        #                          full_matrices=False)

        [U, S, VT] = minNucPlusFrob(D - POA_0 + Y_0 / mu_0, U, S, VT, mu_0)

        # and we compute the projection of A onto Omega
        # Note, making the temp array and then creating the sparse
        # matrix all at once is *much* faster.
        POA_1 = projSVD(U, S, VT, u, v)

        # POATmp = np.zeros([len(d)])
        # # FIXME:  Needs to be numba
        # for i in range(len(d)):
        #     POATmp[i] = U[u[i], :] * np.diag(S) * VT[:, v[i]]
        # POA_1 = sp.csc_matrix(sp.coo_matrix((POATmp, (u, v)), shape=[m, n]))

        # Update the Lagrange mutiplier
        # We have that
        # E_1 = POA_1 - A_1 = POA_1 - U_1*S_1*VT_1
        # So we can plug into
        # Y_1 = Y_0 + mu_0*(D-A_1-E_1)
        # to get
        # Y_1 = Y_0 + mu_0*(D-A_1-(POA_1 - A_1))
        # so
        # Y_1 = Y_0 + mu_0*(D-POA_1)

        Y_1 = Y_0 + mu_0 * (D - POA_1)
        ###################################################

        # If the method is converging well then increase mu_0 to focus
        # more on the constraint.  if
        # mu_0*np.linalg.norm(POA_1-POA_0,ord=2)/partialFrobeniusD <
        # epsilon2: Again, I don't know how to compute the spectral
        # norm of a partially observed matrix, so I replace with the
        # Froebenius norm on the observed entries FIXME: Attempt to
        # justify later.
        if (mu_0 * sparseFrobeniusNorm(POA_1 - POA_0) / partialFrobeniusD <
                epsilon2):
            mu_0 = rho * mu_0

        # stopping criterion from page 12 of Lin, Chen, and Ma.
        # criterion1 = np.linalg.norm(D-A_1-E_1, ord='fro')
        #   /np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-A_1-(POA_1 - A_1), ord='fro')
        #   /np.linalg.norm(D, ord='fro')
        # criterion1 = np.linalg.norm(D-POA_1), ord='fro')
        #   /np.linalg.norm(D, ord='fro')
        # FIXME:  I may need to justify the change from the full Froebenius
        #         norm to the partial one.
        criterion1 = sparseFrobeniusNorm(D - POA_1) / partialFrobeniusD
        # criterion2 = np.min([mu_0,np.sqrt(mu_0)])
        #   *np.linalg.norm(E_1-E_0, ord='fro')/np.linalg.norm(D, ord='fro')
        # This is the one place where I depart from Lin-Chen-Ma.  The stopping
        # criterion there have right about equation uses A and POA.  As I want
        # the algorithm to be fast I ignore the A part, since that would be
        # O(mn)
        # FIXME:  Need to justify
        # FIXME:  I may need to justify the change from the full Froebenius
        #         norm to the partial one.
        criterion2 = np.min([mu_0, np.sqrt(mu_0)]) * \
            sparseFrobeniusNorm(POA_1 - POA_0) / partialFrobeniusD

        if verbose:
            if iteration == 1:
                print("printing")
                print(("iteration criterion1 epsilon1 " +
                       "criterion2 epsilon2 rho      mu"))
            if iteration % 10 == 0:
                print(('%9d %10.2e %8.2e %10.2e %8.2e %8.2e %8.2e' %
                       (iteration, criterion1, epsilon1, criterion2, epsilon2,
                        rho, mu_0)))

        # If both error criterions are satisfied stop the algorithm
        if criterion1 < epsilon1 and criterion2 < epsilon2:
            break

        Y_0 = Y_1.copy()
        POA_0 = POA_1.copy()

    return [U, S, VT]
Example #4
0
def minAPGFast(m,
               n,
               A,
               AT,
               b,
               mu,
               guess,
               truncateK=0,
               tau=10.0,
               debug=False,
               maxIter=1):
    """Compute a fast *single interation* of the minimization using the
    APG algorithm from:

    An accelerated proximal gradient algorithm for nuclear
    norm regularized linear least squares problems
    Kim-Chuan Toh and Sangwoon Yun

    The is computes the minium of the following objective.

    .. math::

        \| L \|_* + \mu / 2 \| A(L) - b \|_2^2

    Note, the :math:`\mu` here is :math:`\frac{1}{\mu}` in
    the above paper.

    Args:
        m, n:  The dimension of the output.

        A: A linear mapping from a matrix to a vector

        AT: The adjoint mapping of A

        b: A vector in the proximal function

        mu: The value of :math:`\mu`.

        guess: An initial guess for the minimization.  It is also
               used for debugging to make sure the value of the
               objective is smaller.

        truncateK: Ignore the first truncateK singular values to use
                   the truncated nuclear norm

        tau: The Lipschitz constant for the problem of interest. Need to
             figure out numerically unless known analytically.

        maxIter: The number of iterations to run the solver.  One is
             sometimes enough.

        debug:  Run the algorithm in debugging mode, with additional
                output and slower run-time.

    Returns:
        The SVD of :math:`L` that achieves the minimum.

    """
    assert len(b.shape) == 1, 'b must be a vector'

    # The SVD of the L we guess
    U = np.matrix(guess['U'])
    E = guess['E']
    VT = np.matrix(guess['VT'])
    # The indicies in Omega at which L is observed
    u = guess['u']
    v = guess['v']

    assert len(u) == len(v) == len(b), 'length of u, v, and b must be the same'

    # To make it consistent with the APG paper we make
    # the following transformation.
    muP = 1. / mu

    ####################################
    # DEBUG ############################
    if debug and (guess is not None):
        Um = np.matrix(U)
        Em = np.matrix(np.diag(E))
        VTm = np.matrix(VT)
        before = objective(Um * Em * VTm,
                           A,
                           U,
                           E,
                           VT,
                           u,
                           v,
                           b,
                           mu,
                           truncateK=truncateK)
        print('minAPGFast before', before)
    # DEBUG ############################
    ####################################

    # Note, this is not the same algorithm as in minAPG.py.
    # This algorithm does not use the same normalization constants!
    # In fact, this algorithm does not have the support of theory
    # as in minAPG.py!
    # This code requires a rank one update to be the same as
    # minAPG.py.
    for i in range(maxIter):
        # Note, this is sparse since the AT will return a
        # sparse matrix!
        tmpOmega = -(1. / tau) * AT(A(U, E, VT, u, v) - b, u, v, m, n)
        # So, this will be a sparseSVDUpdate of
        # Y - (1./tau)*AT(A(Y)-b)
        # which, in the notation here, is
        # U*E*VT + tmpOmega
        # and we rearrange terms to get what we expect
        # for sparseSVDUpdate
        # sparseSVDUpdate(tmpOmega, U, E, VT)

        # NOTE:  tmpOmega is a sparse matrix!  This is ok,
        # since we only use it for sparseSVDUpdate, which expects
        # that
        Unew, Enew, VTnew = sparseSVDUpdate(tmpOmega, U, E, VT)
        Enew[truncateK:] = shrink(muP / tau, Enew[truncateK:])
        U = Unew
        E = Enew
        VT = VTnew

    ####################################
    # DEBUG ############################
    if debug and (guess is not None):
        Um = np.matrix(Unew)
        Em = np.matrix(np.diag(Enew))
        VTm = np.matrix(VTnew)
        after = objective(Um * Em * VTm,
                          A,
                          Unew,
                          Enew,
                          VTnew,
                          u,
                          v,
                          b,
                          mu,
                          truncateK=truncateK,
                          debug=True)
        print('minAPGFast after', after)

        # Compute the slow solution
        print(
            'Using slow solver to help debug.  This can be very slow on large problems.'
        )
        from dimredu.lib.minAPG import minAPG as minAPGSlow
        from dimredu.lib.EDM import K, KAdjoint

        def ASlow(X, u=u, v=v):
            n = len(u)
            output = np.zeros([n])
            Tmp = np.array(K(X))
            for i in range(n):
                output[i] = Tmp[u[i], v[i]]
            return output

        def ATSlow(x, u=u, v=v, m=m, n=n):
            Tmp = np.matrix(np.zeros([m, n]))
            for i in range(len(u)):
                Tmp[u[i], v[i]] = x[i]
            return KAdjoint(Tmp, symmetric=False)

        [Uexact, Eexact, VTexact] = minAPGSlow(m,
                                               n,
                                               ASlow,
                                               ATSlow,
                                               b,
                                               mu,
                                               maxIter=maxIter,
                                               tau=tau,
                                               guess=guess)
        Umexact = np.matrix(Uexact)
        Emexact = np.matrix(np.diag(Eexact))
        VTmexact = np.matrix(VTexact)
        slowAfter = objective(Umexact * Emexact * VTmexact,
                              A,
                              Uexact,
                              Eexact,
                              VTexact,
                              u,
                              v,
                              b,
                              mu,
                              truncateK=truncateK,
                              debug=True)
        print('minAPGSlow after', slowAfter)
        ###

        for i in range(5):
            perturb = np.random.random(size=[m, n]) * 1e-5
            Ltmp = Um * Em * VTm + perturb
            Utmp, Etmp, VTtmp = np.linalg.svd(Ltmp)
            pObj = objective(Ltmp,
                             A,
                             Utmp,
                             Etmp,
                             VTtmp,
                             u,
                             v,
                             b,
                             mu,
                             truncateK=truncateK)
            print(pObj, end=' ')
            if after < pObj:
                print('bigger :-)')
            else:
                print('smaller :-(')
                assert False, 'Only local minimum'
        assert before / after + 1e-7 >= 1., 'minAPGFast went up!'
    # DEBUG ############################
    ####################################

    return [Unew, Enew, VTnew]
Example #5
0
def minNucPlusFrob(X, U, E, VT, mu, truncateK=0, debug=False, guess=None):
    """Compute a fast minimization of nuclear norm plus Frobenius norm.

    The is computes the minium of the following objective.

    .. math::

        \| L \|_* + \mu / 2 \| L - (X + U*E*V^T) \|_F^2

    Args:
        X: A sparse array.

        U, E, VT: The SVD of a low matrix.

        mu: The value of :math:`\mu`.

        truncateK: Ignore the first truncateK singular values to use
                   the truncated nuclear norm

        debug:  Run the algorithm in debugging mode, with additional
                output and slower run-time.

        guess: An initial guess for the minimization.  In this case
                the minimization is in closed form, so it is merely
                used for debugging to see is the value of the
                objective is reduced.

    Returns:
        The SVD of :math:`L` that achieves the minimum.
    """
    assert X.shape[0] == U.shape[0], 'First dim of L and U must match'
    assert X.shape[1] == VT.shape[1], 'Last dim of L and VT must match'

    ####################################
    # DEBUG ############################
    if debug and (guess is not None):
        Um = np.matrix(guess['U'])
        Em = np.matrix(np.diag(guess['E']))
        VTm = np.matrix(guess['VT'])
        before = objective(Um * Em * VTm, X, U, E, VT, mu, truncateK=truncateK)
    # DEBUG ############################
    ####################################

    [Unew, Enew, VTnew] = sparseSVDUpdate(X, U, E, VT)

    # Don't shrink the first truncatedK singular
    # values to implement the truncated nuclear norm
    # as in
    Enew[truncateK:] = shrink(1. / mu, Enew[truncateK:])

    ####################################
    # DEBUG ############################
    if debug and (guess is not None):
        Um = np.matrix(Unew)
        Em = np.matrix(np.diag(Enew))
        VTm = np.matrix(VTnew)
        after = objective(Um * Em * VTm, X, U, E, VT, mu, truncateK=truncateK)
        assert before / after + 1e-7 >= 1., 'minNucPludFrob went up!'
    # DEBUG ############################
    ####################################

    return [Unew, Enew, VTnew]