def qr_destroy(la): """ Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process. Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, because the memory used in `la[0]` is reclaimed earlier. """ a = numpy.asfortranarray(la[0]) del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(a); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(a.shape)) geqrf, = get_lapack_funcs(('geqrf',), (a,)) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) del a # free up mem assert info >= 0 r = triu(qr[:n, :n]) if m < n: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous return q, r
def qr_destroy(la): a = numpy.asfortranarray(la[0]) del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(a.shape)) geqrf, = get_lapack_funcs(('geqrf',), (a,)) qr, tau, work, info = geqrf(a, lwork = -1, overwrite_a = True) qr, tau, work, info = geqrf(a, lwork = work[0], overwrite_a = True) del a # free up mem assert info >= 0 r = triu(qr[:n, :n]) if m < n: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True) q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True) assert info >= 0, "qr failed" assert q.flags.f_contiguous return q, r
def qr_destroy(la): a = numpy.asfortranarray(la[0]) del la[0], la # now `a` is the only reference to the input matrix m, n = a.shape # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(a.shape)) geqrf, = get_lapack_funcs(('geqrf', ), (a, )) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) del a # free up mem assert info >= 0 r = triu(qr[:n, :n]) if m < n: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr', ), (qr, )) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous return q, r
def qr_decomposition(la): """Reduced QR decomposition.""" print "+" * 100, "Performing reduced QR decomposition ..." a = numpy.asfortranarray(la[0]) del la[0], la m, n = a.shape geqrf, = get_lapack_funcs(('geqrf', ), (a, )) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) del a assert info >= 0 r = triu(qr[:n, :n]) if m < n: qr = qr[:, :m] gorgqr, = get_lapack_funcs(('orgqr', ), (qr, )) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous return q, r
def qr_decomposition(la): """Reduced QR decomposition.""" print "+"*100, "Performing reduced QR decomposition ..." a = numpy.asfortranarray(la[0]) del la[0], la m, n = a.shape geqrf, = get_lapack_funcs(('geqrf',), (a,)) qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True) qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True) del a assert info >= 0 r = triu(qr[:n, :n]) if m < n: qr = qr[:, :m] gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous return q, r
def triu_indices(n, k=0): m = numpy.ones((n, n), int) a = triu(m, k) return numpy.where(a != 0)
def merge(self, other, decay = 1.0): """ Merge this Projection with another. Content of `other` is destroyed in the process, so pass this function a copy if you need it further. This is the optimized merge described in algorithm 5. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is self.u = other.u.copy() self.s = other.s.copy() return if self.m != other.m: raise ValueError("vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) # diff = numpy.dot(self.u.T, self.u) - numpy.eye(self.u.shape[1]) # logger.info('orth error after=%f' % numpy.sum(diff * diff)) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with gorgqr. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in numpy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm, = get_blas_funcs(('gemm',), (self.u,)) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a = True) gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf',), (other.u,)) qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True) # sometimes segfaults with overwrite_a=True... qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True) # sometimes segfaults with overwrite_a=True... del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case... qr = qr[:,:m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True) q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), n2, n1), r * other.s]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( k = clipSpectrum(s_k, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k self.s = s_k
def merge(self, other, decay=1.0): """ Merge this Projection with another. Content of `other` is destroyed in the process, so pass this function a copy if you need it further. This is the optimized merge described in algorithm 5. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is if other.s is None: # other.u contains a direct document chunk, not svd => perform svd docs = other.u assert scipy.sparse.issparse(docs) if self.m * self.k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK on them instead logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices=False) else: try: import sparsesvd except ImportError: raise ImportError( "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`" ) logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd( docs, self.k + 30 ) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s, self.k) self.u = u[:, :k].copy('F') self.s = s[:k] else: self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError( "vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] if other.s is None: other.u = other.u.todense() other.s = 1.0 # broadcasting will promote this to eye(n2) where needed # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with gorgqr. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in numpy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray( self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm, = get_blas_funcs(('gemm', ), (self.u, )) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a=True) gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf', ), (other.u, )) qr, tau, work, info = geqrf( other.u, lwork=-1, overwrite_a=True) # sometimes segfaults with overwrite_a=True...? qr, tau, work, info = geqrf( other.u, lwork=work[0], overwrite_a=True) # sometimes segfaults with overwrite_a=True...? del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr', ), (qr, )) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [ matutils.pad( numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s ]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) u_k, s_k, _ = numpy.linalg.svd( k, full_matrices=False ) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( k = clipSpectrum(s_k, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm( 1.0, self.u, u_k[:n1] ) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u, overwrite_c=True) # u = [u,u']*u_k self.s = s_k
def merge(self, other, decay=1.0): """ Merge this Projection with another. Content of `other` is destroyed in the process, so pass this function a copy if you need it further. This is the optimized merge described in algorithm 5. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is self.u = other.u.copy() self.s = other.s.copy() return if self.m != other.m: raise ValueError( "vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) # diff = numpy.dot(self.u.T, self.u) - numpy.eye(self.u.shape[1]) # logger.info('orth error after=%f' % numpy.sum(diff * diff)) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with gorgqr. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in numpy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray( self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm, = get_blas_funcs(('gemm', ), (self.u, )) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a=True) gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf', ), (other.u, )) qr, tau, work, info = geqrf( other.u, lwork=-1, overwrite_a=True) # sometimes segfaults with overwrite_a=True... qr, tau, work, info = geqrf( other.u, lwork=work[0], overwrite_a=True) # sometimes segfaults with overwrite_a=True... del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case... qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr', ), (qr, )) q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True) q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find rotation that diagonalizes r k = numpy.bmat([[ numpy.diag(decay * self.s), c * other.s ], [matutils.pad(numpy.matrix([]).reshape(0, 0), n2, n1), r * other.s]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) u_k, s_k, _ = numpy.linalg.svd( k, full_matrices=False ) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( k = clipSpectrum(s_k, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm( 1.0, self.u, u_k[:n1] ) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u, overwrite_c=True) # u = [u,u']*u_k self.s = s_k
def merge(self, other, decay = 1.0): """ Merge this Projection with another. Content of `other` is destroyed in the process, so pass this function a copy if you need it further. This is the optimized merge described in algorithm 5. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is if other.s is None: # other.u contains a direct document chunk, not svd => perform svd docs = other.u assert scipy.sparse.issparse(docs) if self.m * self.k < 10000: # SVDLIBC gives spurious results for small matrices.. run full # LAPACK on them instead logger.info("computing dense SVD of %s matrix" % str(docs.shape)) u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices = False) else: try: import sparsesvd except ImportError: raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`") logger.info("computing sparse SVD of %s matrix" % str(docs.shape)) ut, s, vt = sparsesvd.sparsesvd(docs, self.k + 30) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested u = ut.T del ut del vt k = clipSpectrum(s, self.k) self.u = u[:, :k].copy('F') self.s = s[:k] else: self.u = other.u.copy('F') self.s = other.s.copy() return if self.m != other.m: raise ValueError("vector space mismatch: update has %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] if other.s is None: other.u = other.u.todense() other.s = 1.0 # broadcasting will promote this to eye(n2) where needed # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with gorgqr. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in numpy? And is it fast(er)? # find component of u2 orthogonal to u1 # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :( self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array other.u = numpy.asfortranarray(other.u) gemm, = get_blas_funcs(('gemm',), (self.u,)) logger.debug("constructing orthogonal component") c = gemm(1.0, self.u, other.u, trans_a = True) gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True) # perform q, r = QR(component); code hacked out of scipy.linalg.qr logger.debug("computing QR of %s dense matrix" % str(other.u.shape)) geqrf, = get_lapack_funcs(('geqrf',), (other.u,)) qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True) # sometimes segfaults with overwrite_a=True...? qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True) # sometimes segfaults with overwrite_a=True...? del other.u assert info >= 0 r = triu(qr[:n2, :n2]) if m < n2: # rare case, #features < #topics qr = qr[:, :m] # retains fortran order gorgqr, = get_lapack_funcs(('orgqr',), (qr,)) q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True) q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True) assert info >= 0, "qr failed" assert q.flags.f_contiguous # find rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( except numpy.linalg.LinAlgError: logging.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up s_k = numpy.sqrt(s_k) k = clipSpectrum(s_k, self.k) u_k, s_k = u_k[:, :k], s_k[:k] # update & rotate current basis U logger.debug("updating orthonormal basis U") self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this! gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k self.s = s_k