def ricipolla(self, k, m): #print "k,m",k,m self.alpha[0:k] = self.eval[0:k] self.beta[0:k] = self.beta[m - 1] * self.evect[0:k, m - 1] #print "beta beta",self.beta,self.beta[m-1],self.evect[0:k,m-1] # E = self.evect[0:k,0:m].copy() a = self.class4vect(k, self.dim) #print " LANCIO mat MULT " self.class4vect.mat_mult(a, self.evect[0:k, 0:m], self.q) for i in range(k): a[i].normalizzaauto() self.class4vect.copy_to_a_from_b(self.q[i], a[i]) a = None self.class4vect.copy_to_a_from_b(self.q[k], self.q[m]) o = self.omega[0:m, 0:m].copy() o = dotblas.dot(o, numpy.transpose(self.evect)) for i in range(k): self.omega[i, k] = self.omega[k, i] = o[i, k] o = dotblas.dot(self.evect, o) self.omega[0:k, 0:k] = o[0:k, 0:k]
def Moltiplica(self, res, v): if (len(self.mR) == 1): res.vr[:] += dotblas.dot(self.mR[0], v.vr) else: res.vr[:] += dotblas.dot(self.mR[0], dotblas.dot(self.mR[1], v.vr)) if (self.shift != 0.0): res.add_from_vect_with_fact(v, self.shift)
def Moltiplica(self,res,v): if( len(self.mR)==1 ): res.vr[:]+=dotblas.dot(self.mR[0],v.vr) else: res.vr[:]+=dotblas.dot(self.mR[0],dotblas.dot(self.mR[1],v.vr)) if( self.shift !=0.0): res.add_from_vect_with_fact(v,self.shift)
def ricipolla(self, k,m): #print "k,m",k,m self.alpha[0:k]= self.eval[0:k] self.beta[0:k] = self.beta[m-1]*self.evect[0:k,m-1] #print "beta beta",self.beta,self.beta[m-1],self.evect[0:k,m-1] # E = self.evect[0:k,0:m].copy() a=self.class4vect(k, self.dim ) #print " LANCIO mat MULT " self.class4vect.mat_mult(a, self.evect[0:k,0:m] , self.q) for i in range(k): a[i].normalizzaauto() self.class4vect.copy_to_a_from_b( self.q[i] , a[i] ) a=None self.class4vect.copy_to_a_from_b(self.q[k] , self.q[m] ) o = self.omega[0:m,0:m].copy() o = dotblas.dot(o, numpy.transpose(self.evect)) for i in range(k): self.omega[i,k]=self.omega[k,i]=o[i,k] o = dotblas.dot(self.evect,o) self.omega[0:k,0:k]=o[0:k,0:k]
a = images[i] #a.shape = r, c print("Eigenvalue %d = %f" % (i, eigenvalues[i])) fname = "Image%02d.edf" % (i+10) if os.path.exists(fname): os.remove(fname) edf = EdfFile.EdfFile(fname,'wb') edf.WriteImage({},a) edf = None else: stack = EDFStack.EDFStack(inputfile, imagestack=False, dtype=numpy.float64) r, c, nChannels = stack.data.shape if 0: stack.data.shape = r * c, nChannels t0 = time.time() covMatrix0 = dotblas.dot(stack.data.T, stack.data) print("Standard Elapsed = ", time.time() - t0) print("Standard Shape = ", covMatrix0.shape) t0 = time.time() stack.data.shape = r , c, nChannels covMatrix1, sumSpectrum, nPixels = getCovarianceMatrix(stack, index=-1, dtype='float64', force=True) print("Dynamic Elapsed = ", time.time() - t0) print("Dynamic Shape = ", covMatrix1.shape) print(covMatrix0.max(), covMatrix0.min(), "Reference = ", covMatrix0[1300, 1350:1360]) print(covMatrix1.max(), covMatrix1.min(), "Calculated = ", covMatrix1[1300, 1350:1360]) delta = covMatrix1-covMatrix0 maxDiff = delta.max() print("Max diff = ", maxDiff)
def mat_mult(self, evect, q): self.vr[:evect.shape[0]] = dotblas.dot(evect.astype(self.tipo), q.vr[:evect.shape[1]])
def lanczosPCA(stack, ncomponents=10, binning=None, **kw): if DEBUG: print("lanczosPCA") if binning is None: binning = 1 if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError("lanczosPCA is only supported when using numpy arrays") # wrapmatrix = "double" wrapmatrix = "single" dtype = numpy.float64 if wrapmatrix == "double": data = data.astype(dtype) if len(data.shape) == 3: r, c, N = data.shape data.shape = r * c, N else: r, N = data.shape c = 1 npixels = r * c if binning > 1: # data.shape may fails with non-contiguous arrays # use reshape. data = numpy.reshape(data, [data.shape[0], data.shape[1] / binning, binning]) data = numpy.sum(data, axis=-1) N /= binning if ncomponents > N: raise ValueError("Number of components too high.") avg = numpy.sum(data, 0) / (1.0 * npixels) numpy.subtract(data, avg, data) Lanczos.LanczosNumericMatrix.tipo = dtype Lanczos.LanczosNumericVector.tipo = dtype if wrapmatrix == "single": SM = [dotblas.dot(data.T, data).astype(dtype)] SM = Lanczos.LanczosNumericMatrix(SM) else: SM = Lanczos.LanczosNumericMatrix([data.T.astype(dtype), data.astype(dtype)]) eigenvalues, eigenvectors = Lanczos.solveEigenSystem(SM, ncomponents, shift=0.0, tol=1.0e-15) SM = None numpy.add(data, avg, data) images = numpy.zeros((ncomponents, npixels), data.dtype) vectors = numpy.zeros((ncomponents, N), dtype) for i in range(ncomponents): vectors[i, :] = eigenvectors[i].vr images[i, :] = dotblas.dot(data, (eigenvectors[i].vr).astype(data.dtype)) data = None images.shape = ncomponents, r, c return images, eigenvalues, vectors
def multipleArrayPCA(stackList, ncomponents=10, binning=None, **kw): """ Given a list of arrays, calculate the requested principal components from the matrix resulting from their column concatenation. Therefore, all the input arrays must have the same number of rows. """ stack = stackList[0] if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError("multipleArrayPCA is only supported when using numpy arrays") if len(data.shape) == 3: r, c = data.shape[:2] npixels = r * c else: c = None r = data.shape[0] npixels = r # reshape and subtract mean to all the input data shapeList = [] avgList = [] eigenvectorLength = 0 for i in range(len(stackList)): shape = stackList[i].shape eigenvectorLength += shape[-1] shapeList.append(shape) stackList[i].shape = npixels, -1 avg = numpy.sum(stackList[i], 0) / (1.0 * npixels) numpy.subtract(stackList[i], avg, stackList[i]) avgList.append(avg) # create the needed storage space for the covariance matrix covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), numpy.float32) rowOffset = 0 indexDict = {} for i in range(len(stackList)): iVectorLength = shapeList[i][-1] colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] if i <= j: covMatrix[ rowOffset : (rowOffset + iVectorLength), colOffset : (colOffset + jVectorLength) ] = dotblas.dot(stackList[i].T, stackList[j]) if i < j: key = "%02d%02d" % (i, j) indexDict[key] = (rowOffset, rowOffset + iVectorLength, colOffset, colOffset + jVectorLength) else: key = "%02d%02d" % (j, i) rowMin, rowMax, colMin, colMax = indexDict[key] covMatrix[rowOffset : (rowOffset + iVectorLength), colOffset : (colOffset + jVectorLength)] = covMatrix[ rowMin:rowMax, colMin:colMax ].T colOffset += jVectorLength rowOffset += iVectorLength indexDict = None # I have the covariance matrix, calculate the eigenvectors and eigenvalues covMatrix = [covMatrix] covMatrix = Lanczos.LanczosNumericMatrix(covMatrix) eigenvalues, evectors = Lanczos.solveEigenSystem(covMatrix, ncomponents, shift=0.0, tol=1.0e-15) covMatrix = None images = numpy.zeros((ncomponents, npixels), numpy.float32) eigenvectors = numpy.zeros((ncomponents, eigenvectorLength), numpy.float32) for i in range(ncomponents): eigenvectors[i, :] = evectors[i].vr colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] images[i, :] += dotblas.dot(stackList[j], eigenvectors[i, colOffset : (colOffset + jVectorLength)]) colOffset += jVectorLength # restore shapes and values for i in range(len(stackList)): numpy.add(stackList[i], avgList[i], stackList[i]) stackList[i].shape = shapeList[i] if c is None: images.shape = ncomponents, r, 1 else: images.shape = ncomponents, r, c return images, eigenvalues, eigenvectors
def lanczosPCA2(stack, ncomponents=10, binning=None, **kw): """ This is a fast method, but it may loose information """ if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack # check we have received a numpy.ndarray and not an HDF5 group # or other type of dynamically loaded data if not isinstance(data, numpy.ndarray): raise TypeError(\ "lanczosPCA2 is only supported when using numpy arrays") r, c, N = data.shape npixels = r * c # number of pixels data.shape = r * c, N if npixels < 2000: BINNING = 2 if npixels < 5000: BINNING = 4 elif npixels < 10000: BINNING = 8 elif npixels < 20000: BINNING = 10 elif npixels < 30000: BINNING = 15 elif npixels < 60000: BINNING = 20 else: BINNING = 30 if BINNING is not None: dataorig = data reminder = npixels % BINNING if reminder: data = data[0:BINNING * int(npixels / BINNING), :] data.shape = data.shape[0] / BINNING, BINNING, data.shape[1] data = numpy.swapaxes(data, 1, 2) data = numpy.sum(data, axis=-1) rc = int(r * c / BINNING) tipo = numpy.float64 neig = ncomponents + 5 # it does not create the covariance matrix but performs two multiplications rappmatrix = "doppia" # it creates the covariance matrix but performs only one multiplication rappmatrix = "singola" # calcola la media mediadata = numpy.sum(data, axis=0) / numpy.array([len(data)], data.dtype) numpy.subtract(data, mediadata, data) Lanczos.LanczosNumericMatrix.tipo = tipo Lanczos.LanczosNumericVector.tipo = tipo if rappmatrix == "singola": SM = [dotblas.dot(data.T, data).astype(tipo)] SM = Lanczos.LanczosNumericMatrix(SM) else: SM = Lanczos.LanczosNumericMatrix( [data.T.astype(tipo), data.astype(tipo)]) # calculate eigenvalues and eigenvectors ev, eve = Lanczos.solveEigenSystem(SM, neig, shift=0.0, tol=1.0e-7) SM = None rc = rc * BINNING newmat = numpy.zeros((r * c, neig), numpy.float64) data = data.astype(tipo) # numpy in-place addition to make sure not intermediate copies are made numpy.add(data, mediadata, data) for i in range(neig): newmat[:, i] = dotblas.dot(dataorig, (eve[i].vr).astype(dataorig.dtype)) newcov = dotblas.dot(newmat.T, newmat) evals, evects = numpy.linalg.eigh(newcov) nuovispettri = dotblas.dot(evects, eve.vr[:neig]) images = numpy.zeros((ncomponents, npixels), data.dtype) vectors = numpy.zeros((ncomponents, N), tipo) for i in range(ncomponents): vectors[i, :] = nuovispettri[-1 - i, :] images[i, :] = dotblas.dot(newmat, evects[-1 - i].astype(dataorig.dtype)) images.shape = ncomponents, r, c return images, evals, vectors
def multipleArrayPCA(stackList0, ncomponents=10, binning=None, legacy=True, **kw): """ Given a list of arrays, calculate the requested principal components from the matrix resulting from their column concatenation. Therefore, all the input arrays must have the same number of rows. """ stackList = [None] * len(stackList0) i = 0 for stack in stackList0: if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack stackList[i] = data i += 1 stack = stackList[0] if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError(\ "multipleArrayPCA is only supported when using numpy arrays") if len(data.shape) == 3: r, c = data.shape[:2] npixels = r * c else: c = None r = data.shape[0] npixels = r #reshape and subtract mean to all the input data shapeList = [] avgList = [] eigenvectorLength = 0 for i in range(len(stackList)): shape = stackList[i].shape eigenvectorLength += shape[-1] shapeList.append(shape) stackList[i].shape = npixels, -1 avg = numpy.sum(stackList[i], 0) / (1.0 * npixels) numpy.subtract(stackList[i], avg, stackList[i]) avgList.append(avg) #create the needed storage space for the covariance matrix covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), numpy.float32) rowOffset = 0 indexDict = {} for i in range(len(stackList)): iVectorLength = shapeList[i][-1] colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] if i <= j: covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ dotblas.dot(stackList[i].T, stackList[j]) if i < j: key = "%02d%02d" % (i, j) indexDict[key] = (rowOffset, rowOffset + iVectorLength, colOffset, colOffset + jVectorLength) else: key = "%02d%02d" % (j, i) rowMin, rowMax, colMin, colMax = indexDict[key] covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ covMatrix[rowMin:rowMax, colMin:colMax].T colOffset += jVectorLength rowOffset += iVectorLength indexDict = None #I have the covariance matrix, calculate the eigenvectors and eigenvalues totalVariance = numpy.diag(covMatrix).sum() evalues, evectors = numpy.linalg.eigh(covMatrix) covMatrix = None print("Total Variance = ", totalVariance.sum()) images = numpy.zeros((ncomponents, npixels), numpy.float32) eigenvectors = numpy.zeros((ncomponents, eigenvectorLength), numpy.float32) eigenvalues = numpy.zeros((ncomponents, ), numpy.float32) a = [(evalues[i], i) for i in range(len(evalues))] a.sort() a.reverse() totalExplainedVariance = 0.0 for i0 in range(ncomponents): i = a[i0][1] eigenvalues[i0] = evalues[i] partialExplainedVariance = 100. * evalues[i] / \ totalVariance print("PC%02d Explained variance %.5f %% " %\ (i0 + 1, partialExplainedVariance)) totalExplainedVariance += partialExplainedVariance eigenvectors[i0, :] = evectors[:, i] #print("NORMA = ", numpy.dot(evectors[:, i].T, evectors[:, i])) print("Total explained variance = %.2f %% " % totalExplainedVariance) for i in range(ncomponents): colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] images[i, :] +=\ dotblas.dot(stackList[j], eigenvectors[i, colOffset:(colOffset + jVectorLength)]) colOffset += jVectorLength #restore shapes and values for i in range(len(stackList)): numpy.add(stackList[i], avgList[i], stackList[i]) stackList[i].shape = shapeList[i] if c is None: images.shape = ncomponents, r, 1 else: images.shape = ncomponents, r, c if legacy: return images, eigenvalues, eigenvectors else: return { "scores": images, "eigenvalues": eigenvalues, "eigenvectors": eigenvectors, "average": avgList, "pixels": npixels, "variance": totalVariance }
def multipleArrayPCA(stackList0, ncomponents=10, binning=None, legacy=True, scale=False, **kw): """ Given a list of arrays, calculate the requested principal components from the matrix resulting from their column concatenation. Therefore, all the input arrays must have the same number of rows. """ stackList = [None] * len(stackList0) i = 0 for stack in stackList0: if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack stackList[i] = data i += 1 stack = stackList[0] if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError(\ "multipleArrayPCA is only supported when using numpy arrays") if len(data.shape) == 3: r, c = data.shape[:2] npixels = r * c else: c = None r = data.shape[0] npixels = r #reshape and subtract mean to all the input data shapeList = [] avgList = [] eigenvectorLength = 0 for i in range(len(stackList)): shape = stackList[i].shape eigenvectorLength += shape[-1] shapeList.append(shape) stackList[i].shape = npixels, -1 avg = numpy.sum(stackList[i], 0) / (1.0 * npixels) numpy.subtract(stackList[i], avg, stackList[i]) avgList.append(avg) #create the needed storage space for the covariance matrix covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), numpy.float32) rowOffset = 0 indexDict = {} for i in range(len(stackList)): iVectorLength = shapeList[i][-1] colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] if i <= j: covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ dotblas.dot(stackList[i].T, stackList[j])/(npixels-1) if i < j: key = "%02d%02d" % (i, j) indexDict[key] = (rowOffset, rowOffset + iVectorLength, colOffset, colOffset + jVectorLength) else: key = "%02d%02d" % (j, i) rowMin, rowMax, colMin, colMax = indexDict[key] covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ covMatrix[rowMin:rowMax, colMin:colMax].T colOffset += jVectorLength rowOffset += iVectorLength indexDict = None #I have the covariance matrix, calculate the eigenvectors and eigenvalues totalVariance = numpy.array(numpy.diag(covMatrix), copy=True) # use the correlation matrix if required normalizeToUnitStandardDeviation = scale #option to normalize to unit standard deviation if normalizeToUnitStandardDeviation: for i in range(covMatrix.shape[0]): if totalVariance[i] > 0: covMatrix[i, :] /= numpy.sqrt(totalVariance[i]) covMatrix[:, i] /= numpy.sqrt(totalVariance[i]) totalVariance = numpy.diag(covMatrix).sum() evalues, evectors = numpy.linalg.eigh(covMatrix) covMatrix = None _logger.info("Total Variance = %s", totalVariance) # The total variance should also be the sum of all the eigenvalues calculatedTotalVariance = evalues.sum() if abs(totalVariance - calculatedTotalVariance) > \ (0.0001 * calculatedTotalVariance): _logger.warning("Discrepancy on total variance") _logger.warning("Variance from matrix = %s", totalVariance) _logger.warning("Variance from sum of eigenvalues = %s", calculatedTotalVariance) images = numpy.zeros((ncomponents, npixels), numpy.float32) eigenvectors = numpy.zeros((ncomponents, eigenvectorLength), numpy.float32) eigenvalues = numpy.zeros((ncomponents, ), numpy.float32) a = [(evalues[i], i) for i in range(len(evalues))] a.sort() a.reverse() totalExplainedVariance = 0.0 for i0 in range(ncomponents): i = a[i0][1] eigenvalues[i0] = evalues[i] partialExplainedVariance = 100. * evalues[i] / \ calculatedTotalVariance _logger.info("PC%02d Explained variance %.5f %% " %\ (i0 + 1, partialExplainedVariance)) totalExplainedVariance += partialExplainedVariance eigenvectors[i0, :] = evectors[:, i] #print("NORMA = ", numpy.dot(evectors[:, i].T, evectors[:, i])) _logger.info("Total explained variance = %.2f %% " % totalExplainedVariance) # figure out if eigenvectors are to be multiplied by -1 for i0 in range(ncomponents): if eigenvectors[i0].sum() < 0.0: _logger.info("PC%02d multiplied by -1" % i0) eigenvectors[i0] *= -1 for i in range(ncomponents): colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] images[i, :] +=\ dotblas.dot(stackList[j], eigenvectors[i, colOffset:(colOffset + jVectorLength)]) colOffset += jVectorLength #restore shapes and values for i in range(len(stackList)): numpy.add(stackList[i], avgList[i], stackList[i]) stackList[i].shape = shapeList[i] if c is None: images.shape = ncomponents, r, 1 else: images.shape = ncomponents, r, c if legacy: return images, eigenvalues, eigenvectors else: return { "scores": images, "eigenvalues": eigenvalues, "eigenvectors": eigenvectors, "average": avgList, "pixels": npixels, "variance": calculatedTotalVariance }
def multipleArrayPCA(stackList0, ncomponents=10, binning=None, legacy=True, **kw): """ Given a list of arrays, calculate the requested principal components from the matrix resulting from their column concatenation. Therefore, all the input arrays must have the same number of rows. """ stackList = [None] * len(stackList0) i = 0 for stack in stackList0: if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack stackList[i] = data i += 1 stack = stackList[0] if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError(\ "multipleArrayPCA is only supported when using numpy arrays") if len(data.shape) == 3: r, c = data.shape[:2] npixels = r * c else: c = None r = data.shape[0] npixels = r #reshape and subtract mean to all the input data shapeList = [] avgList = [] eigenvectorLength = 0 for i in range(len(stackList)): shape = stackList[i].shape eigenvectorLength += shape[-1] shapeList.append(shape) stackList[i].shape = npixels, -1 avg = numpy.sum(stackList[i], 0) / (1.0 * npixels) numpy.subtract(stackList[i], avg, stackList[i]) avgList.append(avg) #create the needed storage space for the covariance matrix covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), numpy.float32) rowOffset = 0 indexDict = {} for i in range(len(stackList)): iVectorLength = shapeList[i][-1] colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] if i <= j: covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ dotblas.dot(stackList[i].T, stackList[j]) if i < j: key = "%02d%02d" % (i, j) indexDict[key] = (rowOffset, rowOffset + iVectorLength, colOffset, colOffset + jVectorLength) else: key = "%02d%02d" % (j, i) rowMin, rowMax, colMin, colMax = indexDict[key] covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ covMatrix[rowMin:rowMax, colMin:colMax].T colOffset += jVectorLength rowOffset += iVectorLength indexDict = None #I have the covariance matrix, calculate the eigenvectors and eigenvalues totalVariance = numpy.diag(covMatrix).sum() evalues, evectors = numpy.linalg.eigh(covMatrix) covMatrix = None print("Total Variance = ", totalVariance.sum()) images = numpy.zeros((ncomponents, npixels), numpy.float32) eigenvectors = numpy.zeros((ncomponents, eigenvectorLength), numpy.float32) eigenvalues = numpy.zeros((ncomponents,), numpy.float32) a = [(evalues[i], i) for i in range(len(evalues))] a.sort() a.reverse() totalExplainedVariance = 0.0 for i0 in range(ncomponents): i = a[i0][1] eigenvalues[i0] = evalues[i] partialExplainedVariance = 100. * evalues[i] / \ totalVariance print("PC%02d Explained variance %.5f %% " %\ (i0 + 1, partialExplainedVariance)) totalExplainedVariance += partialExplainedVariance eigenvectors[i0, :] = evectors[:, i] #print("NORMA = ", numpy.dot(evectors[:, i].T, evectors[:, i])) print("Total explained variance = %.2f %% " % totalExplainedVariance) for i in range(ncomponents): colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] images[i, :] +=\ dotblas.dot(stackList[j], eigenvectors[i, colOffset:(colOffset + jVectorLength)]) colOffset += jVectorLength #restore shapes and values for i in range(len(stackList)): numpy.add(stackList[i], avgList[i], stackList[i]) stackList[i].shape = shapeList[i] if c is None: images.shape = ncomponents, r, 1 else: images.shape = ncomponents, r, c if legacy: return images, eigenvalues, eigenvectors else: return {"scores": images, "eigenvalues": eigenvalues, "eigenvectors": eigenvectors, "average": avgList, "pixels": npixels, "variance": totalVariance}
print("Eigenvalue %d = %f" % (i, eigenvalues[i])) fname = "Image%02d.edf" % (i + 10) if os.path.exists(fname): os.remove(fname) edf = EdfFile.EdfFile(fname, 'wb') edf.WriteImage({}, a) edf = None else: stack = EDFStack.EDFStack(inputfile, imagestack=False, dtype=numpy.float64) r, c, nChannels = stack.data.shape if 0: stack.data.shape = r * c, nChannels t0 = time.time() covMatrix0 = dotblas.dot(stack.data.T, stack.data) print("Standard Elapsed = ", time.time() - t0) print("Standard Shape = ", covMatrix0.shape) t0 = time.time() stack.data.shape = r, c, nChannels covMatrix1, sumSpectrum, nPixels = getCovarianceMatrix( stack, index=-1, dtype='float64', force=True) print("Dynamic Elapsed = ", time.time() - t0) print("Dynamic Shape = ", covMatrix1.shape) print(covMatrix0.max(), covMatrix0.min(), "Reference = ", covMatrix0[1300, 1350:1360]) print(covMatrix1.max(), covMatrix1.min(), "Calculated = ", covMatrix1[1300, 1350:1360]) delta = covMatrix1 - covMatrix0 maxDiff = delta.max() print("Max diff = ", maxDiff)
def getCovarianceMatrix(stack, index=-1, binning=None, dtype=numpy.float64, force=True, center=True, weights=None, spatial_mask=None): #the 1D mask should correspond to the values, before or after #sampling? it could be handled as weigths to be applied to the #spectra. That would allow two uses, as mask and as weights, at #the cost of a multiplication. #the spatial_mask accounts for pixels to be considered. It allows #to calculate the covariance matrix of a subset or to deal with #non finite data (NaN, +inf, -inf, ...). The calling program #should set the mask. #recover the actual data to work with if hasattr(stack, "info") and hasattr(stack, "data"): #we are dealing with a PyMca data object data = stack.data else: data = stack oldShape = data.shape if index not in [0, -1, len(oldShape) - 1]: data = None raise IndexError("1D index must be one of 0, -1 or %d" % len(oldShape)) if index < 0: actualIndex = len(oldShape) + index else: actualIndex = index #the number of spatial pixels nPixels = 1 for i in range(len(oldShape)): if i != actualIndex: nPixels *= oldShape[i] #remove inf or nan #image_data = data.sum(axis=actualIndex) #spatial_mask = numpy.isfinite(image_data) # #the starting number of channels or of images N = oldShape[actualIndex] # our binning (better said sampling) is spectral, in order not to # affect the spatial resolution if binning is None: binning = 1 if weights is None: weights = numpy.ones(N, numpy.float) if spatial_mask is not None: cleanMask = spatial_mask[:].reshape(nPixels) usedPixels = cleanMask.sum() badMask = numpy.array(spatial_mask < 1, dtype=cleanMask.dtype) badMask.shape = nPixels else: cleanMask = None usedPixels = nPixels nChannels = int(N / binning) cleanWeights = weights[::binning] #end of checking part eigenvectorLength = nChannels if (not force)and isinstance(data, numpy.ndarray): if DEBUG: print("Memory consuming calculation") #make a direct calculation (memory cosuming) #take a view to the data dataView = data[:] if index in [0]: #reshape the view to allow the matrix multiplication dataView.shape = -1, nPixels cleanWeights.shape = -1, 1 dataView = dataView[::binning] * cleanWeights if cleanMask is not None: dataView[:, badMask] = 0 sumSpectrum = dataView.sum(axis=1, dtype=numpy.float64) #and return the standard covariance matrix as a matrix product covMatrix = dotblas.dot(dataView, dataView.T)\ / float(usedPixels - 1) else: #the last index dataView.shape = nPixels, -1 cleanWeights.shape = 1, -1 dataView = dataView[:, ::binning] * cleanWeights if cleanMask is not None: cleanMask.shape = -1 if 0: for i in range(dataView.shape[-1]): dataView[badMask, i] = 0 else: dataView[badMask] = 0 sumSpectrum = dataView.sum(axis=0, dtype=numpy.float64) #and return the standard covariance matrix as a matrix product covMatrix = dotblas.dot(dataView.T, dataView )\ / float(usedPixels - 1) if center: averageMatrix = numpy.outer(sumSpectrum, sumSpectrum)\ / (usedPixels * (usedPixels - 1)) covMatrix -= averageMatrix averageMatrix = None return covMatrix, sumSpectrum / usedPixels, usedPixels #we are dealing with dynamically loaded data if DEBUG: print("DYNAMICALLY LOADED DATA") #create the needed storage space for the covariance matrix try: covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), dtype=dtype) sumSpectrum = numpy.zeros((eigenvectorLength,), numpy.float64) except: #make sure no reference to the original input data is kept cleanWeights = None covMatrix = None averageMatrix = None data = None raise #workaround a problem with h5py try: if actualIndex in [0]: testException = data[0:1] else: if len(data.shape) == 2: testException = data[0:1,-1] elif len(data.shape) == 3: testException = data[0:1,0:1,-1] except AttributeError: txt = "%s" % type(data) if 'h5py' in txt: print("Implementing h5py workaround") import h5py data = h5py.Dataset(data.id) else: raise if actualIndex in [0]: #divider is used to decide the fraction of images to keep in memory #in order to limit file access on dynamically loaded data. #Since two chunks of the same size are used, the amount of memory #needed is twice the data size divided by the divider. #For instance, divider = 10 implies the data to be read 5.5 times #from disk while having a memory footprint of about one fifth of #the dataset size. step = 0 divider = 10 while step < 1: step = int(oldShape[index] / divider) divider -= 2 if divider <= 0: step = oldShape[index] break if DEBUG: print("Reading chunks of %d images" % step) nImagesRead = 0 if (binning == 1) and oldShape[index] >= step: chunk1 = numpy.zeros((step, nPixels), numpy.float64) chunk2 = numpy.zeros((nPixels, step), numpy.float64) if spatial_mask is not None: badMask.shape = -1 cleanMask.shape = -1 i = 0 while i < N: iToRead = min(step, N - i) #get step images for the first chunk chunk1[0:iToRead] = data[i:i + iToRead].reshape(iToRead, -1) if spatial_mask is not None: chunk1[0:iToRead, badMask] = 0 sumSpectrum[i:i + iToRead] = chunk1[0:iToRead].sum(axis=1) if center: average = sumSpectrum[i:i + iToRead] / usedPixels average.shape = iToRead, 1 chunk1[0:iToRead] -= average if spatial_mask is not None: chunk1[0:iToRead, badMask] = 0 nImagesRead += iToRead j = 0 while j <= i: #get step images for the second chunk if j == i: jToRead = iToRead if 0: for k in range(0, jToRead): chunk2[:, k] = chunk1[k] else: chunk2[:, 0:jToRead] = chunk1[0:jToRead, :].T else: #get step images for the second chunk jToRead = min(step, nChannels - j) #with loop: #for k in range(0, jToRead): # chunk2[:,k] = data[(j+k):(j+k+1)].reshape(1,-1) # if spatial_mask is not None: # chunk2[badMask[(j+k):(j+k+1),k]] = 0 #equivalent without loop: chunk2[:, 0:jToRead] =\ data[j:(j + jToRead)].reshape(jToRead, -1).T if spatial_mask is not None: chunk2[badMask, 0:jToRead] = 0 nImagesRead += jToRead if center: average = \ chunk2[:, 0:jToRead].sum(axis=0) / usedPixels average.shape = 1, jToRead chunk2[:, 0:jToRead] -= average if spatial_mask is not None: chunk2[badMask, 0:jToRead] = 0 #dot product if (iToRead != step) or (jToRead != step): covMatrix[i: (i + iToRead), j: (j + jToRead)] =\ dotblas.dot(chunk1[:iToRead, :nPixels], chunk2[:nPixels, :jToRead]) else: covMatrix[i: (i + iToRead), j: (j + jToRead)] =\ dotblas.dot(chunk1, chunk2) if i != j: covMatrix[j: (j + jToRead), i: (i + iToRead)] =\ covMatrix[i: (i + iToRead), j: (j + jToRead)].T #increment j j += jToRead i += iToRead chunk1 = None chunk2 = None if DEBUG: print("totalImages Read = ", nImagesRead) elif (binning > 1) and (oldShape[index] >= step): chunk1 = numpy.zeros((step, nPixels), numpy.float64) chunk2 = numpy.zeros((nPixels, step), numpy.float64) #one by one reading till we fill the chunks imagesToRead = numpy.arange(0, oldShape[index], binning) i = int(imagesToRead[weights > 0][0]) spectrumIndex = 0 nImagesRead = 0 while i < N: #fill chunk1 jj = 0 for iToRead in range(0, int(min(step * binning, N - i)), binning): chunk1[jj] = data[i + iToRead].reshape(1, -1) * \ weights[i + iToRead] jj += 1 sumSpectrum[spectrumIndex:(spectrumIndex + jj)] = \ chunk1[0:jj].sum(axis=1) if center: average = \ sumSpectrum[spectrumIndex:(spectrumIndex + jj)] / nPixels average.shape = jj, 1 chunk1[0:jj] -= average nImagesRead += jj iToRead = jj j = 0 while j <= i: #get step images for the second chunk if j == i: jToRead = iToRead chunk2[:, 0:jToRead] = chunk1[0:jToRead, :].T else: #get step images for the second chunk jj = 0 for jToRead in range(0, int(min(step * binning, N - j)), binning): chunk2[:, jj] =\ data[j + jToRead].reshape(1, -1)\ * weights[j + jToRead] jj += 1 nImagesRead += jj if center: average = chunk2[:, 0:jj].sum(axis=0) / nPixels average.shape = 1, jj chunk2 -= average jToRead = jj #dot product if (iToRead != step) or (jToRead != step): covMatrix[i:(i + iToRead), j:(j + jToRead)] =\ dotblas.dot(chunk1[:iToRead, :nPixels], chunk2[:nPixels, :jToRead]) else: covMatrix[i:(i + iToRead), j:(j + jToRead)] =\ dotblas.dot(chunk1, chunk2) if i != j: covMatrix[j:(j + jToRead), i:(i + iToRead)] =\ covMatrix[i:(i + iToRead), j:(j + jToRead)].T #increment j j += jToRead * step i += iToRead * step chunk1 = None chunk2 = None else: raise ValueError("Unhandled case") #should one divide by N or by N-1 ?? if we use images, we #assume the observables are the images, not the spectra!!! #so, covMatrix /= nChannels is wrong and one has to use: covMatrix /= usedPixels else: #the data are already arranged as (nPixels, nChannels) and we #basically have to return data.T * data to get back the covariance #matrix as (nChannels, nChannels) #if someone had the bad idea to store the data in HDF5 with a chunk #size based on the pixels and not on the spectra a loop based on #reading spectrum per spectrum can be very slow step = 0 divider = 10 while step < 1: step = int(nPixels / divider) divider -= 1 if divider <= 0: step = nPixels break step = nPixels if DEBUG: print("Reading chunks of %d spectra" % step) cleanWeights.shape = 1, -1 if len(data.shape) == 2: if cleanMask is not None: badMask.shape = -1 tmpData = numpy.zeros((step, nChannels), numpy.float64) k = 0 while k < nPixels: kToRead = min(step, nPixels - k) tmpData[0:kToRead] = data[k: k + kToRead, ::binning] if cleanMask is not None: tmpData[badMask[k: k + kToRead]] = 0 a = tmpData[0:kToRead] * cleanWeights sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif len(data.shape) == 3: if oldShape[0] == 1: #close to the previous case tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[0]): k = 0 while k < oldShape[1]: kToRead = min(step, oldShape[1] - k) tmpData[0:kToRead] = data[i, k:k + kToRead, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[i, k: k + kToRead]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif oldShape[1] == 1: #almost identical to the previous case tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[1]): k = 0 while k < oldShape[0]: kToRead = min(step, oldShape[0] - k) tmpData[0:kToRead] = data[k: k + kToRead, i, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[k: k + kToRead, i]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif oldShape[0] < 21: if step > oldShape[1]: step = oldShape[1] tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[0]): k = 0 while k < oldShape[1]: kToRead = min(step, oldShape[1] - k) tmpData[0:kToRead] = data[i, k: k + kToRead, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[i, k: k + kToRead]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None else: #I should choose the sizes in terms of the size #of the dataset if oldShape[0] < 41: #divide by 10 deltaRow = 4 elif oldShape[0] < 101: #divide by 10 deltaRow = 10 else: #take pieces of one tenth deltaRow = int(oldShape[0] / 10) deltaCol = oldShape[1] tmpData = numpy.zeros((deltaRow, deltaCol, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] i = 0 while i < oldShape[0]: iToRead = min(deltaRow, oldShape[0] - i) kToRead = iToRead * oldShape[1] tmpData[:iToRead] = data[i:(i + iToRead), :, ::binning] if cleanMask is not None: tmpData[0:kToRead][badMask[i:(i + iToRead), :]] = 0 a = tmpData[:iToRead] a.shape = kToRead, nChannels a *= cleanWeights if 0: #weight each spectrum a /= (a.sum(axis=1).reshape(-1, 1)) sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None i += iToRead #should one divide by N or by N-1 ?? covMatrix /= usedPixels - 1 if center: #the n-1 appears again here averageMatrix = numpy.outer(sumSpectrum, sumSpectrum)\ / (usedPixels * (usedPixels - 1)) covMatrix -= averageMatrix averageMatrix = None return covMatrix, sumSpectrum / usedPixels, usedPixels
def multipleArrayPCA(stackList, ncomponents=10, binning=None, **kw): """ Given a list of arrays, calculate the requested principal components from the matrix resulting from their column concatenation. Therefore, all the input arrays must have the same number of rows. """ stack = stackList[0] if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError(\ "multipleArrayPCA is only supported when using numpy arrays") if len(data.shape) == 3: r, c = data.shape[:2] npixels = r * c else: c = None r = data.shape[0] npixels = r #reshape and subtract mean to all the input data shapeList = [] avgList = [] eigenvectorLength = 0 for i in range(len(stackList)): shape = stackList[i].shape eigenvectorLength += shape[-1] shapeList.append(shape) stackList[i].shape = npixels, -1 avg = numpy.sum(stackList[i], 0) / (1.0 * npixels) numpy.subtract(stackList[i], avg, stackList[i]) avgList.append(avg) #create the needed storage space for the covariance matrix covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), numpy.float32) rowOffset = 0 indexDict = {} for i in range(len(stackList)): iVectorLength = shapeList[i][-1] colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] if i <= j: covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ dotblas.dot(stackList[i].T, stackList[j]) if i < j: key = "%02d%02d" % (i, j) indexDict[key] = (rowOffset, rowOffset + iVectorLength, colOffset, colOffset + jVectorLength) else: key = "%02d%02d" % (j, i) rowMin, rowMax, colMin, colMax = indexDict[key] covMatrix[rowOffset:(rowOffset + iVectorLength), colOffset:(colOffset + jVectorLength)] =\ covMatrix[rowMin:rowMax, colMin:colMax].T colOffset += jVectorLength rowOffset += iVectorLength indexDict = None #I have the covariance matrix, calculate the eigenvectors and eigenvalues covMatrix = [covMatrix] covMatrix = Lanczos.LanczosNumericMatrix(covMatrix) eigenvalues, evectors = Lanczos.solveEigenSystem(covMatrix, ncomponents, shift=0.0, tol=1.0e-15) covMatrix = None images = numpy.zeros((ncomponents, npixels), numpy.float32) eigenvectors = numpy.zeros((ncomponents, eigenvectorLength), numpy.float32) for i in range(ncomponents): eigenvectors[i, :] = evectors[i].vr colOffset = 0 for j in range(len(stackList)): jVectorLength = shapeList[j][-1] images[i, :] +=\ dotblas.dot(stackList[j], eigenvectors[i, colOffset:(colOffset + jVectorLength)]) colOffset += jVectorLength #restore shapes and values for i in range(len(stackList)): numpy.add(stackList[i], avgList[i], stackList[i]) stackList[i].shape = shapeList[i] if c is None: images.shape = ncomponents, r, 1 else: images.shape = ncomponents, r, c return images, eigenvalues, eigenvectors
def lanczosPCA2(stack, ncomponents=10, binning=None, **kw): """ This is a fast method, but it may loose information """ if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack # check we have received a numpy.ndarray and not an HDF5 group # or other type of dynamically loaded data if not isinstance(data, numpy.ndarray): raise TypeError("lanczosPCA2 is only supported when using numpy arrays") r, c, N = data.shape npixels = r * c # number of pixels data.shape = r * c, N if npixels < 2000: BINNING = 2 if npixels < 5000: BINNING = 4 elif npixels < 10000: BINNING = 8 elif npixels < 20000: BINNING = 10 elif npixels < 30000: BINNING = 15 elif npixels < 60000: BINNING = 20 else: BINNING = 30 if BINNING is not None: dataorig = data reminder = npixels % BINNING if reminder: data = data[0 : BINNING * int(npixels / BINNING), :] data.shape = data.shape[0] / BINNING, BINNING, data.shape[1] data = numpy.swapaxes(data, 1, 2) data = numpy.sum(data, axis=-1) rc = int(r * c / BINNING) tipo = numpy.float64 neig = ncomponents + 5 # it does not create the covariance matrix but performs two multiplications rappmatrix = "doppia" # it creates the covariance matrix but performs only one multiplication rappmatrix = "singola" # calcola la media mediadata = numpy.sum(data, axis=0) / numpy.array([len(data)], data.dtype) numpy.subtract(data, mediadata, data) Lanczos.LanczosNumericMatrix.tipo = tipo Lanczos.LanczosNumericVector.tipo = tipo if rappmatrix == "singola": SM = [dotblas.dot(data.T, data).astype(tipo)] SM = Lanczos.LanczosNumericMatrix(SM) else: SM = Lanczos.LanczosNumericMatrix([data.T.astype(tipo), data.astype(tipo)]) # calculate eigenvalues and eigenvectors ev, eve = Lanczos.solveEigenSystem(SM, neig, shift=0.0, tol=1.0e-7) SM = None rc = rc * BINNING newmat = numpy.zeros((r * c, neig), numpy.float64) data = data.astype(tipo) # numpy in-place addition to make sure not intermediate copies are made numpy.add(data, mediadata, data) for i in range(neig): newmat[:, i] = dotblas.dot(dataorig, (eve[i].vr).astype(dataorig.dtype)) newcov = dotblas.dot(newmat.T, newmat) evals, evects = numpy.linalg.eigh(newcov) nuovispettri = dotblas.dot(evects, eve.vr[:neig]) images = numpy.zeros((ncomponents, npixels), data.dtype) vectors = numpy.zeros((ncomponents, N), tipo) for i in range(ncomponents): vectors[i, :] = nuovispettri[-1 - i, :] images[i, :] = dotblas.dot(newmat, evects[-1 - i].astype(dataorig.dtype)) images.shape = ncomponents, r, c return images, evals, vectors
def expectationMaximizationPCA(stack, ncomponents=10, binning=None, **kw): """ This is a fast method when the number of components is small """ if DEBUG: print("expectationMaximizationPCA") #This part is common to all ... if binning is None: binning = 1 if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if len(data.shape) == 3: r, c, N = data.shape data.shape = r * c, N else: r, N = data.shape c = 1 if binning > 1: data = numpy.reshape(data, [data.shape[0], data.shape[1] / binning, binning]) data = numpy.sum(data, axis=-1) N /= binning if ncomponents > N: raise ValueError("Number of components too high.") #end of common part avg = numpy.sum(data, axis=0, dtype=numpy.float) / (1.0 * r * c) numpy.subtract(data, avg, data) dataw = data * 1 images = numpy.zeros((ncomponents, r * c), data.dtype) eigenvalues = numpy.zeros((ncomponents, ), data.dtype) eigenvectors = numpy.zeros((ncomponents, N), data.dtype) for i in range(ncomponents): #generate a random vector p = numpy.random.random(N) #10 iterations seems to be fairly accurate, but it is #slow when reaching "noise" components. #A variation threshold of 1 % seems to be acceptable. tmod_old = 0 tmod = 0.02 j = 0 max_iter = 7 while ((abs(tmod - tmod_old) / tmod) > 0.01) and (j < max_iter): tmod_old = tmod t = 0.0 for k in range(r * c): t += dotblas.dot(dataw[k, :], p.T) * dataw[k, :] tmod = numpy.sqrt(numpy.sum(t * t)) p = t / tmod j += 1 eigenvectors[i, :] = p #subtract the found component from the dataset for k in range(r * c): dataw[k, :] -= dotblas.dot(dataw[k, :], p.T) * p # calculate eigenvalues via the Rayleigh Quotients: # eigenvalue = \ # (Eigenvector.T * Covariance * EigenVector)/ (Eigenvector.T * Eigenvector) for i in range(ncomponents): tmp = dotblas.dot(data, eigenvectors[i, :].T) eigenvalues[i] = \ dotblas.dot(tmp.T, tmp) / dotblas.dot(eigenvectors[i, :].T, eigenvectors[i, :]) #Generate the eigenimages for i0 in range(ncomponents): images[i0, :] = dotblas.dot(data, eigenvectors[i0, :]) #restore the original data numpy.add(data, avg, data) #reshape the images images.shape = ncomponents, r, c return images, eigenvalues, eigenvectors
def expectationMaximizationPCA(stack, ncomponents=10, binning=None, **kw): """ This is a fast method when the number of components is small """ if DEBUG: print("expectationMaximizationPCA") # This part is common to all ... if binning is None: binning = 1 if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if len(data.shape) == 3: r, c, N = data.shape data.shape = r * c, N else: r, N = data.shape c = 1 if binning > 1: data = numpy.reshape(data, [data.shape[0], data.shape[1] / binning, binning]) data = numpy.sum(data, axis=-1) N /= binning if ncomponents > N: raise ValueError("Number of components too high.") # end of common part avg = numpy.sum(data, axis=0, dtype=numpy.float) / (1.0 * r * c) numpy.subtract(data, avg, data) dataw = data * 1 images = numpy.zeros((ncomponents, r * c), data.dtype) eigenvalues = numpy.zeros((ncomponents,), data.dtype) eigenvectors = numpy.zeros((ncomponents, N), data.dtype) for i in range(ncomponents): # generate a random vector p = numpy.random.random(N) # 10 iterations seems to be fairly accurate, but it is # slow when reaching "noise" components. # A variation threshold of 1 % seems to be acceptable. tmod_old = 0 tmod = 0.02 j = 0 max_iter = 7 while ((abs(tmod - tmod_old) / tmod) > 0.01) and (j < max_iter): tmod_old = tmod t = 0.0 for k in range(r * c): t += dotblas.dot(dataw[k, :], p.T) * dataw[k, :] tmod = numpy.sqrt(numpy.sum(t * t)) p = t / tmod j += 1 eigenvectors[i, :] = p # subtract the found component from the dataset for k in range(r * c): dataw[k, :] -= dotblas.dot(dataw[k, :], p.T) * p # calculate eigenvalues via the Rayleigh Quotients: # eigenvalue = \ # (Eigenvector.T * Covariance * EigenVector)/ (Eigenvector.T * Eigenvector) for i in range(ncomponents): tmp = dotblas.dot(data, eigenvectors[i, :].T) eigenvalues[i] = dotblas.dot(tmp.T, tmp) / dotblas.dot(eigenvectors[i, :].T, eigenvectors[i, :]) # Generate the eigenimages for i0 in range(ncomponents): images[i0, :] = dotblas.dot(data, eigenvectors[i0, :]) # restore the original data numpy.add(data, avg, data) # reshape the images images.shape = ncomponents, r, c return images, eigenvalues, eigenvectors
def lanczosPCA(stack, ncomponents=10, binning=None, **kw): if DEBUG: print("lanczosPCA") if binning is None: binning = 1 if hasattr(stack, "info") and hasattr(stack, "data"): data = stack.data else: data = stack if not isinstance(data, numpy.ndarray): raise TypeError(\ "lanczosPCA is only supported when using numpy arrays") #wrapmatrix = "double" wrapmatrix = "single" dtype = numpy.float64 if wrapmatrix == "double": data = data.astype(dtype) if len(data.shape) == 3: r, c, N = data.shape data.shape = r * c, N else: r, N = data.shape c = 1 npixels = r * c if binning > 1: # data.shape may fails with non-contiguous arrays # use reshape. data = numpy.reshape(data, [data.shape[0], data.shape[1] / binning, binning]) data = numpy.sum(data, axis=-1) N /= binning if ncomponents > N: raise ValueError("Number of components too high.") avg = numpy.sum(data, 0) / (1.0 * npixels) numpy.subtract(data, avg, data) Lanczos.LanczosNumericMatrix.tipo = dtype Lanczos.LanczosNumericVector.tipo = dtype if wrapmatrix == "single": SM = [dotblas.dot(data.T, data).astype(dtype)] SM = Lanczos.LanczosNumericMatrix(SM) else: SM = Lanczos.LanczosNumericMatrix( [data.T.astype(dtype), data.astype(dtype)]) eigenvalues, eigenvectors = Lanczos.solveEigenSystem(SM, ncomponents, shift=0.0, tol=1.0e-15) SM = None numpy.add(data, avg, data) images = numpy.zeros((ncomponents, npixels), data.dtype) vectors = numpy.zeros((ncomponents, N), dtype) for i in range(ncomponents): vectors[i, :] = eigenvectors[i].vr images[i, :] = dotblas.dot(data, (eigenvectors[i].vr).astype(data.dtype)) data = None images.shape = ncomponents, r, c return images, eigenvalues, vectors
def getCovarianceMatrix(stack, index=None, binning=None, dtype=numpy.float64, force=True, center=True, weights=None, spatial_mask=None): """ Calculate the covariance matrix of input data (stack) array. The input array is to be understood as a set of observables (spectra) taken at different instances (for instance spatial coordinates). :param stack: Array of data. Dimension greater than one. :type stack: Numpy ndarray. :param index: Integer specifying the array dimension containing the "observables". Only the first the first (index = 0) or the last dimension (index = -1 or index = (ndimensions - 1)) supported. :type index: Integer (default is -1 to indicate it is the last dimension of input array) :param binning: Current implementation corresponds to a sampling of the spectral data and not to an actual binning. This may change in future versions. :type binning: Positive integer (default 1) :param dtype: Keyword indicating the data type of the returned covariance matrix. :type dtype: A valid numpy data type (default numpy.float64) :param force: Indicate how to calculate the covariance matrix: - False : Perform the product data.T * data in one call - True : Perform the product data.T * data progressively (smaller memory footprint) :type force: Boolean (default True) :param center: Indicate if the mean is to be subtracted from the observables. :type center: Boolean (default True) :param weights: Weight to be applied to each observable. It can therefore be used as a spectral mask setting the weight to 0 on the values to ignore. :type weights: Numpy ndarray of same size as the observables or None (default). :spatial_mask: Array of size n where n is the number of measurement instances. In mapping experiments, n would be equal to the number of pixels. :type spatial_mask: Numpy array of unsigned bytes (numpy.uint8) or None (default). :returns: The covMatrix, the average spectrum and the number of used pixels. """ #the 1D mask = weights should correspond to the values, before or after #sampling? it could be handled as weigths to be applied to the #spectra. That would allow two uses, as mask and as weights, at #the cost of a multiplication. #the spatial_mask accounts for pixels to be considered. It allows #to calculate the covariance matrix of a subset or to deal with #non finite data (NaN, +inf, -inf, ...). The calling program #should set the mask. #recover the actual data to work with if hasattr(stack, "info") and hasattr(stack, "data"): #we are dealing with a PyMca data object data = stack.data if index is None: index = stack.info.get("McaWindex", -1) else: data = stack if index is None: index = -1 oldShape = data.shape if index not in [0, -1, len(oldShape) - 1]: data = None raise IndexError("1D index must be one of 0, -1 or %d" % len(oldShape)) if index < 0: actualIndex = len(oldShape) + index else: actualIndex = index #the number of spatial pixels nPixels = 1 for i in range(len(oldShape)): if i != actualIndex: nPixels *= oldShape[i] #remove inf or nan #image_data = data.sum(axis=actualIndex) #spatial_mask = numpy.isfinite(image_data) # #the starting number of channels or of images N = oldShape[actualIndex] # our binning (better said sampling) is spectral, in order not to # affect the spatial resolution if binning is None: binning = 1 if spatial_mask is not None: cleanMask = spatial_mask[:].reshape(nPixels) usedPixels = cleanMask.sum() badMask = numpy.array(spatial_mask < 1, dtype=cleanMask.dtype) badMask.shape = nPixels else: cleanMask = None usedPixels = nPixels nChannels = int(N / binning) if weights is None: weights = numpy.ones(N, numpy.float) if weights.size == nChannels: # binning was taken into account cleanWeights = weights[:] else: cleanWeights = weights[::binning] #end of checking part eigenvectorLength = nChannels if (not force) and isinstance(data, numpy.ndarray): if DEBUG: print("Memory consuming calculation") #make a direct calculation (memory cosuming) #take a view to the data dataView = data[:] if index in [0]: #reshape the view to allow the matrix multiplication dataView.shape = -1, nPixels cleanWeights.shape = -1, 1 dataView = dataView[::binning] * cleanWeights if cleanMask is not None: dataView[:, badMask] = 0 sumSpectrum = dataView.sum(axis=1, dtype=numpy.float64) #and return the standard covariance matrix as a matrix product covMatrix = dotblas.dot(dataView, dataView.T)\ / float(usedPixels - 1) else: #the last index dataView.shape = nPixels, -1 cleanWeights.shape = 1, -1 dataView = dataView[:, ::binning] * cleanWeights if cleanMask is not None: cleanMask.shape = -1 if 0: for i in range(dataView.shape[-1]): dataView[badMask, i] = 0 else: dataView[badMask] = 0 sumSpectrum = dataView.sum(axis=0, dtype=numpy.float64) #and return the standard covariance matrix as a matrix product covMatrix = dotblas.dot(dataView.T, dataView )\ / float(usedPixels - 1) if center: averageMatrix = numpy.outer(sumSpectrum, sumSpectrum)\ / (usedPixels * (usedPixels - 1)) covMatrix -= averageMatrix averageMatrix = None return covMatrix, sumSpectrum / usedPixels, usedPixels #we are dealing with dynamically loaded data if DEBUG: print("DYNAMICALLY LOADED DATA") #create the needed storage space for the covariance matrix try: covMatrix = numpy.zeros((eigenvectorLength, eigenvectorLength), dtype=dtype) sumSpectrum = numpy.zeros((eigenvectorLength, ), numpy.float64) except: #make sure no reference to the original input data is kept cleanWeights = None covMatrix = None averageMatrix = None data = None raise #workaround a problem with h5py try: if actualIndex in [0]: testException = data[0:1] else: if len(data.shape) == 2: testException = data[0:1, -1] elif len(data.shape) == 3: testException = data[0:1, 0:1, -1] except AttributeError: txt = "%s" % type(data) if 'h5py' in txt: print("Implementing h5py workaround") import h5py data = h5py.Dataset(data.id) else: raise if actualIndex in [0]: #divider is used to decide the fraction of images to keep in memory #in order to limit file access on dynamically loaded data. #Since two chunks of the same size are used, the amount of memory #needed is twice the data size divided by the divider. #For instance, divider = 10 implies the data to be read 5.5 times #from disk while having a memory footprint of about one fifth of #the dataset size. step = 0 divider = 10 while step < 1: step = int(oldShape[index] / divider) divider -= 2 if divider <= 0: step = oldShape[index] break if DEBUG: print("Reading chunks of %d images" % step) nImagesRead = 0 if (binning == 1) and oldShape[index] >= step: chunk1 = numpy.zeros((step, nPixels), numpy.float64) chunk2 = numpy.zeros((nPixels, step), numpy.float64) if spatial_mask is not None: badMask.shape = -1 cleanMask.shape = -1 i = 0 while i < N: iToRead = min(step, N - i) #get step images for the first chunk chunk1[0:iToRead] = data[i:i + iToRead].reshape(iToRead, -1) if spatial_mask is not None: chunk1[0:iToRead, badMask] = 0 sumSpectrum[i:i + iToRead] = chunk1[0:iToRead].sum(axis=1) if center: average = sumSpectrum[i:i + iToRead] / usedPixels average.shape = iToRead, 1 chunk1[0:iToRead] -= average if spatial_mask is not None: chunk1[0:iToRead, badMask] = 0 nImagesRead += iToRead j = 0 while j <= i: #get step images for the second chunk if j == i: jToRead = iToRead if 0: for k in range(0, jToRead): chunk2[:, k] = chunk1[k] else: chunk2[:, 0:jToRead] = chunk1[0:jToRead, :].T else: #get step images for the second chunk jToRead = min(step, nChannels - j) #with loop: #for k in range(0, jToRead): # chunk2[:,k] = data[(j+k):(j+k+1)].reshape(1,-1) # if spatial_mask is not None: # chunk2[badMask[(j+k):(j+k+1),k]] = 0 #equivalent without loop: chunk2[:, 0:jToRead] =\ data[j:(j + jToRead)].reshape(jToRead, -1).T if spatial_mask is not None: chunk2[badMask, 0:jToRead] = 0 nImagesRead += jToRead if center: average = \ chunk2[:, 0:jToRead].sum(axis=0) / usedPixels average.shape = 1, jToRead chunk2[:, 0:jToRead] -= average if spatial_mask is not None: chunk2[badMask, 0:jToRead] = 0 #dot product if (iToRead != step) or (jToRead != step): covMatrix[i: (i + iToRead), j: (j + jToRead)] =\ dotblas.dot(chunk1[:iToRead, :nPixels], chunk2[:nPixels, :jToRead]) else: covMatrix[i: (i + iToRead), j: (j + jToRead)] =\ dotblas.dot(chunk1, chunk2) if i != j: covMatrix[j: (j + jToRead), i: (i + iToRead)] =\ covMatrix[i: (i + iToRead), j: (j + jToRead)].T #increment j j += jToRead i += iToRead chunk1 = None chunk2 = None if DEBUG: print("totalImages Read = ", nImagesRead) elif (binning > 1) and (oldShape[index] >= step): chunk1 = numpy.zeros((step, nPixels), numpy.float64) chunk2 = numpy.zeros((nPixels, step), numpy.float64) #one by one reading till we fill the chunks imagesToRead = numpy.arange(0, oldShape[index], binning) i = int(imagesToRead[weights > 0][0]) spectrumIndex = 0 nImagesRead = 0 while i < N: #fill chunk1 jj = 0 for iToRead in range(0, int(min(step * binning, N - i)), binning): chunk1[jj] = data[i + iToRead].reshape(1, -1) * \ weights[i + iToRead] jj += 1 sumSpectrum[spectrumIndex:(spectrumIndex + jj)] = \ chunk1[0:jj].sum(axis=1) if center: average = \ sumSpectrum[spectrumIndex:(spectrumIndex + jj)] / nPixels average.shape = jj, 1 chunk1[0:jj] -= average nImagesRead += jj iToRead = jj j = 0 while j <= i: #get step images for the second chunk if j == i: jToRead = iToRead chunk2[:, 0:jToRead] = chunk1[0:jToRead, :].T else: #get step images for the second chunk jj = 0 for jToRead in range(0, int(min(step * binning, N - j)), binning): chunk2[:, jj] =\ data[j + jToRead].reshape(1, -1)\ * weights[j + jToRead] jj += 1 nImagesRead += jj if center: average = chunk2[:, 0:jj].sum(axis=0) / nPixels average.shape = 1, jj chunk2 -= average jToRead = jj #dot product if (iToRead != step) or (jToRead != step): covMatrix[i:(i + iToRead), j:(j + jToRead)] =\ dotblas.dot(chunk1[:iToRead, :nPixels], chunk2[:nPixels, :jToRead]) else: covMatrix[i:(i + iToRead), j:(j + jToRead)] =\ dotblas.dot(chunk1, chunk2) if i != j: covMatrix[j:(j + jToRead), i:(i + iToRead)] =\ covMatrix[i:(i + iToRead), j:(j + jToRead)].T #increment j j += jToRead * step i += iToRead * step chunk1 = None chunk2 = None else: raise ValueError("PCATools.getCovarianceMatrix: Unhandled case") #should one divide by N or by N-1 ?? if we use images, we #assume the observables are the images, not the spectra!!! #so, covMatrix /= nChannels is wrong and one has to use: covMatrix /= usedPixels else: #the data are already arranged as (nPixels, nChannels) and we #basically have to return data.T * data to get back the covariance #matrix as (nChannels, nChannels) #if someone had the bad idea to store the data in HDF5 with a chunk #size based on the pixels and not on the spectra a loop based on #reading spectrum per spectrum can be very slow step = 0 divider = 10 while step < 1: step = int(nPixels / divider) divider -= 1 if divider <= 0: step = nPixels break step = nPixels if DEBUG: print("Reading chunks of %d spectra" % step) cleanWeights.shape = 1, -1 if len(data.shape) == 2: if cleanMask is not None: badMask.shape = -1 tmpData = numpy.zeros((step, nChannels), numpy.float64) k = 0 while k < nPixels: kToRead = min(step, nPixels - k) tmpData[0:kToRead] = data[k:k + kToRead, ::binning] if cleanMask is not None: tmpData[badMask[k:k + kToRead]] = 0 a = tmpData[0:kToRead] * cleanWeights sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif len(data.shape) == 3: if oldShape[0] == 1: #close to the previous case tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[0]): k = 0 while k < oldShape[1]: kToRead = min(step, oldShape[1] - k) tmpData[0:kToRead] = data[i, k:k + kToRead, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[i, k:k + kToRead]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif oldShape[1] == 1: #almost identical to the previous case tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[1]): k = 0 while k < oldShape[0]: kToRead = min(step, oldShape[0] - k) tmpData[0:kToRead] = data[k: k + kToRead, i, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[k:k + kToRead, i]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None elif oldShape[0] < 21: if step > oldShape[1]: step = oldShape[1] tmpData = numpy.zeros((step, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] for i in range(oldShape[0]): k = 0 while k < oldShape[1]: kToRead = min(step, oldShape[1] - k) tmpData[0:kToRead] = data[i, k: k + kToRead, ::binning]\ * cleanWeights if cleanMask is not None: tmpData[0:kToRead][badMask[i, k:k + kToRead]] = 0 a = tmpData[0:kToRead] sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None k += kToRead tmpData = None else: #I should choose the sizes in terms of the size #of the dataset if oldShape[0] < 41: #divide by 10 deltaRow = 4 elif oldShape[0] < 101: #divide by 10 deltaRow = 10 else: #take pieces of one tenth deltaRow = int(oldShape[0] / 10) deltaCol = oldShape[1] tmpData = numpy.zeros((deltaRow, deltaCol, nChannels), numpy.float64) if cleanMask is not None: badMask.shape = data.shape[0], data.shape[1] i = 0 while i < oldShape[0]: iToRead = min(deltaRow, oldShape[0] - i) kToRead = iToRead * oldShape[1] tmpData[:iToRead] = data[i:(i + iToRead), :, ::binning] if cleanMask is not None: tmpData[0:kToRead][badMask[i:(i + iToRead), :]] = 0 a = tmpData[:iToRead] a.shape = kToRead, nChannels a *= cleanWeights if 0: #weight each spectrum a /= (a.sum(axis=1).reshape(-1, 1)) sumSpectrum += a.sum(axis=0) covMatrix += dotblas.dot(a.T, a) a = None i += iToRead #should one divide by N or by N-1 ?? covMatrix /= usedPixels - 1 if center: #the n-1 appears again here averageMatrix = numpy.outer(sumSpectrum, sumSpectrum)\ / (usedPixels * (usedPixels - 1)) covMatrix -= averageMatrix averageMatrix = None return covMatrix, sumSpectrum / usedPixels, usedPixels
def mat_mult(self, evect , q): self.vr[:evect.shape[0]]=dotblas.dot(evect.astype(self.tipo),q.vr[:evect.shape[1]])