def cluster(self, graph): """ Take a graph and cluster using the method in "On spectral clusering: analysis and algorithm" by Ng et al., 2001. :param graph: the graph to cluster :type graph: :class:`apgl.graph.AbstractMatrixGraph` :returns: An array of size graph.getNumVertices() of cluster membership """ L = graph.normalisedLaplacianSym() omega, Q = numpy.linalg.eig(L) inds = numpy.argsort(omega) #First normalise rows, then columns standardiser = Standardiser() V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T V = vq.whiten(V) #Using kmeans2 here seems to result in a high variance #in the quality of clustering. Therefore stick to kmeans centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans) clusters, distortion = vq.vq(V, centroids) return clusters
def matrixSimilarity(self, V1, V2): """ Compute a vertex similarity matrix C, such that the ijth entry is the matching score between V1_i and V2_j, where larger is a better match. """ X = numpy.r_[V1, V2] standardiser = Standardiser() X = standardiser.normaliseArray(X) V1 = X[0 : V1.shape[0], :] V2 = X[V1.shape[0] :, :] # print(X) # Extend arrays with zeros to make them the same size # if V1.shape[0] < V2.shape[0]: # V1 = Util.extendArray(V1, V2.shape, numpy.min(V1)) # elif V2.shape[0] < V1.shape[0]: # V2 = Util.extendArray(V2, V1.shape, numpy.min(V2)) # Let's compute C as the distance between vertices # Distance is bounded by 1 D = Util.distanceMatrix(V1, V2) maxD = numpy.max(D) minD = numpy.min(D) if (maxD - minD) != 0: C = (maxD - D) / (maxD - minD) else: C = numpy.ones((V1.shape[0], V2.shape[0])) return C
def matrixSimilarity(self, V1, V2): """ Compute a vertex similarity matrix C, such that the ijth entry is the matching score between V1_i and V2_j, where larger is a better match. """ X = numpy.r_[V1, V2] standardiser = Standardiser() X = standardiser.normaliseArray(X) V1 = X[0:V1.shape[0], :] V2 = X[V1.shape[0]:, :] #print(X) #Extend arrays with zeros to make them the same size #if V1.shape[0] < V2.shape[0]: # V1 = Util.extendArray(V1, V2.shape, numpy.min(V1)) #elif V2.shape[0] < V1.shape[0]: # V2 = Util.extendArray(V2, V1.shape, numpy.min(V2)) #Let's compute C as the distance between vertices #Distance is bounded by 1 D = Util.distanceMatrix(V1, V2) maxD = numpy.max(D) minD = numpy.min(D) if (maxD - minD) != 0: C = (maxD - D) / (maxD - minD) else: C = numpy.ones((V1.shape[0], V2.shape[0])) return C
def testScaleArray(self): numExamples = 10 numFeatures = 3 X = numpy.random.rand(numExamples, numFeatures) preprocessor = Standardiser() Xs = preprocessor.scaleArray(X) minVals = numpy.amin(Xs, 0) maxVals = numpy.amax(Xs, 0) tol = 10 ** -6 self.assertTrue(numpy.linalg.norm(minVals + numpy.ones(X.shape[1])) <= tol) self.assertTrue(numpy.linalg.norm(maxVals - numpy.ones(X.shape[1])) <= tol) # Now test stanrdisation on other matrix X = numpy.array([[2, 1], [-1, -2], [0.6, 0.3]]) preprocessor = Standardiser() Xs = preprocessor.scaleArray(X) X2 = numpy.array([[2, 1], [-1, -2], [0.6, 0.3], [4, 2]]) Xs2 = preprocessor.scaleArray(X2) self.assertTrue(numpy.linalg.norm(Xs2[0:3, :] - Xs) < tol)
def testUnstandardiseArray(self): numExamples = 10 numFeatures = 3 tol = 10 ** -6 preprocessor = Standardiser() # Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xs = preprocessor.standardiseArray(X) X2 = preprocessor.unstandardiseArray(Xs) self.assertTrue(numpy.linalg.norm(X2 - X) < tol)
def testLearningRate(self): numExamples = 100 trainX, trainY = data.make_regression(numExamples) trainX = Standardiser().normaliseArray(trainX) trainY = Standardiser().normaliseArray(trainY) learner = DecisionTreeLearner(pruneType="CART", maxDepth=20, minSplit=1) foldsSet = numpy.arange(2, 7, 2) gammas = numpy.array(numpy.round(2**numpy.arange(1, 8, 1)-1), dtype=numpy.int) paramDict = {} paramDict["setGamma"] = gammas betaGrid = learner.learningRate(trainX, trainY, foldsSet, paramDict) #Compute beta more directly numParams = gammas.shape[0] sampleSize = trainX.shape[0] sampleMethod = Sampling.crossValidation Cvs = numpy.array([1]) penalties = numpy.zeros((foldsSet.shape[0], numParams)) betas = numpy.zeros(gammas.shape[0]) for k in range(foldsSet.shape[0]): folds = foldsSet[k] logging.debug("Folds " + str(folds)) idx = sampleMethod(folds, trainX.shape[0]) #Now try penalisation resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] penalties[k, :] = currentPenalties for i in range(gammas.shape[0]): inds = numpy.logical_and(numpy.isfinite(penalties[:, i]), penalties[:, i]>0) tempPenalties = penalties[:, i][inds] tempfoldsSet = numpy.array(foldsSet, numpy.float)[inds] if tempPenalties.shape[0] > 1: x = numpy.log((tempfoldsSet-1)/tempfoldsSet*sampleSize) y = numpy.log(tempPenalties)+numpy.log(tempfoldsSet) clf = linear_model.LinearRegression() clf.fit(numpy.array([x]).T, y) betas[i] = clf.coef_[0] betas = -betas nptst.assert_array_equal(betaGrid, betas)
def testLearnModel(self): numExamples = 50 numFeatures = 200 preprocessor = Standardiser() X = numpy.random.randn(numExamples, numFeatures) X = preprocessor.standardiseArray(X) c = numpy.random.rand(numFeatures) y = numpy.dot(X, c) tol = 0.05 kernel = LinearKernel() lmbda = 0.0001 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) self.assertTrue(Evaluator.rootMeanSqError(y, predY) < tol) #Try increasing y y = y + 5 lmbda = 0.2 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) self.assertTrue(numpy.abs(b - 5) < 0.1) self.assertTrue(Evaluator.rootMeanSqError(y, predY) < 0.1) #Try making prediction for multilabel Y C = numpy.random.rand(numFeatures, numFeatures) Y = numpy.dot(X, C) predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, Y) predY = predictor.predict(X) self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1) #Now, shift the data s = numpy.random.rand(numFeatures) Y = Y + s predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, Y) predY = predictor.predict(X) self.assertTrue(numpy.linalg.norm(b - s) < 0.1) self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)
def testRecursiveSetPrune(self): numExamples = 1000 X, y = data.make_regression(numExamples) y = Standardiser().normaliseArray(y) numTrain = numpy.round(numExamples * 0.66) trainX = X[0:numTrain, :] trainY = y[0:numTrain] testX = X[numTrain:, :] testY = y[numTrain:] learner = DecisionTreeLearner() learner.learnModel(trainX, trainY) rootId = (0,) learner.tree.getVertex(rootId).setTestInds(numpy.arange(testX.shape[0])) learner.recursiveSetPrune(testX, testY, rootId) for vertexId in learner.tree.getAllVertexIds(): tempY = testY[learner.tree.getVertex(vertexId).getTestInds()] predY = numpy.ones(tempY.shape[0])*learner.tree.getVertex(vertexId).getValue() error = numpy.sum((tempY-predY)**2) self.assertAlmostEquals(error, learner.tree.getVertex(vertexId).getTestError()) #Check leaf indices form all indices inds = numpy.array([]) for vertexId in learner.tree.leaves(): inds = numpy.union1d(inds, learner.tree.getVertex(vertexId).getTestInds()) nptst.assert_array_equal(inds, numpy.arange(testY.shape[0]))
def testStandardiseArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() # Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xs = preprocessor.standardiseArray(X) self.assertAlmostEquals(numpy.sum(Xs), 0, places=3) self.assertAlmostEquals(numpy.sum(Xs * Xs), numFeatures, places=3) # Now, test on a portion of a matrix Xss = preprocessor.standardiseArray(X[1:5, :]) self.assertTrue((Xss == Xs[1:5, :]).all())
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([ 0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508 ]) testAucs = numpy.array([ 0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400 ]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1
def testCentreArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() # Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xc = preprocessor.centreArray(X) centreV = preprocessor.getCentreVector() self.assertAlmostEquals(numpy.sum(Xc), 0, places=3) self.assertTrue((X - centreV == Xc).all()) # Now take out 3 rows of X, normalise and compare to normalised X Xs = X[0:3, :] Xsc = preprocessor.centreArray(Xs) self.assertTrue((Xsc == Xc[0:3, :]).all())
def testNormaliseArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() # Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn * Xn), numFeatures, places=3) norms = numpy.sum(Xn * Xn, 0) for i in range(0, norms.shape[0]): self.assertAlmostEquals(norms[i], 1, places=3) self.assertTrue((X / normV == Xn).all()) # Zero one column preprocessor = Standardiser() X[:, 1] = 0 Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn * Xn), numFeatures - 1, places=3) self.assertTrue((X / normV == Xn).all()) # Now take out 3 rows of X, normalise and compare to normalised X Xs = X[0:3, :] Xsn = preprocessor.normaliseArray(Xs) self.assertTrue((Xsn == Xn[0:3, :]).all())
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testClassify(self): numExamples = 10 numFeatures = 20 X = numpy.random.randn(numExamples, numFeatures) y = numpy.sign(numpy.random.randn(numExamples)) logging.debug(y) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-5 lmbda = 1.0 kernel = LinearKernel() predictor = KernelRidgeRegression(kernel, lmbda) predictor.learnModel(X, y) classY, predY = predictor.classify(X) self.assertTrue(numpy.logical_or(classY == 1, classY == -1).all())
def testLearnModel2(self): numExamples = 200 numFeatures = 100 X = numpy.random.randn(numExamples, numFeatures) y = numpy.random.randn(numExamples) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-3 kernel = LinearKernel() #Try using a low-rank matrix lmbda = 0.001 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) logging.debug((numpy.linalg.norm(y))) logging.debug((numpy.linalg.norm(predY - y)))
def setUp(self): numpy.random.seed(21) numpy.seterr(all="raise") numExamples = 100 numFeatures = 10 self.X = numpy.random.rand(numExamples, numFeatures) c = numpy.random.rand(numFeatures) self.y = numpy.array( numpy.sign(self.X.dot(c) - numpy.mean(self.X.dot(c))), numpy.int) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) self.X = Standardiser().standardiseArray(self.X)
def testSetSvmType(self): try: import sklearn except ImportError as error: return numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) X = Standardiser().standardiseArray(X) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + 1 y2 = numpy.array(y > 0, numpy.int32) * 2 - 1 svm = LibSVM() svm.setSvmType("Epsilon_SVR") self.assertEquals(svm.getType(), "Epsilon_SVR") #Try to get a good error Cs = 2**numpy.arange(-6, 4, dtype=numpy.float) epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float) bestError = 10 for C in Cs: for epsilon in epsilons: svm.setEpsilon(epsilon) svm.setC(C) svm.learnModel(X, y) yp = svm.predict(X) if Evaluator.rootMeanSqError(y, yp) < bestError: bestError = Evaluator.rootMeanSqError(y, yp) self.assertTrue( bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0]))) svm.setSvmType("C_SVC") svm.learnModel(X, y2) yp2 = svm.predict(X) self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
def testCARTPrune(self): numExamples = 500 X, y = data.make_regression(numExamples) y = Standardiser().standardiseArray(y) numTrain = numpy.round(numExamples * 0.33) numValid = numpy.round(numExamples * 0.33) trainX = X[0:numTrain, :] trainY = y[0:numTrain] validX = X[numTrain:numTrain+numValid, :] validY = y[numTrain:numTrain+numValid] testX = X[numTrain+numValid:, :] testY = y[numTrain+numValid:] learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2) learner.learnModel(trainX, trainY) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 1000) predY = learner.predict(trainX) learner.setGamma(200) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 200) learner.setGamma(100) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 100) learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2) learner.learnModel(trainX, trainY) predY2 = learner.predict(trainX) #Gamma = 0 implies no pruning nptst.assert_array_equal(predY, predY2) #Full pruning learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1) learner.learnModel(trainX, trainY) self.assertEquals(learner.tree.getNumVertices(), 1)
def testCvPrune(self): numExamples = 500 X, y = data.make_regression(numExamples) y = Standardiser().standardiseArray(y) numTrain = numpy.round(numExamples * 0.33) numValid = numpy.round(numExamples * 0.33) trainX = X[0:numTrain, :] trainY = y[0:numTrain] validX = X[numTrain:numTrain+numValid, :] validY = y[numTrain:numTrain+numValid] testX = X[numTrain+numValid:, :] testY = y[numTrain+numValid:] learner = DecisionTreeLearner() learner.learnModel(trainX, trainY) error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY) #print(learner.getTree()) unprunedTree = learner.tree.copy() learner.setGamma(1000) learner.cvPrune(trainX, trainY) self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices()) learner.setGamma(100) learner.cvPrune(trainX, trainY) #Test if pruned tree is subtree of current: for vertexId in learner.tree.getAllVertexIds(): self.assertTrue(vertexId in unprunedTree.getAllVertexIds()) #The error should be better after pruning learner.learnModel(trainX, trainY) #learner.cvPrune(validX, validY, 0.0, 5) learner.repPrune(validX, validY) error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY) self.assertTrue(error1 >= error2)
def testModelSelect(self): """ We test the results on some data and compare to SVR. """ numExamples = 200 X, y = data.make_regression(numExamples, noise=0.5) X = Standardiser().standardiseArray(X) y = Standardiser().standardiseArray(y) trainX = X[0:100, :] trainY = y[0:100] testX = X[100:, :] testY = y[100:] learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV") learner.setPruneCV(8) paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART") paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) return #Let's compare to the SVM learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float) paramDict["setEpsilon"] = learner2.getEpsilons() idx = Sampling.crossValidation(folds, trainX.shape[0]) bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestSVM.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error)
def testLearnModel(self): numExamples = 50 numFeatures = 200 X = numpy.random.randn(numExamples, numFeatures) y = numpy.random.randn(numExamples) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-3 kernel = LinearKernel() #Compare Linear kernel with linear ridge regression lmbda = 0.1 predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(X) K = numpy.dot(X, X.T) alpha2 = numpy.dot( numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y) predY2 = X.dot( numpy.linalg.inv( numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot( X.T).dot(y) #logging.debug(numpy.linalg.norm(alpha - alpha2)) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) lmbda = 0.5 predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(X) K = numpy.dot(X, X.T) alpha2 = numpy.dot( numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y) predY2 = X.dot( numpy.linalg.inv( numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot( X.T).dot(y) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) #Now test on an alternative test set numTestExamples = 50 testX = numpy.random.randn(numTestExamples, numFeatures) predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(testX) K = numpy.dot(X, X.T) alpha2 = numpy.dot( numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y) predY2 = testX.dot( numpy.linalg.inv( numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot( X.T).dot(y) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) #Use the method against a multi-label example Y = numpy.random.randn(numExamples, numFeatures) alpha = predictor.learnModel(X, Y) self.assertTrue(alpha.shape == (numExamples, numFeatures))
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] sinThetaList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg != "efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg == "IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank - 1, which="LM", ncv=ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append( [bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank - 1, which="LM", ncv=ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot * omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) boundList.append([0] * 4) else: omega, Q = scipy.sparse.linalg.eigsh( ABBA, min(self.k2, ABBA.shape[0] - 1), which="LM", ncv=min(10 * self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh( ABBA, min(self.k1, ABBA.shape[0] - 1), which="LM", ncv=min(15 * self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) if self.computeSinTheta: omegaExact, QExact = scipy.linalg.eigh(ABBA.todense()) inds = numpy.flipud(numpy.argsort(omegaExact)) QExactKbot = QExact[:, inds[self.k1:]] inds = numpy.flipud(numpy.argsort(omega)) QApproxK = Q[:, inds[:self.k1]] sinThetaList.append( scipy.linalg.norm(QExactKbot.T.dot(QApproxK))) decompositionTimeList.append(time.time() - startTime) if self.alg == "IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint( 0, V.shape[0], nb_missing_centroids), :] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans( V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time() - startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from sandbox.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: eigenQuality = { "boundList": boundList, "sinThetaList": sinThetaList } return clustersList, numpy.array( (decompositionTimeList, kMeansTimeList)).T, eigenQuality else: return clustersList
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] sinThetaList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg!="efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg=="IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append([bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot*omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) boundList.append([0]*4) else: omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) if self.computeSinTheta: omegaExact, QExact = scipy.linalg.eigh(ABBA.todense()) inds = numpy.flipud(numpy.argsort(omegaExact)) QExactKbot = QExact[:, inds[self.k1:]] inds = numpy.flipud(numpy.argsort(omega)) QApproxK = Q[:,inds[:self.k1]] sinThetaList.append(scipy.linalg.norm(QExactKbot.T.dot(QApproxK))) decompositionTimeList.append(time.time()-startTime) if self.alg=="IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time()-startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from sandbox.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: eigenQuality = {"boundList" : boundList, "sinThetaList" : sinThetaList} return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, eigenQuality else: return clustersList