def testNormalisedLaplacianRw(self): numVertices = 10 numFeatures = 0 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList) ell = 2 m = 2 generator = BarabasiAlbertGenerator(ell, m) graph = generator.generate(graph) k = 10 W = graph.getSparseWeightMatrix() L = GraphUtils.normalisedLaplacianRw(W) L2 = graph.normalisedLaplacianRw() tol = 10**-6 self.assertTrue(numpy.linalg.norm(L - L2) < tol) #Test zero rows/cols W = scipy.sparse.csr_matrix((5, 5)) W[1, 0] = 1 W[0, 1] = 1 L = GraphUtils.normalisedLaplacianRw(W) for i in range(2, 5): self.assertEquals(L[i, i], 0)
def recordResults(self, clusterList, timeList, fileName): """ Save results for a particular clustering """ iterator = self.getIterator() measures = [] graphInfo = [] logging.debug("Computing cluster measures") for i in range(len(clusterList)): Util.printIteration(i, self.logStep, len(clusterList)) W = next(iterator) #G = networkx.Graph(W) #Store modularity, k-way normalised cut, and cluster size currentMeasures = [GraphUtils.modularity(W, clusterList[i]), GraphUtils.kwayNormalisedCut(W, clusterList[i]), len(numpy.unique(clusterList[i]))] measures.append(currentMeasures) # graph size currentGraphInfo = [W.shape[0]] graphInfo.append(currentGraphInfo) # nb connected components #graphInfo[i, 1] = networkx.number_connected_components(G) measures = numpy.array(measures) graphInfo = numpy.array(graphInfo) numpy.savez(fileName, measures, timeList, graphInfo) logging.debug("Saved file as " + fileName)
def testRandIndex(self): clustering1 = numpy.array([1, 1, 1, 2, 2, 2]) clustering2 = numpy.array([2, 2, 2, 1, 1, 1]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 0.0) clustering2 = numpy.array([2, 2, 2, 1, 1, 2]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 1/3.0) clustering2 = numpy.array([1, 2, 2, 1, 1, 2]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 16/30.0)
def testVertexLabelPairs(self): numVertices = 6 numFeatures = 1 vList = VertexList(numVertices, numFeatures) vList.setVertices(numpy.array([numpy.arange(0, 6)]).T) graph = DenseGraph(vList, True) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) graph.addEdge(0, 2, 0.2) graph.addEdge(2, 3, 0.5) graph.addEdge(0, 4, 0.1) graph.addEdge(3, 4, 0.1) tol = 10**-6 edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue(numpy.linalg.norm(X - edges) < tol ) X = GraphUtils.vertexLabelPairs(graph, edges[[5, 2, 1], :]) self.assertTrue(numpy.linalg.norm(X - edges[[5,2,1], :]) < tol ) #Try a bigger graph numVertices = 6 numFeatures = 2 vList = VertexList(numVertices, numFeatures) vList.setVertices(numpy.random.randn(numVertices, numFeatures)) graph = DenseGraph(vList, True) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue(numpy.linalg.norm(X[0, 0:numFeatures] - vList.getVertex(1)) < tol ) self.assertTrue(numpy.linalg.norm(X[0, numFeatures:numFeatures*2] - vList.getVertex(0)) < tol ) self.assertTrue(numpy.linalg.norm(X[1, 0:numFeatures] - vList.getVertex(3)) < tol ) self.assertTrue(numpy.linalg.norm(X[1, numFeatures:numFeatures*2] - vList.getVertex(1)) < tol ) #Try directed graphs graph = DenseGraph(vList, False) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue(numpy.linalg.norm(X[0, 0:numFeatures] - vList.getVertex(0)) < tol ) self.assertTrue(numpy.linalg.norm(X[0, numFeatures:numFeatures*2] - vList.getVertex(1)) < tol ) self.assertTrue(numpy.linalg.norm(X[1, 0:numFeatures] - vList.getVertex(1)) < tol ) self.assertTrue(numpy.linalg.norm(X[1, numFeatures:numFeatures*2] - vList.getVertex(3)) < tol )
def testRandIndex(self): clustering1 = numpy.array([1, 1, 1, 2, 2, 2]) clustering2 = numpy.array([2, 2, 2, 1, 1, 1]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 0.0) clustering2 = numpy.array([2, 2, 2, 1, 1, 2]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 1 / 3.0) clustering2 = numpy.array([1, 2, 2, 1, 1, 2]) self.assertEquals(GraphUtils.randIndex(clustering1, clustering2), 16 / 30.0)
def learnModel(self, graph): """ Learn a prediction model based on considering all ego-alter pairs. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` """ logging.info("Learning model on graph of size " + str(graph.getNumVertices())) logging.info("Regressor: " + str(self.predictor)) edges = graph.getAllEdges() if graph.isUndirected(): edges2 = numpy.c_[edges[:, 1], edges[:, 0]] edges = numpy.r_[edges, edges2] X = GraphUtils.vertexLabelPairs(graph, edges) y = graph.getEdgeValues(edges) #Now we need to solve least to find regressor of X onto y logging.info("Number of vertex pairs " + str(X.shape)) gc.collect() self.predictor.learnModel(X, y)
def vectorStatistics(self, graph, treeStats=False, eigenStats=True): """ Find a series of statistics for the given input graph which can be represented as vector values. """ Parameter.checkClass(graph, AbstractMatrixGraph) Parameter.checkBoolean(treeStats) statsDict = {} statsDict["inDegreeDist"] = graph.inDegreeDistribution() statsDict["outDegreeDist"] = graph.degreeDistribution() logging.debug("Computing hop counts") P = graph.findAllDistances(False) statsDict["hopCount"] = graph.hopCount(P) logging.debug("Computing triangle count") if graph.getNumVertices() != 0: statsDict["triangleDist"] = numpy.bincount( graph.triangleSequence()) else: statsDict["triangleDist"] = numpy.array([]) #Get the distribution of component sizes logging.debug("Finding distribution of component sizes") if graph.isUndirected(): components = graph.findConnectedComponents() if len(components) != 0: statsDict["componentsDist"] = numpy.bincount( numpy.array([len(c) for c in components], numpy.int)) #Make sure weight matrix is symmetric if graph.getNumVertices() != 0 and eigenStats: logging.debug("Computing eigenvalues/vectors") W = graph.getWeightMatrix() W = (W + W.T) / 2 eigenDistribution, V = numpy.linalg.eig(W) i = numpy.argmax(eigenDistribution) statsDict["maxEigVector"] = V[:, i] statsDict["eigenDist"] = numpy.flipud( numpy.sort(eigenDistribution[eigenDistribution > 0])) gc.collect() else: statsDict["maxEigVector"] = numpy.array([]) statsDict["eigenDist"] = numpy.array([]) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsDict["treeSizesDist"] = numpy.bincount( [len(x) for x in trees]) treeDepths = [ GraphUtils.treeDepth((graph.subgraph(x))) for x in trees ] statsDict["treeDepthsDist"] = numpy.bincount(treeDepths) return statsDict
def testTreeDepth(self): numVertices = 4 numFeatures = 1 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList, False) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(2, 3) self.assertEquals(GraphUtils.treeDepth(graph), 2) numVertices = 5 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList, False) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(2, 3) graph.addEdge(3, 4) self.assertEquals(GraphUtils.treeDepth(graph), 3)
def learnModel(self, graph): """ Take the set of pairs of edges and also non-edges and learn when an edge occurs. """ Parameter.checkInt(self.windowSize, 1, graph.getNumVertices()) self.graph = graph X, y = GraphUtils.vertexLabelExamples(graph) X = self.preprocessor.learn(X) self.learningAlg.learnModel(X, y)
def predictEdges(self, graph, edges): """ Make prediction given the edges and given graph. :param edges: A numpy array consisting of the edges to make predictions over. """ Parameter.checkInt(graph.getVertexList().getNumFeatures(), 1, float('inf')) logging.info("Making prediction over " + str(edges.shape[0]) + " edges") X = GraphUtils.vertexLabelPairs(graph, edges) predY = self.predictor.predict(X) return predY
def vectorStatistics(self, graph, treeStats=False, eigenStats=True): """ Find a series of statistics for the given input graph which can be represented as vector values. """ Parameter.checkClass(graph, AbstractMatrixGraph) Parameter.checkBoolean(treeStats) statsDict = {} statsDict["inDegreeDist"] = graph.inDegreeDistribution() statsDict["outDegreeDist"] = graph.degreeDistribution() logging.debug("Computing hop counts") P = graph.findAllDistances(False) statsDict["hopCount"] = graph.hopCount(P) logging.debug("Computing triangle count") if graph.getNumVertices() != 0: statsDict["triangleDist"] = numpy.bincount(graph.triangleSequence()) else: statsDict["triangleDist"] = numpy.array([]) #Get the distribution of component sizes logging.debug("Finding distribution of component sizes") if graph.isUndirected(): components = graph.findConnectedComponents() if len(components) != 0: statsDict["componentsDist"] = numpy.bincount(numpy.array([len(c) for c in components], numpy.int)) #Make sure weight matrix is symmetric if graph.getNumVertices()!=0 and eigenStats: logging.debug("Computing eigenvalues/vectors") W = graph.getWeightMatrix() W = (W + W.T)/2 eigenDistribution, V = numpy.linalg.eig(W) i = numpy.argmax(eigenDistribution) statsDict["maxEigVector"] = V[:, i] statsDict["eigenDist"] = numpy.flipud(numpy.sort(eigenDistribution[eigenDistribution>0])) gc.collect() else: statsDict["maxEigVector"] = numpy.array([]) statsDict["eigenDist"] = numpy.array([]) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsDict["treeSizesDist"] = numpy.bincount([len(x) for x in trees]) treeDepths = [GraphUtils.treeDepth((graph.subgraph(x))) for x in trees] statsDict["treeDepthsDist"] = numpy.bincount(treeDepths) return statsDict
def testModularity(self): numVertices = 6 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 0) graph.addEdge(1, 1) graph.addEdge(2, 2) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(2, 1) graph.addEdge(3, 4, 2) graph.addEdge(3, 5, 2) graph.addEdge(4, 5, 2) graph.addEdge(3, 3, 2) graph.addEdge(4, 4, 2) graph.addEdge(5, 5, 2) W = graph.getWeightMatrix() clustering = numpy.array([0, 0, 0, 1, 1, 1]) #This is the same as the igraph result Q = GraphUtils.modularity(W, clustering) self.assertEquals(Q, 4.0 / 9.0) Ws = scipy.sparse.csr_matrix(W) Q = GraphUtils.modularity(Ws, clustering) self.assertEquals(Q, 4.0 / 9.0) W = numpy.ones((numVertices, numVertices)) Q = GraphUtils.modularity(W, clustering) self.assertEquals(Q, 0.0) Ws = scipy.sparse.csr_matrix(W) Q = GraphUtils.modularity(Ws, clustering) self.assertEquals(Q, 0.0)
def testModularity(self): numVertices = 6 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0,0) graph.addEdge(1,1) graph.addEdge(2,2) graph.addEdge(0,1) graph.addEdge(0,2) graph.addEdge(2,1) graph.addEdge(3,4,2) graph.addEdge(3,5,2) graph.addEdge(4,5,2) graph.addEdge(3,3,2) graph.addEdge(4,4,2) graph.addEdge(5,5,2) W = graph.getWeightMatrix() clustering = numpy.array([0,0,0,1,1,1]) #This is the same as the igraph result Q = GraphUtils.modularity(W, clustering) self.assertEquals(Q, 4.0/9.0) Ws = scipy.sparse.csr_matrix(W) Q = GraphUtils.modularity(Ws, clustering) self.assertEquals(Q, 4.0/9.0) W = numpy.ones((numVertices, numVertices)) Q = GraphUtils.modularity(W, clustering) self.assertEquals(Q, 0.0) Ws = scipy.sparse.csr_matrix(W) Q = GraphUtils.modularity(Ws, clustering) self.assertEquals(Q, 0.0)
def testModularityMatrix(self): W = scipy.sparse.csr_matrix((5, 5)) W[1, 0] = 1 W[0, 1] = 1 W[2, 3] = 1 W[3, 2] = 1 B = GraphUtils.modularityMatrix(W) B2 = numpy.zeros((5,5)) d = numpy.array(W.sum(0).ravel()).ravel() m = W.getnnz()/2 for i in range(5): for j in range(5): B2[i,j] = W[i,j] - d[i]*d[j]/2*m self.assertEquals(B2[i, j], B[i, j])
def testModularityMatrix(self): W = scipy.sparse.csr_matrix((5, 5)) W[1, 0] = 1 W[0, 1] = 1 W[2, 3] = 1 W[3, 2] = 1 B = GraphUtils.modularityMatrix(W) B2 = numpy.zeros((5, 5)) d = numpy.array(W.sum(0).ravel()).ravel() m = W.getnnz() / 2 for i in range(5): for j in range(5): B2[i, j] = W[i, j] - d[i] * d[j] / 2 * m self.assertEquals(B2[i, j], B[i, j])
def testKwayNormalisedCut(self): numVertices = 6 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(2, 1) graph.addEdge(3, 4) graph.addEdge(3, 5) graph.addEdge(5, 4) W = graph.getWeightMatrix() clustering = numpy.array([0, 0, 0, 1, 1, 1]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 0.0) #Try sparse W Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 0.0) graph.addEdge(2, 3) W = graph.getWeightMatrix() self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 1.0 / 7) Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 1.0 / 7) clustering = numpy.array([0, 0, 0, 1, 1, 2]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 61.0 / 105) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 61.0 / 105) #Test two vertices without any edges W = numpy.zeros((2, 2)) clustering = numpy.array([0, 1]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 0.0) Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 0.0)
def testKwayNormalisedCut(self): numVertices = 6 graph = SparseGraph(GeneralVertexList(numVertices)) graph.addEdge(0, 1) graph.addEdge(0, 2) graph.addEdge(2, 1) graph.addEdge(3, 4) graph.addEdge(3, 5) graph.addEdge(5, 4) W = graph.getWeightMatrix() clustering = numpy.array([0,0,0, 1,1,1]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 0.0) #Try sparse W Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 0.0) graph.addEdge(2, 3) W = graph.getWeightMatrix() self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 1.0/7) Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 1.0/7) clustering = numpy.array([0,0,0, 1,1,2]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 61.0/105) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 61.0/105) #Test two vertices without any edges W = numpy.zeros((2, 2)) clustering = numpy.array([0, 1]) self.assertEquals(GraphUtils.kwayNormalisedCut(W, clustering), 0.0) Ws = scipy.sparse.csr_matrix(W) self.assertEquals(GraphUtils.kwayNormalisedCut(Ws, clustering), 0.0)
def testShiftLaplacian(self): numVertices = 10 numFeatures = 0 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList) ell = 2 m = 2 generator = BarabasiAlbertGenerator(ell, m) graph = generator.generate(graph) k = 10 W = graph.getSparseWeightMatrix() L = GraphUtils.shiftLaplacian(W) L2 = 2 * numpy.eye(numVertices) - graph.normalisedLaplacianSym() tol = 10**-6 self.assertTrue(numpy.linalg.norm(L - L2) < tol)
def testShiftLaplacian(self): numVertices = 10 numFeatures = 0 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList) ell = 2 m = 2 generator = BarabasiAlbertGenerator(ell, m) graph = generator.generate(graph) k = 10 W = graph.getSparseWeightMatrix() L = GraphUtils.shiftLaplacian(W) L2 = 2*numpy.eye(numVertices) - graph.normalisedLaplacianSym() tol = 10**-6 self.assertTrue(numpy.linalg.norm(L - L2) < tol)
#Plot bound as Nystrom cols change W = iterator.next() nystromNs = numpy.arange(200, 1000, 50) #Same plots with Fowlkes dataset #There is no eigengap in this case so bound does poorly W = scipy.sparse.csr_matrix(createDataset(sigma=1.5)) nystromNs = numpy.arange(20, 151, 10) k = 2 errors = numpy.zeros((len(nystromNs), numMethods)) innerProds = numpy.zeros((len(nystromNs), numMethods)) L = GraphUtils.shiftLaplacian(W) L2 = GraphUtils.normalisedLaplacianSym(W) print(L2.todense()) #Find connected components graph = SparseGraph(GeneralVertexList(W.shape[0])) graph.setWeightMatrix(W) components = graph.findConnectedComponents() print(len(components)) #Compute exact eigenvalues omega, Q = numpy.linalg.eigh(L.todense()) inds = numpy.flipud(numpy.argsort(omega)) omega, Q = omega[inds], Q[:, inds] omegak, Qk = omega[0:k], Q[:, 0:k]
if saveResults: errors = numpy.zeros((numGraphs, numRepetitions)) allBoundLists = numpy.zeros((numRepetitions, numGraphs, 5)) for r in range(numRepetitions): iterator = BoundGraphIterator(numGraphs=numGraphs) clusterer = IterativeSpectralClustering(k1, k2, T=100, computeBound=True, alg="IASC") clusterer.nb_iter_kmeans = 20 logging.debug("Starting clustering") clusterList, timeList, boundList = clusterer.clusterFromIterator(iterator, verbose=True) allBoundLists[r, :, :] = numpy.array(boundList) for i in range(len(clusterList)): errors[i, r] = GraphUtils.randIndex(clusterList[i], iterator.realClustering) print(allBoundLists.mean(0)) numpy.save(fileName, allBoundLists) logging.debug("Saved results as " + fileName) else: allBoundLists = numpy.load(fileName) boundList = allBoundLists.mean(0) stdBoundList = allBoundLists.std(0) stdBoundList[:, 0] = boundList[:, 0] plotStyles1 = ['k-', 'k--', 'k-.', 'k:', 'b--', 'b-.', 'g-', 'g--', 'g-.', 'r-', 'r--', 'r-.'] print(boundList) print(stdBoundList)
def scalarStatistics(self, graph, slowStats=True, treeStats=False): """ Find a series of statistics for the given input graph which can be represented as scalar values. Return results as a vector. """ #This method is a bit of a mess Parameter.checkClass(graph, AbstractSingleGraph) Parameter.checkBoolean(slowStats) Parameter.checkBoolean(treeStats) statsArray = numpy.ones(self.numStats) * -1 statsArray[self.numVerticesIndex] = graph.getNumVertices() statsArray[self.numEdgesIndex] = graph.getNumEdges() statsArray[self.numDirEdgesIndex] = graph.getNumDirEdges() statsArray[self.densityIndex] = graph.density() if graph.isUndirected(): subComponents = graph.findConnectedComponents() statsArray[self.numComponentsIndex] = len(subComponents) nonSingletonSubComponents = [ c for c in subComponents if len(c) > 1 ] statsArray[self.numNonSingletonComponentsIndex] = len( nonSingletonSubComponents) triOrMoreSubComponents = [c for c in subComponents if len(c) > 2] statsArray[self.numTriOrMoreComponentsIndex] = len( triOrMoreSubComponents) #logging.debug("Studying max component") if len(subComponents) != 0: maxCompGraph = graph.subgraph(list(subComponents[0])) statsArray[self.maxComponentSizeIndex] = len(subComponents[0]) if len(subComponents) >= 2: statsArray[self.secondComponentSizeIndex] = len( subComponents[1]) statsArray[ self.maxComponentEdgesIndex] = maxCompGraph.getNumEdges() statsArray[self.meanComponentSizeIndex] = sum([ len(x) for x in subComponents ]) / float(statsArray[self.numComponentsIndex]) statsArray[self.maxCompMeanDegreeIndex] = numpy.mean( maxCompGraph.outDegreeSequence()) else: statsArray[self.maxComponentSizeIndex] = 0 statsArray[self.maxComponentEdgesIndex] = 0 statsArray[self.meanComponentSizeIndex] = 0 statsArray[self.geodesicDistMaxCompIndex] = 0 if graph.getNumVertices() != 0: statsArray[self.meanDegreeIndex] = numpy.mean( graph.outDegreeSequence()) else: statsArray[self.meanDegreeIndex] = 0 if slowStats: if self.useFloydWarshall: logging.debug("Running Floyd-Warshall") P = graph.floydWarshall(False) else: logging.debug("Running Dijkstra's algorithm") P = graph.findAllDistances(False) statsArray[self.diameterIndex] = graph.diameter(P=P) statsArray[self.effectiveDiameterIndex] = graph.effectiveDiameter( self.q, P=P) statsArray[self.powerLawIndex] = graph.fitPowerLaw()[0] statsArray[self.geodesicDistanceIndex] = graph.geodesicDistance( P=P) statsArray[ self. harmonicGeoDistanceIndex] = graph.harmonicGeodesicDistance(P=P) if graph.isUndirected() and len(subComponents) != 0: statsArray[ self.geodesicDistMaxCompIndex] = graph.geodesicDistance( P=P, vertexInds=list(subComponents[0])) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsArray[self.numTreesIndex] = len(trees) nonSingletonTrees = [c for c in trees if len(c) > 1] statsArray[self.numNonSingletonTreesIndex] = len(nonSingletonTrees) statsArray[self.meanTreeSizeIndex] = numpy.mean( [len(x) for x in trees]) treeDepths = [ GraphUtils.treeDepth((graph.subgraph(list(x)))) for x in trees ] statsArray[self.meanTreeDepthIndex] = numpy.mean(treeDepths) if len(trees) != 0: maxTreeGraph = graph.subgraph(trees[0]) statsArray[self.maxTreeSizeIndex] = len(trees[0]) statsArray[self.maxTreeDepthIndex] = GraphUtils.treeDepth( maxTreeGraph) if len(trees) >= 2: secondTreeGraph = graph.subgraph(trees[1]) statsArray[self.secondTreeSizeIndex] = len(trees[1]) statsArray[ self.secondTreeDepthIndex] = GraphUtils.treeDepth( secondTreeGraph) return statsArray
def scalarStatistics(self, graph, slowStats=True, treeStats=False): """ Find a series of statistics for the given input graph which can be represented as scalar values. Return results as a vector. """ #This method is a bit of a mess Parameter.checkClass(graph, AbstractSingleGraph) Parameter.checkBoolean(slowStats) Parameter.checkBoolean(treeStats) statsArray = numpy.ones(self.numStats)*-1 statsArray[self.numVerticesIndex] = graph.getNumVertices() statsArray[self.numEdgesIndex] = graph.getNumEdges() statsArray[self.numDirEdgesIndex] = graph.getNumDirEdges() statsArray[self.densityIndex] = graph.density() if graph.isUndirected(): logging.debug("Finding connected components") subComponents = graph.findConnectedComponents() logging.debug("Done") statsArray[self.numComponentsIndex] = len(subComponents) nonSingletonSubComponents = [c for c in subComponents if len(c) > 1] statsArray[self.numNonSingletonComponentsIndex] = len(nonSingletonSubComponents) triOrMoreSubComponents = [c for c in subComponents if len(c) > 2] statsArray[self.numTriOrMoreComponentsIndex] = len(triOrMoreSubComponents) logging.debug("Studying max component") if len(subComponents) != 0: maxCompGraph = graph.subgraph(list(subComponents[0])) statsArray[self.maxComponentSizeIndex] = len(subComponents[0]) if len(subComponents) >= 2: statsArray[self.secondComponentSizeIndex] = len(subComponents[1]) statsArray[self.maxComponentEdgesIndex] = maxCompGraph.getNumEdges() statsArray[self.meanComponentSizeIndex] = sum([len(x) for x in subComponents])/float(statsArray[self.numComponentsIndex]) statsArray[self.maxCompMeanDegreeIndex] = numpy.mean(maxCompGraph.outDegreeSequence()) else: statsArray[self.maxComponentSizeIndex] = 0 statsArray[self.maxComponentEdgesIndex] = 0 statsArray[self.meanComponentSizeIndex] = 0 statsArray[self.geodesicDistMaxCompIndex] = 0 if graph.getNumVertices() != 0: statsArray[self.meanDegreeIndex] = numpy.mean(graph.outDegreeSequence()) else: statsArray[self.meanDegreeIndex] = 0 if slowStats: if self.useFloydWarshall: logging.debug("Running Floyd-Warshall") P = graph.floydWarshall(False) else: logging.debug("Running Dijkstra's algorithm") P = graph.findAllDistances(False) statsArray[self.diameterIndex] = graph.diameter(P=P) statsArray[self.effectiveDiameterIndex] = graph.effectiveDiameter(self.q, P=P) statsArray[self.powerLawIndex] = graph.fitPowerLaw()[0] statsArray[self.geodesicDistanceIndex] = graph.geodesicDistance(P=P) statsArray[self.harmonicGeoDistanceIndex] = graph.harmonicGeodesicDistance(P=P) if graph.isUndirected() and len(subComponents) != 0: statsArray[self.geodesicDistMaxCompIndex] = graph.geodesicDistance(P=P, vertexInds=list(subComponents[0])) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsArray[self.numTreesIndex] = len(trees) nonSingletonTrees = [c for c in trees if len(c) > 1] statsArray[self.numNonSingletonTreesIndex] = len(nonSingletonTrees) statsArray[self.meanTreeSizeIndex] = numpy.mean([len(x) for x in trees]) treeDepths = [GraphUtils.treeDepth((graph.subgraph(list(x)))) for x in trees] statsArray[self.meanTreeDepthIndex] = numpy.mean(treeDepths) if len(trees) != 0: maxTreeGraph = graph.subgraph(trees[0]) statsArray[self.maxTreeSizeIndex] = len(trees[0]) statsArray[self.maxTreeDepthIndex] = GraphUtils.treeDepth(maxTreeGraph) if len(trees) >= 2: secondTreeGraph = graph.subgraph(trees[1]) statsArray[self.secondTreeSizeIndex] = len(trees[1]) statsArray[self.secondTreeDepthIndex] = GraphUtils.treeDepth(secondTreeGraph) return statsArray
def testVertexLabelPairs(self): numVertices = 6 numFeatures = 1 vList = VertexList(numVertices, numFeatures) vList.setVertices(numpy.array([numpy.arange(0, 6)]).T) graph = DenseGraph(vList, True) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) graph.addEdge(0, 2, 0.2) graph.addEdge(2, 3, 0.5) graph.addEdge(0, 4, 0.1) graph.addEdge(3, 4, 0.1) tol = 10**-6 edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue(numpy.linalg.norm(X - edges) < tol) X = GraphUtils.vertexLabelPairs(graph, edges[[5, 2, 1], :]) self.assertTrue(numpy.linalg.norm(X - edges[[5, 2, 1], :]) < tol) #Try a bigger graph numVertices = 6 numFeatures = 2 vList = VertexList(numVertices, numFeatures) vList.setVertices(numpy.random.randn(numVertices, numFeatures)) graph = DenseGraph(vList, True) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue( numpy.linalg.norm(X[0, 0:numFeatures] - vList.getVertex(1)) < tol) self.assertTrue( numpy.linalg.norm(X[0, numFeatures:numFeatures * 2] - vList.getVertex(0)) < tol) self.assertTrue( numpy.linalg.norm(X[1, 0:numFeatures] - vList.getVertex(3)) < tol) self.assertTrue( numpy.linalg.norm(X[1, numFeatures:numFeatures * 2] - vList.getVertex(1)) < tol) #Try directed graphs graph = DenseGraph(vList, False) graph.addEdge(0, 1, 0.1) graph.addEdge(1, 3, 0.1) edges = graph.getAllEdges() X = GraphUtils.vertexLabelPairs(graph, edges) self.assertTrue( numpy.linalg.norm(X[0, 0:numFeatures] - vList.getVertex(0)) < tol) self.assertTrue( numpy.linalg.norm(X[0, numFeatures:numFeatures * 2] - vList.getVertex(1)) < tol) self.assertTrue( numpy.linalg.norm(X[1, 0:numFeatures] - vList.getVertex(1)) < tol) self.assertTrue( numpy.linalg.norm(X[1, numFeatures:numFeatures * 2] - vList.getVertex(3)) < tol)
vals = line.split() node1Inds.append(indexer.append(vals[0])) node2Inds.append(indexer.append(vals[1])) node1Inds = numpy.array(node1Inds) node2Inds = numpy.array(node2Inds) m = len(indexer.getIdDict()) A = numpy.zeros((m, m)) A[node1Inds, node2Inds] = 1 A = (A+A.T)/2 A = scipy.sparse.csr_matrix(A) L = GraphUtils.normalisedLaplacianSym(A) Ls.append(L) u, V = scipy.sparse.linalg.eigs(L, k=m-2, which="SM") u = u.real inds = numpy.argsort(u) u = u[inds] V = V[:, inds] us.append(u) k0 = numpy.where(u > 0.01)[0][0] k = numpy.argmax(numpy.diff(u[k0:])) ks.append(k)
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] sinThetaList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg != "efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg == "IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank - 1, which="LM", ncv=ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append( [bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank - 1, which="LM", ncv=ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot * omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) boundList.append([0] * 4) else: omega, Q = scipy.sparse.linalg.eigsh( ABBA, min(self.k2, ABBA.shape[0] - 1), which="LM", ncv=min(10 * self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh( ABBA, min(self.k1, ABBA.shape[0] - 1), which="LM", ncv=min(15 * self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) if self.computeSinTheta: omegaExact, QExact = scipy.linalg.eigh(ABBA.todense()) inds = numpy.flipud(numpy.argsort(omegaExact)) QExactKbot = QExact[:, inds[self.k1:]] inds = numpy.flipud(numpy.argsort(omega)) QApproxK = Q[:, inds[:self.k1]] sinThetaList.append( scipy.linalg.norm(QExactKbot.T.dot(QApproxK))) decompositionTimeList.append(time.time() - startTime) if self.alg == "IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint( 0, V.shape[0], nb_missing_centroids), :] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans( V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time() - startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from sandbox.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: eigenQuality = { "boundList": boundList, "sinThetaList": sinThetaList } return clustersList, numpy.array( (decompositionTimeList, kMeansTimeList)).T, eigenQuality else: return clustersList
p = 0.05 pClust = 0.3 W = numpy.ones((numVertices, numVertices))*p for i in range(numClusters): W[endClusterSize*i:endClusterSize*(i+1), endClusterSize*i:endClusterSize*(i+1)] = pClust P = numpy.random.rand(numVertices, numVertices) W = numpy.array(P < W, numpy.float) upTriInds = numpy.triu_indices(numVertices) W[upTriInds] = 0 W = W + W.T graph = SparseGraph(vList) graph.setWeightMatrix(W) L = GraphUtils.shiftLaplacian(scipy.sparse.csr_matrix(W)) u, V = numpy.linalg.eig(L.todense()) print(V.shape) print(numpy.linalg.cond(V)) # run with exact eigenvalue decomposition logging.info("Running exact method") graphIterator = IncreasingSubgraphListIterator(graph, subgraphIndicesList) """ for W in graphIterator: graph = SparseGraph(GeneralVertexList(W.shape[0])) graph.setWeightMatrixSparse(W) components = graph.findConnectedComponents() print(graph)
numRepetitions = 20 #numRepetitions = 1 saveResults = False resultsDir = PathDefaults.getOutputDir() + "cluster/" fileName = resultsDir + "ErrorBoundNystrom.npy" if saveResults: for r in range(numRepetitions): i = 0 iterator = BoundGraphIterator(changeEdges=50, numGraphs=numGraphs, numClusterVertices=numClusterVertices, numClusters=k, p=0.1) for W in iterator: print("i="+str(i)) L = GraphUtils.shiftLaplacian(W) if i == 0: initialL = L initialOmega, initialQ = numpy.linalg.eigh(L.todense()) inds = numpy.flipud(numpy.argsort(initialOmega)) initialOmega, initialQ = initialOmega[inds], initialQ[:, inds] #Fix for weird error in EigenAdd2 later on initialQ = numpy.array(initialQ) initialQk = initialQ[:, 0:k] # for IASC lastL = initialL lastOmegas = [initialOmega]*len(IASCL) lastQs = [initialQ]*len(IASCL) #Compute exact eigenvalues
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg!="efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg=="IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot*omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) else: omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) decompositionTimeList.append(time.time()-startTime) if self.alg=="IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time()-startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from apgl.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList else: return clustersList