def testNormaliseArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() #Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures, places=3) norms = numpy.sum(Xn*Xn, 0) for i in range(0, norms.shape[0]): self.assertAlmostEquals(norms[i], 1, places=3) self.assertTrue((X/normV == Xn).all()) #Zero one column preprocessor = Standardiser() X[:, 1] = 0 Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures-1, places=3) self.assertTrue((X/normV == Xn).all()) #Now take out 3 rows of X, normalise and compare to normalised X Xs = X[0:3, :] Xsn = preprocessor.normaliseArray(Xs) self.assertTrue((Xsn == Xn[0:3, :]).all())
def matrixSimilarity(self, V1, V2): """ Compute a vertex similarity matrix C, such that the ijth entry is the matching score between V1_i and V2_j, where larger is a better match. """ X = numpy.r_[V1, V2] standardiser = Standardiser() X = standardiser.normaliseArray(X) V1 = X[0:V1.shape[0], :] V2 = X[V1.shape[0]:, :] #print(X) #Extend arrays with zeros to make them the same size #if V1.shape[0] < V2.shape[0]: # V1 = Util.extendArray(V1, V2.shape, numpy.min(V1)) #elif V2.shape[0] < V1.shape[0]: # V2 = Util.extendArray(V2, V1.shape, numpy.min(V2)) #Let's compute C as the distance between vertices #Distance is bounded by 1 D = Util.distanceMatrix(V1, V2) maxD = numpy.max(D) minD = numpy.min(D) if (maxD-minD) != 0: C = (maxD - D)/(maxD-minD) else: C = numpy.ones((V1.shape[0], V2.shape[0])) return C
def cluster(self, graph): """ Take a graph and cluster using the method in "On spectral clusering: analysis and algorithm" by Ng et al., 2001. :param graph: the graph to cluster :type graph: :class:`apgl.graph.AbstractMatrixGraph` :returns: An array of size graph.getNumVertices() of cluster membership """ L = graph.normalisedLaplacianSym() omega, Q = numpy.linalg.eig(L) inds = numpy.argsort(omega) #First normalise rows, then columns standardiser = Standardiser() V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T V = vq.whiten(V) #Using kmeans2 here seems to result in a high variance #in the quality of clustering. Therefore stick to kmeans centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans) clusters, distortion = vq.vq(V, centroids) return clusters
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg!="efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg=="IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot*omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) else: omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) decompositionTimeList.append(time.time()-startTime) if self.alg=="IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time()-startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from apgl.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList else: return clustersList
class EgoNetworkSimulator(AbstractDiffusionSimulator): """ A class which combines Ego network prediction with simulating information transmission within a simulated social network. """ def __init__(self, graph, predictor): """ Create the class by reading a graph with labelled edges. Instantiate the predictor and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.graph = graph self.predictor = predictor self.errorMethod = Evaluator.balancedError #Note: We modify the vertices of the input graph!!!! logging.warn("About to modify (normalise) the vertices of the graph.") self.preprocessor = Standardiser() V = graph.getVertexList().getVertices(graph.getAllVertexIds()) V = self.preprocessor.normaliseArray(V) graph.getVertexList().setVertices(V) def getPreprocessor(self): """ Returns the preprocessor """ return self.preprocessor def sampleEdges(self, sampleSize): """ This function exists so that we can sample the same examples used in model selection and exclude them when running evaluateClassifier. """ edges = self.graph.getAllEdges() trainInds = numpy.random.permutation(edges.shape[0])[0:sampleSize] trainEdges = edges[trainInds, :] trainGraph = SparseGraph(self.graph.getVertexList(), self.graph.isUndirected()) trainGraph.addEdges(trainEdges, self.graph.getEdgeValues(trainEdges)) logging.info("Randomly sampled " + str(sampleSize) + " edges") return trainGraph def modelSelection(self, paramList, paramFunc, folds, errorFunc, sampleSize): """ Perform model selection using an edge label predictor. """ Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) #trainGraph = self.sampleEdges(sampleSize) trainGraph = self.graph #Perform model selection meanErrs, stdErrs = self.predictor.cvModelSelection(trainGraph, paramList, paramFunc, folds, errorFunc) logging.info("Model selection errors:" + str(meanErrs)) logging.info("Model selection stds:" + str(stdErrs)) logging.info("Model selection best parameters:" + str(paramList[numpy.argmin(meanErrs)])) return paramList[numpy.argmin(meanErrs)], paramFunc, meanErrs[numpy.argmin(meanErrs)] def evaluateClassifier(self, params, paramFuncs, folds, errorFunc, sampleSize, invert=True): """ Evaluate the predictor with the given parameters. Often model selection is done before this step and in that case, invert=True uses a sample excluding those used for model selection. Return a set of errors for each """ Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) trainGraph = self.sampleEdges(sampleSize) return self.predictor.cvError(trainGraph, params, paramFuncs, folds, errorFunc) def trainClassifier(self, params, paramFuncs, sampleSize): for j in range(len(params)): paramFuncs[j](params[j]) trainGraph = self.sampleEdges(sampleSize) self.predictor.learnModel(trainGraph) return self.predictor def runSimulation(self, maxIterations): Parameter.checkInt(maxIterations, 1, float('inf')) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(self.graph, self.predictor, self.preprocessor) totalInfo = numpy.zeros(maxIterations+1) totalInfo[0] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[0])) logging.info("--- Simulation Started ---") for i in range(0, maxIterations): logging.info("--- Iteration " + str(i) + " ---") self.graph = egoSimulator.advanceGraph() totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[i+1])) #Compute distribution of ages etc. in alters alterIndices = egoSimulator.getAlters(i) alterAges = numpy.zeros(len(alterIndices)) alterGenders = numpy.zeros(len(alterIndices)) for j in range(0, len(alterIndices)): currentVertex = self.graph.getVertex(alterIndices[j]) alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))] alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))] (freqs, items) = Util.histogram(alterAges) logging.info("Distribution of ages " + str(freqs) + " " + str(items)) (freqs, items) = Util.histogram(alterGenders) logging.info("Distribution of genders " + str(freqs) + " " + str(items)) logging.info("--- Simulation Finished ---") return totalInfo, egoSimulator.getTransmissions() def getVertexFeatureDistribution(self, fIndex, vIndices=None): return self.graph.getVertexFeatureDistribution(fIndex, vIndices) def getPreProcessor(self): return self.preprocessor def getClassifier(self): return self.predictor preprocessor = None examplesList = None predictor = None graph = None edgeWeight = 1