예제 #1
0
 def testNormaliseArray(self):
     numExamples = 10 
     numFeatures = 3 
     
     preprocessor = Standardiser()
     
     #Test an everyday matrix 
     X = numpy.random.rand(numExamples, numFeatures)
     Xn = preprocessor.normaliseArray(X)
     normV = preprocessor.getNormVector()
     self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures, places=3)
     
     norms = numpy.sum(Xn*Xn, 0)
     
     for i in range(0, norms.shape[0]): 
         self.assertAlmostEquals(norms[i], 1, places=3)
         
     self.assertTrue((X/normV == Xn).all())
     
     #Zero one column 
     preprocessor = Standardiser()
     X[:, 1] = 0 
     Xn = preprocessor.normaliseArray(X)
     normV = preprocessor.getNormVector()
     self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures-1, places=3)
     self.assertTrue((X/normV == Xn).all())
     
     #Now take out 3 rows of X, normalise and compare to normalised X 
     Xs = X[0:3, :]
     Xsn = preprocessor.normaliseArray(Xs)
     self.assertTrue((Xsn == Xn[0:3, :]).all())
예제 #2
0
 def matrixSimilarity(self, V1, V2): 
     """
     Compute a vertex similarity matrix C, such that the ijth entry is the matching 
     score between V1_i and V2_j, where larger is a better match. 
     """  
     X = numpy.r_[V1, V2]
     standardiser = Standardiser()
     X = standardiser.normaliseArray(X)
     
     V1 = X[0:V1.shape[0], :]
     V2 = X[V1.shape[0]:, :]
     
     #print(X)
      
     #Extend arrays with zeros to make them the same size
     #if V1.shape[0] < V2.shape[0]: 
     #    V1 = Util.extendArray(V1, V2.shape, numpy.min(V1))
     #elif V2.shape[0] < V1.shape[0]: 
     #    V2 = Util.extendArray(V2, V1.shape, numpy.min(V2))
       
     #Let's compute C as the distance between vertices 
     #Distance is bounded by 1
     D = Util.distanceMatrix(V1, V2)
     maxD = numpy.max(D)
     minD = numpy.min(D)
     if (maxD-minD) != 0: 
         C = (maxD - D)/(maxD-minD)
     else: 
         C = numpy.ones((V1.shape[0], V2.shape[0])) 
         
     return C
예제 #3
0
    def cluster(self, graph):
        """
        Take a graph and cluster using the method in "On spectral clusering: analysis
        and algorithm" by Ng et al., 2001. 

        :param graph: the graph to cluster
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :returns:  An array of size graph.getNumVertices() of cluster membership 
        """
        L = graph.normalisedLaplacianSym()

        omega, Q = numpy.linalg.eig(L)
        inds = numpy.argsort(omega)

        #First normalise rows, then columns
        standardiser = Standardiser()
        V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T
        V = vq.whiten(V)
        #Using kmeans2 here seems to result in a high variance
        #in the quality of clustering. Therefore stick to kmeans
        centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans)
        clusters, distortion = vq.vq(V, centroids)

        return clusters
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = [] 
        kMeansTimeList = [] 
        boundList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg!="efficientNystrom": 
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg=="IASC": 
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)   
                    
                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])
                        
                        #Now use accurate values of norm of R and delta   
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2)                  
                        boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]])      
                else: 
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound: 
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]  
                        QKbot = Q[:, inds[self.k2:]] 
                        AKbot = (QKbot*omegaKbot).dot(QKbot.T)
                        
                        omegaSort = numpy.flipud(numpy.sort(omega))
                    else: 
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                            
            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact": 
                omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd": 
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            decompositionTimeList.append(time.time()-startTime)                  
                  
            if self.alg=="IASC":
                self.storeInformation(subW, ABBA)
            
            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time()-startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from apgl.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList
        else:
            return clustersList
예제 #5
0
class EgoNetworkSimulator(AbstractDiffusionSimulator):
    """
    A class which combines Ego network prediction with simulating information transmission
    within a simulated social network.
    """
    def __init__(self, graph, predictor):
        """
        Create the class by reading a graph with labelled edges. Instantiate the predictor
        and create a preprocesor to standarise examples to have zero mean and unit variance.
        """
        self.graph = graph
        self.predictor = predictor
        self.errorMethod = Evaluator.balancedError

        #Note: We modify the vertices of the input graph!!!!
        logging.warn("About to modify (normalise) the vertices of the graph.")
        self.preprocessor = Standardiser()
        V = graph.getVertexList().getVertices(graph.getAllVertexIds())
        V = self.preprocessor.normaliseArray(V)
        graph.getVertexList().setVertices(V)

    def getPreprocessor(self):
        """
        Returns the preprocessor
        """
        return self.preprocessor

    def sampleEdges(self, sampleSize):
        """
        This function exists so that we can sample the same examples used in model
        selection and exclude them when running evaluateClassifier.
        """
        edges = self.graph.getAllEdges()
        trainInds = numpy.random.permutation(edges.shape[0])[0:sampleSize]
        trainEdges = edges[trainInds, :]

        trainGraph = SparseGraph(self.graph.getVertexList(), self.graph.isUndirected())
        trainGraph.addEdges(trainEdges, self.graph.getEdgeValues(trainEdges))

        logging.info("Randomly sampled " + str(sampleSize) + " edges")

        return trainGraph

    def modelSelection(self, paramList, paramFunc, folds, errorFunc, sampleSize):
        """
        Perform model selection using an edge label predictor. 
        """
        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) 

        #trainGraph = self.sampleEdges(sampleSize)
        trainGraph = self.graph

        #Perform model selection
        meanErrs, stdErrs = self.predictor.cvModelSelection(trainGraph, paramList, paramFunc, folds, errorFunc)
        logging.info("Model selection errors:" + str(meanErrs))
        logging.info("Model selection stds:" + str(stdErrs))
        logging.info("Model selection best parameters:" + str(paramList[numpy.argmin(meanErrs)]))

        return paramList[numpy.argmin(meanErrs)], paramFunc, meanErrs[numpy.argmin(meanErrs)] 

    def evaluateClassifier(self, params, paramFuncs, folds, errorFunc, sampleSize, invert=True):
        """
        Evaluate the predictor with the given parameters. Often model selection is done before this step
        and in that case, invert=True uses a sample excluding those used for model selection.

        Return a set of errors for each
        """
        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges())

        trainGraph = self.sampleEdges(sampleSize)

        return self.predictor.cvError(trainGraph, params, paramFuncs, folds, errorFunc)

    def trainClassifier(self, params, paramFuncs, sampleSize):
        
        for j in range(len(params)):
            paramFuncs[j](params[j])

        trainGraph = self.sampleEdges(sampleSize)
        self.predictor.learnModel(trainGraph)

        return self.predictor

    def runSimulation(self, maxIterations):
        Parameter.checkInt(maxIterations, 1, float('inf'))

        #Notice that the data is preprocessed in the same way as the survey data
        egoSimulator = EgoSimulator(self.graph, self.predictor, self.preprocessor)

        totalInfo = numpy.zeros(maxIterations+1)
        totalInfo[0] = EgoUtils.getTotalInformation(self.graph)
        logging.info("Total number of people with information: " + str(totalInfo[0]))

        logging.info("--- Simulation Started ---")

        for i in range(0, maxIterations):
            logging.info("--- Iteration " + str(i) + " ---")

            self.graph = egoSimulator.advanceGraph()
            totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph)
            logging.info("Total number of people with information: " + str(totalInfo[i+1]))

            #Compute distribution of ages etc. in alters
            alterIndices = egoSimulator.getAlters(i)
            alterAges = numpy.zeros(len(alterIndices))
            alterGenders = numpy.zeros(len(alterIndices))

            for j in range(0, len(alterIndices)):
                currentVertex = self.graph.getVertex(alterIndices[j])
                alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))]
                alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))]

            (freqs, items) = Util.histogram(alterAges)
            logging.info("Distribution of ages " + str(freqs) + " " + str(items))
            (freqs, items) = Util.histogram(alterGenders)
            logging.info("Distribution of genders " + str(freqs) + " " + str(items))

        logging.info("--- Simulation Finished ---")

        return totalInfo, egoSimulator.getTransmissions()

    def getVertexFeatureDistribution(self, fIndex, vIndices=None):
        return self.graph.getVertexFeatureDistribution(fIndex, vIndices)

    def getPreProcessor(self):
        return self.preprocessor

    def getClassifier(self):
        return self.predictor

    preprocessor = None
    examplesList = None
    predictor = None
    graph = None
    edgeWeight = 1