def sequenceVectorStats(self,
                            graph,
                            subgraphIndices,
                            treeStats=False,
                            eigenStats=True):
        """
        Pass in a list of graphs are returns a series of statistics. Each list
        element is a dict of vector statistics. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt,
                                [0, graph.getNumVertices()])
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsDictList = []

        for i in range(numGraphs):
            Util.printIteration(i, self.vectorPrintStep, numGraphs)
            subgraph = graph.subgraph(subgraphIndices[i])
            statsDictList.append(
                self.vectorStatistics(subgraph, treeStats, eigenStats))

        return statsDictList
Example #2
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the following score
        \sum_z \in n(x) \cup n(y) = 1/|log(n(z)|
        Returns a matrix with rows are a ranked list of verticies of length self.windowSize.
        """

        Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices())
        logging.info("Running predictEdges in " + str(self.__class__.__name__))

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, self.printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            for j in range(0, self.graph.getNumVertices()):
                commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0]

                for k in commonNeighbours:
                    q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0])
                    if q != 0:
                        scores[j] = scores[j] + 1/q


            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
    def sequenceScalarStats(self,
                            graph,
                            subgraphIndices,
                            slowStats=True,
                            treeStats=False):
        """
        Pass in a graph and list of subgraph indices and returns a series of statistics. Each row
        corresponds to the statistics on the subgraph. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt,
                                [0, graph.getNumVertices()])
        Parameter.checkBoolean(slowStats)
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsMatrix = numpy.zeros((numGraphs, self.numStats))

        for i in range(numGraphs):
            Util.printIteration(i, self.printStep, numGraphs)
            #logging.debug("Subgraph size: " + str(len(subgraphIndices[i])))
            subgraph = graph.subgraph(subgraphIndices[i])
            statsMatrix[i, :] = self.scalarStatistics(subgraph, slowStats,
                                                      treeStats)

        return statsMatrix
Example #4
0
    def readAuthorsAndDocuments(self, useAbstract=True): 
        logging.debug("About to read file " + self.dataFilename)
        inFile = open(self.dataFilename)  
        authorList = []
        citationList = []
        documentList = []
                    
        lastAbstract = ""
        lastVenue = ""
        lastTitle = ""    
        lastAuthors = []     
        lastCitationNo = 0                
                    
        for i, line in enumerate(inFile):
            Util.printIteration(i, self.stepSize, self.numLines)
                
            #Match the fields in the file 
            emptyLine = line == "\n"
            title = re.findall("#\*(.*)", line)
            currentAuthors = re.findall("#@(.*)", line)  
            abstract = re.findall("#!(.*)", line)
            venue = re.findall("#conf(.*)", line)
            citationNo = re.findall("#citation(.*)", line)
            
            if emptyLine:
                if useAbstract: 
                    document = lastTitle + " " + lastAbstract 
                else: 
                    document = lastTitle     
                documentList.append(document) 
                authorList.append(lastAuthors)
                citationList.append(lastCitationNo)

                lastAbstract = ""
                lastTitle = ""
                lastAuthors = []
                lastCitationNo = 0   
 
            if len(title) != 0 and len(title[0]) != 0: 
                lastTitle = title[0]
                
            if len(venue) != 0 and len(venue[0]) != 0: 
                lastVenue = venue[0]  
            
            if len(abstract) != 0 and len(abstract[0]) != 0: 
                lastAbstract = abstract[0]
                
            if len(citationNo) != 0 and len(citationNo[0]) != 0: 
                lastCitationNo = int(citationNo[0])
                       
            if len(currentAuthors) != 0: 
                currentAuthors = currentAuthors[0].split(",")  
                currentAuthors = set([x.strip() for x in currentAuthors])
                currentAuthors = currentAuthors.difference(set([""]))
                lastAuthors = currentAuthors                     

        inFile.close() 
        logging.debug("Finished reading " + str(len(documentList)) + " articles")  
        
        return authorList, documentList, citationList
Example #5
0
 def cleanXML(self):
     """
     Take the original XML file and clean up HTML characters and & symbols. We 
     also create a list of possible matches for the experts. 
     """
     if not os.path.exists(self.xmlCleanFilename):
         logging.debug("Cleaning XML")
         h = HTMLParser.HTMLParser()
         
         inFile = open(self.xmlFileName)
         outFile = open(self.xmlCleanFilename, "w")
         i = 0 
         
         for line in inFile: 
             Util.printIteration(i, self.stepSize, self.numLines)
             outLine = h.unescape(line).replace("&", "&")
             outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine)
             outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine)
             outFile.write(outLine) 
             i += 1
         
         inFile.close() 
         outFile.close() 
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.xmlCleanFilename)
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i>=j: # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
#            if deltaW[i, j] < 0:
#                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]
        
        return lmbda, Q 
Example #7
0
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "netflix/training_set/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])        
            
            movieIds = array.array("I")
            custIds = array.array("I")
            ratings = array.array("B")
            dates = array.array("L")
            j = 0
            
            for i in range(self.startMovieID, self.endMovieID+1): 
                Util.printIteration(i-1, 1, self.endMovieID-1)
                ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt")
                ratingsFile.readline()
                
                for line in ratingsFile: 
                    vals = line.split(",")
                    
                    custId = int(vals[0])
                    
                    if custId not in custIdSet: 
                        custIdSet.add(custId)
                        custIdDict[custId] = j
                        custInd = j 
                        j += 1 
                    else: 
                        custInd = custIdDict[custId]
                    
                    rating = int(vals[1])     
                    t = datetime.strptime(vals[2].strip(), "%Y-%m-%d")
                
                    movieIds.append(i-1)
                    custIds.append(custInd)   
                    ratings.append(rating)
                    dates.append(int(time.mktime(t.timetuple()))) 
                    
            movieIds = numpy.array(movieIds, numpy.uint32)
            custIds = numpy.array(custIds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.uint8)
            dates = numpy.array(dates, numpy.uint32)
            
            assert ratings.shape[0] == self.numRatings            
            
            numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
Example #8
0
    def processProbe(self): 
        """
        Go through the probe set and label the corresponding ratings in the full 
        dataset as test. 
        """
        if not os.path.exists(self.isTrainRatingsFileName):
            custIdDict = pickle.load(open(self.custDictFileName))             
            dataArr = numpy.load(self.ratingFileName)
            movieInds, custInds, ratings, dates = dataArr["arr_0"], dataArr["arr_1"], dataArr["arr_2"], dataArr["arr_3"]
            logging.debug("Number of ratings: " + str(ratings.shape[0]+1))            
            del ratings, dates 
            logging.debug("Training data loaded")
            
            isTrainRating = numpy.ones(movieInds.shape[0], numpy.bool)
            probeFile = open(self.probeFileName)
            i = 0 
            
            #First figure out the movie boundaries 
            movieBoundaries = numpy.nonzero(numpy.diff(movieInds) != 0)[0] + 1
            movieBoundaries = numpy.insert(movieBoundaries, 0, 0)
            movieBoundaries = numpy.append(movieBoundaries, movieInds.shape[0])
            
            assert movieBoundaries.shape[0] == self.numMovies+1 
            assert movieBoundaries[-1] == movieInds.shape[0]
            
            for line in probeFile: 
                if line.find(":") != -1: 
                    Util.printIteration(i, 10, self.numProbeMovies)
                    movieId = line[0:-2]
                    movieInd = int(movieId)-1
                
                    startInd = movieBoundaries[movieInd] 
                    endInd = movieBoundaries[movieInd+1] 
                    #All the customers that watches movie movieInd
                    tempCustInds = custInds[startInd:endInd]
                    sortedInds = numpy.argsort(tempCustInds)
                    
                    assert (movieInds[startInd:endInd] == movieInd).all()
                    
                    i += 1
                else: 
                    custId = int(line.strip())
                    custInd = custIdDict[custId]

                    offset = numpy.searchsorted(tempCustInds[sortedInds], custInd)
                    isTrainRating[startInd + sortedInds[offset]] = 0 
                    
                    assert custInds[startInd + sortedInds[offset]] == custInd
               
            assert i == self.numProbeMovies 
            assert numpy.logical_not(isTrainRating).sum() == self.numProbeRatings               
               
            numpy.savez(self.isTrainRatingsFileName, isTrainRating) 
            logging.debug("Saved file as " + self.isTrainRatingsFileName)
        else: 
            logging.debug("Train/test indicators file " + str(self.isTrainRatingsFileName) + " already processed")
Example #9
0
    def supervisedMC23(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. We just make
        sure it fits the stationary distribution. 
        """
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        u, v = scipy.sparse.linalg.eigs(Py.T, 1)
        v = numpy.array(v).flatten()

        c = numpy.zeros(v.shape[0])

        for i, P in enumerate(PList): 
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        c = cvxopt.matrix(c)
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores        
    def learnModel(self, graph):
        """
        Learn a prediction model based on considering ego networks as independent.
        For each ego, X contains a list of neighbours and the corresponding labels
        are the values of the edge labels. We then find the set of primal weights
        w for each ego network and then regress onto the set of weights using the
        ego labels.

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`
        """

        logging.info("Learning model on graph of size " + str(graph.getNumVertices()))
        logging.info("EgoLearner: " + str(self.egoRegressor))
        logging.info("AlterLearner: " + str(self.alterRegressor))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(list(allIndices))
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe  =  numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices()/10)
        alterError = 0.0 

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                X = V[neighbours, :]
                y = numpy.ones(X.shape[0])

                for j in range(neighbours.shape[0]):
                    y[j] = graph.getEdge(i, neighbours[j])


                w = self.alterRegressor.learnModel(X, y)
                #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y))

                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        logging.info("Finding regression matrix onto weights using matrix of size " + str(Xe.shape))
        gc.collect()
        #self.standardiser = Standardiser()
        #self.standardiser2 = Standardiser()
        #Xe = self.standardiser.standardiseArray(Xe)
        #W = self.standardiser2.standardiseArray(W)
        self.egoRegressor.learnModel(Xe, W)


        return W 
Example #11
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #12
0
    def supervisedMC22(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. 
        """
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Q = cvxopt.spmatrix([], [], [], (n*n, len(lists)))

        for i, P in enumerate(PList): 
            #print(P.todense())
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores
Example #13
0
    def generate_data_file(dir, nb_user=None):
        logging.debug("nb_user: "******"creating file " + str(f_data_name))
            shutil.copy(BemolData.get_file_name(dir, None), f_data_name)

        # other files to generate
        nb_user_to_generate = []
        current_nb_user = BemolData.get_nb_user_to_read(nb_user)
        logging.debug("current_nb_user before while: " + str(current_nb_user))
        # !!!!! security failure TOCTTOU
        while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))):
            logging.debug("current_nb_user in while: " + str(current_nb_user))
            nb_user_to_generate.append(current_nb_user)
            current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1)
        nb_user_to_generate.reverse()

    
        # generate other files
        for current_nb_user in nb_user_to_generate:
            # read data
            f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1)
            f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user)
            logging.info("creating file " + f_to_create_data_name)
            dict_user = MyDictionary()
            try:
                f_existing_data = gzip.open(f_existing_data_name, 'rb')
                f_to_create_data = gzip.open(f_to_create_data_name, 'wb')

                i = 0
                i_max = BemolData.get_nb_line(f_existing_data_name)
                for line in f_existing_data:
                    Util.printIteration(i, 1000, i_max); i += 1
                    m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line)
                    if dict_user.index(int(m.group(1))) < current_nb_user:
                        f_to_create_data.write(line)
            except IOError as error:
                if error.filename == f_existing_data:
                    raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile')
                else:
                    raise error
    def sequenceScalarStats(self, graph, subgraphIndices):
        """
        Pass in a list of graphs are returns a series of statistics. Each row
        corresponds to the statistics on the subgraph.
        """

        numGraphs = len(subgraphIndices)
        statsMatrix = numpy.zeros((numGraphs, self.numStats))

        for i in range(numGraphs):
            Util.printIteration(i, self.printStep, numGraphs)
            logging.debug("Subgraph size: " + str(len(subgraphIndices[i])))
            subgraph = graph.subgraph(subgraphIndices[i])
            statsMatrix[i, :] = self.scalarStatistics(subgraph)

        return statsMatrix
Example #15
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on all of the edges of the input graph.
        For each ego, X contains a list of neighbours and non-neighbours in the same
        ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of
        primal weights w for each ego network and then regress onto the set of weights
        using the ego labels.

        One can either learn by comparing neighbours and non-neighbours, or alternatively
        using the labels of edges and making prediction on unlabelled edges. 

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`

        :param randomNegLabel: How to compute edge labels, False means use the labels
        themselves, and True means randomly pick non-neighbours to have -1 labels
        :type randomNegLabel: class `bool`
        """

        Parameter.checkInt(self.windowSize, 1, graph.getNumVertices())
        self.graph = graph
        logging.info("Learning model on graph of size " + str(graph.getNumVertices()))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(allIndices)
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe  =  numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices()/10)

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                compNeighbours = numpy.setdiff1d(allIndices, neighbours)
                perm = numpy.random.permutation(compNeighbours.shape[0])[0:neighbours.shape[0]]
                negativeVertices = V[compNeighbours[perm], :]
                X = numpy.r_[V[neighbours, :], negativeVertices]
                y = numpy.ones(X.shape[0])
                y[neighbours.shape[0]:] = -1
 
                w = self.alterRegressor.learnModel(X, y)
                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        self.egoRegressor.learnModel(Xe, W)
Example #16
0
    def maxProductPaths(self):
        """
        Find the maximum product paths between all pairs of vertices using
        a modified version of the Floyd-Warshall algorithm.

        :returns: A matrix P whose ijth entry corresponds to the maximal product of edge weights between them.
        """
        numVertices = self.vList.getNumVertices()
        P = self.getWeightMatrix().copy()
        stepSize = min(100, numVertices-1)

        for k in range(0, numVertices):
            Util.printIteration(k, stepSize, numVertices)
            P2 = numpy.outer(P[:, k], P[k, :])
            P = numpy.maximum(P, P2)

        return P
 def __getArcString(self, graph):
     arcString = ""
     ind = 0 
         
     for vertex1 in graph.getAllVertexIds():
         Util.printIteration(ind, self.printStep, graph.getNumVertices())
         
         neighbours = graph.neighbours(vertex1)
         pajekIndex1 = self.vertexIdDict[vertex1]
         
         for vertex2 in neighbours:
             pajekIndex2 = self.vertexIdDict[vertex2]
             arcString = arcString + str(pajekIndex1) + " " + str(pajekIndex2) + " " + str(graph.getEdge(vertex1, vertex2))
             arcString = arcString + " c " + self.colours[self.defaultColour] + "\n"
             
         ind = ind + 1
        
     return arcString
Example #18
0
 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
 def __getEdgeString(self, graph):
     edgeString = ""
     ind = 0 
     
     for vertex1 in graph.getAllVertexIds():
         Util.printIteration(ind, self.printStep, graph.getNumVertices())
         neighbours = graph.neighbours(vertex1)
         pajekIndex1 = self.vertexIdDict[vertex1]
         
         for vertex2 in neighbours:
             pajekIndex2 = self.vertexIdDict[vertex2]
             colour = self.getEdgeColour(vertex1, vertex2, graph)
             edgeString = edgeString + str(pajekIndex1) + " " + str(pajekIndex2) + " " + str(self.getEdgeWeight(vertex1, vertex2, graph))
             edgeString = edgeString + " w " + str(self.getEdgeSize(vertex1, vertex2, graph))
             edgeString = edgeString + " c " + colour + "\n"
             
         ind = ind + 1
                 
     return edgeString
Example #20
0
    def generateReceivers(self, egoAlterArray, realAltersArray, alterFieldIndices):
        """ 
        Takes in a row for each ego with up to 15 egos on each line, extract alters and augment data. 
        The parameter alterFieldIndices is a list of indices in realAltersArray that match those present 
        in egoAlterArray. 
        """
        numEgos = egoAlterArray.shape[0]
        maxAlters = numEgos * self.numPossibleAlters
        
        generatedAltersArray = numpy.zeros((maxAlters, realAltersArray.shape[1]))
        egoIndices = numpy.zeros(maxAlters, numpy.int)
        alterIndices = numpy.zeros(maxAlters, numpy.int32)
        receiverIndex = 0 
        
        logging.info("Generating receivers for " + str(numEgos) + " egos")
        
        for i in range(0, numEgos): 
            Util.printIteration(i, self.printIterationStep, numEgos)
            
            for j in range(0, self.numPossibleAlters): 
                if egoAlterArray[i, j*self.partialAlterFields] != -1 and egoAlterArray[i, j*self.partialAlterFields] != 0:  
                    candidateAlters = numpy.array(list(range(0, realAltersArray.shape[0])))
                    
                    for k in range(0, len(alterFieldIndices)): 
                        subset = numpy.nonzero(realAltersArray[:, alterFieldIndices[k]] == egoAlterArray[i, j*self.partialAlterFields+k])[0]
                        candidateAlters = numpy.intersect1d(candidateAlters, subset)
                   
                    if candidateAlters.shape[0] != 0: 
                        alterIndices[receiverIndex] = candidateAlters[rand.randint(0, candidateAlters.shape[0])]
                        egoIndices[receiverIndex] = i
                        
                        chosenAlter = realAltersArray[alterIndices[receiverIndex], :]
                        generatedAltersArray[receiverIndex, :] = chosenAlter
                        receiverIndex = receiverIndex + 1 
                else: 
                    break 
                
        generatedAltersArray = generatedAltersArray[0:receiverIndex, :]
        egoIndices = egoIndices[0:receiverIndex]
        alterIndices = alterIndices[0:receiverIndex]
        logging.info("Done - chose " + str(receiverIndex) + " receivers")

        return (generatedAltersArray, egoIndices, alterIndices)
Example #21
0
    def sequenceVectorStats(self, graph, subgraphIndices, treeStats=False, eigenStats=True):
        """
        Pass in a list of graphs are returns a series of statistics. Each list
        element is a dict of vector statistics. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()])
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsDictList = []

        for i in range(numGraphs):
            Util.printIteration(i, self.vectorPrintStep, numGraphs)
            subgraph = graph.subgraph(subgraphIndices[i])
            statsDictList.append(self.vectorStatistics(subgraph, treeStats, eigenStats))

        return statsDictList
Example #22
0
    def __getArcString(self, graph):
        arcString = ""
        ind = 0

        for vertex1 in graph.getAllVertexIds():
            Util.printIteration(ind, self.printStep, graph.getNumVertices())

            neighbours = graph.neighbours(vertex1)
            pajekIndex1 = self.vertexIdDict[vertex1]

            for vertex2 in neighbours:
                pajekIndex2 = self.vertexIdDict[vertex2]
                arcString = arcString + str(pajekIndex1) + " " + str(
                    pajekIndex2) + " " + str(graph.getEdge(vertex1, vertex2))
                arcString = arcString + " c " + self.colours[
                    self.defaultColour] + "\n"

            ind = ind + 1

        return arcString
Example #23
0
    def sequenceClustering(self, graph, subgraphIndices, clusterFunc, maxComponent=True):
        """
        Take a graph and a sequence of indices corresponding to subgraphs and
        compute some clusters indices for each one. 
        """
        numGraphs = len(subgraphIndices)
        clusterList = []

        for i in range(numGraphs):
            Util.printIteration(i, self.vectorPrintStep, numGraphs)
            subgraph = graph.subgraph(subgraphIndices[i])

            if maxComponent:
                subComponents = subgraph.findConnectedComponents()
                subgraph = subgraph.subgraph(subComponents[-1])

            clusterList.append(clusterFunc(subgraph))

        return clusterList
        
Example #24
0
    def learnModel(self, X, y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(X)
        Parameter.checkArray(y)
        
        labels = numpy.unique(y)
        if labels.shape[0] != 2:
            raise ValueError("Can only accept binary labelled data")
        if (labels != numpy.array([-1, 1])).any(): 
            raise ValueError("Labels must be -1/+1: " + str(labels))

        forestList = []
        numSampledExamples = numpy.round(self.sampleSize*X.shape[0])

        for i in range(self.numTrees):
            Util.printIteration(i, 1, self.numTrees, "Tree: ")
            if self.sampleReplace:
                inds = numpy.random.randint(0, X.shape[0], numSampledExamples)
            else:
                inds = numpy.random.permutation(X.shape[0])[0:numSampledExamples]

            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(self.maxDepth)
            treeRank.setMinSplit(self.minSplit)
            treeRank.setFeatureSize(self.featureSize)
            treeRank.setBestResponse(self.bestResponse)
            treeRank.learnModel(X[inds, :], y[inds])
            forestList.append(treeRank)

        self.forestList = forestList
Example #25
0
    def sequenceScalarStats(self, graph, subgraphIndices, slowStats=True, treeStats=False):
        """
        Pass in a graph and list of subgraph indices and returns a series of statistics. Each row
        corresponds to the statistics on the subgraph. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()])
        Parameter.checkBoolean(slowStats)
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsMatrix = numpy.zeros((numGraphs, self.numStats))

        for i in range(numGraphs):
            Util.printIteration(i, self.printStep, numGraphs)
            logging.debug("Subgraph size: " + str(len(subgraphIndices[i])))
            subgraph = graph.subgraph(subgraphIndices[i])
            statsMatrix[i, :] = self.scalarStatistics(subgraph, slowStats, treeStats)

        return statsMatrix
Example #26
0
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): 
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses)
    
    for i in range(len(datasetNames)):
        logging.debug("Learning using dataset " + datasetNames[i][0])
        outfileName = outputDir + datasetNames[i][0] + fileNameSuffix

        fileLock = FileLock(outfileName + ".npz")
        if not fileLock.isLocked() and not fileLock.fileExists():
            fileLock.lock()
            
            numRealisations = datasetNames[i][1]  
            gridShape = [numRealisations, sampleSizes.shape[0]]
            gridShape.extend(list(learner.gridShape(paramDict)))   
            gridShape = tuple(gridShape)            
            
            betaGrids = numpy.zeros(gridShape) 
            
            for k in range(sampleSizes.shape[0]):
                sampleSize = sampleSizes[k]
                
                logging.debug("Using sample size " + str(sampleSize))
                for j in range(numRealisations):
                        Util.printIteration(j, 1, numRealisations, "Realisation: ")
                        trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j)
                        
                        numpy.random.seed(21)
                        trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
                        validX = trainX[trainInds,:]
                        validY = trainY[trainInds]
                        
                        betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict)
            
            numpy.savez(outfileName, betaGrids)
            logging.debug("Saved results as file " + outfileName + ".npz")
            fileLock.unlock()
Example #27
0
    def __getEdgeString(self, graph):
        edgeString = ""
        ind = 0

        for vertex1 in graph.getAllVertexIds():
            Util.printIteration(ind, self.printStep, graph.getNumVertices())
            neighbours = graph.neighbours(vertex1)
            pajekIndex1 = self.vertexIdDict[vertex1]

            for vertex2 in neighbours:
                pajekIndex2 = self.vertexIdDict[vertex2]
                colour = self.getEdgeColour(vertex1, vertex2, graph)
                edgeString = edgeString + str(pajekIndex1) + " " + str(
                    pajekIndex2) + " " + str(
                        self.getEdgeWeight(vertex1, vertex2, graph))
                edgeString = edgeString + " w " + str(
                    self.getEdgeSize(vertex1, vertex2, graph))
                edgeString = edgeString + " c " + colour + "\n"

            ind = ind + 1

        return edgeString
Example #28
0
    def maxInfluence(self, P, k):
        """
        The matrix P is one representing the quality of information reaching each
        node from certain input nodes. The i,jth entry is the quality of information
        reaching vertex j from i. Returns the k nodes of maximal influence using
        a greedy method. 

        Complexity is O(k n)
        """
        Parameter.checkInt(k, 0, P.shape[0])

        numVertices = P.shape[0]
        bestActivations = numpy.zeros(numVertices)
        bestTotalActivation = 0
        
        selectedIndices = []
        unselectedIndices = set(range(0, numVertices))
        stepSize = 50

        for i in range(0, k):
            Util.printIteration(i, stepSize, k)

            for j in unselectedIndices:
                activations = numpy.max(numpy.r_['0,2', P[j, :], bestActivations], 0)
                currentActivation = numpy.sum(activations)
                
                if currentActivation > bestTotalActivation:
                    bestIndex = j
                    bestTotalActivation = numpy.sum(currentActivation)
                    
            bestActivations = numpy.max(numpy.r_['0,2', P[bestIndex, :], bestActivations], 0)

            if bestIndex in selectedIndices:
                bestIndex = unselectedIndices.copy().pop()

            selectedIndices.append(bestIndex)
            unselectedIndices.remove(bestIndex)

        return selectedIndices
Example #29
0
    def writeToFile(self, fileName, graph):
        fileName = fileName + ".net"

        numVertices = graph.getNumVertices()
        pajekIndex = 1

        f = open(fileName, 'w')
        f.write("*Vertices " + str(numVertices) + "\n")
        logging.info('Writing to Pajek file: ' + fileName)
        logging.info('Writing vertices')

        for i in graph.getAllVertexIds():
            Util.printIteration(i, self.printStep, graph.getNumVertices())
            self.vertexIdDict[i] = pajekIndex
            vertexSize = self.getVertexSize(i, graph)
            vertexColour = self.getVertexColour(i, graph)

            vertexString = str(pajekIndex) + ' "' + str(pajekIndex) + '" '
            vertexString = vertexString + "0.0 0.0 0.0 "
            vertexString = vertexString + "x_fact " + str(vertexSize) + " "
            vertexString = vertexString + "y_fact " + str(vertexSize) + " "
            vertexString = vertexString + "ic " + vertexColour + " "
            vertexString = vertexString + "bc " + vertexColour + " \n"
            f.write(vertexString)

            pajekIndex += 1

        logging.info('Writing edges')
        if graph.isUndirected():
            f.write("*Edges\n")
            f.write(self.__getEdgeString(graph))
        else:
            f.write("*Arcs\n")
            f.write(self.__getArcString(graph))

        f.close()
        logging.info("Finished, wrote " + str(numVertices) + " vertices & " +
                     str(graph.getNumEdges()) + " edges.")
    def sequenceClustering(self,
                           graph,
                           subgraphIndices,
                           clusterFunc,
                           maxComponent=True):
        """
        Take a graph and a sequence of indices corresponding to subgraphs and
        compute some clusters indices for each one. 
        """
        numGraphs = len(subgraphIndices)
        clusterList = []

        for i in range(numGraphs):
            Util.printIteration(i, self.vectorPrintStep, numGraphs)
            subgraph = graph.subgraph(subgraphIndices[i])

            if maxComponent:
                subComponents = subgraph.findConnectedComponents()
                subgraph = subgraph.subgraph(subComponents[-1])

            clusterList.append(clusterFunc(subgraph))

        return clusterList
    def writeToFile(self, fileName, graph):
        fileName = fileName + ".net"
    
        numVertices = graph.getNumVertices()
        pajekIndex = 1
        
        f = open(fileName, 'w')
        f.write("*Vertices " + str(numVertices) + "\n")
        logging.info('Writing to Pajek file: ' + fileName)
        logging.info('Writing vertices')

        for i in graph.getAllVertexIds():
            Util.printIteration(i, self.printStep, graph.getNumVertices())
            self.vertexIdDict[i] = pajekIndex
            vertexSize = self.getVertexSize(i, graph)
            vertexColour = self.getVertexColour(i, graph)

            vertexString = str(pajekIndex) + ' "' + str(pajekIndex) + '" '
            vertexString = vertexString + "0.0 0.0 0.0 "
            vertexString = vertexString + "x_fact " + str(vertexSize) + " "
            vertexString = vertexString + "y_fact " + str(vertexSize) + " "
            vertexString = vertexString + "ic " + vertexColour + " "
            vertexString = vertexString + "bc " + vertexColour + " \n"
            f.write(vertexString)

            pajekIndex += 1

        logging.info('Writing edges')
        if graph.isUndirected(): 
            f.write("*Edges\n")
            f.write(self.__getEdgeString(graph))
        else:
            f.write("*Arcs\n")
            f.write(self.__getArcString(graph))
                 
        f.close()
        logging.info("Finished, wrote " + str(numVertices) + " vertices & " + str(graph.getNumEdges()) + " edges.")
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the Jacard Index.
        Returns a matrix with rows are a ranked list of verticies of length windowSize.
        """

        """
        The score is the |n(x) \cup n(y)|/|n(x) \cap n(y)|. This is faster than
        the other method. 
        """
        logging.info("Running predictEdges in " + str(self.__class__.__name__))
        printStep = 50 

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            #Maybe something like this: 
            #WI = W[vertexIndices[i], :] + W
            #WU = W[vertexIndices[i], :] * W

            for j in range(0, self.graph.getNumVertices()):
                scores[j] = numpy.nonzero(W[vertexIndices[i], :] + W[j, :])[0].shape[0]

                if scores[j] != 0:
                    scores[j] = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0].shape[0]/float(scores[j])

            
            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
Example #33
0
    def MC2(lists, itemList, alpha=None, verbose=False): 
        """
        Perform weighted rank aggregation using MC2 as given in Rank Aggregation Methods 
        for the Web, Dwork et al. The weighting vector is given by alpha. 
        
        :param lists: A list of lists. Each sublist is an ordered set of a subset of the items from itemList 
        
        :param itemList: A list of all possible items 
        """
        
        n = len(itemList)
        ell = len(lists)
        
        if alpha == None: 
            alpha = numpy.ones(ell)/ell
        
        P = numpy.zeros((n, n))
        PList = [] 
        
        logging.debug("Computing permutation matrices")
        for j, lst in enumerate(lists): 
            Util.printIteration(j, 1, ell)
            Pj = RankAggregator.generateTransitionMatrix(lst, itemList)

            P = P + alpha[j] * Pj 
            PList.append(Pj)
        
        P /= ell 
        logging.debug("Done")

        outputList,scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores
Example #34
0
 def matchExperts(self): 
     expertsSet = self.loadExperts(self.expertsFileName)
     
     if not os.path.exists(self.expertMatchesFilename): 
         inFile = open(self.xmlCleanFilename)    
         expertMatches = set([])
         i = 0 
         
         for line in inFile:
             Util.printIteration(i, self.stepSize, self.numLines)
             if i % self.stepSize == 0: 
                 logging.debug(expertMatches)
                 
             author = re.findall("<author>(.*)</author>", line)  
             if len(author) != 0: 
                 possibleMatches = difflib.get_close_matches(author[0], expertsSet, cutoff=self.matchCutoff)
                 if len(possibleMatches) != 0: 
                     expertMatches.add(author[0])
                     expertsSet.remove(possibleMatches[0])
                     
                     if len(expertsSet) == 0: 
                         logging.debug("Found all experts, breaking")
                         break 
             
             i += 1
         
         expertMatches = sorted(list(expertMatches))
         expertMatchesFile = open(self.expertMatchesFilename, "w")
         
         for expert in expertMatches: 
             expertMatchesFile.write(expert + "\n")
         expertMatchesFile.close()
         
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.expertMatchesFilename)
def plotMaxTreesStats():
    biSums1 = []
    heteroSums1 = []
    biSums2 = []
    heteroSums2 = []

    treeDepth1 = [] 
    treeSize1 = []
    treeDepth2 = []
    treeSize2 = [] 

    logging.info("Finding trees")
    trees = sGraph.findTrees()

    maxTree = sGraph.subgraph(trees[0])
    secondTree = sGraph.subgraph(trees[1])

    maxRootIndex = trees[0][numpy.nonzero(sGraph.inDegreeSequence()[trees[0]] == 0)[0]]
    secondRootIndex = trees[1][numpy.nonzero(sGraph.inDegreeSequence()[trees[1]] == 0)[0]]

    for j in range(len(subgraphIndicesList)):
        Util.printIteration(j, 1, len(subgraphIndicesList))
        subgraphIndices = subgraphIndicesList[j]
        subgraphIndices = numpy.array(subgraphIndices)

        currentMaxRootIndex = numpy.nonzero(subgraphIndices == maxRootIndex)[0]
        currentSecondRootIndex = numpy.nonzero(subgraphIndices == secondRootIndex)[0]
        subgraph = sGraph.subgraph(subgraphIndices)

        if currentMaxRootIndex.shape[0] == 1:
            maxTree = subgraph.subgraph(subgraph.depthFirstSearch(currentMaxRootIndex[0]))
        else:
            maxTree = subgraph.subgraph(numpy.array([]))

        if currentSecondRootIndex.shape[0] == 1:
            secondTree = subgraph.subgraph(subgraph.depthFirstSearch(currentSecondRootIndex[0]))
        else:
            secondTree = subgraph.subgraph(numpy.array([]))

        subgraphVertexArray = maxTree.getVertexList().getVertices()
        subgraphVertexArray2 = secondTree.getVertexList().getVertices()
        #Compute proportion of MSM, Male, Female, Hetero
        heteroSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==0))
        biSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==1))

        heteroSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==0))
        biSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==1))

        treeDepth1.append(GraphUtils.treeDepth(maxTree))
        treeSize1.append(maxTree.getNumVertices())
        treeDepth2.append(GraphUtils.treeDepth(secondTree))
        treeSize2.append(secondTree.getNumVertices())

    resultsFilename = resultsDir + "treeSizesDepths.npz"
    file = open(resultsFilename, 'w')
    numpy.savez(file, treeDepth1, treeSize1, treeDepth2, treeSize2)

    global plotInd

    plt.figure(plotInd)
    plt.plot(absDayList, heteroSums1, plotStyles3[0], absDayList, biSums1, plotStyles3[1], absDayList, heteroSums2, plotStyles3[2], absDayList, biSums2, plotStyles3[3])
    plt.xticks(locs, labels)
    plt.xlabel("Year")
    plt.ylabel("Detections")
    plt.legend(("Max tree heterosexual", "Max tree MSM", "2nd tree heterosexual", "2nd tree MSM"), loc="upper left")
    plt.savefig(figureDir + "MaxTreeOrientGender.eps")
    plotInd += 1
def plotTreeStats():
    logging.info("Computing tree stats")
    resultsFileName = resultsDir + "InfectGrowthTreeStats.pkl"

    if saveResults:
        statsDictList = []

        for j in range(len(subgraphIndicesList2)):
            Util.printIteration(j, 1, len(subgraphIndicesList2))
            subgraphIndices = subgraphIndicesList2[j]
            subgraph = sGraph.subgraph(subgraphIndices)
            logging.info("Finding trees")
            trees = subgraph.findTrees()
            logging.info("Computing tree statistics")
            statsDict = {}

            locationEntropy = []
            orientEntropy = []
            detectionRanges = []

            for i in range(len(trees)):
                if len(trees[i]) > 1:
                    treeGraph = subgraph.subgraph(trees[i])
                    vertexArray = treeGraph.getVertexList().getVertices(list(range(treeGraph.getNumVertices())))
                    
                    locationEntropy.append(Util.entropy(vertexArray[:, locationIndex]))
                    orientEntropy.append(Util.entropy(vertexArray[:, orientationIndex]))
                    
                    detections = vertexArray[:, detectionIndex]
                    detectionRanges.append(numpy.max(detections) - numpy.min(detections))

            statsDict["locationEnt"] = numpy.array(locationEntropy)
            statsDict["orientEnt"] = numpy.array(orientEntropy)
            statsDict["detectRanges"] = numpy.array(detectionRanges)
            statsDictList.append(statsDict)

        Util.savePickle(statsDictList, resultsFileName, True)
    else:
        statsDictList = Util.loadPickle(resultsFileName)
        
        locBins = numpy.arange(0, 2.4, 0.2)
        detectBins = numpy.arange(0, 6500, 500)
        locationEntDists = []
        orientEntDists = []
        detectionDists = [] 

        for j in range(0, len(dayList2)):
            dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear)))
            logging.info(dateStr)
            statsDict = statsDictList[j]
            plotInd2 = plotInd

            locationEntDists.append(statsDict["locationEnt"])
            orientEntDists.append(statsDict["orientEnt"])
            detectionDists.append(statsDict["detectRanges"])

        #for j in range(len(orientEntDists)):
        #    print(numpy.sum(numpy.histogram(orientEntDists[j])[0]))
        #    print(numpy.histogram(orientEntDists[j])[0]/float(orientEntDists[j].shape[0]))

        dateStrs = [DateUtils.getDateStrFromDay(dayList2[i], startYear) for i in range(1, len(dayList2))]

        plt.figure(plotInd2)
        histOut = plt.hist(locationEntDists, locBins, normed=True)
        plt.xlabel("Location Entropy")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "LocationEnt" +  ".eps")
        #plt.legend()
        plotInd2 += 1

        plt.figure(plotInd2)
        histOut = plt.hist(orientEntDists, normed=True)
        plt.xlabel("Orientation Entropy")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "OrientEnt" +  ".eps")
        #plt.legend()
        plotInd2 += 1

        plt.figure(plotInd2)
        histOut = plt.hist(detectionDists, detectBins, normed=True)
        plt.xlabel("Detection Range (days)")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "DetectionRanges" +  ".eps")
        #plt.legend()
        plotInd2 += 1