예제 #1
0
    def readAuthorsAndDocuments(self, useAbstract=True): 
        logging.debug("About to read file " + self.dataFilename)
        inFile = open(self.dataFilename)  
        authorList = []
        citationList = []
        documentList = []
                    
        lastAbstract = ""
        lastVenue = ""
        lastTitle = ""    
        lastAuthors = []     
        lastCitationNo = 0                
                    
        for i, line in enumerate(inFile):
            Util.printIteration(i, self.stepSize, self.numLines)
                
            #Match the fields in the file 
            emptyLine = line == "\n"
            title = re.findall("#\*(.*)", line)
            currentAuthors = re.findall("#@(.*)", line)  
            abstract = re.findall("#!(.*)", line)
            venue = re.findall("#conf(.*)", line)
            citationNo = re.findall("#citation(.*)", line)
            
            if emptyLine:
                if useAbstract: 
                    document = lastTitle + " " + lastAbstract 
                else: 
                    document = lastTitle     
                documentList.append(document) 
                authorList.append(lastAuthors)
                citationList.append(lastCitationNo)

                lastAbstract = ""
                lastTitle = ""
                lastAuthors = []
                lastCitationNo = 0   
 
            if len(title) != 0 and len(title[0]) != 0: 
                lastTitle = title[0]
                
            if len(venue) != 0 and len(venue[0]) != 0: 
                lastVenue = venue[0]  
            
            if len(abstract) != 0 and len(abstract[0]) != 0: 
                lastAbstract = abstract[0]
                
            if len(citationNo) != 0 and len(citationNo[0]) != 0: 
                lastCitationNo = int(citationNo[0])
                       
            if len(currentAuthors) != 0: 
                currentAuthors = currentAuthors[0].split(",")  
                currentAuthors = set([x.strip() for x in currentAuthors])
                currentAuthors = currentAuthors.difference(set([""]))
                lastAuthors = currentAuthors                     

        inFile.close() 
        logging.debug("Finished reading " + str(len(documentList)) + " articles")  
        
        return authorList, documentList, citationList
    def setVertices(self, vertices):
        """
        Set the vertices to the given list of vertices.

        :param vertices: a set of vertices of the same shape as this object.
        """
        Util.abstract()
    def getAllEdges(self):
        """
        Return an array of edges with each row representing an edge.

        :returns:  A numpy array of all edges in this graph. 
        """
        Util.abstract()
def saveStats(args):
    i, theta, startDate, endDate, recordStep = args 
    
    resultsFileName = outputDir + "SimStats" + str(i) + ".pkl"
    
    try:
        with open(resultsFileName) as f: pass
    except IOError as e:
        featureInds= numpy.ones(targetGraph.vlist.getNumFeatures(), numpy.bool)
        featureInds[HIVVertices.dobIndex] = False 
        featureInds[HIVVertices.infectionTimeIndex] = False 
        featureInds[HIVVertices.hiddenDegreeIndex] = False 
        featureInds[HIVVertices.stateIndex] = False 
        featureInds = numpy.arange(featureInds.shape[0])[featureInds]        
        
        matcher = GraphMatch("PATH", alpha=0.5, featureInds=featureInds, useWeightM=False)
        graphMetrics = HIVGraphMetrics2(targetGraph, 1.0, matcher, float(endDate))        
        
        times, infectedIndices, removedIndices, graph = HIVModelUtils.simulate(thetaArray[i], startDate, endDate, recordStep, M, graphMetrics)
        times, vertexArray, removedGraphStats = HIVModelUtils.generateStatistics(graph, startDate, endDate, recordStep)
    
        stats = times, vertexArray, removedGraphStats, graphMetrics.dists, graphMetrics.graphDists, graphMetrics.labelDists
        
        
        Util.savePickle(stats, resultsFileName)
예제 #5
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the following score
        \sum_z \in n(x) \cup n(y) = 1/|log(n(z)|
        Returns a matrix with rows are a ranked list of verticies of length self.windowSize.
        """

        Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices())
        logging.info("Running predictEdges in " + str(self.__class__.__name__))

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, self.printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            for j in range(0, self.graph.getNumVertices()):
                commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0]

                for k in commonNeighbours:
                    q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0])
                    if q != 0:
                        scores[j] = scores[j] + 1/q


            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
예제 #6
0
    def complement(self):
        """
        Returns a graph with identical vertices (same reference) to the current one, but with the
        complement of the set of edges. Edges that do not exist have weight 1.
        """

        Util.abstract()
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i>=j: # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
#            if deltaW[i, j] < 0:
#                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]
        
        return lmbda, Q 
    def getAllEdges(self):
        """
        Return an array of edges with each row representing an edge.

        :returns:  A numpy array of all edges in this graph. 
        """
        Util.abstract()
    def getVertices(self, vertexIndices):
        """
        Returns a list of vertices specified by vertexIndices.

        :param vertexIndices: a list of vertex indices.
        """
        Util.abstract()
    def setVertices(self, vertices):
        """
        Set the vertices to the given list of vertices.

        :param vertices: a set of vertices of the same shape as this object.
        """
        Util.abstract()
예제 #11
0
    def saveResult(self, X, Y, learner, fileName):
        """
        Save a single result to file, checking if the results have already been computed
        """
        fileBaseName, sep, ext = fileName.rpartition(".")
        lockFileName = fileBaseName + ".lock"
        gc.collect()

        if not os.path.isfile(fileName) and not os.path.isfile(lockFileName):
            try:
                lockFile = open(lockFileName, 'w')
                lockFile.close()
                logging.debug("Created lock file " + lockFileName)

                logging.debug("Computing file " + fileName)
                logging.debug(learner)
                (bestParams, allMetrics, bestMetaDicts) = learner.evaluateCvOuter(X, Y, self.folds)
                cvResults = {"bestParams":bestParams, "allMetrics":allMetrics, "metaDicts":bestMetaDicts}
                Util.savePickle(cvResults, fileName)
                
                os.remove(lockFileName)
                logging.debug("Deleted lock file " + lockFileName)
            except:
                logging.debug("Caught an error in the code ... skipping")
                raise
        else:
            logging.debug("File exists, or is locked: " + fileName)
예제 #12
0
 def cleanXML(self):
     """
     Take the original XML file and clean up HTML characters and & symbols. We 
     also create a list of possible matches for the experts. 
     """
     if not os.path.exists(self.xmlCleanFilename):
         logging.debug("Cleaning XML")
         h = HTMLParser.HTMLParser()
         
         inFile = open(self.xmlFileName)
         outFile = open(self.xmlCleanFilename, "w")
         i = 0 
         
         for line in inFile: 
             Util.printIteration(i, self.stepSize, self.numLines)
             outLine = h.unescape(line).replace("&", "&amp;")
             outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine)
             outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine)
             outFile.write(outLine) 
             i += 1
         
         inFile.close() 
         outFile.close() 
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.xmlCleanFilename)
    def sequenceVectorStats(self,
                            graph,
                            subgraphIndices,
                            treeStats=False,
                            eigenStats=True):
        """
        Pass in a list of graphs are returns a series of statistics. Each list
        element is a dict of vector statistics. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt,
                                [0, graph.getNumVertices()])
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsDictList = []

        for i in range(numGraphs):
            Util.printIteration(i, self.vectorPrintStep, numGraphs)
            subgraph = graph.subgraph(subgraphIndices[i])
            statsDictList.append(
                self.vectorStatistics(subgraph, treeStats, eigenStats))

        return statsDictList
예제 #14
0
    def simulateModel(theta):
        """
        The parameter t is the particle index. 
        """
        logging.debug("theta=" + str(theta))
 
        #We start with the observed graph at the start date 
        graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) 
        graph.addVertices(M-graph.size)

        p = Util.powerLawProbs(alpha, zeroVal)
        hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
        
        featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool)
        featureInds[HIVVertices.dobIndex] = False 
        featureInds[HIVVertices.infectionTimeIndex] = False 
        featureInds[HIVVertices.hiddenDegreeIndex] = False 
        featureInds[HIVVertices.stateIndex] = False
        featureInds = numpy.arange(featureInds.shape[0])[featureInds]
        matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False)
        graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, float(endDate))
        
        recordStep = (endDate-startDate)/float(numRecordSteps)
        rates = HIVRates(graph, hiddenDegSeq)
        model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics)
        model.setRecordStep(recordStep)
        model.setParams(theta)
        
        model.simulate() 
    
        objective = model.objective()
        return objective
    def sequenceScalarStats(self,
                            graph,
                            subgraphIndices,
                            slowStats=True,
                            treeStats=False):
        """
        Pass in a graph and list of subgraph indices and returns a series of statistics. Each row
        corresponds to the statistics on the subgraph. 
        """
        Parameter.checkClass(graph, AbstractMatrixGraph)
        for inds in subgraphIndices:
            Parameter.checkList(inds, Parameter.checkInt,
                                [0, graph.getNumVertices()])
        Parameter.checkBoolean(slowStats)
        Parameter.checkBoolean(treeStats)

        numGraphs = len(subgraphIndices)
        statsMatrix = numpy.zeros((numGraphs, self.numStats))

        for i in range(numGraphs):
            Util.printIteration(i, self.printStep, numGraphs)
            #logging.debug("Subgraph size: " + str(len(subgraphIndices[i])))
            subgraph = graph.subgraph(subgraphIndices[i])
            statsMatrix[i, :] = self.scalarStatistics(subgraph, slowStats,
                                                      treeStats)

        return statsMatrix
    def getVertices(self, vertexIndices):
        """
        Returns a list of vertices specified by vertexIndices.

        :param vertexIndices: a list of vertex indices.
        """
        Util.abstract()
예제 #17
0
    def getWeightMatrix(self):
        """
        Returns a numpy array of the weight matrix of this graph.

        :returns:  The weight matrix of this graph. 
        """
        Util.abstract()
예제 #18
0
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "netflix/training_set/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])        
            
            movieIds = array.array("I")
            custIds = array.array("I")
            ratings = array.array("B")
            dates = array.array("L")
            j = 0
            
            for i in range(self.startMovieID, self.endMovieID+1): 
                Util.printIteration(i-1, 1, self.endMovieID-1)
                ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt")
                ratingsFile.readline()
                
                for line in ratingsFile: 
                    vals = line.split(",")
                    
                    custId = int(vals[0])
                    
                    if custId not in custIdSet: 
                        custIdSet.add(custId)
                        custIdDict[custId] = j
                        custInd = j 
                        j += 1 
                    else: 
                        custInd = custIdDict[custId]
                    
                    rating = int(vals[1])     
                    t = datetime.strptime(vals[2].strip(), "%Y-%m-%d")
                
                    movieIds.append(i-1)
                    custIds.append(custInd)   
                    ratings.append(rating)
                    dates.append(int(time.mktime(t.timetuple()))) 
                    
            movieIds = numpy.array(movieIds, numpy.uint32)
            custIds = numpy.array(custIds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.uint8)
            dates = numpy.array(dates, numpy.uint32)
            
            assert ratings.shape[0] == self.numRatings            
            
            numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
    def load(filename):
        """
        Load this object from filename.

        :param filename: The name of the file to load.
        :type filename: :class:`str`
        """
        Util.abstract()
    def getVertex(self, index):
        """
        Returns the value of a vertex.

        :param index: the index of the vertex.
        :type index: :class:`int`
        """
        Util.abstract()
    def clearVertex(self, index):
        """
        Sets a vertex to None

        :param index: the index of the vertex to assign a value.
        :type index: :class:`int`
        """
        Util.abstract()
    def save(self, filename):
        """
        Save this object to filename.nvl.

        :param filename: The name of the file to save.
        :type filename: :class:`str`
        """
        Util.abstract()
    def getAllVertexIds(self):
        """
        Return a list of all indices of the vertices
        
        :returns:  A numpy array of all the vertex indices in this graph. 
        """

        Util.abstract()
    def load(filename):
        """
        Load this object from filename.

        :param filename: The name of the file to load.
        :type filename: :class:`str`
        """
        Util.abstract()
    def removeEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex):
        """
        Remove an edge between two vertices.

        @param vertexIndex1: The index of the first vertex.
        @param vertexIndex1: The index of the second vertex.
        """
        Util.abstract()
    def getAllVertexIds(self):
        """
        Return a list of all indices of the vertices
        
        :returns:  A numpy array of all the vertex indices in this graph. 
        """

        Util.abstract()
예제 #27
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`
        """
        Util.abstract()
    def clearVertex(self, index):
        """
        Sets a vertex to None

        :param index: the index of the vertex to assign a value.
        :type index: :class:`int`
        """
        Util.abstract()
예제 #29
0
    def removeEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex):
        """
        Remove an edge between two vertices.

        @param vertexIndex1: The index of the first vertex.
        @param vertexIndex1: The index of the second vertex.
        """
        Util.abstract()
    def getVertex(self, index):
        """
        Returns the value of a vertex.

        :param index: the index of the vertex.
        :type index: :class:`int`
        """
        Util.abstract()
    def save(self, filename):
        """
        Save this object to filename.nvl.

        :param filename: The name of the file to save.
        :type filename: :class:`str`
        """
        Util.abstract()
    def testExpandIntArray(self):
        v = numpy.array([1, 3, 2, 4], numpy.int)
        w = Util.expandIntArray(v)

        self.assertTrue((w == numpy.array([0,1,1,1,2,2,3,3,3,3], numpy.int)).all())

        v = numpy.array([], numpy.int)
        w = Util.expandIntArray(v)
        self.assertTrue((w == numpy.array([], numpy.int)).all())
예제 #33
0
    def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True):
        """
        Evaluate this learning algorithm using the given list of training/test splits 
        The metricMethod is a method which takes (predictedY, realY) as input
        and returns a metric about the quality of the evaluation.

        :param X: A matrix with examples as rows 
        :type X: :class:`ndarray`

        :param y: A vector of labels 
        :type y: :class:`ndarray`

        :param idx: A list of training/test splits 
        :type idx: :class:`list`

        :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y 
        :type learnModel: :class:`function`

        :param predict: A function such that predict(X) makes predictions for X
        :type predict: :class:`function`

        :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY
        :type metricMethod: :class:`function`

        Output: the mean and variation of the cross validation folds. 
        """
        #Parameter.checkClass(idx, list)
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X, softCheck=True)
        Parameter.checkInt(X.shape[0], 1, float('inf'))
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(y, softCheck=True)

        if y.ndim != 1:
            raise ValueError("Dimention of y must be 1")
        
        i = 0
        metrics = numpy.zeros(len(idx))
        logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples")

        for idxtr, idxts in idx:
            if progress:
                Util.printConciseIteration(i, 1, len(idx))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = y[idxtr], y[idxts]
            #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY)))
            #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY)))

            learnModel(trainX, trainY)
            predY = predict(testX)
            gc.collect()

            metrics[i] = metricMethod(predY, testY)
            i += 1

        return metrics
예제 #34
0
    def eigenAdd(omega, Q, Y, k):
        """
        Perform an eigen update of the form A*A + Y*Y in which Y is a low-rank matrix
        and A^*A = Q Omega Q*. We use the rank-k approximation of A:  Q_k Omega_k Q_k^*
        and then approximate [A^*A_k Y^*Y]_k.
        """
        #logging.debug("< eigenAdd >")
        Parameter.checkInt(k, 0, omega.shape[0])
        #if not numpy.isrealobj(omega) or not numpy.isrealobj(Q):
        #    raise ValueError("Eigenvalues and eigenvectors must be real")
        if omega.ndim != 1:
            raise ValueError("omega must be 1-d array")
        if omega.shape[0] != Q.shape[1]:
            raise ValueError("Must have same number of eigenvalues and eigenvectors")

        if __debug__:
            Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="input Q in eigenAdd()")

        #Taking the abs of the eigenvalues is correct
        inds = numpy.flipud(numpy.argsort(numpy.abs(omega)))

        omega, Q = Util.indEig(omega, Q, inds[numpy.abs(omega)>EigenUpdater.tol])
        Omega = numpy.diag(omega)

        YY = Y.conj().T.dot(Y)
        QQ = Q.dot(Q.conj().T)
        Ybar = Y - Y.dot(QQ)

        Pbar, sigmaBar, Qbar = numpy.linalg.svd(Ybar, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(numpy.abs(sigmaBar)))
        inds = inds[numpy.abs(sigmaBar)>EigenUpdater.tol]
        Pbar, sigmaBar, Qbar = Util.indSvd(Pbar, sigmaBar, Qbar, inds)
        
        SigmaBar = numpy.diag(sigmaBar)
        Qbar = Ybar.T.dot(Pbar)
        Qbar = Qbar.dot(numpy.diag(numpy.diag(Qbar.T.dot(Qbar))**-0.5))

        r = sigmaBar.shape[0]

        YQ = Y.dot(Q)
        Zeros = numpy.zeros((r, omega.shape[0]))
        D = numpy.c_[Q, Qbar]

        YYQQ = YY.dot(QQ)
        Z = D.conj().T.dot(YYQQ + YYQQ.conj().T).dot(D)
        F = numpy.c_[numpy.r_[Omega - YQ.conj().T.dot(YQ), Zeros], numpy.r_[Zeros.T, SigmaBar.conj().dot(SigmaBar)]]
        F = F + Z 

        pi, H = scipy.linalg.eigh(F)
        inds = numpy.flipud(numpy.argsort(numpy.abs(pi)))

        H = H[:, inds[0:k]]
        pi = pi[inds[0:k]]

        V = D.dot(H)
        #logging.debug("</ eigenAdd >")
        return pi, V
예제 #35
0
    def eigpsd(X, n):
        """
        Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix.
        The input matrix X can be a numpy array or a scipy sparse matrix. In the case that
        n==X.shape[0] we convert to an ndarray. 

        :param X: The matrix to find the eigenvalues of.
        :type X: :class:`ndarray`

        :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices.

        :return lmbda: The set of eigenvalues 
        :return V: The matrix of eigenvectors as a ndarray
        """
        if type(n) == int:
            n = min(n, X.shape[0])
            inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n])
        elif type(n) == numpy.ndarray:
            inds = n 
        else: 
            raise ValueError("Invalid n value: " + str(n))
            
        invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds)

        if numpy.sort(inds).shape[0] == X.shape[0] and (numpy.sort(inds) == numpy.arange(X.shape[0])).all():
            if scipy.sparse.issparse(X):
                X = numpy.array(X.todense())
            lmbda, V = Util.safeEigh(X)
            return lmbda, V

        tmp = X[inds, :] 
        A = tmp[:, inds]
        B = tmp[:, invInds]

        if scipy.sparse.issparse(X): 
            A = numpy.array(A.todense())
            BB = numpy.array((B*B.T).todense())
        else:
            BB = B.dot(B.T)
        
        #Following line is very slow 
        #Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A)) 
        Am12 = Util.matrixPowerh(A, -0.5)
        S = A + Am12.dot(BB).dot(Am12)
        S = (S.T + S)/2

        lmbda, U = Util.safeEigh(S)

        tol = 10**-10
        lmbdaN = lmbda.copy()
        lmbdaN[numpy.abs(lmbda) < tol] = 0
        lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol]**-0.5
        
        V = X[:, inds].dot(Am12.dot(U)*lmbdaN)
        
        return lmbda, V
예제 #36
0
    def processProbe(self): 
        """
        Go through the probe set and label the corresponding ratings in the full 
        dataset as test. 
        """
        if not os.path.exists(self.isTrainRatingsFileName):
            custIdDict = pickle.load(open(self.custDictFileName))             
            dataArr = numpy.load(self.ratingFileName)
            movieInds, custInds, ratings, dates = dataArr["arr_0"], dataArr["arr_1"], dataArr["arr_2"], dataArr["arr_3"]
            logging.debug("Number of ratings: " + str(ratings.shape[0]+1))            
            del ratings, dates 
            logging.debug("Training data loaded")
            
            isTrainRating = numpy.ones(movieInds.shape[0], numpy.bool)
            probeFile = open(self.probeFileName)
            i = 0 
            
            #First figure out the movie boundaries 
            movieBoundaries = numpy.nonzero(numpy.diff(movieInds) != 0)[0] + 1
            movieBoundaries = numpy.insert(movieBoundaries, 0, 0)
            movieBoundaries = numpy.append(movieBoundaries, movieInds.shape[0])
            
            assert movieBoundaries.shape[0] == self.numMovies+1 
            assert movieBoundaries[-1] == movieInds.shape[0]
            
            for line in probeFile: 
                if line.find(":") != -1: 
                    Util.printIteration(i, 10, self.numProbeMovies)
                    movieId = line[0:-2]
                    movieInd = int(movieId)-1
                
                    startInd = movieBoundaries[movieInd] 
                    endInd = movieBoundaries[movieInd+1] 
                    #All the customers that watches movie movieInd
                    tempCustInds = custInds[startInd:endInd]
                    sortedInds = numpy.argsort(tempCustInds)
                    
                    assert (movieInds[startInd:endInd] == movieInd).all()
                    
                    i += 1
                else: 
                    custId = int(line.strip())
                    custInd = custIdDict[custId]

                    offset = numpy.searchsorted(tempCustInds[sortedInds], custInd)
                    isTrainRating[startInd + sortedInds[offset]] = 0 
                    
                    assert custInds[startInd + sortedInds[offset]] == custInd
               
            assert i == self.numProbeMovies 
            assert numpy.logical_not(isTrainRating).sum() == self.numProbeRatings               
               
            numpy.savez(self.isTrainRatingsFileName, isTrainRating) 
            logging.debug("Saved file as " + self.isTrainRatingsFileName)
        else: 
            logging.debug("Train/test indicators file " + str(self.isTrainRatingsFileName) + " already processed")
    def setVertex(self, index, value):
        """
        Set a vertex to the corresponding value.

        :param index: the index of the vertex to assign a value.
        :type index: :class:`int`

        :param value: the value to assign to the vertex.
        """
        Util.abstract()
예제 #38
0
    def addEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex, edge):
        """
        Add an edge to the graph between two vertices.

        @param vertexIndex1: The index of the first vertex.
        @param vertexIndex1: The index of the second vertex.
        @param edge: The value to assign to the edge.
        """

        Util.abstract()
    def addEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex, edge):
        """
        Add an edge to the graph between two vertices.

        @param vertexIndex1: The index of the first vertex.
        @param vertexIndex1: The index of the second vertex.
        @param edge: The value to assign to the edge.
        """

        Util.abstract()
    def testEntropy(self):
        v = numpy.array([0, 0, 0, 1, 1, 1])

        self.assertEquals(Util.entropy(v), 1)

        v = numpy.array([0, 0, 0])
        self.assertEquals(Util.entropy(v), 0)

        v = numpy.array([1, 1, 1])
        self.assertEquals(Util.entropy(v), 0)
    def setVertex(self, index, value):
        """
        Set a vertex to the corresponding value.

        :param index: the index of the vertex to assign a value.
        :type index: :class:`int`

        :param value: the value to assign to the vertex.
        """
        Util.abstract()
    def testMatrixPowerh(self):
        A = numpy.random.rand(10, 10)
        A = A.T.dot(A)            
            
        tol = 10**-6 
        A2 = A.dot(A)

        lmbda, V = scipy.linalg.eig(A)

        A12 = Util.matrixPowerh(A, 0.5)

        self.assertTrue(numpy.linalg.norm(A12.dot(A12)  - A) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A) - Util.matrixPowerh(A, -1)) < tol)
        self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol)
        self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A).dot(numpy.linalg.inv(A)) - Util.matrixPowerh(A, -2)) < tol)        
        
        #Now lets test on a low rank matrix
        lmbda[5:] = 0
        A = V.dot(numpy.diag(lmbda)).dot(numpy.linalg.inv(V))
        A2 = A.dot(A)
        A12 = Util.matrixPowerh(A, 0.5)
        Am12 = Util.matrixPowerh(A, -0.5)

        
        self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Util.matrixPowerh(A, -1)) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Am12.dot(Am12)) < tol)
        self.assertTrue(numpy.linalg.norm(A12.dot(A12)  - A) < tol)
        self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol)
        self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
예제 #43
0
    def distance2(self, graph1, graph2, permutation):
        """
        Compute a graph distance metric between two graphs give a permutation 
        vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F)
        (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F 
        and is bounded between 0 and 1. 
        
        :param graph1: A graph object 
        
        :param graph2: The second graph object to match 
        
        :param permutation: An array of permutation indices matching the first to second graph 
        :type permutation: `numpy.ndarray`
        
        """
        if self.useWeightM:         
            W1 = graph1.getWeightMatrix()
            W2 = graph2.getWeightMatrix()
        else: 
            W1 = graph1.adjacencyMatrix()
            W2 = graph2.adjacencyMatrix()
        
        if W1.shape[0] < W2.shape[0]: 
            W1 = Util.extendArray(W1, W2.shape)
        elif W2.shape[0] < W1.shape[0]:
            W2 = Util.extendArray(W2, W1.shape)
        
        n = W1.shape[0]
        P = numpy.zeros((n, n)) 
        P[(numpy.arange(n), permutation)] = 1
        dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T))**2
        
        #Now compute the vertex similarities distance         
        V1 = graph1.getVertexList().getVertices()
        V2 = graph2.getVertexList().getVertices()
        
        if V1.shape[0] < V2.shape[0]: 
            V1 = Util.extendArray(V1, V2.shape)
        elif V2.shape[0] < V1.shape[0]: 
            V2 = Util.extendArray(V2, V1.shape)
        
        dist2 = numpy.sum((V1 - P.T.dot(V2))**2)

        norm1 = ((W1**2).sum() + (W2**2).sum())
        norm2 = ((V1**2).sum() + (V2**2).sum())
        
        if norm1!= 0: 
            dist1 = dist1/norm1
        if norm2!= 0:
            dist2 = dist2/norm2         
        
        dist = (1-self.alpha)*dist1 + self.alpha*dist2
        
        return dist 
    def save(self, filename):
        """
        Save this object to filename.nvl.

        :param filename: The name of the file to save to.
        :type filename: :class:`str`

        :returns: The name of the saved file including extension.
        """
        Util.savePickle(self.V, filename + self.ext, overwrite=True)
        return filename + self.ext
    def removeEdge(self, vertexIndex1, vertexIndex2):
        """
        Remove an edge between two vertices.

        :param vertexIndex1: The index of the first vertex.
        :type vertexIndex1: :class:`int`

        :param vertexIndex2: The index of the second vertex.
        :type vertexIndex2: :class:`int`
        """
        Util.abstract()
    def testMode(self):
        x = numpy.array([1,1,1,2,2,3,3,3,3,3,5,5])
        self.assertEquals(Util.mode(x), 3)

        x = numpy.array([1,1,1,2,2,3,3,3,5,5])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([1,2,3,4])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([0])
        self.assertEquals(Util.mode(x), 0)
 def testCumMin(self): 
     v = numpy.array([5, 6, 4, 5, 1])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, numpy.array([5, 5, 4, 4, 1]))
     
     v = numpy.array([5, 4, 3, 2, 1])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, v)
 
     v = numpy.array([1, 2, 3])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, numpy.ones(3))    
    def testRank(self):
        X = numpy.random.rand(10, 1)
        self.assertEquals(Util.rank(X), 1)

        X = numpy.random.rand(10, 12)
        self.assertEquals(Util.rank(X), 10)

        X = numpy.random.rand(31, 12)
        self.assertEquals(Util.rank(X), 12)

        K = numpy.dot(X, X.T)
        self.assertEquals(Util.rank(X), 12)
예제 #49
0
    def load(cls, filename):
        """
        Load the graph object from the corresponding file. Data is loaded in a zip
        format as created using save().

        :param filename: The name of the file to load.
        :type filename: :class:`str`

        :returns: A graph corresponding to the one saved in filename.
        """
        Parameter.checkClass(filename, str)
        import zipfile 

        (path, filename) = os.path.split(filename)
        if path == "":
            path = "./"
        
        tempPath = tempfile.mkdtemp()
        originalPath = os.getcwd()
        
        try:
            os.chdir(path)

            myzip = zipfile.ZipFile(filename + '.zip', 'r')
            myzip.extractall(tempPath)
            myzip.close()

            os.chdir(tempPath)

            #Deal with legacy files 
            try:
                W = cls.loadMatrix(cls._wFilename)
                metaDict = Util.loadPickle(cls._metaFilename)
                vList = globals()[metaDict["vListType"]].load(cls._verticesFilename)
                undirected = metaDict["undirected"]

            except IOError:
                W = cls.loadMatrix(filename + cls._matExt)
                vList = VertexList.load(filename)
                undirected = Util.loadPickle(filename + cls._boolExt)

            graph = cls(vList, undirected)
            graph.W = W

            for tempFile in myzip.namelist():
                os.remove(tempFile)
        finally:
            os.chdir(originalPath)

        os.rmdir(tempPath)

        return graph
    def getFeatureDistribution(self, fIndex, vIndices=None):
        """
        Returns a tuple (frequencies, items) about a particular feature given
        by fIndex. This method is depricated. 
        """
        Parameter.checkIndex(fIndex, 0, self.getNumFeatures())

        if vIndices == None:
            (freqs, items) = Util.histogram(self.V[:, fIndex])
        else:
            (freqs, items) = Util.histogram(self.V[vIndices, fIndex])

        return (freqs, items)
예제 #51
0
    def save(self, filename):
        """
        Save the graph object to the corresponding filename under the .zip extension. The
        adjacency matrix is stored in matrix market format and the AbstractVertexList
        decides how to store the vertex labels. 

        :param filename: The name of the file to save.
        :type filename: :class:`str`

        :returns: The name of the saved zip file.
        """
        Parameter.checkClass(filename, str)
        import zipfile
        
        (path, filename) = os.path.split(filename)
        if path == "":
            path = "./"        
        
        tempPath = tempfile.mkdtemp()

        originalPath = os.getcwd()
        try:
            os.chdir(tempPath)

            self.saveMatrix(self.W, self._wFilename)
            vListFilename = self.vList.save(self._verticesFilename)

            metaDict = {}
            metaDict["version"] = apgl.__version__
            metaDict["undirected"] = self.undirected
            metaDict["vListType"] = self.vList.__class__.__name__
            Util.savePickle(metaDict, self._metaFilename)

            myzip = zipfile.ZipFile(filename + '.zip', 'w')
            myzip.write(self._wFilename)
            myzip.write(vListFilename)
            myzip.write(self._metaFilename)
            myzip.close()

            os.remove(self._wFilename)
            os.remove(vListFilename)
            os.remove(self._metaFilename)
            
            shutil.move(filename + ".zip", path + "/" + filename + '.zip')
        finally:
            os.chdir(originalPath)
            
        os.rmdir(tempPath)
            
        return path + "/" + filename + '.zip'
    def addEdge(self, vertexIndex1, vertexIndex2, edgeValue):
        """
        Add a non-zero edge between two vertices.
        
        :param vertexIndex1: The index of the first vertex.
        :type vertexIndex1: :class:`int`

        :param vertexIndex2: The index of the second vertex.
        :type vertexIndex2: :class:`int`

        :param edgeValue: The value to assign to the edge.
        """

        Util.abstract()
예제 #53
0
    def generate(self, graph, requireEmpty=True):
        '''
        Create an Configuration Model graph. Note the the degree sequence(s) given
        in the constructor cannot be guarenteed. The algorithm randomly selects
        two free "spokes" and then tried to connect them. If two vertices are
        already connected the corresponding spokes are not used again. In the case
        that requireEmpty is False then a non-empty graph can be used and the given
        degree sequence(s) is(are) the difference(s) in degrees between the output graph and
        input one. 

        :param graph: a graph to populate with edges
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :param requireEmpty: if this is set to true then we require an empty graph.
        :type requireEmpty: :class:`bool`

        :returns: The modified input graph. 
        '''
        Parameter.checkClass(graph, AbstractMatrixGraph)
        if requireEmpty and graph.getNumEdges() != 0:
            raise ValueError("Graph must have no edges")
        if graph.getNumVertices() != self.outDegSequence.shape[0]:
            raise ValueError(
                "Graph must have same number of vertices as degree sequence")
        if self.getInDegSequence() != None and graph.isUndirected():
            raise ValueError(
                "In-degree sequence must be used in conjunction with directed graphs"
            )

        if self.getInDegSequence() == None:
            expandedInds = Util.expandIntArray(self.outDegSequence)
            numpy.random.shuffle(expandedInds)
            for i in range(0, len(expandedInds), 2):
                if i != len(expandedInds) - 1:
                    graph.addEdge(expandedInds[i], expandedInds[i + 1])
        else:
            expandedOutInds = Util.expandIntArray(self.outDegSequence)
            expandedInInds = Util.expandIntArray(self.inDegSequence)
            numpy.random.shuffle(expandedOutInds)
            numpy.random.shuffle(expandedInInds)

            for i in range(
                    numpy.min(
                        numpy.array([
                            expandedOutInds.shape[0], expandedInInds.shape[0]
                        ]))):
                graph.addEdge(expandedOutInds[i], expandedInInds[i])

        return graph
    def testRandom2Choice(self):
        n = 1000
        V = numpy.array([[0.3, 0.7], [0.5, 0.5]])

        J = Util.random2Choice(V, n)
        self.assertAlmostEquals(numpy.sum(J[0, :]==0)/float(n), V[0, 0], places=1)
        self.assertAlmostEquals(numpy.sum(J[0, :]==1)/float(n), V[0, 1], places=1)

        self.assertAlmostEquals(numpy.sum(J[1, :]==0)/float(n), V[1, 0], places=1)
        self.assertAlmostEquals(numpy.sum(J[1, :]==1)/float(n), V[1, 1], places=1)

        #Now use a vector of probabilities
        v = numpy.array([0.3, 0.7])
        j = Util.random2Choice(v, n)
        self.assertAlmostEquals(numpy.sum(j==0)/float(n), v[0], places=1)
        self.assertAlmostEquals(numpy.sum(j==1)/float(n), v[1], places=1)
예제 #55
0
    def maxProductPaths(self):
        """
        Find the maximum product paths between all pairs of vertices using
        a modified version of the Floyd-Warshall algorithm.

        :returns: A matrix P whose ijth entry corresponds to the maximal product of edge weights between them.
        """
        numVertices = self.vList.getNumVertices()
        P = self.getWeightMatrix().copy()
        stepSize = min(100, numVertices-1)

        for k in range(0, numVertices):
            Util.printIteration(k, stepSize, numVertices)
            P2 = numpy.outer(P[:, k], P[k, :])
            P = numpy.maximum(P, P2)

        return P