def toDictGraph(self): """ Convert to a DictGraph object. Currently ignores vertex labels. :return graph: A DictGraph object. """ edges = self.getAllEdges() values = self.getEdgeValues(edges) graph = DictGraph(self.undirected) graph.addEdges(edges, values) return graph
def testAddEdges(self): graph = DictGraph() edgeList = [(1, 2), (2, 1), (5, 2), (8, 8)] graph.addEdges(edgeList) self.assertEquals(graph.getNumEdges(), 3) self.assertEquals(graph.getEdge(1, 2), 1) self.assertEquals(graph.getEdge(5, 2), 1) self.assertEquals(graph.getEdge(2, 1), 1) self.assertEquals(graph.getEdge(8, 8), 1) edgeValues = [1, 2, 3, 4] graph.addEdges(edgeList, edgeValues) self.assertEquals(graph.getEdge(1, 2), 2) self.assertEquals(graph.getEdge(5, 2), 3) self.assertEquals(graph.getEdge(2, 1), 2) self.assertEquals(graph.getEdge(8, 8), 4) #Now test directed graphs graph = DictGraph(False) graph.addEdges(edgeList) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(1, 2), 1) self.assertEquals(graph.getEdge(5, 2), 1) self.assertEquals(graph.getEdge(2, 1), 1) self.assertEquals(graph.getEdge(8, 8), 1) edgeValues = [1, 2, 3, 4] graph.addEdges(edgeList, edgeValues) self.assertEquals(graph.getEdge(1, 2), 1) self.assertEquals(graph.getEdge(5, 2), 3) self.assertEquals(graph.getEdge(2, 1), 2) self.assertEquals(graph.getEdge(8, 8), 4)
def __init__(self, minGraphSize=500, maxGraphSize=None, dayStep=30): dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" dateFilename = dataDir + "Cit-HepTh-dates.txt" #Note the IDs are integers but can start with zero so we prefix "1" to each ID edges = [] file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges.append([vertex1, vertex2]) #if vertex1 == vertex2: # print(vertex1) file.close() logging.info("Loaded edge file " + str(edgesFilename) + " with " + str(len(edges)) + " edges") #Keep an edge graph graph = DictGraph(False) graph.addEdges(edges) logging.info("Created directed citation graph with " + str(graph.getNumEdges()) + " edges and " + str(graph.getNumVertices()) + " vertices") #Read in the dates articles appear in a dict which used the year and month #as the key and the value is a list of vertex ids. For each month we include #all papers uploaded that month and those directed cited by those uploads. startDate = datetime.date(1990, 1, 1) file = open(dateFilename, 'r') file.readline() numLines = 0 subgraphIds = [] for line in file: (id, sep, date) = line.partition("\t") id = id.strip() date = date.strip() inputDate = datetime.datetime.strptime(date.strip(), "%Y-%m-%d") inputDate = inputDate.date() if graph.vertexExists(id): tDelta = inputDate - startDate graph.vertices[id] = tDelta.days subgraphIds.append(id) #If a paper cites another, it must have been written before #the citing paper - enforce this rule. for neighbour in graph.neighbours(id): if graph.getVertex(neighbour) == None: graph.setVertex(neighbour, tDelta.days) subgraphIds.append(neighbour) elif tDelta.days < graph.getVertex(neighbour): graph.setVertex(neighbour, tDelta.days) numLines += 1 file.close() subgraphIds = set(subgraphIds) graph = graph.subgraph(list(subgraphIds)) logging.debug(graph) logging.info("Loaded date file " + str(dateFilename) + " with " + str(len(subgraphIds)) + " dates and " + str(numLines) + " lines") W = graph.getSparseWeightMatrix() W = W + W.T vList = VertexList(W.shape[0], 1) vList.setVertices(numpy.array([graph.getVertices(graph.getAllVertexIds())]).T) #Note: we have 16 self edges and some two-way citations so this graph has fewer edges than the directed one self.graph = SparseGraph(vList, W=W) logging.debug(self.graph) #Now pick the max component components = self.graph.findConnectedComponents() self.graph = self.graph.subgraph(components[0]) logging.debug("Largest component graph: " + str(self.graph)) self.minGraphSize = minGraphSize self.maxGraphSize = maxGraphSize self.dayStep = dayStep
def testGetIterator(self): generator = CitationIterGenerator() iterator = generator.getIterator() lastW = iterator.next() for W in iterator: self.assertTrue((W-W.T).getnnz() == 0) self.assertTrue((lastW - W[0:lastW.shape[0], 0:lastW.shape[0]]).getnnz() ==0 ) lastW = W numVertices = W.shape[0] #Now compute the vertexIds manually: dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" dateFilename = dataDir + "Cit-HepTh-dates.txt" #We can't load in numbers using numpy since some may start with zero edges = [] file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges.append([int("1" + vertex1), int("1" + vertex2)]) edges = numpy.array(edges, numpy.int) #Check file read correctly self.assertTrue((edges[0, :] == numpy.array([11001, 19304045])).all()) self.assertTrue((edges[1, :] == numpy.array([11001, 19308122])).all()) self.assertTrue((edges[9, :] == numpy.array([11001, 19503124])).all()) vertexIds1 = numpy.unique(edges) logging.info("Number of graph vertices: " + str(vertexIds1.shape[0])) file = open(dateFilename, 'r') file.readline() vertexIds2 = [] for line in file: (id, sep, date) = line.partition("\t") id = id.strip() date = date.strip() vertexIds2.append(int("1" + id)) #Check file read correctly vertexIds2 = numpy.array(vertexIds2, numpy.int) self.assertTrue((vertexIds2[0:10] == numpy.array([19203201, 19203202, 19203203, 19203204, 19203205, 19203206, 19203207, 19203208, 19203209, 19203210], numpy.int)).all()) vertexIds2 = numpy.unique(numpy.array(vertexIds2, numpy.int)) graph = DictGraph(False) graph.addEdges(edges) #Find the set of vertices with known citation vertices = [] vertexId2Set = set(vertexIds2.tolist()) for i in graph.getAllVertexIds(): Util.printIteration(i, 50000, edges.shape[0]) if i in vertexId2Set: vertices.append(i) vertices.extend(graph.neighbours(i)) logging.debug("Number of final vertices: " + str(numVertices)) numVertices2 = numpy.unique(numpy.array(vertices)).shape[0] self.assertEquals(numVertices, numVertices2) #Now compare the weight matrices using the undirected graph #Note the order of vertices is different from the iterator graph = DictGraph() graph.addEdges(edges) subgraph = graph.subgraph(numpy.unique(numpy.array(vertices))) W2 = subgraph.getSparseWeightMatrix() self.assertEquals(W.getnnz(), W2.getnnz())