def match(self, graph1, graph2): """ Take two graphs are match them. The two graphs must be AbstractMatrixGraphs with VertexLists representing the vertices. :param graph1: A graph object :param graph2: The second graph object to match :return permutation: A vector of indices representing the matching of elements of graph1 to graph2 :return distance: The graph distance list [graphDistance, fDistance, fDistanceExact] """ # Deal with case where at least one graph is emty if graph1.size == 0 and graph2.size == 0: permutation = numpy.array([], numpy.int) distanceVector = [0, 0, 0] time = 0 return permutation, distanceVector, time elif graph1.size == 0 or graph2.size == 0: if graph1.size == 0: graph1 = SparseGraph(VertexList(graph2.size, graph2.getVertexList().getNumFeatures())) else: graph2 = SparseGraph(VertexList(graph1.size, graph1.getVertexList().getNumFeatures())) numTempFiles = 5 tempFileNameList = [] for i in range(numTempFiles): fileObj = tempfile.NamedTemporaryFile(delete=False) tempFileNameList.append(fileObj.name) fileObj.close() configFileName = tempFileNameList[0] graph1FileName = tempFileNameList[1] graph2FileName = tempFileNameList[2] similaritiesFileName = tempFileNameList[3] outputFileName = tempFileNameList[4] if self.useWeightM: W1 = graph1.getWeightMatrix() W2 = graph2.getWeightMatrix() else: W1 = graph1.adjacencyMatrix() W2 = graph2.adjacencyMatrix() numpy.savetxt(graph1FileName, W1, fmt="%.5f") numpy.savetxt(graph2FileName, W2, fmt="%.5f") # Compute matrix similarities C = self.vertexSimilarities(graph1, graph2) numpy.savetxt(similaritiesFileName, C, fmt="%.5f") # Write config file configFile = open(configFileName, "w") configStr = "graph_1=" + graph1FileName + " s\n" configStr += "graph_2=" + graph2FileName + " s\n" configStr += "C_matrix=" + similaritiesFileName + " s\n" configStr += "algo=" + self.algorithm + " s\n" configStr += "algo_init_sol=" + self.init + " s\n" configStr += "alpha_ldh=" + str(self.alpha) + " d\n" configStr += "cdesc_matrix=A c\n" configStr += "cscore_matrix=A c\n" configStr += "hungarian_max=10000 d\n" configStr += "algo_fw_xeps=0.01 d\n" configStr += "algo_fw_feps=0.01 d\n" configStr += "dummy_nodes=0 i\n" configStr += "dummy_nodes_fill=" + str(self.rho) + " d\n" configStr += "dummy_nodes_c_coef=" + str(self.gamma) + " d\n" configStr += "qcvqcc_lambda_M=" + str(self.lambdaM) + " d\n" configStr += "qcvqcc_lambda_min=1e-3 d\n" configStr += "blast_match=0 i\n" configStr += "blast_match_proj=0 i\n" configStr += "exp_out_file=" + outputFileName + " s\n" configStr += "exp_out_format=Compact Permutation s\n" configStr += "verbose_mode=0 i\n" configStr += "verbose_file=cout s\n" configFile.write(configStr) configFile.close() fnull = open(os.devnull, "w") home = expanduser("~") argList = [home + "/.local/bin/graphm", configFileName] subprocess.call(argList, stdout=fnull, stderr=fnull) fnull.close() # Next: parse input files outputFile = open(outputFileName, "r") line = outputFile.readline() line = outputFile.readline() line = outputFile.readline() line = outputFile.readline() graphDistance = float(outputFile.readline().split()[2]) fDistance = float(outputFile.readline().split()[2]) fDistanceExact = float(outputFile.readline().split()[2]) time = float(outputFile.readline().split()[1]) line = outputFile.readline() line = outputFile.readline() permutation = numpy.zeros(max(graph1.getNumVertices(), graph2.getNumVertices()), numpy.int) i = 0 for line in outputFile: permutation[i] = int(line.strip()) - 1 i += 1 # Delete files os.remove(graph1FileName) os.remove(graph2FileName) os.remove(similaritiesFileName) os.remove(configFileName) os.remove(outputFileName) distanceVector = [graphDistance, fDistance, fDistanceExact] return permutation, distanceVector, time
class CitationIterGenerator(object): """ A class to load the high energy physics data and generate an iterator. The dataset is found in http://snap.stanford.edu/data/cit-HepTh.html """ def __init__(self, minGraphSize=500, maxGraphSize=None, dayStep=30): dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" dateFilename = dataDir + "Cit-HepTh-dates.txt" #Note the IDs are integers but can start with zero so we prefix "1" to each ID edges = [] file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges.append([vertex1, vertex2]) #if vertex1 == vertex2: # print(vertex1) file.close() logging.info("Loaded edge file " + str(edgesFilename) + " with " + str(len(edges)) + " edges") #Keep an edge graph graph = DictGraph(False) graph.addEdges(edges) logging.info("Created directed citation graph with " + str(graph.getNumEdges()) + " edges and " + str(graph.getNumVertices()) + " vertices") #Read in the dates articles appear in a dict which used the year and month #as the key and the value is a list of vertex ids. For each month we include #all papers uploaded that month and those directed cited by those uploads. startDate = datetime.date(1990, 1, 1) file = open(dateFilename, 'r') file.readline() numLines = 0 subgraphIds = [] for line in file: (id, sep, date) = line.partition("\t") id = id.strip() date = date.strip() inputDate = datetime.datetime.strptime(date.strip(), "%Y-%m-%d") inputDate = inputDate.date() if graph.vertexExists(id): tDelta = inputDate - startDate graph.vertices[id] = tDelta.days subgraphIds.append(id) #If a paper cites another, it must have been written before #the citing paper - enforce this rule. for neighbour in graph.neighbours(id): if graph.getVertex(neighbour) == None: graph.setVertex(neighbour, tDelta.days) subgraphIds.append(neighbour) elif tDelta.days < graph.getVertex(neighbour): graph.setVertex(neighbour, tDelta.days) numLines += 1 file.close() subgraphIds = set(subgraphIds) graph = graph.subgraph(list(subgraphIds)) logging.debug(graph) logging.info("Loaded date file " + str(dateFilename) + " with " + str(len(subgraphIds)) + " dates and " + str(numLines) + " lines") W = graph.getSparseWeightMatrix() W = W + W.T vList = VertexList(W.shape[0], 1) vList.setVertices(numpy.array([graph.getVertices(graph.getAllVertexIds())]).T) #Note: we have 16 self edges and some two-way citations so this graph has fewer edges than the directed one self.graph = SparseGraph(vList, W=W) logging.debug(self.graph) #Now pick the max component components = self.graph.findConnectedComponents() self.graph = self.graph.subgraph(components[0]) logging.debug("Largest component graph: " + str(self.graph)) self.minGraphSize = minGraphSize self.maxGraphSize = maxGraphSize self.dayStep = dayStep def getIterator(self): """ Return an iterator which outputs the citation graph for each month. Note that the graphs are undirected but we make them directed. """ vertexArray = self.graph.getVertexList().getVertices() dates = vertexArray[:, 0] firstVertex = numpy.argmin(dates) dayList = range(int(numpy.min(dates)), int(numpy.max(dates)), self.dayStep) dayList.append(numpy.max(dates)) subgraphIndicesList = [] #Generate subgraph indices list for i in dayList: subgraphIndices = numpy.nonzero(dates <= i)[0] #Check subgraphIndices are sorted subgraphIndices = numpy.sort(subgraphIndices) currentSubgraph = self.graph.subgraph(subgraphIndices) compIndices = currentSubgraph.depthFirstSearch(list(subgraphIndices).index(firstVertex)) subgraphIndices = subgraphIndices[compIndices] if self.maxGraphSize != None and subgraphIndices.shape[0] > self.maxGraphSize: break print(subgraphIndices.shape[0]) if subgraphIndices.shape[0] >= self.minGraphSize: subgraphIndicesList.append(subgraphIndices) iterator = IncreasingSubgraphListIterator(self.graph, subgraphIndicesList) return iterator
def match(self, graph1, graph2): """ Take two graphs are match them. The two graphs must be AbstractMatrixGraphs with VertexLists representing the vertices. :param graph1: A graph object :param graph2: The second graph object to match :return permutation: A vector of indices representing the matching of elements of graph1 to graph2 :return distance: The graph distance list [graphDistance, fDistance, fDistanceExact] """ #Deal with case where at least one graph is emty if graph1.size == 0 and graph2.size == 0: permutation = numpy.array([], numpy.int) distanceVector = [0, 0, 0] time = 0 return permutation, distanceVector, time elif graph1.size == 0 or graph2.size == 0: if graph1.size == 0: graph1 = SparseGraph( VertexList(graph2.size, graph2.getVertexList().getNumFeatures())) else: graph2 = SparseGraph( VertexList(graph1.size, graph1.getVertexList().getNumFeatures())) numTempFiles = 5 tempFileNameList = [] for i in range(numTempFiles): fileObj = tempfile.NamedTemporaryFile(delete=False) tempFileNameList.append(fileObj.name) fileObj.close() configFileName = tempFileNameList[0] graph1FileName = tempFileNameList[1] graph2FileName = tempFileNameList[2] similaritiesFileName = tempFileNameList[3] outputFileName = tempFileNameList[4] if self.useWeightM: W1 = graph1.getWeightMatrix() W2 = graph2.getWeightMatrix() else: W1 = graph1.adjacencyMatrix() W2 = graph2.adjacencyMatrix() numpy.savetxt(graph1FileName, W1, fmt='%.5f') numpy.savetxt(graph2FileName, W2, fmt='%.5f') #Compute matrix similarities C = self.vertexSimilarities(graph1, graph2) numpy.savetxt(similaritiesFileName, C, fmt='%.5f') #Write config file configFile = open(configFileName, 'w') configStr = "graph_1=" + graph1FileName + " s\n" configStr += "graph_2=" + graph2FileName + " s\n" configStr += "C_matrix=" + similaritiesFileName + " s\n" configStr += "algo=" + self.algorithm + " s\n" configStr += "algo_init_sol=" + self.init + " s\n" configStr += "alpha_ldh=" + str(self.alpha) + " d\n" configStr += "cdesc_matrix=A c\n" configStr += "cscore_matrix=A c\n" configStr += "hungarian_max=10000 d\n" configStr += "algo_fw_xeps=0.01 d\n" configStr += "algo_fw_feps=0.01 d\n" configStr += "dummy_nodes=0 i\n" configStr += "dummy_nodes_fill=" + str(self.rho) + " d\n" configStr += "dummy_nodes_c_coef=" + str(self.gamma) + " d\n" configStr += "qcvqcc_lambda_M=" + str(self.lambdaM) + " d\n" configStr += "qcvqcc_lambda_min=1e-3 d\n" configStr += "blast_match=0 i\n" configStr += "blast_match_proj=0 i\n" configStr += "exp_out_file=" + outputFileName + " s\n" configStr += "exp_out_format=Compact Permutation s\n" configStr += "verbose_mode=0 i\n" configStr += "verbose_file=cout s\n" configFile.write(configStr) configFile.close() fnull = open(os.devnull, 'w') home = expanduser("~") argList = [home + "/.local/bin/graphm", configFileName] subprocess.call(argList, stdout=fnull, stderr=fnull) fnull.close() #Next: parse input files outputFile = open(outputFileName, 'r') line = outputFile.readline() line = outputFile.readline() line = outputFile.readline() line = outputFile.readline() graphDistance = float(outputFile.readline().split()[2]) fDistance = float(outputFile.readline().split()[2]) fDistanceExact = float(outputFile.readline().split()[2]) time = float(outputFile.readline().split()[1]) line = outputFile.readline() line = outputFile.readline() permutation = numpy.zeros( max(graph1.getNumVertices(), graph2.getNumVertices()), numpy.int) i = 0 for line in outputFile: permutation[i] = int(line.strip()) - 1 i += 1 #Delete files os.remove(graph1FileName) os.remove(graph2FileName) os.remove(similaritiesFileName) os.remove(configFileName) os.remove(outputFileName) distanceVector = [graphDistance, fDistance, fDistanceExact] return permutation, distanceVector, time