Example #1
0
    def match(self, graph1, graph2):
        """
        Take two graphs are match them. The two graphs must be AbstractMatrixGraphs 
        with VertexLists representing the vertices.  
        
        :param graph1: A graph object 
        
        :param graph2: The second graph object to match 
        
        :return permutation: A vector of indices representing the matching of elements of graph1 to graph2 
        :return distance: The graph distance list [graphDistance, fDistance, fDistanceExact] 
        """
        # Deal with case where at least one graph is emty
        if graph1.size == 0 and graph2.size == 0:
            permutation = numpy.array([], numpy.int)
            distanceVector = [0, 0, 0]
            time = 0
            return permutation, distanceVector, time
        elif graph1.size == 0 or graph2.size == 0:
            if graph1.size == 0:
                graph1 = SparseGraph(VertexList(graph2.size, graph2.getVertexList().getNumFeatures()))
            else:
                graph2 = SparseGraph(VertexList(graph1.size, graph1.getVertexList().getNumFeatures()))

        numTempFiles = 5
        tempFileNameList = []

        for i in range(numTempFiles):
            fileObj = tempfile.NamedTemporaryFile(delete=False)
            tempFileNameList.append(fileObj.name)
            fileObj.close()

        configFileName = tempFileNameList[0]
        graph1FileName = tempFileNameList[1]
        graph2FileName = tempFileNameList[2]
        similaritiesFileName = tempFileNameList[3]
        outputFileName = tempFileNameList[4]

        if self.useWeightM:
            W1 = graph1.getWeightMatrix()
            W2 = graph2.getWeightMatrix()
        else:
            W1 = graph1.adjacencyMatrix()
            W2 = graph2.adjacencyMatrix()

        numpy.savetxt(graph1FileName, W1, fmt="%.5f")
        numpy.savetxt(graph2FileName, W2, fmt="%.5f")

        # Compute matrix similarities
        C = self.vertexSimilarities(graph1, graph2)
        numpy.savetxt(similaritiesFileName, C, fmt="%.5f")

        # Write config file
        configFile = open(configFileName, "w")

        configStr = "graph_1=" + graph1FileName + " s\n"
        configStr += "graph_2=" + graph2FileName + " s\n"
        configStr += "C_matrix=" + similaritiesFileName + " s\n"
        configStr += "algo=" + self.algorithm + " s\n"
        configStr += "algo_init_sol=" + self.init + " s\n"
        configStr += "alpha_ldh=" + str(self.alpha) + " d\n"
        configStr += "cdesc_matrix=A c\n"
        configStr += "cscore_matrix=A c\n"
        configStr += "hungarian_max=10000 d\n"
        configStr += "algo_fw_xeps=0.01 d\n"
        configStr += "algo_fw_feps=0.01 d\n"
        configStr += "dummy_nodes=0 i\n"
        configStr += "dummy_nodes_fill=" + str(self.rho) + " d\n"
        configStr += "dummy_nodes_c_coef=" + str(self.gamma) + " d\n"
        configStr += "qcvqcc_lambda_M=" + str(self.lambdaM) + " d\n"
        configStr += "qcvqcc_lambda_min=1e-3 d\n"
        configStr += "blast_match=0 i\n"
        configStr += "blast_match_proj=0 i\n"
        configStr += "exp_out_file=" + outputFileName + " s\n"
        configStr += "exp_out_format=Compact Permutation s\n"
        configStr += "verbose_mode=0 i\n"
        configStr += "verbose_file=cout s\n"

        configFile.write(configStr)
        configFile.close()

        fnull = open(os.devnull, "w")

        home = expanduser("~")
        argList = [home + "/.local/bin/graphm", configFileName]
        subprocess.call(argList, stdout=fnull, stderr=fnull)

        fnull.close()

        # Next: parse input files
        outputFile = open(outputFileName, "r")

        line = outputFile.readline()
        line = outputFile.readline()
        line = outputFile.readline()
        line = outputFile.readline()

        graphDistance = float(outputFile.readline().split()[2])
        fDistance = float(outputFile.readline().split()[2])
        fDistanceExact = float(outputFile.readline().split()[2])
        time = float(outputFile.readline().split()[1])

        line = outputFile.readline()
        line = outputFile.readline()

        permutation = numpy.zeros(max(graph1.getNumVertices(), graph2.getNumVertices()), numpy.int)

        i = 0
        for line in outputFile:
            permutation[i] = int(line.strip()) - 1
            i += 1

        # Delete files
        os.remove(graph1FileName)
        os.remove(graph2FileName)
        os.remove(similaritiesFileName)
        os.remove(configFileName)
        os.remove(outputFileName)

        distanceVector = [graphDistance, fDistance, fDistanceExact]
        return permutation, distanceVector, time
class CitationIterGenerator(object):
    """
    A class to load the high energy physics data and generate an iterator. The 
    dataset is found in http://snap.stanford.edu/data/cit-HepTh.html
    """
    def __init__(self, minGraphSize=500, maxGraphSize=None, dayStep=30):
        
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"
        dateFilename = dataDir + "Cit-HepTh-dates.txt"

        #Note the IDs are integers but can start with zero so we prefix "1" to each ID 
        edges = []
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges.append([vertex1, vertex2])
            
            #if vertex1 == vertex2: 
            #    print(vertex1)

        file.close()

        logging.info("Loaded edge file " + str(edgesFilename) + " with " + str(len(edges)) + " edges")

        #Keep an edge graph 
        graph = DictGraph(False)
        graph.addEdges(edges)
        logging.info("Created directed citation graph with " + str(graph.getNumEdges()) + " edges and " + str(graph.getNumVertices()) + " vertices")

        #Read in the dates articles appear in a dict which used the year and month
        #as the key and the value is a list of vertex ids. For each month we include
        #all papers uploaded that month and those directed cited by those uploads. 
        startDate = datetime.date(1990, 1, 1)

        file = open(dateFilename, 'r')
        file.readline()
        numLines = 0 
        subgraphIds = []

        for line in file:
            (id, sep, date) = line.partition("\t")
            id = id.strip()
            date = date.strip()
            

            inputDate = datetime.datetime.strptime(date.strip(), "%Y-%m-%d")
            inputDate = inputDate.date()

            if graph.vertexExists(id):
                tDelta = inputDate - startDate
                            
                graph.vertices[id] = tDelta.days 
                subgraphIds.append(id)
                
                #If a paper cites another, it must have been written before 
                #the citing paper - enforce this rule. 
                for neighbour in graph.neighbours(id): 
                    if graph.getVertex(neighbour) == None: 
                        graph.setVertex(neighbour, tDelta.days) 
                        subgraphIds.append(neighbour)
                    elif tDelta.days < graph.getVertex(neighbour): 
                        graph.setVertex(neighbour, tDelta.days) 
                        
            numLines += 1 
            
        file.close()
        
        subgraphIds = set(subgraphIds)
        graph = graph.subgraph(list(subgraphIds))
        logging.debug(graph)
        logging.info("Loaded date file " + str(dateFilename) + " with " + str(len(subgraphIds)) + " dates and " + str(numLines) + " lines")

        W = graph.getSparseWeightMatrix()
        W = W + W.T
        
        vList = VertexList(W.shape[0], 1)
        vList.setVertices(numpy.array([graph.getVertices(graph.getAllVertexIds())]).T)
        
        #Note: we have 16 self edges and some two-way citations so this graph has fewer edges than the directed one 
        self.graph = SparseGraph(vList, W=W)
        logging.debug(self.graph)
        
        #Now pick the max component 
        components = self.graph.findConnectedComponents()
        self.graph = self.graph.subgraph(components[0])
        
        logging.debug("Largest component graph: " + str(self.graph))
        
        self.minGraphSize = minGraphSize
        self.maxGraphSize = maxGraphSize 
        self.dayStep = dayStep 
        
    def getIterator(self):
        """
        Return an iterator which outputs the citation graph for each month. Note
        that the graphs are undirected but we make them directed.
        """
        vertexArray = self.graph.getVertexList().getVertices()
        dates = vertexArray[:, 0]
        firstVertex = numpy.argmin(dates)
        
        dayList = range(int(numpy.min(dates)), int(numpy.max(dates)), self.dayStep)
        dayList.append(numpy.max(dates))
        
        subgraphIndicesList = []

        #Generate subgraph indices list 
        for i in dayList:
            subgraphIndices = numpy.nonzero(dates <= i)[0]
            
            #Check subgraphIndices are sorted 
            subgraphIndices = numpy.sort(subgraphIndices)
            currentSubgraph = self.graph.subgraph(subgraphIndices)
            compIndices = currentSubgraph.depthFirstSearch(list(subgraphIndices).index(firstVertex))
            subgraphIndices =  subgraphIndices[compIndices]        
            
            if self.maxGraphSize != None and subgraphIndices.shape[0] > self.maxGraphSize: 
                break 
            
            print(subgraphIndices.shape[0])
            
            if subgraphIndices.shape[0] >= self.minGraphSize: 
                subgraphIndicesList.append(subgraphIndices)
                
        iterator = IncreasingSubgraphListIterator(self.graph, subgraphIndicesList)
        return iterator 
Example #3
0
    def match(self, graph1, graph2):
        """
        Take two graphs are match them. The two graphs must be AbstractMatrixGraphs 
        with VertexLists representing the vertices.  
        
        :param graph1: A graph object 
        
        :param graph2: The second graph object to match 
        
        :return permutation: A vector of indices representing the matching of elements of graph1 to graph2 
        :return distance: The graph distance list [graphDistance, fDistance, fDistanceExact] 
        """
        #Deal with case where at least one graph is emty
        if graph1.size == 0 and graph2.size == 0:
            permutation = numpy.array([], numpy.int)
            distanceVector = [0, 0, 0]
            time = 0
            return permutation, distanceVector, time
        elif graph1.size == 0 or graph2.size == 0:
            if graph1.size == 0:
                graph1 = SparseGraph(
                    VertexList(graph2.size,
                               graph2.getVertexList().getNumFeatures()))
            else:
                graph2 = SparseGraph(
                    VertexList(graph1.size,
                               graph1.getVertexList().getNumFeatures()))

        numTempFiles = 5
        tempFileNameList = []

        for i in range(numTempFiles):
            fileObj = tempfile.NamedTemporaryFile(delete=False)
            tempFileNameList.append(fileObj.name)
            fileObj.close()

        configFileName = tempFileNameList[0]
        graph1FileName = tempFileNameList[1]
        graph2FileName = tempFileNameList[2]
        similaritiesFileName = tempFileNameList[3]
        outputFileName = tempFileNameList[4]

        if self.useWeightM:
            W1 = graph1.getWeightMatrix()
            W2 = graph2.getWeightMatrix()
        else:
            W1 = graph1.adjacencyMatrix()
            W2 = graph2.adjacencyMatrix()

        numpy.savetxt(graph1FileName, W1, fmt='%.5f')
        numpy.savetxt(graph2FileName, W2, fmt='%.5f')

        #Compute matrix similarities
        C = self.vertexSimilarities(graph1, graph2)
        numpy.savetxt(similaritiesFileName, C, fmt='%.5f')

        #Write config file
        configFile = open(configFileName, 'w')

        configStr = "graph_1=" + graph1FileName + " s\n"
        configStr += "graph_2=" + graph2FileName + " s\n"
        configStr += "C_matrix=" + similaritiesFileName + " s\n"
        configStr += "algo=" + self.algorithm + " s\n"
        configStr += "algo_init_sol=" + self.init + " s\n"
        configStr += "alpha_ldh=" + str(self.alpha) + " d\n"
        configStr += "cdesc_matrix=A c\n"
        configStr += "cscore_matrix=A c\n"
        configStr += "hungarian_max=10000 d\n"
        configStr += "algo_fw_xeps=0.01 d\n"
        configStr += "algo_fw_feps=0.01 d\n"
        configStr += "dummy_nodes=0 i\n"
        configStr += "dummy_nodes_fill=" + str(self.rho) + " d\n"
        configStr += "dummy_nodes_c_coef=" + str(self.gamma) + " d\n"
        configStr += "qcvqcc_lambda_M=" + str(self.lambdaM) + " d\n"
        configStr += "qcvqcc_lambda_min=1e-3 d\n"
        configStr += "blast_match=0 i\n"
        configStr += "blast_match_proj=0 i\n"
        configStr += "exp_out_file=" + outputFileName + " s\n"
        configStr += "exp_out_format=Compact Permutation s\n"
        configStr += "verbose_mode=0 i\n"
        configStr += "verbose_file=cout s\n"

        configFile.write(configStr)
        configFile.close()

        fnull = open(os.devnull, 'w')

        home = expanduser("~")
        argList = [home + "/.local/bin/graphm", configFileName]
        subprocess.call(argList, stdout=fnull, stderr=fnull)

        fnull.close()

        #Next: parse input files
        outputFile = open(outputFileName, 'r')

        line = outputFile.readline()
        line = outputFile.readline()
        line = outputFile.readline()
        line = outputFile.readline()

        graphDistance = float(outputFile.readline().split()[2])
        fDistance = float(outputFile.readline().split()[2])
        fDistanceExact = float(outputFile.readline().split()[2])
        time = float(outputFile.readline().split()[1])

        line = outputFile.readline()
        line = outputFile.readline()

        permutation = numpy.zeros(
            max(graph1.getNumVertices(), graph2.getNumVertices()), numpy.int)

        i = 0
        for line in outputFile:
            permutation[i] = int(line.strip()) - 1
            i += 1

        #Delete files
        os.remove(graph1FileName)
        os.remove(graph2FileName)
        os.remove(similaritiesFileName)
        os.remove(configFileName)
        os.remove(outputFileName)

        distanceVector = [graphDistance, fDistance, fDistanceExact]
        return permutation, distanceVector, time