Ejemplo n.º 1
0
    def test_make_graph_from_points_knn_and_mst(self):
        X = np.asarray(
            ((0, 0), (0, 1), (1, 0), (0, 3), (0, 4), (1, 3), (2, 3)))
        sqrt2 = np.sqrt(2)
        g, ew = hg.make_graph_from_points(X,
                                          graph_type="knn+mst",
                                          symmetrization="max",
                                          n_neighbors=2)

        g_ref = hg.UndirectedGraph(7)
        g_ref.add_edges((0, 0, 1, 3, 3, 4, 5, 3, 1),
                        (1, 2, 2, 5, 4, 5, 6, 6, 3))
        w_ref = (1, 1, sqrt2, 1, 1, sqrt2, 1, 2, 2)

        self.assertTrue(TestAlgorithmGraphCore.graph_equal(
            g, ew, g_ref, w_ref))

        g, ew = hg.make_graph_from_points(X,
                                          graph_type="knn+mst",
                                          symmetrization="min",
                                          n_neighbors=2)

        g_ref = hg.UndirectedGraph(7)
        g_ref.add_edges((0, 0, 1, 3, 3, 5, 1), (1, 2, 2, 5, 4, 6, 3))
        w_ref = (1, 1, sqrt2, 1, 1, 1, 2)

        self.assertTrue(TestAlgorithmGraphCore.graph_equal(
            g, ew, g_ref, w_ref))
Ejemplo n.º 2
0
    def test_add_vertex(self):
        g = hg.UndirectedGraph()
        self.assertTrue(g.num_vertices() == 0)
        self.assertTrue(g.add_vertex() == 0)
        self.assertTrue(g.num_vertices() == 1)
        self.assertTrue(g.add_vertex() == 1)
        self.assertTrue(g.num_vertices() == 2)

        g = hg.UndirectedGraph(3)
        self.assertTrue(g.num_vertices() == 3)
Ejemplo n.º 3
0
    def test_add_edges(self):
        g = hg.UndirectedGraph(3)
        g.add_edge(0, 1)
        g.add_edge(0, 2)

        g2 = hg.UndirectedGraph(3)
        g2.add_edges((0, 0), (1, 2))

        self.assertTrue(g2.num_edges() == 2)

        for i in range(g2.num_edges()):
            self.assertTrue(g.edge_from_index(i) == g2.edge_from_index(i))
Ejemplo n.º 4
0
    def test_graphWrite(self):
        global graph_file
        filename = "testWriteGraphPink.graph"
        silent_remove(filename)

        vertex_weights = np.arange(1, 16)
        edges_weights = np.asarray((3, 0, 0, 1, 3, 0, 1, 0, 2, 0, 1, 0, 3, 0))
        shape = (3, 5)

        graph = hg.UndirectedGraph(15)
        for i in range(14):
            graph.add_edge(i, i + 1)

        hg.save_graph_pink(filename, graph, vertex_weights, edges_weights, shape)

        self.assertTrue(os.path.exists(filename))

        with open(filename, 'r') as f:
            data = f.read()

        silent_remove(filename)

        with open(graph_file, 'r') as f:
            data_ref = f.read()

        self.assertTrue(data == data_ref)

        # Test default attributes
        hg.save_graph_pink(filename, graph)
        self.assertTrue(os.path.exists(filename))
        silent_remove(filename)
Ejemplo n.º 5
0
    def test_adjacency_matrix_2_undirected_graph_non_edge_values(self):
        ref_adj_mat = np.asarray(((-1, 1, 2, 3, 4),
                                  (1, -1, 5, -1, -1),
                                  (2, 5, -1, 6, 7),
                                  (3, -1, 6, -1, -1),
                                  (4, -1, 7, -1, -1)))
        graph, edge_weights = hg.adjacency_matrix_2_undirected_graph(ref_adj_mat, -1)

        ref_graph = hg.UndirectedGraph(5)
        ref_graph.add_edge(0, 1)
        ref_graph.add_edge(0, 2)
        ref_graph.add_edge(0, 3)
        ref_graph.add_edge(0, 4)
        ref_graph.add_edge(1, 2)
        ref_graph.add_edge(2, 3)
        ref_graph.add_edge(2, 4)

        ref_edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7))

        self.assertTrue(np.all(edge_weights == ref_edge_weights))
        self.assertTrue(graph.num_vertices() == ref_graph.num_vertices())
        self.assertTrue(graph.num_edges() == ref_graph.num_edges())

        for (e1, e2) in zip(graph.edges(), ref_graph.edges()):
            self.assertTrue(e1 == e2)
Ejemplo n.º 6
0
    def test_binary_partition_tree_average_linkage2(self):
        graph = hg.UndirectedGraph(10)
        graph.add_edges(
            (0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 7, 7),
            (3, 6, 4, 2, 5, 3, 6, 9, 7, 3, 8, 5, 9, 4, 6, 9, 7, 8, 6, 9, 8))
        edge_values = np.asarray(
            (0.87580029, 0.60123697, 0.79924759, 0.74221387, 0.75418382,
             0.66159356, 1.31856839, 0.76080612, 1.08881471, 0.98557615,
             0.61454158, 0.50913424, 0.63556478, 0.64684775, 1.14865302,
             0.81741018, 2.1591071, 0.60563004, 2.06636665, 1.35617725,
             0.83085949),
            dtype=np.float64)

        tree, altitudes = hg.binary_partition_tree_average_linkage(
            graph, edge_values)

        expected_parents = np.asarray((11, 14, 10, 13, 15, 10, 11, 18, 12, 13,
                                       12, 17, 16, 14, 15, 16, 17, 18, 18),
                                      dtype=np.int64)
        expected_altitudes = np.asarray(
            (0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.509134, 0.601237,
             0.610086, 0.635565, 0.661594, 0.732129, 0.810695, 1.241727,
             1.35874),
            dtype=np.float64)
        self.assertTrue(np.all(expected_parents == tree.parents()))
        self.assertTrue(np.allclose(expected_altitudes, altitudes))
Ejemplo n.º 7
0
 def test_add_vertices(self):
     g = hg.UndirectedGraph()
     self.assertTrue(g.num_vertices() == 0)
     g.add_vertices(3)
     self.assertTrue(g.num_vertices() == 3)
     g.add_vertices(2)
     self.assertTrue(g.num_vertices() == 5)
Ejemplo n.º 8
0
    def test_make_graph_from_points_complete(self):
        X = np.asarray(((0, 0), (0, 1), (1, 0)))
        sqrt2 = np.sqrt(2)
        g, ew = hg.make_graph_from_points(X, graph_type="complete")

        g_ref = hg.UndirectedGraph(3)
        g_ref.add_edges((0, 0, 1), (1, 2, 2))
        w_ref = (1, 1, sqrt2)

        self.assertTrue(TestAlgorithmGraphCore.graph_equal(g, ew, g_ref, w_ref))
Ejemplo n.º 9
0
    def test_subgraph_spanning(self):
        graph = hg.UndirectedGraph(6)
        graph.add_edges(np.arange(5), np.arange(1, 6))
        edge_indices = np.asarray((4, 0, 3))
        subgraph, vertex_map = hg.subgraph(graph, edge_indices, spanning=False, return_vertex_map=True)

        self.assertTrue(subgraph.num_vertices() == 5)
        self.assertTrue(subgraph.num_edges() == len(edge_indices))
        sources, targets = subgraph.edge_list()
        self.assertTrue(np.all(vertex_map == (0, 1, 3, 4, 5)))
        self.assertTrue(np.all(vertex_map[sources] == (4, 0, 3)))
        self.assertTrue(np.all(vertex_map[targets] == (5, 1, 4)))
Ejemplo n.º 10
0
    def test_binary_partition_tree_ward_linkage(self):
        graph = hg.UndirectedGraph(5)

        graph.add_edges((0, 0, 0, 1, 2, 2, 3), (1, 2, 3, 2, 3, 4, 4))

        vertex_centroids = np.asarray(
            ((0, 0), (1, 1), (1, 3), (-3, 4), (-1, 5)))

        vertex_sizes = np.asarray((1, 1, 1, 2, 1))

        tree, altitudes = hg.binary_partition_tree_ward_linkage(
            graph, vertex_centroids, vertex_sizes)

        expected_parents = np.asarray((5, 5, 7, 6, 6, 7, 8, 8, 8),
                                      dtype=np.int64)
        expected_altitudes = np.asarray(
            (0., 0., 0., 0., 0., 1., 3.333333, 4.333333, 27.),
            dtype=np.float64)
        self.assertTrue(np.all(expected_parents == tree.parents()))
        self.assertTrue(np.allclose(expected_altitudes, altitudes))

        graph = hg.UndirectedGraph(3)
        graph.add_edges((0, 1), (2, 2))

        vertex_centroids = np.asarray(((0, ), (1, ), (5, )))
        tree, altitudes = hg.binary_partition_tree_ward_linkage(
            graph, vertex_centroids)

        expected_parents = np.asarray((4, 3, 3, 4, 4), dtype=np.int64)
        expected_altitudes = np.asarray((0., 0., 0., 8, 8), dtype=np.float64)
        self.assertTrue(np.all(expected_parents == tree.parents()))
        self.assertTrue(np.allclose(expected_altitudes, altitudes))

        tree, altitudes = hg.binary_partition_tree_ward_linkage(
            graph, vertex_centroids, altitude_correction="none")

        expected_parents = np.asarray((4, 3, 3, 4, 4), dtype=np.int64)
        expected_altitudes = np.asarray((0., 0., 0., 8, 6), dtype=np.float64)
        self.assertTrue(np.all(expected_parents == tree.parents()))
        self.assertTrue(np.allclose(expected_altitudes, altitudes))
Ejemplo n.º 11
0
    def test_undirected_graph_2_adjacency_matrix(self):
        graph = hg.UndirectedGraph(5)
        graph.add_edge(0, 1)
        graph.add_edge(0, 2)
        graph.add_edge(0, 3)
        graph.add_edge(0, 4)
        graph.add_edge(1, 2)
        graph.add_edge(2, 3)
        graph.add_edge(2, 4)

        edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7))
        adj_mat = hg.undirected_graph_2_adjacency_matrix(graph, edge_weights, non_edge_value=-1, sparse=False)

        ref_adj_mat = np.asarray(((-1, 1, 2, 3, 4),
                                  (1, -1, 5, -1, -1),
                                  (2, 5, -1, 6, 7),
                                  (3, -1, 6, -1, -1),
                                  (4, -1, 7, -1, -1)))
        self.assertTrue(np.all(ref_adj_mat == adj_mat))
        self.assertTrue(isinstance(adj_mat, np.ndarray))

        t = hg.Tree(np.asarray((5, 5, 6, 6, 6, 7, 7, 7)))
        edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7))
        adj_mat = hg.undirected_graph_2_adjacency_matrix(t, edge_weights)

        ref_adj_mat = np.asarray(((0, 0, 0, 0, 0, 1, 0, 0),
                                  (0, 0, 0, 0, 0, 2, 0, 0),
                                  (0, 0, 0, 0, 0, 0, 3, 0),
                                  (0, 0, 0, 0, 0, 0, 4, 0),
                                  (0, 0, 0, 0, 0, 0, 5, 0),
                                  (1, 2, 0, 0, 0, 0, 0, 6),
                                  (0, 0, 3, 4, 5, 0, 0, 7),
                                  (0, 0, 0, 0, 0, 6, 7, 0)))
        self.assertTrue(np.all(ref_adj_mat == adj_mat))
        self.assertTrue(sp.issparse(adj_mat))

        t = hg.Tree(np.asarray((5, 5, 6, 6, 6, 7, 7, 7)))
        adj_mat = hg.undirected_graph_2_adjacency_matrix(t)

        ref_adj_mat = np.asarray(((0, 0, 0, 0, 0, 1, 0, 0),
                                  (0, 0, 0, 0, 0, 1, 0, 0),
                                  (0, 0, 0, 0, 0, 0, 1, 0),
                                  (0, 0, 0, 0, 0, 0, 1, 0),
                                  (0, 0, 0, 0, 0, 0, 1, 0),
                                  (1, 1, 0, 0, 0, 0, 0, 1),
                                  (0, 0, 1, 1, 1, 0, 0, 1),
                                  (0, 0, 0, 0, 0, 1, 1, 0)))
        self.assertTrue(np.all(ref_adj_mat == adj_mat))
        self.assertTrue(sp.issparse(adj_mat))

        with self.assertRaises(Exception):
            hg.undirected_graph_2_adjacency_matrix(t, non_edge_value=-1, sparse=True)
Ejemplo n.º 12
0
    def _get_associated_mst(tree, altitudes):
        """
        Create a valid edge mst for the given tree (returns an edge weighted undirected graph)
        """
        nb = tree.num_leaves()
        link_v = np.arange(nb)
        link_v = hg.accumulate_sequential(tree, link_v, hg.Accumulators.first)

        g = hg.UndirectedGraph(nb)
        edge_weights = np.zeros((nb - 1,), np.float32)
        for r in tree.leaves_to_root_iterator(include_leaves=False):
            g.add_edge(link_v[tree.child(0, r)], link_v[tree.child(1, r)])
            edge_weights[r - nb] = altitudes[r]

        return g, edge_weights
Ejemplo n.º 13
0
    def test_add_edge(self):
        g = hg.UndirectedGraph(3)
        self.assertTrue(g.num_edges() == 0)
        g.add_edge(0, 1)
        self.assertTrue(g.num_edges() == 1)

        # parallel edge allowed
        g.add_edge(0, 1)
        self.assertTrue(g.num_edges() == 2)

        # still parallel edge allowed
        g.add_edge(1, 0)
        self.assertTrue(g.num_edges() == 3)

        g.add_edge(0, 2)
        self.assertTrue(g.num_edges() == 4)
Ejemplo n.º 14
0
    def test_binary_partition_tree_exponential_linkage(self):
        graph = hg.UndirectedGraph(5)
        sources = np.asarray((0, 0, 1, 2, 2, 3))
        targets = np.asarray((1, 2, 4, 3, 4, 4))
        graph.add_edges(sources, targets)

        edge_weights = np.asarray((1, 3, 5, 2, 4, 6), dtype=np.float64)
        edge_weight_weights = np.asarray((2, 2, 1, 3, 3, 1), dtype=np.float64)

        tree, altitudes = hg.binary_partition_tree_exponential_linkage(
            graph, edge_weights, -1, edge_weight_weights)

        ref_parents = np.asarray((5, 5, 6, 6, 8, 7, 7, 8, 8))
        ref_altitudes = np.asarray((0., 0., 0., 0., 0., 1., 2., 3., 4.182275))

        self.assertTrue(np.all(tree.parents() == ref_parents))
        self.assertTrue(np.allclose(altitudes, ref_altitudes))
Ejemplo n.º 15
0
    def test_adjacency_matrix_2_undirected_graph(self):
        ref_adj_mat = np.asarray(((0, 0.1), (0.1, 0)), dtype=np.float64)
        graph, edge_weights = hg.adjacency_matrix_2_undirected_graph(
            ref_adj_mat)

        ref_graph = hg.UndirectedGraph(2)
        ref_graph.add_edge(0, 1)

        ref_edge_weights = np.asarray((0.1, ))

        self.assertTrue(edge_weights.dtype == np.float64)
        self.assertTrue(np.all(edge_weights == ref_edge_weights))
        self.assertTrue(graph.num_vertices() == ref_graph.num_vertices())
        self.assertTrue(graph.num_edges() == ref_graph.num_edges())

        for (e1, e2) in zip(graph.edges(), ref_graph.edges()):
            self.assertTrue(e1 == e2)
Ejemplo n.º 16
0
def adjacency_matrix_2_undirected_graph(adjacency_matrix, non_edge_value=0):
    """
    Undirected edge-weighted graph corresponding to an adjacency matrix.

    Adjacency matrix entries which are equal to :attr:`non_edge_value` are not considered to be part of the graph.

    :param adjacency_matrix: Input adjacency matrix (A 2d symmetric square matrix)
    :param non_edge_value: Value used to represent non existing edges in the adjacency matrix
    :return: a pair (UndirectedGraph, ndarray) representing the graph and its edge_weights (Concept :class:`~higra.CptEdgeWeightedGraph`)
    """
    if adjacency_matrix.ndim != 2 or adjacency_matrix.shape[
            0] != adjacency_matrix.shape[1]:
        raise ValueError("'adjacency_matrix' must be a 2d square matrix.")

    try:
        import scipy.sparse as sp
        scipy_available = True
    except:
        scipy_available = False

    if scipy_available and sp.issparse(adjacency_matrix):
        if non_edge_value != 0:
            raise ValueError(
                "'non_edge_value' must be equal to 0 is 'adjacency_matrix' is a Scipy sparse matrix."
            )
        adjacency_matrix = sp.triu(adjacency_matrix)
        sources, targets, edge_weights = sp.find(adjacency_matrix)
    else:
        adjacency_matrix = adjacency_matrix.copy()
        adjacency_matrix[np.tri(*adjacency_matrix.shape, k=-1,
                                dtype=np.bool)] = non_edge_value
        if non_edge_value != 0:
            mask = adjacency_matrix != non_edge_value
        else:
            mask = adjacency_matrix
        sources, targets = np.nonzero(mask)
        edge_weights = adjacency_matrix[sources, targets]

    graph = hg.UndirectedGraph(adjacency_matrix.shape[0])
    graph.add_edges(sources, targets)

    return graph, edge_weights
Ejemplo n.º 17
0
    def test_minimum_spanning_forest(self):
        graph = hg.UndirectedGraph(6)
        graph.add_edges((0, 0, 1, 3, 3, 4), (1, 2, 2, 4, 5, 5))

        edge_weights = np.asarray((0, 1, 2, 3, 4, 5))

        mst = hg.minimum_spanning_tree(graph, edge_weights)
        mst_edge_map = hg.CptMinimumSpanningTree.get_edge_map(mst)

        self.assertTrue(mst.num_vertices() == 6)
        self.assertTrue(mst.num_edges() == 4)

        ref_sources = (0, 0, 3, 3)
        ref_targets = (1, 2, 4, 5)
        sources, targets = mst.edge_list()

        self.assertTrue(np.all(sources == ref_sources))
        self.assertTrue(np.all(targets == ref_targets))

        self.assertTrue(np.all(mst_edge_map == (0, 1, 3, 4)))
Ejemplo n.º 18
0
 def test_graph():
     g = hg.UndirectedGraph(4)
     g.add_edge(0, 1)
     g.add_edge(1, 2)
     g.add_edge(0, 2)
     return g
Ejemplo n.º 19
0
def make_graph_from_points(X, graph_type="knn+mst", symmetrization="max", **kwargs):
    """
    Creates a graph from vertex coordinates.

    The argument :attr:`graph_type` selects the graph creation methods. Possible values are:

        - ``"complete"``: creates the complete graph
        - ``"knn"``: creates a :math:`k`-nearest neighbor graph, the parameter :math:`k` can be controlled
          with the extra parameter 'n_neighbors' (default value 5).
          The resulting graph may have several connected components.
        - ``"knn+mst"`` (default): creates a :math:`k`-nearest neighbor graph and add the edges of an mst of the complete graph.
          This method ensures that the resulting graph is connected.
          The parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5).
        - ``"delaunay"``: creates a graph corresponding to the Delaunay triangulation of the points
          (only works in low dimensions).

    The weight of an edge :math:`\{x,y\}` is equal to the Euclidean distance between
    :math:`x` and :math:`y`: :math:`w(\{x,y\})=\|X[x, :] - X[y, :]\|`.

    :math:`K`-nearest neighbor based graphs are naturally directed, the argument :attr:`symmetrization` enables to chose a
    symmetrization strategy. Possible values are:

        - ``"min"``: an edge :math:`\{x,y\}` is created if there both arcs :math:`(x,y)` and :math:`(y,x)` exist.
          Its weight is given by the minimum weight of the two arcs.
        - ``"max"``: an edge :math:`\{x,y\}` is created if there is any of the two arcs :math:`(x,y)` and :math:`(y,x)` exists.
          Its weight is given by the weight of the existing arcs (if both arcs exists they necessarily have the same weight).

    This method is not suited for large set of points.

    :param X: A 2d array of vertex coordinates
    :param graph_type: ``"complete"``, ``"knn"``, ``"knn+mst"`` (default), or ``"delaunay"``
    :param symmetrization: `"min"`` or ``"max"``
    :param kwargs: extra args depends of chosen graph type
    :return: a graph and its edge weights
    """
    try:
        from scipy.spatial.distance import pdist, squareform, euclidean
        from sklearn.neighbors import kneighbors_graph
        from scipy.sparse.csgraph import minimum_spanning_tree
        from scipy.spatial import Delaunay
    except:
        raise RuntimeError("scipy and sklearn required.")

    n_neighbors = kwargs.get('n_neighbors', 5)
    mode = kwargs.get('mode', 'distance')

    def symmetrization_fun(A):
        if symmetrization == "min":
            return A.minimum(A.T)
        elif symmetrization == "max":
            return A.maximum(A.T)
        else:
            raise ValueError("Unknown symmetrization: " + str(symmetrization))

    if graph_type == "complete":
        d = pdist(X)
        A = squareform(d)
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "knn":
        A = kneighbors_graph(X, n_neighbors=n_neighbors, mode=mode)
        A = symmetrization_fun(A)
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "knn+mst":
        A = kneighbors_graph(X, n_neighbors=n_neighbors, mode=mode)
        A = symmetrization_fun(A)
        D = squareform(pdist(X))
        MST = minimum_spanning_tree(D)
        MST = MST + MST.T
        A = A.maximum(MST)
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "delaunay":
        g = hg.UndirectedGraph(X.shape[0])
        edge_weights = []

        # add QJ to ensure that coplanar point are not discarded
        tmp = Delaunay(X)
        nbp = X.shape[0]
        if tmp.coplanar.size != 0:
            print("Warning coplanar points detected!")
        indices, indptr = tmp.vertex_neighbor_vertices

        for k in range(nbp):
            neighbours = indptr[indices[k]:indices[k+1]]
            for n in neighbours:
                if n > k:
                    d = euclidean(X[k, :], X[n, :])
                    g.add_edge(k, n)
                    edge_weights.append(d)

        edge_weights = np.asarray(edge_weights, dtype=np.float64)
    elif graph_type == "mst":
        D = squareform(pdist(X))
        MST = minimum_spanning_tree(D).toarray()
        MST = MST + MST.T
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(MST)
    else:
        raise ValueError("Unknown graph_type: " + str(graph_type))

    return g, edge_weights
Ejemplo n.º 20
0
def main(params):
    output_path = params["output_path"]
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    logger = utils.get_logger(params["output_path"], 'log')

    pickle_src_path = params["pickle_src_path"]
    if pickle_src_path is None or not os.path.exists(pickle_src_path):
        pickle_src_path = output_path

    embed_data_path = params["embed_data_path"]
    if embed_data_path is None or not os.path.exists(embed_data_path):
        embed_data_path = output_path

    # Init model
    reranker = BiEncoderRanker(params)
    reranker.model.eval()
    tokenizer = reranker.tokenizer
    n_gpu = reranker.n_gpu

    knn = params["knn"]  # Use as the max-knn value for the graph construction
    use_types = params["use_types"]
    # within_doc = params["within_doc"]
    data_split = params["data_split"]  # Default = "test"

    # Load test data
    entity_dictionary_loaded = False
    test_dictionary_pkl_path = os.path.join(pickle_src_path,
                                            'test_dictionary.pickle')
    test_tensor_data_pkl_path = os.path.join(pickle_src_path,
                                             'test_tensor_data.pickle')
    test_mention_data_pkl_path = os.path.join(pickle_src_path,
                                              'test_mention_data.pickle')
    # if params['transductive']:
    #     train_tensor_data_pkl_path = os.path.join(pickle_src_path, 'train_tensor_data.pickle')
    #     train_mention_data_pkl_path = os.path.join(pickle_src_path, 'train_mention_data.pickle')
    if os.path.isfile(test_dictionary_pkl_path):
        print("Loading stored processed entity dictionary...")
        with open(test_dictionary_pkl_path, 'rb') as read_handle:
            test_dictionary = pickle.load(read_handle)
        entity_dictionary_loaded = True
    if os.path.isfile(test_tensor_data_pkl_path) and os.path.isfile(
            test_mention_data_pkl_path):
        print("Loading stored processed test data...")
        with open(test_tensor_data_pkl_path, 'rb') as read_handle:
            test_tensor_data = pickle.load(read_handle)
        with open(test_mention_data_pkl_path, 'rb') as read_handle:
            mention_data = pickle.load(read_handle)
    else:
        test_samples = utils.read_dataset(data_split, params["data_path"])
        if not entity_dictionary_loaded:
            with open(os.path.join(params["data_path"], 'dictionary.pickle'),
                      'rb') as read_handle:
                test_dictionary = pickle.load(read_handle)

        # Check if dataset has multiple ground-truth labels
        mult_labels = "labels" in test_samples[0].keys()
        # Filter samples without gold entities
        test_samples = list(
            filter(
                lambda sample: (len(sample["labels"]) > 0) if mult_labels else
                (sample["label"] is not None), test_samples))
        logger.info("Read %d test samples." % len(test_samples))

        mention_data, test_dictionary, test_tensor_data = data_process.process_mention_data(
            test_samples,
            test_dictionary,
            tokenizer,
            params["max_context_length"],
            params["max_cand_length"],
            multi_label_key="labels" if mult_labels else None,
            context_key=params["context_key"],
            silent=params["silent"],
            logger=logger,
            debug=params["debug"],
            knn=knn,
            dictionary_processed=entity_dictionary_loaded)
        print("Saving processed test data...")
        if not entity_dictionary_loaded:
            with open(test_dictionary_pkl_path, 'wb') as write_handle:
                pickle.dump(test_dictionary,
                            write_handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
        with open(test_tensor_data_pkl_path, 'wb') as write_handle:
            pickle.dump(test_tensor_data,
                        write_handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
        with open(test_mention_data_pkl_path, 'wb') as write_handle:
            pickle.dump(mention_data,
                        write_handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    # Reducing the entity dictionary to only the ground truth of the mention queries
    # Combining the entities and mentions into one structure for joint embedding and indexing
    new_ents = {}
    new_ents_arr = []
    men_labels = []
    for men in mention_data:
        ent = men['label_idxs'][0]
        if ent not in new_ents:
            new_ents[ent] = len(new_ents_arr)
            new_ents_arr.append(ent)
        men_labels.append(new_ents[ent])
    ent_labels = [i for i in range(len(new_ents_arr))]
    new_ent_vecs = torch.tensor(
        list(map(lambda x: test_dictionary[x]['ids'], new_ents_arr)))
    new_ent_types = list(
        map(lambda x: {"type": test_dictionary[x]['type']}, new_ents_arr))
    test_men_vecs = test_tensor_data[:][0]

    n_mentions = len(test_tensor_data)
    n_entities = len(new_ent_vecs)
    n_embeds = n_mentions + n_entities
    leaf_labels = np.array(ent_labels + men_labels, dtype=int)
    all_vecs = torch.cat((new_ent_vecs, test_men_vecs))
    all_types = new_ent_types + mention_data  # Array of dicts containing key "type" for selected ents and all mentions

    # Values of k to run the evaluation against
    knn_vals = [25 * 2**i for i in range(int(math.log(knn / 25, 2)) + 1)
                ] if params["exact_knn"] is None else [params["exact_knn"]]
    # Store the maximum evaluation k
    max_knn = knn_vals[-1]

    time_start = time.time()

    # Check if graphs are already built
    graph_path = os.path.join(output_path, 'graphs.pickle')
    if os.path.isfile(graph_path):
        print("Loading stored joint graphs...")
        with open(graph_path, 'rb') as read_handle:
            joint_graphs = pickle.load(read_handle)
    else:
        # Initialize graphs to store mention-mention and mention-entity similarity score edges;
        # Keyed on k, the number of nearest mentions retrieved
        joint_graphs = {}
        for k in knn_vals:
            joint_graphs[k] = {
                'rows': np.array([]),
                'cols': np.array([]),
                'data': np.array([]),
                'shape': (n_embeds, n_embeds)
            }

        # Check and load stored embedding data
        embed_data_path = os.path.join(embed_data_path, 'embed_data.t7')
        embed_data = None
        if os.path.isfile(embed_data_path):
            embed_data = torch.load(embed_data_path)
        if use_types:
            if embed_data is not None:
                logger.info('Loading stored embeddings')
                embeds = embed_data['embeds']
                if 'idxs_by_type' in embed_data:
                    idxs_by_type = embed_data['idxs_by_type']
                else:
                    idxs_by_type = data_process.get_idxs_by_type(all_types)
            else:
                logger.info("Embedding data")
                dict_embeds = data_process.embed_and_index(
                    reranker,
                    all_vecs[:n_entities],
                    encoder_type='candidate',
                    only_embed=True,
                    n_gpu=n_gpu,
                    batch_size=params['embed_batch_size'])
                men_embeds = data_process.embed_and_index(
                    reranker,
                    all_vecs[n_entities:],
                    encoder_type='context',
                    only_embed=True,
                    n_gpu=n_gpu,
                    batch_size=params['embed_batch_size'])
                embeds = np.concatenate((dict_embeds, men_embeds), axis=0)
                idxs_by_type = data_process.get_idxs_by_type(all_types)
            search_indexes = data_process.get_index_from_embeds(
                embeds, corpus_idxs=idxs_by_type, force_exact_search=True)
        else:
            if embed_data is not None:
                logger.info('Loading stored embeddings')
                embeds = embed_data['embeds']
            else:
                logger.info("Embedding data")
                dict_embeds = data_process.embed_and_index(
                    reranker,
                    all_vecs[:n_entities],
                    encoder_type='candidate',
                    only_embed=True,
                    n_gpu=n_gpu,
                    batch_size=params['embed_batch_size'])
                men_embeds = data_process.embed_and_index(
                    reranker,
                    all_vecs[n_entities:],
                    encoder_type='context',
                    only_embed=True,
                    n_gpu=n_gpu,
                    batch_size=params['embed_batch_size'])
                embeds = np.concatenate((dict_embeds, men_embeds), axis=0)
            search_index = data_process.get_index_from_embeds(
                embeds, force_exact_search=True)
        # Save computed embedding data if not loaded from disk
        if embed_data is None:
            embed_data = {}
            embed_data['embeds'] = embeds
            if use_types:
                embed_data['idxs_by_type'] = idxs_by_type
            # NOTE: Cannot pickle faiss index because it is a SwigPyObject
            torch.save(embed_data,
                       embed_data_path,
                       pickle_protocol=pickle.HIGHEST_PROTOCOL)

        # Build faiss search index
        if params["normalize_embeds"]:
            embeds = normalize(embeds, axis=0)
        logger.info("Building KNN index...")
        if use_types:
            search_indexes = data_process.get_index_from_embeds(
                embeds, corpus_idxs=idxs_by_type, force_exact_search=True)
        else:
            search_index = data_process.get_index_from_embeds(
                embeds, force_exact_search=True)

        logger.info("Starting KNN search...")
        if not use_types:
            faiss_dists, faiss_idxs = search_index.search(embeds, max_knn + 1)
        else:
            query_len = n_embeds
            faiss_idxs = np.zeros((query_len, max_knn + 1))
            faiss_dists = np.zeros((query_len, max_knn + 1), dtype=float)
            for entity_type in search_indexes:
                embeds_by_type = embeds[idxs_by_type[entity_type]]
                nn_dists_by_type, nn_idxs_by_type = search_indexes[
                    entity_type].search(embeds_by_type, max_knn + 1)
                for i, idx in enumerate(idxs_by_type[entity_type]):
                    faiss_idxs[idx] = nn_idxs_by_type[i]
                    faiss_dists[idx] = nn_dists_by_type[i]
        logger.info("Search finished")

        logger.info('Building graphs')
        # Find the most similar nodes for each mention and node in the set (minus self)
        for idx in trange(n_embeds):
            # Compute adjacent node edge weight
            if idx != 0:
                adj_idx = idx - 1
                adj_data = embeds[adj_idx] @ embeds[idx]
            nn_idxs = faiss_idxs[idx]
            nn_scores = faiss_dists[idx]
            # Filter candidates to remove mention query and keep only the top k candidates
            filter_mask = nn_idxs != idx
            nn_idxs, nn_scores = nn_idxs[filter_mask][:max_knn], nn_scores[
                filter_mask][:max_knn]
            # Add edges to the graphs
            for k in joint_graphs:
                # Add edge to adjacent node to force the graph to be connected
                if idx != 0:
                    joint_graph['rows'] = np.append(joint_graph['rows'],
                                                    adj_idx)
                    joint_graph['cols'] = np.append(joint_graph['cols'], idx)
                    joint_graph['data'] = np.append(joint_graph['data'],
                                                    adj_data)
                joint_graph = joint_graphs[k]
                # Add mention-mention edges
                joint_graph['rows'] = np.append(joint_graph['rows'], [idx] * k)
                joint_graph['cols'] = np.append(joint_graph['cols'],
                                                nn_idxs[:k])
                joint_graph['data'] = np.append(joint_graph['data'],
                                                nn_scores[:k])

        knn_fetch_time = time.time() - time_start
        # Pickle the graphs
        print("Saving joint graphs...")
        with open(graph_path, 'wb') as write_handle:
            pickle.dump(joint_graphs,
                        write_handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

        if params['only_embed_and_build']:
            logger.info(f"Saved embedding data at: {embed_data_path}")
            logger.info(f"Saved graphs at: {graph_path}")
            exit()

    results = {
        'n_leaves': n_embeds,
        'n_entities': n_entities,
        'n_mentions': n_mentions
    }

    graph_processing_time = time.time()
    n_graphs_processed = 0.
    linkage_fns = ["single", "complete", "average"] if params["linkage"] is None \
        else [params["linkage"]]  # Different HAC linkage functions to run the analyses over

    for fn in linkage_fns:
        logger.info(f"Linkage function: {fn}")
        purities = []
        fn_result = {}
        for k in joint_graphs:
            graph = hg.UndirectedGraph(n_embeds)
            graph.add_edges(joint_graphs[k]['rows'], joint_graphs[k]['cols'])
            weights = -joint_graphs[k][
                'data']  # Since Higra expects weights as distances, not similarity
            tree = get_hac_tree(graph, weights, linkage=fn)
            purity = hg.dendrogram_purity(tree, leaf_labels)
            fn_result[f"purity@{k}nn"] = purity
            logger.info(f"purity@{k}nn = {purity}")
            purities.append(purity)
            n_graphs_processed += 1
        fn_result["average"] = round(np.mean(purities), 4)
        logger.info(f"average = {fn_result['average']}")
        results[fn] = fn_result

    avg_graph_processing_time = (time.time() -
                                 graph_processing_time) / n_graphs_processed
    avg_per_graph_time = (knn_fetch_time + avg_graph_processing_time) / 60
    execution_time = (time.time() - time_start) / 60

    # Store results
    output_file_name = os.path.join(
        output_path,
        f"results_{__import__('calendar').timegm(__import__('time').gmtime())}"
    )

    logger.info(f"Results: \n {results}")
    with open(f'{output_file_name}.json', 'w') as f:
        json.dump(results, f, indent=2)
        print(f"\nResults saved at: {output_file_name}.json")

    logger.info("\nThe avg. per graph evaluation time is {} minutes\n".format(
        avg_per_graph_time))
    logger.info(
        "\nThe total evaluation took {} minutes\n".format(execution_time))
Ejemplo n.º 21
0
def subgraph(graph, edge_indices, spanning=True, return_vertex_map=False):
    """
    Extract a subgraph of the input graph. Let :math:`G=(V,E)` be the graph :attr:`graph` and let :math:`E^*`
    be a subset of :math:`E`. The subgraph of :math:`G` induced by :math:`E^*` is equal to:

    - :math:`(V, E^*)` is :attr:`spanning` is ``True``; and
    - :math:`(\\bigcup E^*, E^*)` otherwise (the set of vertices of the subgraph is equal to the set of vertices present at
      an extremity of an edge in :math:`E^*`).

    The array :attr:`edge_indices` contains the indices of the edges in the set :math:`E^*`. The edges in the subgraph
    are in the same order as the edges in the array :attr:`edge_indices`.

    If :attr:`spanning` is ``False``, the subgraph may contain less vertices than the input graph. In such case, the
    optional array result :math:`vertex\_map` (returned if :attr:`return_vertex_map` is ``True``) indicates for each
    vertex :math:`i` of the subgraph, its corresponding index in the input graph.

    :Example:

        >>> # linear graph with 6 vertices
        >>> graph = hg.UndirectedGraph(6)
        >>> graph.add_edges(np.arange(5), np.arange(1, 6))
        >>>
        >>> # select edges (4, 5), (0, 1), and (3, 4), note that vertex 2 is not in any edge
        >>> edge_indices = np.asarray((4, 0, 3))
        >>> subgraph, vertex_map = hg.subgraph(graph, edge_indices, spanning=False, return_vertex_map=True)
        >>>
        >>> subgraph.num_vertices()
        5
        >>> vertex_map
        [0 1 3 4 5]
        >>> subgraph.edge_list()
        ([3 0 2], [4 1 3])
        >>> vertex_map
        [0 1 3 4 5]

    :param graph: input graph.
    :param edge_indices: an array of edge indices of the input graph.
    :param spanning: if ``True``, the subgraph has the same vertex set as the input graph.
    :param return_vertex_map: if ``True``, also returns an array mapping each vertex of the current to its corresponding
           vertex in the input graph.
    :return: a subgraph and, if :attr:`return_vertex_map` is ``True``, a vertex map
    """
    if spanning:
        subgraph = hg.UndirectedGraph(graph.num_vertices())
        sources, targets = graph.edge_list()
        subgraph.add_edges(sources[edge_indices], targets[edge_indices])

        if return_vertex_map:
            vertex_map = np.arange(graph.num_vertices())
    else:
        sources, targets = graph.edge_list()
        sources = sources[edge_indices]
        targets = targets[edge_indices]
        all_vertices = np.concatenate((sources, targets))
        vertex_map, inverse = np.unique(all_vertices, return_inverse=True)

        sources = inverse[:edge_indices.size]
        targets = inverse[edge_indices.size:]

        subgraph = hg.UndirectedGraph(vertex_map.size)
        subgraph.add_edges(sources, targets)

    if return_vertex_map:
        return subgraph, vertex_map
    else:
        return subgraph
Ejemplo n.º 22
0
def __reduce_ctr(num_vertices, sources, targets):
    graph = hg.UndirectedGraph(num_vertices)
    graph.add_edges(sources, targets)
    return graph
Ejemplo n.º 23
0
def make_graph_from_points(X, graph_type="knn+mst", **kwargs):
    """
    Creates a graph from vertex coordinates.

    Possible graph creation methods are:

        - 'complete': creates the complete graph
        - 'knn': creates a :math:`k`-nearest neighbor graph, the parameter :math:`k` can be controlled
          with the extra parameter 'n_neighbors' (default value 5).
          The resulting graph may have several connected components.
        - 'knn+mst' (default): creates a :math:`k`-nearest neighbor graph and add the edges of an mst of the complete graph.
          This method ensures that the resulting graph is connected.
          The parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5).
        - 'delaunay': creates a graph corresponding to the Delaunay triangulation of the points
          (only works in low dimensions).

    The weight of an edge :math:`\{x,y\}` is equal to the Euclidean distance between
    :math:`x` and :math:`y`: :math:`w(\{x,y\})=\|X[x, :] - X[y, :]\|`.

    This method is not suited for large set of points.

    :param X: A 2d array of vertex coordinates
    :param graph_type: 'complete', 'knn', 'knn+mst' (default), or 'delaunay'
    :param kwargs: extra args depends of chosen graph type
    :return: a graph and its edge weights
    """
    try:
        from scipy.spatial.distance import pdist, squareform, euclidean
        from sklearn.neighbors import kneighbors_graph
        from scipy.sparse.csgraph import minimum_spanning_tree
        from scipy.spatial import Delaunay
    except:
        raise RuntimeError("scipy and sklearn required.")

    n_neighbors = kwargs.get('n_neighbors', 5)
    mode = kwargs.get('mode', 'distance')

    if graph_type == "complete":
        d = pdist(X)
        A = squareform(d)
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "knn":
        A = kneighbors_graph(X, n_neighbors, mode).toarray()
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "knn+mst":
        A = kneighbors_graph(X, n_neighbors, mode).toarray()
        D = squareform(pdist(X))
        MST = minimum_spanning_tree(D).toarray()
        MST = MST + MST.T
        A = np.maximum(A, MST)
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A)
    elif graph_type == "delaunay":
        g = hg.UndirectedGraph(X.shape[0])
        edge_weights = []

        # add QJ to ensure that coplanar point are not discarded
        tmp = Delaunay(X)
        nbp = X.shape[0]
        if tmp.coplanar.size != 0:
            print("Warning coplanar points detected!")
        indices, indptr = tmp.vertex_neighbor_vertices

        for k in range(nbp):
            neighbours = indptr[indices[k]:indices[k + 1]]
            for n in neighbours:
                if n > k:
                    d = euclidean(X[k, :], X[n, :])
                    g.add_edge(k, n)
                    edge_weights.append(d)

        edge_weights = np.asarray(edge_weights, dtype=np.float64)
    elif graph_type == "mst":
        D = squareform(pdist(X))
        MST = minimum_spanning_tree(D).toarray()
        MST = MST + MST.T
        g, edge_weights = hg.adjacency_matrix_2_undirected_graph(MST)
    else:
        raise ValueError("Unknown graph_type: " + str(graph_type))

    return g, edge_weights