def test_make_graph_from_points_knn_and_mst(self): X = np.asarray( ((0, 0), (0, 1), (1, 0), (0, 3), (0, 4), (1, 3), (2, 3))) sqrt2 = np.sqrt(2) g, ew = hg.make_graph_from_points(X, graph_type="knn+mst", symmetrization="max", n_neighbors=2) g_ref = hg.UndirectedGraph(7) g_ref.add_edges((0, 0, 1, 3, 3, 4, 5, 3, 1), (1, 2, 2, 5, 4, 5, 6, 6, 3)) w_ref = (1, 1, sqrt2, 1, 1, sqrt2, 1, 2, 2) self.assertTrue(TestAlgorithmGraphCore.graph_equal( g, ew, g_ref, w_ref)) g, ew = hg.make_graph_from_points(X, graph_type="knn+mst", symmetrization="min", n_neighbors=2) g_ref = hg.UndirectedGraph(7) g_ref.add_edges((0, 0, 1, 3, 3, 5, 1), (1, 2, 2, 5, 4, 6, 3)) w_ref = (1, 1, sqrt2, 1, 1, 1, 2) self.assertTrue(TestAlgorithmGraphCore.graph_equal( g, ew, g_ref, w_ref))
def test_add_vertex(self): g = hg.UndirectedGraph() self.assertTrue(g.num_vertices() == 0) self.assertTrue(g.add_vertex() == 0) self.assertTrue(g.num_vertices() == 1) self.assertTrue(g.add_vertex() == 1) self.assertTrue(g.num_vertices() == 2) g = hg.UndirectedGraph(3) self.assertTrue(g.num_vertices() == 3)
def test_add_edges(self): g = hg.UndirectedGraph(3) g.add_edge(0, 1) g.add_edge(0, 2) g2 = hg.UndirectedGraph(3) g2.add_edges((0, 0), (1, 2)) self.assertTrue(g2.num_edges() == 2) for i in range(g2.num_edges()): self.assertTrue(g.edge_from_index(i) == g2.edge_from_index(i))
def test_graphWrite(self): global graph_file filename = "testWriteGraphPink.graph" silent_remove(filename) vertex_weights = np.arange(1, 16) edges_weights = np.asarray((3, 0, 0, 1, 3, 0, 1, 0, 2, 0, 1, 0, 3, 0)) shape = (3, 5) graph = hg.UndirectedGraph(15) for i in range(14): graph.add_edge(i, i + 1) hg.save_graph_pink(filename, graph, vertex_weights, edges_weights, shape) self.assertTrue(os.path.exists(filename)) with open(filename, 'r') as f: data = f.read() silent_remove(filename) with open(graph_file, 'r') as f: data_ref = f.read() self.assertTrue(data == data_ref) # Test default attributes hg.save_graph_pink(filename, graph) self.assertTrue(os.path.exists(filename)) silent_remove(filename)
def test_adjacency_matrix_2_undirected_graph_non_edge_values(self): ref_adj_mat = np.asarray(((-1, 1, 2, 3, 4), (1, -1, 5, -1, -1), (2, 5, -1, 6, 7), (3, -1, 6, -1, -1), (4, -1, 7, -1, -1))) graph, edge_weights = hg.adjacency_matrix_2_undirected_graph(ref_adj_mat, -1) ref_graph = hg.UndirectedGraph(5) ref_graph.add_edge(0, 1) ref_graph.add_edge(0, 2) ref_graph.add_edge(0, 3) ref_graph.add_edge(0, 4) ref_graph.add_edge(1, 2) ref_graph.add_edge(2, 3) ref_graph.add_edge(2, 4) ref_edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7)) self.assertTrue(np.all(edge_weights == ref_edge_weights)) self.assertTrue(graph.num_vertices() == ref_graph.num_vertices()) self.assertTrue(graph.num_edges() == ref_graph.num_edges()) for (e1, e2) in zip(graph.edges(), ref_graph.edges()): self.assertTrue(e1 == e2)
def test_binary_partition_tree_average_linkage2(self): graph = hg.UndirectedGraph(10) graph.add_edges( (0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 7, 7), (3, 6, 4, 2, 5, 3, 6, 9, 7, 3, 8, 5, 9, 4, 6, 9, 7, 8, 6, 9, 8)) edge_values = np.asarray( (0.87580029, 0.60123697, 0.79924759, 0.74221387, 0.75418382, 0.66159356, 1.31856839, 0.76080612, 1.08881471, 0.98557615, 0.61454158, 0.50913424, 0.63556478, 0.64684775, 1.14865302, 0.81741018, 2.1591071, 0.60563004, 2.06636665, 1.35617725, 0.83085949), dtype=np.float64) tree, altitudes = hg.binary_partition_tree_average_linkage( graph, edge_values) expected_parents = np.asarray((11, 14, 10, 13, 15, 10, 11, 18, 12, 13, 12, 17, 16, 14, 15, 16, 17, 18, 18), dtype=np.int64) expected_altitudes = np.asarray( (0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.509134, 0.601237, 0.610086, 0.635565, 0.661594, 0.732129, 0.810695, 1.241727, 1.35874), dtype=np.float64) self.assertTrue(np.all(expected_parents == tree.parents())) self.assertTrue(np.allclose(expected_altitudes, altitudes))
def test_add_vertices(self): g = hg.UndirectedGraph() self.assertTrue(g.num_vertices() == 0) g.add_vertices(3) self.assertTrue(g.num_vertices() == 3) g.add_vertices(2) self.assertTrue(g.num_vertices() == 5)
def test_make_graph_from_points_complete(self): X = np.asarray(((0, 0), (0, 1), (1, 0))) sqrt2 = np.sqrt(2) g, ew = hg.make_graph_from_points(X, graph_type="complete") g_ref = hg.UndirectedGraph(3) g_ref.add_edges((0, 0, 1), (1, 2, 2)) w_ref = (1, 1, sqrt2) self.assertTrue(TestAlgorithmGraphCore.graph_equal(g, ew, g_ref, w_ref))
def test_subgraph_spanning(self): graph = hg.UndirectedGraph(6) graph.add_edges(np.arange(5), np.arange(1, 6)) edge_indices = np.asarray((4, 0, 3)) subgraph, vertex_map = hg.subgraph(graph, edge_indices, spanning=False, return_vertex_map=True) self.assertTrue(subgraph.num_vertices() == 5) self.assertTrue(subgraph.num_edges() == len(edge_indices)) sources, targets = subgraph.edge_list() self.assertTrue(np.all(vertex_map == (0, 1, 3, 4, 5))) self.assertTrue(np.all(vertex_map[sources] == (4, 0, 3))) self.assertTrue(np.all(vertex_map[targets] == (5, 1, 4)))
def test_binary_partition_tree_ward_linkage(self): graph = hg.UndirectedGraph(5) graph.add_edges((0, 0, 0, 1, 2, 2, 3), (1, 2, 3, 2, 3, 4, 4)) vertex_centroids = np.asarray( ((0, 0), (1, 1), (1, 3), (-3, 4), (-1, 5))) vertex_sizes = np.asarray((1, 1, 1, 2, 1)) tree, altitudes = hg.binary_partition_tree_ward_linkage( graph, vertex_centroids, vertex_sizes) expected_parents = np.asarray((5, 5, 7, 6, 6, 7, 8, 8, 8), dtype=np.int64) expected_altitudes = np.asarray( (0., 0., 0., 0., 0., 1., 3.333333, 4.333333, 27.), dtype=np.float64) self.assertTrue(np.all(expected_parents == tree.parents())) self.assertTrue(np.allclose(expected_altitudes, altitudes)) graph = hg.UndirectedGraph(3) graph.add_edges((0, 1), (2, 2)) vertex_centroids = np.asarray(((0, ), (1, ), (5, ))) tree, altitudes = hg.binary_partition_tree_ward_linkage( graph, vertex_centroids) expected_parents = np.asarray((4, 3, 3, 4, 4), dtype=np.int64) expected_altitudes = np.asarray((0., 0., 0., 8, 8), dtype=np.float64) self.assertTrue(np.all(expected_parents == tree.parents())) self.assertTrue(np.allclose(expected_altitudes, altitudes)) tree, altitudes = hg.binary_partition_tree_ward_linkage( graph, vertex_centroids, altitude_correction="none") expected_parents = np.asarray((4, 3, 3, 4, 4), dtype=np.int64) expected_altitudes = np.asarray((0., 0., 0., 8, 6), dtype=np.float64) self.assertTrue(np.all(expected_parents == tree.parents())) self.assertTrue(np.allclose(expected_altitudes, altitudes))
def test_undirected_graph_2_adjacency_matrix(self): graph = hg.UndirectedGraph(5) graph.add_edge(0, 1) graph.add_edge(0, 2) graph.add_edge(0, 3) graph.add_edge(0, 4) graph.add_edge(1, 2) graph.add_edge(2, 3) graph.add_edge(2, 4) edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7)) adj_mat = hg.undirected_graph_2_adjacency_matrix(graph, edge_weights, non_edge_value=-1, sparse=False) ref_adj_mat = np.asarray(((-1, 1, 2, 3, 4), (1, -1, 5, -1, -1), (2, 5, -1, 6, 7), (3, -1, 6, -1, -1), (4, -1, 7, -1, -1))) self.assertTrue(np.all(ref_adj_mat == adj_mat)) self.assertTrue(isinstance(adj_mat, np.ndarray)) t = hg.Tree(np.asarray((5, 5, 6, 6, 6, 7, 7, 7))) edge_weights = np.asarray((1, 2, 3, 4, 5, 6, 7)) adj_mat = hg.undirected_graph_2_adjacency_matrix(t, edge_weights) ref_adj_mat = np.asarray(((0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0, 0, 0, 2, 0, 0), (0, 0, 0, 0, 0, 0, 3, 0), (0, 0, 0, 0, 0, 0, 4, 0), (0, 0, 0, 0, 0, 0, 5, 0), (1, 2, 0, 0, 0, 0, 0, 6), (0, 0, 3, 4, 5, 0, 0, 7), (0, 0, 0, 0, 0, 6, 7, 0))) self.assertTrue(np.all(ref_adj_mat == adj_mat)) self.assertTrue(sp.issparse(adj_mat)) t = hg.Tree(np.asarray((5, 5, 6, 6, 6, 7, 7, 7))) adj_mat = hg.undirected_graph_2_adjacency_matrix(t) ref_adj_mat = np.asarray(((0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0), (0, 0, 0, 0, 0, 0, 1, 0), (0, 0, 0, 0, 0, 0, 1, 0), (0, 0, 0, 0, 0, 0, 1, 0), (1, 1, 0, 0, 0, 0, 0, 1), (0, 0, 1, 1, 1, 0, 0, 1), (0, 0, 0, 0, 0, 1, 1, 0))) self.assertTrue(np.all(ref_adj_mat == adj_mat)) self.assertTrue(sp.issparse(adj_mat)) with self.assertRaises(Exception): hg.undirected_graph_2_adjacency_matrix(t, non_edge_value=-1, sparse=True)
def _get_associated_mst(tree, altitudes): """ Create a valid edge mst for the given tree (returns an edge weighted undirected graph) """ nb = tree.num_leaves() link_v = np.arange(nb) link_v = hg.accumulate_sequential(tree, link_v, hg.Accumulators.first) g = hg.UndirectedGraph(nb) edge_weights = np.zeros((nb - 1,), np.float32) for r in tree.leaves_to_root_iterator(include_leaves=False): g.add_edge(link_v[tree.child(0, r)], link_v[tree.child(1, r)]) edge_weights[r - nb] = altitudes[r] return g, edge_weights
def test_add_edge(self): g = hg.UndirectedGraph(3) self.assertTrue(g.num_edges() == 0) g.add_edge(0, 1) self.assertTrue(g.num_edges() == 1) # parallel edge allowed g.add_edge(0, 1) self.assertTrue(g.num_edges() == 2) # still parallel edge allowed g.add_edge(1, 0) self.assertTrue(g.num_edges() == 3) g.add_edge(0, 2) self.assertTrue(g.num_edges() == 4)
def test_binary_partition_tree_exponential_linkage(self): graph = hg.UndirectedGraph(5) sources = np.asarray((0, 0, 1, 2, 2, 3)) targets = np.asarray((1, 2, 4, 3, 4, 4)) graph.add_edges(sources, targets) edge_weights = np.asarray((1, 3, 5, 2, 4, 6), dtype=np.float64) edge_weight_weights = np.asarray((2, 2, 1, 3, 3, 1), dtype=np.float64) tree, altitudes = hg.binary_partition_tree_exponential_linkage( graph, edge_weights, -1, edge_weight_weights) ref_parents = np.asarray((5, 5, 6, 6, 8, 7, 7, 8, 8)) ref_altitudes = np.asarray((0., 0., 0., 0., 0., 1., 2., 3., 4.182275)) self.assertTrue(np.all(tree.parents() == ref_parents)) self.assertTrue(np.allclose(altitudes, ref_altitudes))
def test_adjacency_matrix_2_undirected_graph(self): ref_adj_mat = np.asarray(((0, 0.1), (0.1, 0)), dtype=np.float64) graph, edge_weights = hg.adjacency_matrix_2_undirected_graph( ref_adj_mat) ref_graph = hg.UndirectedGraph(2) ref_graph.add_edge(0, 1) ref_edge_weights = np.asarray((0.1, )) self.assertTrue(edge_weights.dtype == np.float64) self.assertTrue(np.all(edge_weights == ref_edge_weights)) self.assertTrue(graph.num_vertices() == ref_graph.num_vertices()) self.assertTrue(graph.num_edges() == ref_graph.num_edges()) for (e1, e2) in zip(graph.edges(), ref_graph.edges()): self.assertTrue(e1 == e2)
def adjacency_matrix_2_undirected_graph(adjacency_matrix, non_edge_value=0): """ Undirected edge-weighted graph corresponding to an adjacency matrix. Adjacency matrix entries which are equal to :attr:`non_edge_value` are not considered to be part of the graph. :param adjacency_matrix: Input adjacency matrix (A 2d symmetric square matrix) :param non_edge_value: Value used to represent non existing edges in the adjacency matrix :return: a pair (UndirectedGraph, ndarray) representing the graph and its edge_weights (Concept :class:`~higra.CptEdgeWeightedGraph`) """ if adjacency_matrix.ndim != 2 or adjacency_matrix.shape[ 0] != adjacency_matrix.shape[1]: raise ValueError("'adjacency_matrix' must be a 2d square matrix.") try: import scipy.sparse as sp scipy_available = True except: scipy_available = False if scipy_available and sp.issparse(adjacency_matrix): if non_edge_value != 0: raise ValueError( "'non_edge_value' must be equal to 0 is 'adjacency_matrix' is a Scipy sparse matrix." ) adjacency_matrix = sp.triu(adjacency_matrix) sources, targets, edge_weights = sp.find(adjacency_matrix) else: adjacency_matrix = adjacency_matrix.copy() adjacency_matrix[np.tri(*adjacency_matrix.shape, k=-1, dtype=np.bool)] = non_edge_value if non_edge_value != 0: mask = adjacency_matrix != non_edge_value else: mask = adjacency_matrix sources, targets = np.nonzero(mask) edge_weights = adjacency_matrix[sources, targets] graph = hg.UndirectedGraph(adjacency_matrix.shape[0]) graph.add_edges(sources, targets) return graph, edge_weights
def test_minimum_spanning_forest(self): graph = hg.UndirectedGraph(6) graph.add_edges((0, 0, 1, 3, 3, 4), (1, 2, 2, 4, 5, 5)) edge_weights = np.asarray((0, 1, 2, 3, 4, 5)) mst = hg.minimum_spanning_tree(graph, edge_weights) mst_edge_map = hg.CptMinimumSpanningTree.get_edge_map(mst) self.assertTrue(mst.num_vertices() == 6) self.assertTrue(mst.num_edges() == 4) ref_sources = (0, 0, 3, 3) ref_targets = (1, 2, 4, 5) sources, targets = mst.edge_list() self.assertTrue(np.all(sources == ref_sources)) self.assertTrue(np.all(targets == ref_targets)) self.assertTrue(np.all(mst_edge_map == (0, 1, 3, 4)))
def test_graph(): g = hg.UndirectedGraph(4) g.add_edge(0, 1) g.add_edge(1, 2) g.add_edge(0, 2) return g
def make_graph_from_points(X, graph_type="knn+mst", symmetrization="max", **kwargs): """ Creates a graph from vertex coordinates. The argument :attr:`graph_type` selects the graph creation methods. Possible values are: - ``"complete"``: creates the complete graph - ``"knn"``: creates a :math:`k`-nearest neighbor graph, the parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5). The resulting graph may have several connected components. - ``"knn+mst"`` (default): creates a :math:`k`-nearest neighbor graph and add the edges of an mst of the complete graph. This method ensures that the resulting graph is connected. The parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5). - ``"delaunay"``: creates a graph corresponding to the Delaunay triangulation of the points (only works in low dimensions). The weight of an edge :math:`\{x,y\}` is equal to the Euclidean distance between :math:`x` and :math:`y`: :math:`w(\{x,y\})=\|X[x, :] - X[y, :]\|`. :math:`K`-nearest neighbor based graphs are naturally directed, the argument :attr:`symmetrization` enables to chose a symmetrization strategy. Possible values are: - ``"min"``: an edge :math:`\{x,y\}` is created if there both arcs :math:`(x,y)` and :math:`(y,x)` exist. Its weight is given by the minimum weight of the two arcs. - ``"max"``: an edge :math:`\{x,y\}` is created if there is any of the two arcs :math:`(x,y)` and :math:`(y,x)` exists. Its weight is given by the weight of the existing arcs (if both arcs exists they necessarily have the same weight). This method is not suited for large set of points. :param X: A 2d array of vertex coordinates :param graph_type: ``"complete"``, ``"knn"``, ``"knn+mst"`` (default), or ``"delaunay"`` :param symmetrization: `"min"`` or ``"max"`` :param kwargs: extra args depends of chosen graph type :return: a graph and its edge weights """ try: from scipy.spatial.distance import pdist, squareform, euclidean from sklearn.neighbors import kneighbors_graph from scipy.sparse.csgraph import minimum_spanning_tree from scipy.spatial import Delaunay except: raise RuntimeError("scipy and sklearn required.") n_neighbors = kwargs.get('n_neighbors', 5) mode = kwargs.get('mode', 'distance') def symmetrization_fun(A): if symmetrization == "min": return A.minimum(A.T) elif symmetrization == "max": return A.maximum(A.T) else: raise ValueError("Unknown symmetrization: " + str(symmetrization)) if graph_type == "complete": d = pdist(X) A = squareform(d) g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "knn": A = kneighbors_graph(X, n_neighbors=n_neighbors, mode=mode) A = symmetrization_fun(A) g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "knn+mst": A = kneighbors_graph(X, n_neighbors=n_neighbors, mode=mode) A = symmetrization_fun(A) D = squareform(pdist(X)) MST = minimum_spanning_tree(D) MST = MST + MST.T A = A.maximum(MST) g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "delaunay": g = hg.UndirectedGraph(X.shape[0]) edge_weights = [] # add QJ to ensure that coplanar point are not discarded tmp = Delaunay(X) nbp = X.shape[0] if tmp.coplanar.size != 0: print("Warning coplanar points detected!") indices, indptr = tmp.vertex_neighbor_vertices for k in range(nbp): neighbours = indptr[indices[k]:indices[k+1]] for n in neighbours: if n > k: d = euclidean(X[k, :], X[n, :]) g.add_edge(k, n) edge_weights.append(d) edge_weights = np.asarray(edge_weights, dtype=np.float64) elif graph_type == "mst": D = squareform(pdist(X)) MST = minimum_spanning_tree(D).toarray() MST = MST + MST.T g, edge_weights = hg.adjacency_matrix_2_undirected_graph(MST) else: raise ValueError("Unknown graph_type: " + str(graph_type)) return g, edge_weights
def main(params): output_path = params["output_path"] if not os.path.exists(output_path): os.makedirs(output_path) logger = utils.get_logger(params["output_path"], 'log') pickle_src_path = params["pickle_src_path"] if pickle_src_path is None or not os.path.exists(pickle_src_path): pickle_src_path = output_path embed_data_path = params["embed_data_path"] if embed_data_path is None or not os.path.exists(embed_data_path): embed_data_path = output_path # Init model reranker = BiEncoderRanker(params) reranker.model.eval() tokenizer = reranker.tokenizer n_gpu = reranker.n_gpu knn = params["knn"] # Use as the max-knn value for the graph construction use_types = params["use_types"] # within_doc = params["within_doc"] data_split = params["data_split"] # Default = "test" # Load test data entity_dictionary_loaded = False test_dictionary_pkl_path = os.path.join(pickle_src_path, 'test_dictionary.pickle') test_tensor_data_pkl_path = os.path.join(pickle_src_path, 'test_tensor_data.pickle') test_mention_data_pkl_path = os.path.join(pickle_src_path, 'test_mention_data.pickle') # if params['transductive']: # train_tensor_data_pkl_path = os.path.join(pickle_src_path, 'train_tensor_data.pickle') # train_mention_data_pkl_path = os.path.join(pickle_src_path, 'train_mention_data.pickle') if os.path.isfile(test_dictionary_pkl_path): print("Loading stored processed entity dictionary...") with open(test_dictionary_pkl_path, 'rb') as read_handle: test_dictionary = pickle.load(read_handle) entity_dictionary_loaded = True if os.path.isfile(test_tensor_data_pkl_path) and os.path.isfile( test_mention_data_pkl_path): print("Loading stored processed test data...") with open(test_tensor_data_pkl_path, 'rb') as read_handle: test_tensor_data = pickle.load(read_handle) with open(test_mention_data_pkl_path, 'rb') as read_handle: mention_data = pickle.load(read_handle) else: test_samples = utils.read_dataset(data_split, params["data_path"]) if not entity_dictionary_loaded: with open(os.path.join(params["data_path"], 'dictionary.pickle'), 'rb') as read_handle: test_dictionary = pickle.load(read_handle) # Check if dataset has multiple ground-truth labels mult_labels = "labels" in test_samples[0].keys() # Filter samples without gold entities test_samples = list( filter( lambda sample: (len(sample["labels"]) > 0) if mult_labels else (sample["label"] is not None), test_samples)) logger.info("Read %d test samples." % len(test_samples)) mention_data, test_dictionary, test_tensor_data = data_process.process_mention_data( test_samples, test_dictionary, tokenizer, params["max_context_length"], params["max_cand_length"], multi_label_key="labels" if mult_labels else None, context_key=params["context_key"], silent=params["silent"], logger=logger, debug=params["debug"], knn=knn, dictionary_processed=entity_dictionary_loaded) print("Saving processed test data...") if not entity_dictionary_loaded: with open(test_dictionary_pkl_path, 'wb') as write_handle: pickle.dump(test_dictionary, write_handle, protocol=pickle.HIGHEST_PROTOCOL) with open(test_tensor_data_pkl_path, 'wb') as write_handle: pickle.dump(test_tensor_data, write_handle, protocol=pickle.HIGHEST_PROTOCOL) with open(test_mention_data_pkl_path, 'wb') as write_handle: pickle.dump(mention_data, write_handle, protocol=pickle.HIGHEST_PROTOCOL) # Reducing the entity dictionary to only the ground truth of the mention queries # Combining the entities and mentions into one structure for joint embedding and indexing new_ents = {} new_ents_arr = [] men_labels = [] for men in mention_data: ent = men['label_idxs'][0] if ent not in new_ents: new_ents[ent] = len(new_ents_arr) new_ents_arr.append(ent) men_labels.append(new_ents[ent]) ent_labels = [i for i in range(len(new_ents_arr))] new_ent_vecs = torch.tensor( list(map(lambda x: test_dictionary[x]['ids'], new_ents_arr))) new_ent_types = list( map(lambda x: {"type": test_dictionary[x]['type']}, new_ents_arr)) test_men_vecs = test_tensor_data[:][0] n_mentions = len(test_tensor_data) n_entities = len(new_ent_vecs) n_embeds = n_mentions + n_entities leaf_labels = np.array(ent_labels + men_labels, dtype=int) all_vecs = torch.cat((new_ent_vecs, test_men_vecs)) all_types = new_ent_types + mention_data # Array of dicts containing key "type" for selected ents and all mentions # Values of k to run the evaluation against knn_vals = [25 * 2**i for i in range(int(math.log(knn / 25, 2)) + 1) ] if params["exact_knn"] is None else [params["exact_knn"]] # Store the maximum evaluation k max_knn = knn_vals[-1] time_start = time.time() # Check if graphs are already built graph_path = os.path.join(output_path, 'graphs.pickle') if os.path.isfile(graph_path): print("Loading stored joint graphs...") with open(graph_path, 'rb') as read_handle: joint_graphs = pickle.load(read_handle) else: # Initialize graphs to store mention-mention and mention-entity similarity score edges; # Keyed on k, the number of nearest mentions retrieved joint_graphs = {} for k in knn_vals: joint_graphs[k] = { 'rows': np.array([]), 'cols': np.array([]), 'data': np.array([]), 'shape': (n_embeds, n_embeds) } # Check and load stored embedding data embed_data_path = os.path.join(embed_data_path, 'embed_data.t7') embed_data = None if os.path.isfile(embed_data_path): embed_data = torch.load(embed_data_path) if use_types: if embed_data is not None: logger.info('Loading stored embeddings') embeds = embed_data['embeds'] if 'idxs_by_type' in embed_data: idxs_by_type = embed_data['idxs_by_type'] else: idxs_by_type = data_process.get_idxs_by_type(all_types) else: logger.info("Embedding data") dict_embeds = data_process.embed_and_index( reranker, all_vecs[:n_entities], encoder_type='candidate', only_embed=True, n_gpu=n_gpu, batch_size=params['embed_batch_size']) men_embeds = data_process.embed_and_index( reranker, all_vecs[n_entities:], encoder_type='context', only_embed=True, n_gpu=n_gpu, batch_size=params['embed_batch_size']) embeds = np.concatenate((dict_embeds, men_embeds), axis=0) idxs_by_type = data_process.get_idxs_by_type(all_types) search_indexes = data_process.get_index_from_embeds( embeds, corpus_idxs=idxs_by_type, force_exact_search=True) else: if embed_data is not None: logger.info('Loading stored embeddings') embeds = embed_data['embeds'] else: logger.info("Embedding data") dict_embeds = data_process.embed_and_index( reranker, all_vecs[:n_entities], encoder_type='candidate', only_embed=True, n_gpu=n_gpu, batch_size=params['embed_batch_size']) men_embeds = data_process.embed_and_index( reranker, all_vecs[n_entities:], encoder_type='context', only_embed=True, n_gpu=n_gpu, batch_size=params['embed_batch_size']) embeds = np.concatenate((dict_embeds, men_embeds), axis=0) search_index = data_process.get_index_from_embeds( embeds, force_exact_search=True) # Save computed embedding data if not loaded from disk if embed_data is None: embed_data = {} embed_data['embeds'] = embeds if use_types: embed_data['idxs_by_type'] = idxs_by_type # NOTE: Cannot pickle faiss index because it is a SwigPyObject torch.save(embed_data, embed_data_path, pickle_protocol=pickle.HIGHEST_PROTOCOL) # Build faiss search index if params["normalize_embeds"]: embeds = normalize(embeds, axis=0) logger.info("Building KNN index...") if use_types: search_indexes = data_process.get_index_from_embeds( embeds, corpus_idxs=idxs_by_type, force_exact_search=True) else: search_index = data_process.get_index_from_embeds( embeds, force_exact_search=True) logger.info("Starting KNN search...") if not use_types: faiss_dists, faiss_idxs = search_index.search(embeds, max_knn + 1) else: query_len = n_embeds faiss_idxs = np.zeros((query_len, max_knn + 1)) faiss_dists = np.zeros((query_len, max_knn + 1), dtype=float) for entity_type in search_indexes: embeds_by_type = embeds[idxs_by_type[entity_type]] nn_dists_by_type, nn_idxs_by_type = search_indexes[ entity_type].search(embeds_by_type, max_knn + 1) for i, idx in enumerate(idxs_by_type[entity_type]): faiss_idxs[idx] = nn_idxs_by_type[i] faiss_dists[idx] = nn_dists_by_type[i] logger.info("Search finished") logger.info('Building graphs') # Find the most similar nodes for each mention and node in the set (minus self) for idx in trange(n_embeds): # Compute adjacent node edge weight if idx != 0: adj_idx = idx - 1 adj_data = embeds[adj_idx] @ embeds[idx] nn_idxs = faiss_idxs[idx] nn_scores = faiss_dists[idx] # Filter candidates to remove mention query and keep only the top k candidates filter_mask = nn_idxs != idx nn_idxs, nn_scores = nn_idxs[filter_mask][:max_knn], nn_scores[ filter_mask][:max_knn] # Add edges to the graphs for k in joint_graphs: # Add edge to adjacent node to force the graph to be connected if idx != 0: joint_graph['rows'] = np.append(joint_graph['rows'], adj_idx) joint_graph['cols'] = np.append(joint_graph['cols'], idx) joint_graph['data'] = np.append(joint_graph['data'], adj_data) joint_graph = joint_graphs[k] # Add mention-mention edges joint_graph['rows'] = np.append(joint_graph['rows'], [idx] * k) joint_graph['cols'] = np.append(joint_graph['cols'], nn_idxs[:k]) joint_graph['data'] = np.append(joint_graph['data'], nn_scores[:k]) knn_fetch_time = time.time() - time_start # Pickle the graphs print("Saving joint graphs...") with open(graph_path, 'wb') as write_handle: pickle.dump(joint_graphs, write_handle, protocol=pickle.HIGHEST_PROTOCOL) if params['only_embed_and_build']: logger.info(f"Saved embedding data at: {embed_data_path}") logger.info(f"Saved graphs at: {graph_path}") exit() results = { 'n_leaves': n_embeds, 'n_entities': n_entities, 'n_mentions': n_mentions } graph_processing_time = time.time() n_graphs_processed = 0. linkage_fns = ["single", "complete", "average"] if params["linkage"] is None \ else [params["linkage"]] # Different HAC linkage functions to run the analyses over for fn in linkage_fns: logger.info(f"Linkage function: {fn}") purities = [] fn_result = {} for k in joint_graphs: graph = hg.UndirectedGraph(n_embeds) graph.add_edges(joint_graphs[k]['rows'], joint_graphs[k]['cols']) weights = -joint_graphs[k][ 'data'] # Since Higra expects weights as distances, not similarity tree = get_hac_tree(graph, weights, linkage=fn) purity = hg.dendrogram_purity(tree, leaf_labels) fn_result[f"purity@{k}nn"] = purity logger.info(f"purity@{k}nn = {purity}") purities.append(purity) n_graphs_processed += 1 fn_result["average"] = round(np.mean(purities), 4) logger.info(f"average = {fn_result['average']}") results[fn] = fn_result avg_graph_processing_time = (time.time() - graph_processing_time) / n_graphs_processed avg_per_graph_time = (knn_fetch_time + avg_graph_processing_time) / 60 execution_time = (time.time() - time_start) / 60 # Store results output_file_name = os.path.join( output_path, f"results_{__import__('calendar').timegm(__import__('time').gmtime())}" ) logger.info(f"Results: \n {results}") with open(f'{output_file_name}.json', 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved at: {output_file_name}.json") logger.info("\nThe avg. per graph evaluation time is {} minutes\n".format( avg_per_graph_time)) logger.info( "\nThe total evaluation took {} minutes\n".format(execution_time))
def subgraph(graph, edge_indices, spanning=True, return_vertex_map=False): """ Extract a subgraph of the input graph. Let :math:`G=(V,E)` be the graph :attr:`graph` and let :math:`E^*` be a subset of :math:`E`. The subgraph of :math:`G` induced by :math:`E^*` is equal to: - :math:`(V, E^*)` is :attr:`spanning` is ``True``; and - :math:`(\\bigcup E^*, E^*)` otherwise (the set of vertices of the subgraph is equal to the set of vertices present at an extremity of an edge in :math:`E^*`). The array :attr:`edge_indices` contains the indices of the edges in the set :math:`E^*`. The edges in the subgraph are in the same order as the edges in the array :attr:`edge_indices`. If :attr:`spanning` is ``False``, the subgraph may contain less vertices than the input graph. In such case, the optional array result :math:`vertex\_map` (returned if :attr:`return_vertex_map` is ``True``) indicates for each vertex :math:`i` of the subgraph, its corresponding index in the input graph. :Example: >>> # linear graph with 6 vertices >>> graph = hg.UndirectedGraph(6) >>> graph.add_edges(np.arange(5), np.arange(1, 6)) >>> >>> # select edges (4, 5), (0, 1), and (3, 4), note that vertex 2 is not in any edge >>> edge_indices = np.asarray((4, 0, 3)) >>> subgraph, vertex_map = hg.subgraph(graph, edge_indices, spanning=False, return_vertex_map=True) >>> >>> subgraph.num_vertices() 5 >>> vertex_map [0 1 3 4 5] >>> subgraph.edge_list() ([3 0 2], [4 1 3]) >>> vertex_map [0 1 3 4 5] :param graph: input graph. :param edge_indices: an array of edge indices of the input graph. :param spanning: if ``True``, the subgraph has the same vertex set as the input graph. :param return_vertex_map: if ``True``, also returns an array mapping each vertex of the current to its corresponding vertex in the input graph. :return: a subgraph and, if :attr:`return_vertex_map` is ``True``, a vertex map """ if spanning: subgraph = hg.UndirectedGraph(graph.num_vertices()) sources, targets = graph.edge_list() subgraph.add_edges(sources[edge_indices], targets[edge_indices]) if return_vertex_map: vertex_map = np.arange(graph.num_vertices()) else: sources, targets = graph.edge_list() sources = sources[edge_indices] targets = targets[edge_indices] all_vertices = np.concatenate((sources, targets)) vertex_map, inverse = np.unique(all_vertices, return_inverse=True) sources = inverse[:edge_indices.size] targets = inverse[edge_indices.size:] subgraph = hg.UndirectedGraph(vertex_map.size) subgraph.add_edges(sources, targets) if return_vertex_map: return subgraph, vertex_map else: return subgraph
def __reduce_ctr(num_vertices, sources, targets): graph = hg.UndirectedGraph(num_vertices) graph.add_edges(sources, targets) return graph
def make_graph_from_points(X, graph_type="knn+mst", **kwargs): """ Creates a graph from vertex coordinates. Possible graph creation methods are: - 'complete': creates the complete graph - 'knn': creates a :math:`k`-nearest neighbor graph, the parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5). The resulting graph may have several connected components. - 'knn+mst' (default): creates a :math:`k`-nearest neighbor graph and add the edges of an mst of the complete graph. This method ensures that the resulting graph is connected. The parameter :math:`k` can be controlled with the extra parameter 'n_neighbors' (default value 5). - 'delaunay': creates a graph corresponding to the Delaunay triangulation of the points (only works in low dimensions). The weight of an edge :math:`\{x,y\}` is equal to the Euclidean distance between :math:`x` and :math:`y`: :math:`w(\{x,y\})=\|X[x, :] - X[y, :]\|`. This method is not suited for large set of points. :param X: A 2d array of vertex coordinates :param graph_type: 'complete', 'knn', 'knn+mst' (default), or 'delaunay' :param kwargs: extra args depends of chosen graph type :return: a graph and its edge weights """ try: from scipy.spatial.distance import pdist, squareform, euclidean from sklearn.neighbors import kneighbors_graph from scipy.sparse.csgraph import minimum_spanning_tree from scipy.spatial import Delaunay except: raise RuntimeError("scipy and sklearn required.") n_neighbors = kwargs.get('n_neighbors', 5) mode = kwargs.get('mode', 'distance') if graph_type == "complete": d = pdist(X) A = squareform(d) g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "knn": A = kneighbors_graph(X, n_neighbors, mode).toarray() g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "knn+mst": A = kneighbors_graph(X, n_neighbors, mode).toarray() D = squareform(pdist(X)) MST = minimum_spanning_tree(D).toarray() MST = MST + MST.T A = np.maximum(A, MST) g, edge_weights = hg.adjacency_matrix_2_undirected_graph(A) elif graph_type == "delaunay": g = hg.UndirectedGraph(X.shape[0]) edge_weights = [] # add QJ to ensure that coplanar point are not discarded tmp = Delaunay(X) nbp = X.shape[0] if tmp.coplanar.size != 0: print("Warning coplanar points detected!") indices, indptr = tmp.vertex_neighbor_vertices for k in range(nbp): neighbours = indptr[indices[k]:indices[k + 1]] for n in neighbours: if n > k: d = euclidean(X[k, :], X[n, :]) g.add_edge(k, n) edge_weights.append(d) edge_weights = np.asarray(edge_weights, dtype=np.float64) elif graph_type == "mst": D = squareform(pdist(X)) MST = minimum_spanning_tree(D).toarray() MST = MST + MST.T g, edge_weights = hg.adjacency_matrix_2_undirected_graph(MST) else: raise ValueError("Unknown graph_type: " + str(graph_type)) return g, edge_weights