def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Harmonic': """ Harmonic centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Harmonic` """ adjacency = check_format(adjacency) n = adjacency.shape[0] if not is_square(adjacency): raise ValueError( "The adjacency is not square. Please use 'bipartite2undirected' or " "'bipartite2directed'.") indices = np.arange(n) paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices) np.fill_diagonal(paths, 1) inv = (1 / paths) np.fill_diagonal(inv, 0) self.scores_ = inv.dot(np.ones(n)) return self
def co_neighbors_graph(adjacency: Union[sparse.csr_matrix, np.ndarray], normalized: bool = True, method='knn', n_neighbors: int = 5, embedding_dimension: int = 8) -> sparse.csr_matrix: """Compute the co-neighborhood adjacency defined as :math:`\\tilde{A} = AF^{-1}A^T`, where F is a weight matrix. Parameters ---------- adjacency: Adjacency of the input graph. normalized: If ``True``, F is the diagonal in-degree matrix :math:`F = \\text{diag}(A^T1)`. Otherwise, F is the identity matrix. method: Either ``'exact'`` or ``'knn'``. If 'exact' the output is computed with matrix multiplication. However, the density can be much higher than in the input graph and this can trigger Memory errors. If ``'knn'``, the co-neighborhood is approximated through KNN-search in an appropriate spectral embedding space. n_neighbors: Number of neighbors for the KNN search. Only useful if ``method='knn'``. embedding_dimension: Dimension of the embedding space. Only useful if ``method='knn'``. Returns ------- adjacency_: sparse.csr_matrix Adjacency of the co-neighborhood. """ adjacency = check_format(adjacency) if method == 'exact': if normalized: forward = transition_matrix(adjacency.T) else: forward = adjacency.T return adjacency.dot(forward) elif method == 'knn': if normalized: bispectral = BiSpectral(embedding_dimension, weights='degree', col_weights='degree', scaling='divide') else: bispectral = BiSpectral(embedding_dimension, weights='degree', col_weights='uniform', scaling=None) bispectral.fit(adjacency) knn = KNeighborsTransformer(n_neighbors, undirected=True) knn.fit(bispectral.row_embedding_) return knn.adjacency_ else: raise ValueError('method must be "exact" or "knn".')
def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiParis': """Applies the Paris algorithm to :math:`A = \\begin{bmatrix} 0 & B \\\\ B^T & 0 \\end{bmatrix}` where :math:`B` is the input treated as a biadjacency matrix. Parameters ---------- biadjacency: Biadjacency matrix of the graph. Returns ------- self: :class:`BiParis` """ paris = Paris(engine=self.engine, weights=self.weights, reorder=self.reorder) biadjacency = check_format(biadjacency) adjacency = bipartite2undirected(biadjacency) paris.fit(adjacency) self.dendrogram_ = paris.dendrogram_ return self
def fit( self, adjacency: Union[sparse.csr_matrix, np.ndarray], personalization: Optional[Union[dict, np.ndarray]] = None) -> 'PageRank': """ Standard PageRank with restart. Parameters ---------- adjacency : Adjacency matrix. personalization : If ``None``, the uniform distribution is used. Otherwise, a non-negative, non-zero vector or a dictionary must be provided. Returns ------- self: :class:`PageRank` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError("The adjacency is not square. See BiPageRank.") rso = RandomSurferOperator(adjacency, self.damping_factor, personalization, False) self.scores_ = rso.solve(self.solver, self.n_iter) return self
def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiLouvain': """Applies the directed version of Louvain algorithm to :math:`A = \\begin{bmatrix} 0 & B \\\\ 0 & 0 \\end{bmatrix}` where :math:`B` is the input treated as a biadjacency matrix. Parameters ---------- biadjacency: Biadjacency matrix of the graph. Returns ------- self: :class:`BiLouvain` """ louvain = Louvain(algorithm=self.algorithm, agg_tol=self.agg_tol, max_agg_iter=self.max_agg_iter, shuffle_nodes=self.shuffle_nodes, sorted_cluster=self.sorted_cluster, random_state=self.random_state, verbose=self.log.verbose) biadjacency = check_format(biadjacency) n1, _ = biadjacency.shape adjacency = bipartite2directed(biadjacency) louvain.fit(adjacency) self.row_labels_ = louvain.labels_[:n1] self.col_labels_ = louvain.labels_[n1:] self.labels_ = louvain.labels_ self.iteration_count_ = louvain.iteration_count_ self.aggregate_graph_ = louvain.aggregate_graph_ return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'SpectralClustering': """Apply embedding method followed by clustering to the graph. Parameters ---------- adjacency: Adjacency matrix of the graph. Returns ------- self: :class:`SpectralClustering` """ adjacency = check_format(adjacency) if not is_symmetric(adjacency): raise ValueError('The adjacency is not symmetric.') spectral = Spectral(self.embedding_dimension).fit(adjacency) embedding = spectral.embedding_ if self.l2normalization: norm = np.linalg.norm(embedding, axis=1) norm[norm == 0.] = 1 embedding /= norm[:, np.newaxis] kmeans = KMeans(self.n_clusters) kmeans.fit(embedding) self.labels_ = kmeans.labels_ return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], node_weights=None, randomized_decomposition: bool = True) -> 'SpectralEmbedding': """Fits the model from data in adjacency_matrix Parameters ---------- adjacency : array-like, shape = (n, n) Adjacency matrix of the graph randomized_decomposition: bool (default=True) whether to use a randomized (and faster) decomposition method or the standard scipy one. node_weights : {``'uniform'``, ``'degree'``, array of length n_nodes with positive entries} Node weights Returns ------- self: :class:`SpectralEmbedding` """ adjacency = check_format(adjacency) n_nodes, m_nodes = adjacency.shape if not check_square(adjacency): raise ValueError("The adjacency matrix must be a square matrix.") if connected_components(adjacency, directed=False)[0] > 1: raise ValueError("The graph must be connected.") if not check_symmetry(adjacency): raise ValueError("The adjacency matrix is not symmetric.") # builds standard laplacian degrees = adjacency.dot(np.ones(n_nodes)) degree_matrix = sparse.diags(degrees, format='csr') laplacian = degree_matrix - adjacency # applies normalization by node weights if node_weights is None: node_weights = self.node_weights weights = check_weights(node_weights, adjacency) weight_matrix = sparse.diags(np.sqrt(weights), format='csr') weight_matrix.data = 1 / weight_matrix.data laplacian = weight_matrix.dot(laplacian.dot(weight_matrix)) # spectral decomposition n_components = min(self.embedding_dimension + 1, n_nodes - 1) if randomized_decomposition: eigenvalues, eigenvectors = randomized_eig(laplacian, n_components, which='SM') else: eigenvalues, eigenvectors = eigsh(laplacian, n_components, which='SM') self.eigenvalues_ = eigenvalues[1:] self.embedding_ = np.array(weight_matrix.dot(eigenvectors[:, 1:])) return self
def largest_connected_component(adjacency: Union[sparse.csr_matrix, np.ndarray], return_labels: bool = False): """ Extract the largest connected component of a graph. Bipartite graphs are treated as undirected ones. Parameters ---------- adjacency Adjacency or biadjacency matrix of the graph. return_labels: bool Whether to return the indices of the new nodes in the original graph. Returns ------- new_adjacency: sparse.csr_matrix Adjacency or biadjacency matrix of the largest connected component. indices: array or tuple of array Indices of the nodes in the original graph. For biadjacency matrices, ``indices[0]`` corresponds to the rows and ``indices[1]`` to the columns. """ adjacency = check_format(adjacency) n_samples, n_features = adjacency.shape if not is_square(adjacency): bipartite: bool = True full_adjacency = sparse.bmat([[None, adjacency], [adjacency.T, None]], format='csr') else: bipartite: bool = False full_adjacency = adjacency n_components, labels = connected_components(full_adjacency) unique_labels, counts = np.unique(labels, return_counts=True) component_label = unique_labels[np.argmax(counts)] component_indices = np.where(labels == component_label)[0] if bipartite: split_ix = np.searchsorted(component_indices, n_samples) samples_ix, features_ix = component_indices[: split_ix], component_indices[ split_ix:] - n_samples else: samples_ix, features_ix = component_indices, component_indices new_adjacency = adjacency[samples_ix, :] new_adjacency = (new_adjacency.tocsc()[:, features_ix]).tocsr() if return_labels: if bipartite: return new_adjacency, (samples_ix, features_ix) else: return new_adjacency, samples_ix else: return new_adjacency
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'HITS': """ Compute HITS algorithm with a spectral method. Parameters ---------- adjacency : Adjacency or biadjacency matrix of the graph. Returns ------- self: :class:`HITS` """ adjacency = check_format(adjacency) if self.solver == 'auto': solver = auto_solver(adjacency.nnz) if solver == 'lanczos': self.solver: SVDSolver = LanczosSVD() else: self.solver: SVDSolver = HalkoSVD() self.solver.fit(adjacency, 1) hubs: np.ndarray = self.solver.left_singular_vectors_.reshape(-1) authorities: np.ndarray = self.solver.right_singular_vectors_.reshape( -1) h_pos, h_neg = (hubs > 0).sum(), (hubs < 0).sum() a_pos, a_neg = (authorities > 0).sum(), (authorities < 0).sum() if h_pos > h_neg: hubs = np.clip(hubs, a_min=0., a_max=None) else: hubs = np.clip(-hubs, a_min=0., a_max=None) if a_pos > a_neg: authorities = np.clip(authorities, a_min=0., a_max=None) else: authorities = np.clip(-authorities, a_min=0., a_max=None) if self.mode == 'hubs': self.scores_ = hubs self.col_scores_ = authorities elif self.mode == 'authorities': self.scores_ = authorities self.col_scores_ = hubs else: raise ValueError('Mode should be "hubs" or "authorities".') return self
def fit(self, biadjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiSpectralClustering': """Apply embedding method followed by clustering to the graph. Parameters ---------- biadjacency: Biadjacency matrix of the graph. Returns ------- self: :class:`BiSpectralClustering` """ biadjacency = check_format(biadjacency) n1, n2 = biadjacency.shape bispectral = BiSpectral(self.embedding_dimension).fit(biadjacency) if self.co_clustering: embedding = bispectral.embedding_ else: embedding = bispectral.row_embedding_ if self.l2normalization: norm = np.linalg.norm(embedding, axis=1) norm[norm == 0.] = 1 embedding /= norm[:, np.newaxis] kmeans = KMeans(self.n_clusters) kmeans.fit(embedding) if self.co_clustering: self.row_labels_ = kmeans.labels_[:n1] self.col_labels_ = kmeans.labels_[n1:] self.labels_ = kmeans.labels_ else: self.row_labels_ = kmeans.labels_ self.labels_ = kmeans.labels_ return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Closeness': """ Closeness centrality for connected graphs. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Closeness` """ adjacency = check_format(adjacency) n = adjacency.shape[0] if not is_square(adjacency): raise ValueError( "The adjacency is not square. Please use 'bipartite2undirected' or " "'bipartite2directed'.") if not is_connected(adjacency): raise ValueError("The graph must be connected.") if self.method == 'exact': nb_samples = n indices = np.arange(n) elif self.method == 'approximate': nb_samples = min(int(log(n) / self.tol**2), n) indices = np.random.choice(np.arange(n), nb_samples, replace=False) else: raise ValueError( "Method should be either 'exact' or 'approximate'.") paths = shortest_path(adjacency, n_jobs=self.n_jobs, indices=indices) self.scores_ = ( (n - 1) * nb_samples / n) / paths.T.dot(np.ones(nb_samples)) return self
def dasgupta_score(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'uniform') -> float: """ Dasgupta's score of a hierarchy, defined as 1 - Dasgupta's cost. The higher the score, the better. Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` or ``'uniform'`` (default). Returns ------- score : float Dasgupta's score of the hierarchy, normalized to get a value between 0 and 1. References ---------- Dasgupta, S. (2016). A cost function for similarity-based hierarchical clustering. Proceedings of ACM symposium on Theory of Computing. """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square.') n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights) height = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) cluster_weight = np.zeros(n - 1) for t in range(n - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n and height[node1 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n] edge_sampling[node1 - n] = 0 elif node2 >= n and height[node2 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n] edge_sampling[node2 - n] = 0 height[t] = dendrogram[t][2] if node2 in aggregate_graph.neighbors[node1]: edge_sampling[t] += aggregate_graph.neighbors[node1][node2] cluster_weight[t] = aggregate_graph.cluster_out_weights[node1] + aggregate_graph.cluster_out_weights[node2] \ + aggregate_graph.cluster_in_weights[node1] + aggregate_graph.cluster_in_weights[node2] aggregate_graph.merge(node1, node2) cost: float = edge_sampling.dot(cluster_weight) / 2 return 1 - cost
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Paris': """ Agglomerative clustering using the nearest neighbor chain. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Paris` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError( 'The adjacency matrix is not square. Use BiParis() instead.') n = adjacency.shape[0] sym_adjacency = adjacency + adjacency.T weights = self.weights out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) if n <= 1: raise ValueError('The graph must contain at least two nodes.') if self.engine == 'python': aggregate_graph = AggregateGraph(sym_adjacency, out_weights, in_weights) connected_components = [] dendrogram = [] while len(aggregate_graph.cluster_sizes) > 0: node = None for node in aggregate_graph.cluster_sizes: break chain = [node] while chain: node = chain.pop() if aggregate_graph.neighbors[node]: max_sim = -float("inf") nearest_neighbor = None for neighbor in aggregate_graph.neighbors[node]: sim = aggregate_graph.similarity(node, neighbor) if sim > max_sim: nearest_neighbor = neighbor max_sim = sim elif sim == max_sim: nearest_neighbor = min(neighbor, nearest_neighbor) if chain: nearest_neighbor_last = chain.pop() if nearest_neighbor_last == nearest_neighbor: dendrogram.append([ node, nearest_neighbor, 1. / max_sim, aggregate_graph.cluster_sizes[node] + aggregate_graph. cluster_sizes[nearest_neighbor] ]) aggregate_graph.merge(node, nearest_neighbor) else: chain.append(nearest_neighbor_last) chain.append(node) chain.append(nearest_neighbor) else: chain.append(node) chain.append(nearest_neighbor) else: connected_components.append( (node, aggregate_graph.cluster_sizes[node])) del aggregate_graph.cluster_sizes[node] node, cluster_size = connected_components.pop() for next_node, next_cluster_size in connected_components: cluster_size += next_cluster_size dendrogram.append( [node, next_node, float("inf"), cluster_size]) node = aggregate_graph.next_cluster aggregate_graph.next_cluster += 1 dendrogram = np.array(dendrogram) if self.reorder: dendrogram = reorder_dendrogram(dendrogram) self.dendrogram_ = dendrogram return self elif self.engine == 'numba': n = np.int32(adjacency.shape[0]) indices, indptr, data = sym_adjacency.indices, sym_adjacency.indptr, sym_adjacency.data dendrogram = fit_core(n, out_weights, in_weights, data, indices, indptr) dendrogram = np.array(dendrogram) if self.reorder: dendrogram = reorder_dendrogram(dendrogram) self.dendrogram_ = dendrogram return self else: raise ValueError('Unknown engine.')
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'BiSpectral': """ Computes the generalized SVD of the adjacency matrix. Parameters ---------- adjacency: array-like, shape = (n1, n2) Adjacency matrix, where n1 = n2 is the number of nodes for a standard graph, n1, n2 are the number of nodes in each part for a bipartite graph. Returns ------- self: :class:`BiSpectral` """ adjacency = check_format(adjacency).asfptype() n1, n2 = adjacency.shape if self.solver == 'auto': solver = auto_solver(adjacency.nnz) if solver == 'lanczos': self.solver: SVDSolver = LanczosSVD() else: self.solver: SVDSolver = HalkoSVD() total_weight = adjacency.dot(np.ones(n2)).sum() regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * total_weight / (n1 * n2) adjacency = SparseLR(adjacency, [(regularization * np.ones(n1), np.ones(n2))]) w_row = check_weights(self.weights, adjacency) w_col = check_weights(self.col_weights, adjacency.T) diag_row = diag_pinv(np.sqrt(w_row)) diag_col = diag_pinv(np.sqrt(w_col)) normalized_adj = safe_sparse_dot(diag_row, safe_sparse_dot(adjacency, diag_col)) # svd if self.embedding_dimension >= min(n1, n2) - 1: n_components = min(n1, n2) - 1 warnings.warn(Warning("The dimension of the embedding must be less than the number of rows " "and the number of columns. Changed accordingly.")) else: n_components = self.embedding_dimension + 1 self.solver.fit(normalized_adj, n_components) index = np.argsort(-self.solver.singular_values_) self.singular_values_ = self.solver.singular_values_[index[1:]] self.row_embedding_ = diag_row.dot(self.solver.left_singular_vectors_[:, index[1:]]) self.col_embedding_ = diag_col.dot(self.solver.right_singular_vectors_[:, index[1:]]) if self.scaling: if self.scaling == 'multiply': self.row_embedding_ *= np.sqrt(self.singular_values_) self.col_embedding_ *= np.sqrt(self.singular_values_) elif self.scaling == 'divide': energy_levels: np.ndarray = np.sqrt(1 - np.clip(self.singular_values_, 0, 1) ** 2) energy_levels[energy_levels > 0] = 1 / energy_levels[energy_levels > 0] self.row_embedding_ *= energy_levels self.col_embedding_ *= energy_levels elif self.scaling == 'barycenter': self.row_embedding_ *= self.singular_values_ else: warnings.warn(Warning("The scaling must be 'multiply' or 'divide' or 'barycenter'. No scaling done.")) self.embedding_ = np.vstack((self.row_embedding_, self.col_embedding_)) return self
def cosine_modularity(adjacency, embedding: np.ndarray, col_embedding=None, resolution=1., weights='degree', return_all: bool = False): """ Quality metric of an embedding :math:`x` defined by: :math:`Q = \\sum_{ij}\\left(\\dfrac{A_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right) \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(x_j)}{2}\\right)` where :math:`\\pi(x_i)` is the projection of :math:`x_i` onto the unit-sphere. For bipartite graphs with column embedding :math:`y`, the metric is :math:`Q = \\sum_{ij}\\left(\\dfrac{B_{ij}}{w} - \\gamma \\dfrac{w_iw'_j}{w^2}\\right) \\left(\\dfrac{1 + \\pi(x_i)^T\\pi(y_j)}{2}\\right)` This metric is normalized to lie between -1 and 1 (for :math:`\\gamma = 1`). Parameters ---------- adjacency: sparse.csr_matrix or np.ndarray Adjacency matrix of the graph. embedding: np.ndarray Embedding of the nodes. col_embedding: None or np.ndarray For biadjacency matrices, embedding of the columns. resolution: float Resolution parameter. weights: ``'degree'`` or ``'uniform'`` Weights of the nodes. return_all: bool, default = ``False`` whether to return (fit, div, :math:`Q`) or :math:`Q` Returns ------- modularity : float fit: float, optional diversity: float, optional """ adjacency = check_format(adjacency) total_weight: float = adjacency.data.sum() if col_embedding is None: if not is_square(adjacency): raise ValueError( 'col_embedding cannot be None for non-square adjacency matrices.' ) else: col_embedding = embedding.copy() row_norms = np.linalg.norm(embedding, axis=1) col_norms = np.linalg.norm(col_embedding, axis=1) norm_row_emb = embedding norm_row_emb[(row_norms > 0)] /= row_norms[:, np.newaxis] norm_col_emb = col_embedding norm_col_emb[(col_norms > 0)] /= col_norms[:, np.newaxis] row_probs = check_probs(weights, adjacency) col_probs = check_probs(weights, adjacency.T) fit: float = 0.5 * (1 + (np.multiply( norm_row_emb, adjacency.dot(norm_col_emb))).sum() / total_weight) div: float = 0.5 * ( 1 + (embedding.T.dot(row_probs)).dot(col_embedding.T.dot(col_probs))) if return_all: return fit, div, fit - resolution * div else: return fit - resolution * div
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray], randomized_decomposition: bool = True, n_iter='auto', power_iteration_normalizer: Union[str, None] = 'auto', random_state=None) -> 'GSVDEmbedding': """Fits the model from data in adjacency_matrix. Parameters ---------- adjacency: array-like, shape = (n, m) Adjacency matrix, where n = m is the number of nodes for a standard directed or undirected graph, n is the cardinal of V1 and m is the cardinal of V2 for a bipartite graph. randomized_decomposition: whether to use a randomized (and faster) svd method or the standard scipy one. n_iter: int or ``'auto'`` (default is ``'auto'``) See :meth:`sknetwork.embedding.randomized_range_finder` power_iteration_normalizer: ``'auto'`` (default), ``'QR'``, ``'LU'``, ``None`` See :meth:`sknetwork.embedding.randomized_range_finder` random_state: int, RandomState instance or ``None``, optional (default= ``None``) See :meth:`sknetwork.embedding.randomized_range_finder` Returns ------- self: :class:`GSVDEmbedding` """ adjacency = check_format(adjacency) n_nodes, m_nodes = adjacency.shape total_weight = adjacency.data.sum() # out-degree vector dou = adjacency.dot(np.ones(m_nodes)) # in-degree vector din = adjacency.T.dot(np.ones(n_nodes)) # pseudo inverse square-root out-degree matrix dhou = sparse.diags(np.sqrt(dou), shape=(n_nodes, n_nodes), format='csr') dhou.data = 1 / dhou.data # pseudo inverse square-root in-degree matrix dhin = sparse.diags(np.sqrt(din), shape=(m_nodes, m_nodes), format='csr') dhin.data = 1 / dhin.data laplacian = dhou.dot(adjacency.dot(dhin)) if randomized_decomposition: u, sigma, vt = randomized_svd( laplacian, self.embedding_dimension, n_iter=n_iter, power_iteration_normalizer=power_iteration_normalizer, random_state=random_state) else: u, sigma, vt = linalg.svds(laplacian, self.embedding_dimension) self.singular_values_ = sigma self.embedding_ = np.sqrt(total_weight) * dhou.dot(u) * sigma self.features_ = np.sqrt(total_weight) * dhin.dot(vt.T) # shift the center of mass self.embedding_ -= np.ones((n_nodes, 1)).dot( self.embedding_.T.dot(dou)[:, np.newaxis].T) / total_weight self.features_ -= np.ones((m_nodes, 1)).dot( self.features_.T.dot(din)[:, np.newaxis].T) / total_weight return self
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Spectral': """Fits the model from data in adjacency. Parameters ---------- adjacency : Adjacency matrix of the graph (symmetric matrix). Returns ------- self: :class:`Spectral` """ adjacency = check_format(adjacency).asfptype() if not is_square(adjacency): raise ValueError( 'The adjacency matrix is not square. See BiSpectral.') if not is_symmetric(adjacency): raise ValueError( 'The adjacency matrix is not symmetric.' 'Either convert it to a symmetric matrix or use BiSpectral.') n = adjacency.shape[0] if self.solver == 'auto': solver = auto_solver(adjacency.nnz) if solver == 'lanczos': self.solver: EigSolver = LanczosEig() else: self.solver: EigSolver = HalkoEig() if self.embedding_dimension > n - 2: warnings.warn( Warning( "The dimension of the embedding must be less than the number of nodes - 1." )) n_components = n - 2 else: n_components = self.embedding_dimension + 1 if (self.regularization is None or self.regularization == 0.) and not is_connected(adjacency): warnings.warn( Warning( "The graph is not connected and low-rank regularization is not active." "This can cause errors in the computation of the embedding." )) if isinstance(self.solver, HalkoEig) and not self.normalized_laplacian: raise NotImplementedError( "Halko solver is not yet compatible with regular Laplacian." "Call 'fit' with 'normalized_laplacian' = True or force lanczos solver." ) weights = adjacency.dot(np.ones(n)) regularization = self.regularization if regularization: if self.relative_regularization: regularization = regularization * weights.sum() / n**2 weights += regularization * n if self.normalized_laplacian: # Finding the largest eigenvalues of the normalized adjacency is easier for the solver than finding the # smallest eigenvalues of the normalized laplacian. normalizing_matrix = diag_pinv(np.sqrt(weights)) if regularization: norm_adjacency = NormalizedAdjacencyOperator( adjacency, regularization) else: norm_adjacency = normalizing_matrix.dot( adjacency.dot(normalizing_matrix)) self.solver.which = 'LA' self.solver.fit(matrix=norm_adjacency, n_components=n_components) eigenvalues = 1 - self.solver.eigenvalues_ # eigenvalues of the Laplacian in increasing order index = np.argsort(eigenvalues) # skip first eigenvalue eigenvalues = eigenvalues[index][1:] # keep only positive eigenvectors of the normalized adjacency matrix eigenvectors = self.solver.eigenvectors_[:, index][:, 1:] * ( eigenvalues < 1 - self.tol) embedding = np.array(normalizing_matrix.dot(eigenvectors)) else: if regularization: laplacian = LaplacianOperator(adjacency, regularization) else: weight_matrix = sparse.diags(weights, format='csr') laplacian = weight_matrix - adjacency self.solver.which = 'SM' self.solver.fit(matrix=laplacian, n_components=n_components) eigenvalues = self.solver.eigenvalues_[1:] embedding = self.solver.eigenvectors_[:, 1:] if self.scaling: if self.scaling == 'multiply': eigenvalues = np.minimum(eigenvalues, 1) embedding *= np.sqrt(1 - eigenvalues) elif self.scaling == 'divide': inv_eigenvalues = np.zeros_like(eigenvalues) index = np.where(eigenvalues > 0)[0] inv_eigenvalues[index] = 1 / eigenvalues[index] embedding *= np.sqrt(inv_eigenvalues) else: warnings.warn( Warning( "The scaling must be 'multiply' or 'divide'. No scaling done." )) self.embedding_ = embedding self.eigenvalues_ = eigenvalues self.regularization_ = regularization return self
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """ Tree sampling divergence of a hierarchy (quality metric). The higher the score, the better. Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized: If ``True``, normalized by the mutual information of the graph. Returns ------- score : float The tree sampling divergence of the hierarchy. If normalized, returns a value between 0 and 1. References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square.') n = adjacency.shape[0] if n <= 1: raise ValueError('The graph must contain at least two nodes.') total_weight = adjacency.data.sum() if total_weight <= 0: raise ValueError('The graph must contain at least one edge.') adjacency.data = adjacency.data / total_weight out_weights = check_probs(weights, adjacency) in_weights = check_probs(weights, adjacency.T) aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights) height = np.zeros(n - 1) edge_sampling = np.zeros(n - 1) node_sampling = np.zeros(n - 1) for t in range(n - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n and height[node1 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n] edge_sampling[node1 - n] = 0 node_sampling[t] = node_sampling[node1 - n] elif node2 >= n and height[node2 - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n] edge_sampling[node2 - n] = 0 node_sampling[t] = node_sampling[node2 - n] if node2 in aggregate_graph.neighbors[node1]: edge_sampling[t] += aggregate_graph.neighbors[node1][node2] node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \ aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1] height[t] = dendrogram[t][2] aggregate_graph.merge(node1, node2) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data)) score /= mutual_information return score
def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'Louvain': """ Clustering using chosen Optimizer. Parameters ---------- adjacency : Adjacency matrix of the graph. Returns ------- self: :class:`Louvain` """ adjacency = check_format(adjacency) if not is_square(adjacency): raise ValueError('The adjacency matrix is not square. Use BiLouvain() instead.') n = adjacency.shape[0] out_weights = check_probs('degree', adjacency) in_weights = check_probs('degree', adjacency.T) nodes = np.arange(n) if self.shuffle_nodes: nodes = self.random_state.permutation(nodes) adjacency = adjacency[nodes, :].tocsc()[:, nodes].tocsr() graph = AggregateGraph(adjacency, out_weights, in_weights) membership = sparse.identity(n, format='csr') increase = True iteration_count = 0 self.log.print("Starting with", graph.n_nodes, "nodes.") while increase: iteration_count += 1 self.algorithm.fit(graph) if self.algorithm.score_ <= self.agg_tol: increase = False else: agg_membership = membership_matrix(self.algorithm.labels_) membership = membership.dot(agg_membership) graph.aggregate(agg_membership) if graph.n_nodes == 1: break self.log.print("Iteration", iteration_count, "completed with", graph.n_nodes, "clusters and ", self.algorithm.score_, "increment.") if iteration_count == self.max_agg_iter: break if self.sorted_cluster: labels = reindex_clusters(membership.indices) else: labels = membership.indices if self.shuffle_nodes: reverse = np.empty(nodes.size, nodes.dtype) reverse[nodes] = np.arange(nodes.size) labels = labels[reverse] self.labels_ = labels self.iteration_count_ = iteration_count self.aggregate_graph_ = graph.norm_adjacency * adjacency.data.sum() return self