def __init__( self, profile: sps.csr_matrix, embedding: np.ndarray, search_config: Optional[MLPSearchConfig] = None, ): ( profile_train, profile_test, embedding_train, embedding_test, ) = train_test_split( profile.astype(np.float32), embedding.astype(np.float32), random_state=42, ) self.profile_train = profile_train self.profile_test = profile_test self.embedding_train = jnp.asarray(embedding_train, dtype=jnp.float32) self.embedding_test = jnp.asarray(embedding_test, dtype=jnp.float32) if search_config is None: self.search_config = MLPSearchConfig() else: self.search_config = search_config
def saveSparseMatrix(file: str, matrix: csr_matrix, colnames: Union[List[str], None] = None) -> None: """ Save sparse matrix to json file Args: ile: file name to store results, full or relative path matrix (M,N): sparse matrix of size (M,N) colnames, optional (1,N): column names Returns: None """ json_content = dict() if colnames is not None: json_content["features"] = colnames json_content["size"] = matrix.shape matrix = matrix.astype(float) keys = matrix.todok().keys() keys = [tuple(map(int, key)) for key in keys] vals = matrix.todok().values() vals = list(map(float, vals)) json_content["positions"] = keys json_content["counts"] = vals with open(file, "w") as f: json.dump(json_content, f, indent=4)
def precompute_best_item_indices(self, URM: sps.csr_matrix): URM = URM.copy() if self.feature_weighting == "BM25": URM = URM.astype(np.float32) URM = okapi_BM_25(URM) URM = check_matrix(URM, 'csr') elif self.feature_weighting == "TF-IDF": URM = URM.astype(np.float32) URM = TF_IDF(URM) URM = check_matrix(URM, 'csr') similarity = Compute_Similarity(URM, shrink=self.shrink, topK=self.topK, normalize=self.normalize, similarity="cosine") similarity_matrix = similarity.compute_similarity() self.sorted_indices = np.array( np.argsort(-similarity_matrix.todense(), axis=1))
def calculate_diffusion_map( W: csr_matrix, n_components: int, solver: str, max_t: int, n_jobs: int, random_state: int, ) -> Tuple[np.array, np.array, np.array]: assert issparse(W) nc, labels = connected_components(W, directed=True, connection="strong") logger.info("Calculating connected components is done.") assert nc == 1 W_norm, diag, diag_half = calculate_normalized_affinity(W.astype(np.float64)) # use double precision to guarantee reproducibility logger.info("Calculating normalized affinity matrix is done.") n_jobs = eff_n_jobs(n_jobs) with threadpool_limits(limits = n_jobs): if solver == "eigsh": np.random.seed(random_state) v0 = np.random.uniform(-1.0, 1.0, W_norm.shape[0]) Lambda, U = eigsh(W_norm, k=n_components, v0=v0) Lambda = Lambda[::-1] U = U[:, ::-1] else: assert solver == "randomized" U, S, VT = randomized_svd( W_norm, n_components=n_components, random_state=random_state ) signs = np.sign((U * VT.transpose()).sum(axis=0)) # get eigenvalue signs Lambda = signs * S # get eigenvalues # remove the first eigen value and vector Lambda = Lambda[1:] U = U[:, 1:] Phi = U / diag_half[:, np.newaxis] if max_t == -1: Lambda_new = Lambda / (1.0 - Lambda) else: # Find the knee point x = np.array(range(1, max_t + 1), dtype = float) y = np.array([calc_von_neumann_entropy(Lambda, t) for t in x]) t = x[find_knee_point(x, y)] logger.info("Detected knee point at t = {:.0f}.".format(t)) # U_df = U * Lambda #symmetric diffusion component Lambda_new = Lambda * ((1.0 - Lambda ** t) / (1.0 - Lambda)) Phi_pt = Phi * Lambda_new # asym pseudo component return Phi_pt, Lambda, Phi # , U_df, W_norm
def _secondary_outputs(self, input_matrix: sparse.csr_matrix): """Compute different variables from labels_.""" if self.return_membership or self.return_aggregate: if np.issubdtype(input_matrix.data.dtype, np.bool_): input_matrix = input_matrix.astype(float) if not self.bipartite: membership = membership_matrix(self.labels_) if self.return_membership: self.membership_ = normalize(input_matrix.dot(membership)) if self.return_aggregate: self.aggregate_ = sparse.csr_matrix( membership.T.dot(input_matrix.dot(membership))) else: if self.labels_col_ is None: n_labels = max(self.labels_) + 1 membership_row = membership_matrix(self.labels_, n_labels=n_labels) membership_col = normalize( input_matrix.T.dot(membership_row)) else: n_labels = max(max(self.labels_row_), max( self.labels_col_)) + 1 membership_row = membership_matrix(self.labels_row_, n_labels=n_labels) membership_col = membership_matrix(self.labels_col_, n_labels=n_labels) if self.return_membership: self.membership_row_ = normalize( input_matrix.dot(membership_col)) self.membership_col_ = normalize( input_matrix.T.dot(membership_row)) self.membership_ = self.membership_row_ if self.return_aggregate: aggregate_ = sparse.csr_matrix( membership_row.T.dot(input_matrix)) aggregate_ = aggregate_.dot(membership_col) self.aggregate_ = aggregate_ return self
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). * Graphs * Digraphs Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.52 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars( adjacency, weights) node_sampling = np.zeros(n - 1) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if i >= n and height[i - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[i - n] edge_sampling[i - n] = 0 node_sampling[t] = node_sampling[i - n] elif j >= n and height[j - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[j - n] edge_sampling[j - n] = 0 node_sampling[t] = node_sampling[j - n] if j in aggregate_graph.neighbors[i]: edge_sampling[t] += aggregate_graph.neighbors[i][j] node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \ aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i] height[t] = dendrogram[t][2] aggregate_graph.merge(i, j) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) score /= mutual_information return score
def get_score(self, profile: sps.csr_matrix) -> DenseScoreArray: user_embedding: DenseMatrix = self.mlp.predict( profile.astype(np.float32).toarray()) return self.cf_rec.get_score_from_user_embedding( user_embedding).astype(np.float64)
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.05 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() edge_sampling, node_sampling, _ = get_sampling_distributions( adjacency, dendrogram, weights) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) if mutual_information > 0: score /= mutual_information return score