def logisticClassProbability(X: spa.csr_matrix, b: np.array, W: np.array): '''Returns Class probabilities given data and parameters''' logits = X.dot(W.T)+b elogits = np.exp(logits-logits.max(axis=1)[:,np.newaxis]) # make calculation more stable numerically elogits_sum = elogits.sum(axis=1) class_probs = elogits/elogits_sum[:,np.newaxis] return class_probs
def make_weights(distribution: str, adjacency: sparse.csr_matrix) -> np.ndarray: """Array of weights from a matrix and a desired distribution. Parameters ---------- distribution: Distribution for node sampling. Only ``'degree'`` or ``'uniform'`` are accepted. adjacency: The adjacency matrix of the neighbors. Returns ------- node_weights: np.ndarray Weights of nodes. """ n = adjacency.shape[0] distribution = distribution.lower() if distribution == 'degree': node_weights_vec = adjacency.dot(np.ones(adjacency.shape[1])) elif distribution == 'uniform': node_weights_vec = np.ones(n) else: raise ValueError('Unknown distribution of node weights.') return node_weights_vec
def __init__(self, adjacency: sparse.csr_matrix, damping_factor: float = 0.85, personalization=None, fb_mode: bool = False, verbose: bool = False): VerboseMixin.__init__(self, verbose) n1, n2 = adjacency.shape restart_prob: np.ndarray = restart_probability(n1, personalization) if fb_mode: restart_prob = np.hstack((restart_prob, np.zeros(n2))) adjacency = bipartite2undirected(adjacency) LinearOperator.__init__(self, shape=adjacency.shape, dtype=float) n = adjacency.shape[0] out_degrees = adjacency.dot(np.ones(n)) damping_matrix = damping_factor * sparse.eye(n, format='csr') if fb_mode: damping_matrix.data[n1:] = 1 self.a = (damping_matrix.dot(transition_matrix(adjacency))).T.tocsr() self.b = (np.ones(n) - damping_factor * out_degrees.astype(bool)) * restart_prob
def left_sparse_dot(self, matrix: sparse.csr_matrix): """Left dot product with a sparse matrix Parameters ---------- matrix: Matrix Returns ------- SparseLR object """ return SparseLR(matrix.dot(self.sparse_mat), [(matrix.dot(x), y) for (x, y) in self.low_rank_tuples])
def similarity_from_sparse(matrix_a: sparse.csr_matrix, matrix_b: sparse.csr_matrix): intersection = matrix_a.dot(matrix_b.transpose()).toarray() norm_1 = np.array(matrix_a.multiply(matrix_a).sum(axis=1)) norm_2 = np.array(matrix_b.multiply(matrix_b).sum(axis=1)) union = norm_1 + norm_2.T - intersection return intersection / union
def augmentURM(cls, URM_train: csr_matrix, W_sparse: csr_matrix, threshold_interactions: int, threshold_similarity: float): """ Augmentation of the URM train. :param threshold_interactions: here a threshold on the similarity is considered. Similarity matrix W_sparse will be considered for this purpose :param threshold_similarity: threshold used to insert a new row. In this case it is specified as the minimum number of interactions required to insert a new row in the URM train :param W_sparse: similarity matrix :param URM_train: URM train that will be augmented :return: a csr_matrix with augmented interactions according to the threshold """ print("Augmenting URM") URM_train = URM_train.copy() # Count similarity count_W_sparse = URM_train.dot(URM_train.transpose()) # Selecting new print("Selecting new candidates") users = np.arange(URM_train.shape[0]) new_rows_list = [] for i in range(0, users.size): if i % 5000 == 0: print("{} done in {}".format(i, users.size)) candidates = count_W_sparse[i].indices # users candidates data = count_W_sparse[i].data # data for the candidates for j, candidate in enumerate(candidates): if candidate > i and data[ j] > threshold_interactions and W_sparse[ i, candidate] > threshold_similarity: new_rows_list.append([i, candidate]) print("Candidate list size: {}".format(len(new_rows_list))) # Creating the new matrix print("Creating new URM...", end="") new_URM = None for candidate in new_rows_list: new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0) new_row = csr_matrix(new_row) new_row.data[new_row.data > 1] = 1 if new_URM is None: new_URM = new_row else: new_URM = vstack([new_URM, new_row], format="csr") if new_URM is None: new_URM = URM_train else: new_URM = vstack([URM_train, new_URM], format="csr") print("Done") return new_URM
def gradient(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray, l2reg: float) -> np.ndarray: m = X.shape[0] z = X.dot(beta) - y grad = X.T.dot(z) # L2 regularization term grad += l2reg * np.append(0, beta[1:]) grad /= m return grad
def _precompute_representation( features: sp.csr_matrix, feature_embeddings: np.ndarray, feature_biases: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ :param: features csr_matrix [n_objects, n_features] :param: feature_embeddings np.ndarray(float) [n_features, no_component] :param: feature_biases np.ndarray(float) [n_features] :return: TODO: tuple of - representation np.ndarray(float) [n_objects, no_component+1] - bias repr """ representation = features.dot(feature_embeddings) representation_bias = features.dot(feature_biases) return representation, representation_bias
def gradient(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray, is_win: np.ndarray, f, sigma: float, l2reg: float) -> float: z = (X.dot(beta) - y) / sigma z_lose = -(np.exp(norm.logpdf(z) - norm.logcdf(z))) #z_lose = -(norm.pdf(z) / norm.cdf(z)) z = f(z, z_lose, is_win) grad = X.T.dot(z) / sigma # L2 regularization term grad += l2reg * np.append(0, beta[1:]) return grad
def loss_function(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray, l2reg: float) -> float: m = X.shape[0] # squared loss z = X.dot(beta) - y #loss = sum(-norm.logpdf(z)) loss = sum(z**2) # L2 regularization term loss += l2reg * sum(beta[1:]**2) loss /= (2 * m) return loss
def _secondary_outputs(self, input_matrix: sparse.csr_matrix): """Compute different variables from labels_.""" if self.return_membership or self.return_aggregate: if np.issubdtype(input_matrix.data.dtype, np.bool_): input_matrix = input_matrix.astype(float) if not self.bipartite: membership = membership_matrix(self.labels_) if self.return_membership: self.membership_ = normalize(input_matrix.dot(membership)) if self.return_aggregate: self.aggregate_ = sparse.csr_matrix( membership.T.dot(input_matrix.dot(membership))) else: if self.labels_col_ is None: n_labels = max(self.labels_) + 1 membership_row = membership_matrix(self.labels_, n_labels=n_labels) membership_col = normalize( input_matrix.T.dot(membership_row)) else: n_labels = max(max(self.labels_row_), max( self.labels_col_)) + 1 membership_row = membership_matrix(self.labels_row_, n_labels=n_labels) membership_col = membership_matrix(self.labels_col_, n_labels=n_labels) if self.return_membership: self.membership_row_ = normalize( input_matrix.dot(membership_col)) self.membership_col_ = normalize( input_matrix.T.dot(membership_row)) self.membership_ = self.membership_row_ if self.return_aggregate: aggregate_ = sparse.csr_matrix( membership_row.T.dot(input_matrix)) aggregate_ = aggregate_.dot(membership_col) self.aggregate_ = aggregate_ return self
def loss_function( beta: np.ndarray, X: csr_matrix, y: np.ndarray, is_win: np.ndarray, f, sigma: float, l2reg: float) -> float: z = (X.dot(beta) - y) / sigma # loss for win bids z_win = -norm.logpdf(z) #z_win = -(np.log(1/np.sqrt(2*np.pi)) - z**2/2) # loss for lose bids z_lose = -norm.logcdf(z) loss = sum(f(z_win, z_lose, is_win)) # L2 regularization term loss += l2reg * sum(beta[1:] ** 2) / 2 return loss
def __init__(self, adjacency: sparse.csr_matrix, damping_factor: float, border: np.ndarray = None): super(DirichletOperator, self).__init__(shape=adjacency.shape, dtype=float) n = adjacency.shape[0] out_nodes = adjacency.dot(np.ones(n)).astype(bool) if border is None: border = np.zeros(n, dtype=bool) interior: sparse.csr_matrix = sparse.diags(~border, shape=(n, n), format='csr', dtype=float) self.a = damping_factor * interior.dot(normalize(adjacency)) self.b = interior.dot(np.ones(n) - damping_factor * out_nodes) / n
def predict(self, x: spa.csr_matrix): '''Return a matrix with the predicted ratings. Applies logs to avoid underflow and to take into account that the probability of appearance of a word increases if the same word has already appeared before. Uses only matrix operations.''' if self.is_trained == False: return ('''The Classifier has not been trained. Please use train(train_data: spa.csr_matrix, scores: np.ndarray, Laplace_alpha) to train the Classifier.''') else: x.data = np.log(x.data + 1) self.log_cond_prob_matrix = spa.hstack(self.log_cond_prob_trans) log_freq = np.log(self.fractions) pre_final_result = x.dot(self.log_cond_prob_matrix) + log_freq final_prediction = (pre_final_result.argmax(axis=1) + 1).transpose() x.data = np.exp(x.data) - 1 return final_prediction
def newAugmentUMR(cls, URM_train: csr_matrix, W_sparse: csr_matrix, threshold_interactions: int, threshold_similarity: float): print("New Augmenting URM") count_W_sparse = URM_train.dot(URM_train.transpose()) count_mask: csr_matrix = count_W_sparse > threshold_interactions sim_mask: csr_matrix = W_sparse > threshold_similarity mask = count_mask.multiply(sim_mask) mask = triu(mask) mask = mask.tocoo() row_user = mask.row col_user = mask.col new_mask = row_user != col_user row_user = row_user[new_mask] col_user = col_user[new_mask] new_users = np.array([row_user, col_user]) new_users = np.transpose(new_users) new_rows_list: list = new_users.tolist() print("Candidate list size: {}".format(len(new_rows_list))) # Creating the new matrix print("Creating new URM...", end="") new_URM = None for candidate in new_rows_list: new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0) new_row = csr_matrix(new_row) new_row.data[new_row.data > 1] = 1 if new_URM is None: new_URM = new_row else: new_URM = vstack([new_URM, new_row], format="csr") if new_URM is None: new_URM = URM_train else: new_URM = vstack([URM_train, new_URM], format="csr") print("Done") return new_URM
def calc_objective_per_iter(w_i, feature_mat: sparse.csr_matrix, empirical_counts, num_h, true_tags, alpha): """ Calculate max entropy likelihood for an iterative optimization method :param alpha: the regularization coefficient :param num_h: number of histories in the training data :param empirical_counts: pre computed empirical_counts :param w_i: weights vector in iteration i The function returns the Max Entropy likelihood (objective) and the objective gradient """ scores = feature_mat.dot(w_i) scores = scores.reshape((num_h, -1)) exp_scores = np.exp(scores) sum_exp = np.sum(exp_scores, axis=1) probs = exp_scores/sum_exp.reshape((num_h, 1)) expected_counts = feature_mat.transpose().dot(probs.reshape(-1)).reshape(-1) likelihood = np.sum(scores[np.arange(num_h), true_tags] - np.log(sum_exp) - (alpha/2) * (w_i**2)) grad = empirical_counts - expected_counts - alpha*w_i return (-1) * likelihood, (-1) * grad
def augment_with_item_similarity_best_scores(urm: sps.csr_matrix, similarity, topK, value=0.5, remove_seen=True, users=None): # Create a copy of the urm augmented_urm = urm.tolil(copy=True).astype(np.float) # Compute the score matrix score_matrix = urm.dot(similarity).astype(np.float) # Remove items that has already been seen if remove_seen: indices_seen = urm.nonzero() score_matrix[indices_seen] = float("-inf") # Filtering the data that are not in the users list if users is not None: score_matrix = score_matrix[users] # Find the topK generated interactions top_indices = score_matrix.data.argpartition(-topK)[-topK:] max_k = score_matrix.data[top_indices].min() x = sps.find(score_matrix) print(x) print(len(x)) print(len(x[0])) user_item_data = zip(x[0], x[1], x[2]) user_item = [(user, item) for user, item, data in user_item_data if data >= max_k] # Insert the best items in the urm matrix for user, item in user_item: augmented_urm[user, item] += value # Return the augmented urm return augmented_urm.tocsr()
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.05 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() edge_sampling, node_sampling, _ = get_sampling_distributions( adjacency, dendrogram, weights) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: weights_row = get_probs(weights, adjacency) weights_col = get_probs(weights, adjacency.T) inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) if mutual_information > 0: score /= mutual_information return score
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree', normalized: bool = True) -> float: """Tree sampling divergence of a hierarchy (quality metric). * Graphs * Digraphs Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Dendrogram. weights : Weights of nodes. ``'degree'`` (default) or ``'uniform'``. normalized : If ``True``, normalized score (between 0 and 1). Returns ------- score : float Score. Example ------- >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris >>> from sknetwork.data import house >>> paris = Paris() >>> adjacency = house() >>> dendrogram = paris.fit_transform(adjacency) >>> score = tree_sampling_divergence(adjacency, dendrogram) >>> np.round(score, 2) 0.52 References ---------- Charpentier, B. & Bonald, T. (2019). `Tree Sampling Divergence: An Information-Theoretic Metric for Hierarchical Graph Clustering. <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI. """ adjacency = check_format(adjacency) check_square(adjacency) check_min_nnz(adjacency.nnz, 1) adjacency = adjacency.astype(float) n = adjacency.shape[0] check_min_size(n, 2) adjacency.data /= adjacency.data.sum() aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars( adjacency, weights) node_sampling = np.zeros(n - 1) for t in range(n - 1): i = int(dendrogram[t][0]) j = int(dendrogram[t][1]) if i >= n and height[i - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[i - n] edge_sampling[i - n] = 0 node_sampling[t] = node_sampling[i - n] elif j >= n and height[j - n] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[j - n] edge_sampling[j - n] = 0 node_sampling[t] = node_sampling[j - n] if j in aggregate_graph.neighbors[i]: edge_sampling[t] += aggregate_graph.neighbors[i][j] node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \ aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i] height[t] = dendrogram[t][2] aggregate_graph.merge(i, j) index = np.where(edge_sampling)[0] score = edge_sampling[index].dot( np.log(edge_sampling[index] / node_sampling[index])) if normalized: inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr') inv_out_weights.data = 1 / inv_out_weights.data inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr') inv_in_weights.data = 1 / inv_in_weights.data sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights)) inv_out_weights.data = np.ones(len(inv_out_weights.data)) inv_in_weights.data = np.ones(len(inv_in_weights.data)) edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights)) mutual_information = edge_sampling.data.dot(np.log( sampling_ratio.data)) score /= mutual_information return score
def predict(self, X: csr_matrix): return X.dot(self.beta)
def predict(self, X: csr_matrix, wr: np.ndarray): return wr * X.dot(self.beta_lm) + (1 - wr) * X.dot(self.beta_clm)
def svg_graph(adjacency: sparse.csr_matrix, position: Optional[np.ndarray] = None, names: Optional[np.ndarray] = None, labels: Optional[Union[dict, np.ndarray]] = None, scores: Optional[np.ndarray] = None, seeds: Union[list, dict] = None, width: float = 400, height: float = 300, margin: float = 20, margin_text: float = 3, scale: float = 1, node_order: Optional[np.ndarray] = None, node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20, display_node_weight: bool = False, node_weights: Optional[np.ndarray] = None, node_width: float = 1, node_width_max: float = 3, node_color: str = 'gray', display_edges: bool = True, edge_width: float = 1, edge_width_min: float = 0.5, edge_width_max: float = 20, edge_weight: bool = True, edge_color: Optional[str] = None, font_size: int = 12, directed: bool = False, filename: Optional[str] = None) -> str: """Return SVG image of a graph. Parameters ---------- adjacency : Adjacency matrix of the graph. position : Positions of the nodes. names : Names of the nodes. labels : Labels of the nodes (negative values mean no label). scores : Scores of the nodes (measure of importance). seeds : Nodes to be highlighted (if dict, only keys are considered). width : Width of the image. height : Height of the image. margin : Margin of the image. margin_text : Margin between node and text. scale : Multiplicative factor on the dimensions of the image. node_order : Order in which nodes are displayed. node_size : Size of nodes. node_size_min : Minimum size of a node. node_size_max: Maximum size of a node. node_width : Width of node circle. node_width_max : Maximum width of node circle. node_color : Default color of nodes (svg color). display_node_weight : Display node weights by node size. node_weights : Node weights (used only if **display_node_weight** is ``True``). display_edges : If ``True``, display edges. edge_width : Width of edges. edge_width_min : Minimum width of edges. edge_width_max : Maximum width of edges. edge_weight : Display edge weights with edge widths. edge_color : Default color of edges (svg color). font_size : Font size. directed : If ``True``, considers the graph as directed. filename : Filename for saving image (optional). Returns ------- image : str SVG image. Example ------- >>> from sknetwork.data import karate_club >>> graph = karate_club(True) >>> adjacency = graph.adjacency >>> position = graph.position >>> from sknetwork.visualization import svg_graph >>> image = svg_graph(adjacency, position) >>> image[1:4] 'svg' """ n = adjacency.shape[0] # node order if node_order is None: node_order = np.arange(n) # position if position is None: spring = Spring() position = spring.fit_transform(adjacency) # colors colors = get_colors(n, labels, scores, node_color) if edge_color is None: if names is None: edge_color = 'black' else: edge_color = 'gray' # node sizes if node_weights is None: node_weights = adjacency.dot(np.ones(n)) node_sizes = get_node_sizes(node_weights, node_size, node_size_min, node_size_max, display_node_weight) # node widths node_widths = get_node_widths(n, seeds, node_width, node_width_max) # edge widths adjacency_ = sparse.coo_matrix(adjacency) edge_widths = get_edge_widths(adjacency_.data, edge_width, edge_width_min, edge_width_max, edge_weight) # rescaling position, width, height = rescale(position, width, height, margin, node_size_max, display_node_weight) if names is not None: text_length = np.max(np.array([len(str(name)) for name in names])) width += text_length * font_size * .5 # scaling position *= scale height *= scale width *= scale svg = """<svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">""".format(width, height) if directed: svg += """<defs><marker id="arrow" markerWidth="10" markerHeight="10" refX="9" refY="3" orient="auto" >""" svg += """<path d="M0,0 L0,6 L9,3 z" fill="{}"/></marker></defs>""".format(edge_color) # edges if display_edges: n_edges = len(adjacency_.row) for ix in range(n_edges): i = adjacency_.row[ix] j = adjacency_.col[ix] if directed: svg += svg_edge_directed(pos_1=position[i], pos_2=position[j], stroke_width=edge_widths[ix], stroke_color=edge_color, node_size=node_sizes[j]) else: svg += svg_edge(pos_1=position[i], pos_2=position[j], stroke_width=edge_widths[ix], stroke_color=edge_color) # nodes for i in node_order: svg += svg_node(position[i], node_sizes[i], colors[i], node_widths[i]) # text if names is not None: for i in range(n): svg += svg_text(position[i] + node_sizes[i] + (margin_text, 0), names[i], font_size) svg += """</svg>""" if filename is not None: with open(filename + '.svg', 'w') as f: f.write(svg) return svg
def compute_theta(beliefs: csr_matrix, degrees: csr_matrix): # compute_theta(beliefs, degrees)[t,0] returns theta of community t return degrees.dot(beliefs)
def fit(self, adjacency: sparse.csr_matrix, node_weights: Union[str, np.ndarray] = 'degree', reorder: bool = True): """ Agglomerative clustering using the nearest neighbor chain. Parameters ---------- adjacency : Adjacency matrix of the graph to cluster. node_weights : Node weights used in the linkage. reorder : If True, reorder the dendrogram in increasing order of heights. Returns ------- self: :class:`Paris` """ if type(adjacency) != sparse.csr_matrix: raise TypeError( 'The adjacency matrix must be in a scipy compressed sparse row (csr) format.' ) if adjacency.shape[0] != adjacency.shape[1]: raise ValueError('The adjacency matrix must be square.') if adjacency.shape[0] <= 1: raise ValueError('The graph must contain at least two nodes.') if (adjacency != adjacency.T).nnz != 0: raise ValueError( 'The graph cannot be directed. Please fit a symmetric adjacency matrix.' ) n_nodes = adjacency.shape[0] if type(node_weights) == np.ndarray: if len(node_weights) != n_nodes: raise ValueError( 'The number of node weights must match the number of nodes.' ) else: node_probs = node_weights elif type(node_weights) == str: if node_weights == 'degree': node_probs = adjacency.dot(np.ones(n_nodes)) elif node_weights == 'uniform': node_probs = np.ones(n_nodes) else: raise ValueError('Unknown distribution of node weights.') else: raise TypeError( 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.' ) if np.any(node_probs <= 0): raise ValueError('All node weights must be positive.') else: node_probs = node_probs / np.sum(node_probs) aggregate_graph = AggregateGraph(adjacency, node_probs) connected_components = [] dendrogram = [] while len(aggregate_graph.cluster_sizes) > 0: node = None for node in aggregate_graph.cluster_sizes: break chain = [node] while chain: node = chain.pop() if aggregate_graph.graph[node]: max_sim = -float("inf") nearest_neighbor = None for neighbor in aggregate_graph.graph[node]: sim = aggregate_graph.graph[node][neighbor] / aggregate_graph.cluster_probs[node] / \ aggregate_graph.cluster_probs[neighbor] if sim > max_sim: nearest_neighbor = neighbor max_sim = sim elif sim == max_sim: nearest_neighbor = min(neighbor, nearest_neighbor) if chain: nearest_neighbor_last = chain.pop() if nearest_neighbor_last == nearest_neighbor: dendrogram.append([ node, nearest_neighbor, 1. / max_sim, aggregate_graph.cluster_sizes[node] + aggregate_graph.cluster_sizes[nearest_neighbor] ]) aggregate_graph.merge(node, nearest_neighbor) else: chain.append(nearest_neighbor_last) chain.append(node) chain.append(nearest_neighbor) else: chain.append(node) chain.append(nearest_neighbor) else: connected_components.append( (node, aggregate_graph.cluster_sizes[node])) del aggregate_graph.cluster_sizes[node] node, cluster_size = connected_components.pop() for next_node, next_cluster_size in connected_components: cluster_size += next_cluster_size dendrogram.append([node, next_node, float("inf"), cluster_size]) node = aggregate_graph.next_cluster aggregate_graph.next_cluster += 1 dendrogram = np.array(dendrogram) if reorder: dendrogram = reorder_dendrogram(dendrogram) self.dendrogram_ = dendrogram return self
def compare_news_vector_with_(arr, vec: csr_matrix): return 1 - vec.dot(arr)
def left_sparse_dot(self, matrix: sparse.csr_matrix): """Left dot product with a sparse matrix.""" return SparseLR(matrix.dot(self.sparse_mat), [(matrix.dot(x), y) for (x, y) in self.low_rank_tuples])
def svg_bigraph(biadjacency: sparse.csr_matrix, names_row: Optional[np.ndarray] = None, names_col: Optional[np.ndarray] = None, labels_row: Optional[Union[dict, np.ndarray]] = None, labels_col: Optional[Union[dict, np.ndarray]] = None, scores_row: Optional[Union[dict, np.ndarray]] = None, scores_col: Optional[Union[dict, np.ndarray]] = None, membership_row: Optional[sparse.csr_matrix] = None, membership_col: Optional[sparse.csr_matrix] = None, seeds_row: Union[list, dict] = None, seeds_col: Union[list, dict] = None, position_row: Optional[np.ndarray] = None, position_col: Optional[np.ndarray] = None, reorder: bool = True, width: Optional[float] = 400, height: Optional[float] = 300, margin: float = 20, margin_text: float = 3, scale: float = 1, node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20, display_node_weight: bool = False, node_weights_row: Optional[np.ndarray] = None, node_weights_col: Optional[np.ndarray] = None, node_width: float = 1, node_width_max: float = 3, color_row: str = 'gray', color_col: str = 'gray', label_colors: Optional[Iterable] = None, display_edges: bool = True, edge_labels: Optional[list] = None, edge_width: float = 1, edge_width_min: float = 0.5, edge_width_max: float = 10, edge_color: str = 'black', display_edge_weight: bool = True, font_size: int = 12, filename: Optional[str] = None) -> str: """Return SVG image of a bigraph. Parameters ---------- biadjacency : Biadjacency matrix of the graph. names_row : Names of the rows. names_col : Names of the columns. labels_row : Labels of the rows (negative values mean no label). labels_col : Labels of the columns (negative values mean no label). scores_row : Scores of the rows (measure of importance). scores_col : Scores of the columns (measure of importance). membership_row : Membership of the rows (label distribution). membership_col : Membership of the columns (label distribution). seeds_row : Rows to be highlighted (if dict, only keys are considered). seeds_col : Columns to be highlighted (if dict, only keys are considered). position_row : Positions of the rows. position_col : Positions of the columns. reorder : Use clustering to order nodes. width : Width of the image. height : Height of the image. margin : Margin of the image. margin_text : Margin between node and text. scale : Multiplicative factor on the dimensions of the image. node_size : Size of nodes. node_size_min : Minimum size of nodes. node_size_max : Maximum size of nodes. display_node_weight : If ``True``, display node weights through node size. node_weights_row : Weights of rows (used only if **display_node_weight** is ``True``). node_weights_col : Weights of columns (used only if **display_node_weight** is ``True``). node_width : Width of node circle. node_width_max : Maximum width of node circle. color_row : Default color of rows (svg color). color_col : Default color of cols (svg color). label_colors : Colors of the labels (svg color). display_edges : If ``True``, display edges. edge_labels : Labels of the edges, as a list of tuples (source, destination, label) edge_width : Width of edges. edge_width_min : Minimum width of edges. edge_width_max : Maximum width of edges. display_edge_weight : If ``True``, display edge weights through edge widths. edge_color : Default color of edges (svg color). font_size : Font size. filename : Filename for saving image (optional). Returns ------- image : str SVG image. Example ------- >>> from sknetwork.data import movie_actor >>> biadjacency = movie_actor() >>> from sknetwork.visualization import svg_bigraph >>> image = svg_bigraph(biadjacency) >>> image[1:4] 'svg' """ n_row, n_col = biadjacency.shape # node positions if position_row is None or position_col is None: position_row = np.zeros((n_row, 2)) position_col = np.ones((n_col, 2)) if reorder: bilouvain = BiLouvain() bilouvain.fit(biadjacency) index_row = np.argsort(bilouvain.labels_row_) index_col = np.argsort(bilouvain.labels_col_) else: index_row = np.arange(n_row) index_col = np.arange(n_col) position_row[index_row, 1] = np.arange(n_row) position_col[index_col, 1] = np.arange(n_col) + .5 * (n_row - n_col) position = np.vstack((position_row, position_col)) # node colors colors_row = get_node_colors(n_row, labels_row, scores_row, membership_row, color_row, label_colors) colors_col = get_node_colors(n_col, labels_col, scores_col, membership_col, color_col, label_colors) # node sizes if node_weights_row is None: node_weights_row = biadjacency.dot(np.ones(n_col)) if node_weights_col is None: node_weights_col = biadjacency.T.dot(np.ones(n_row)) node_sizes_row, node_sizes_col = get_node_sizes_bipartite(node_weights_row, node_weights_col, node_size, node_size_min, node_size_max, display_node_weight) # node widths node_widths_row = get_node_widths(n_row, seeds_row, node_width, node_width_max) node_widths_col = get_node_widths(n_col, seeds_col, node_width, node_width_max) # rescaling if not width and not height: raise ValueError("You must specify either the width or the height of the image.") position, width, height = rescale(position, width, height, margin, node_size, node_size_max, display_node_weight) # node names if names_row is not None: text_length = np.max(np.array([len(str(name)) for name in names_row])) position[:, 0] += text_length * font_size * .5 width += text_length * font_size * .5 if names_col is not None: text_length = np.max(np.array([len(str(name)) for name in names_col])) width += text_length * font_size * .5 # scaling position *= scale height *= scale width *= scale position_row = position[:n_row] position_col = position[n_row:] svg = """<svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">\n""".format(width, height) # edges if display_edges: biadjacency_coo = sparse.coo_matrix(biadjacency) if edge_color is None: if names_row is None and names_col is None: edge_color = 'black' else: edge_color = 'gray' edge_colors, edge_order, edge_colors_residual = get_edge_colors(biadjacency, edge_labels, edge_color, label_colors) edge_widths = get_edge_widths(biadjacency_coo, edge_width, edge_width_min, edge_width_max, display_edge_weight) for ix in edge_order: i = biadjacency_coo.row[ix] j = biadjacency_coo.col[ix] color = edge_colors[ix] svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_widths[ix], edge_color=color) for i, j, color in edge_colors_residual: svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_width, edge_color=color) # nodes for i in range(n_row): if membership_row is None: svg += svg_node(position_row[i], node_sizes_row[i], colors_row[i], node_widths_row[i]) else: if membership_row[i].nnz == 1: index = membership_row[i].indices[0] svg += svg_node(position_row[i], node_sizes_row[i], colors_row[index], node_widths_row[i]) else: svg += svg_pie_chart_node(position_row[i], node_sizes_row[i], membership_row[i].todense(), colors_row, node_widths_row[i]) for i in range(n_col): if membership_col is None: svg += svg_node(position_col[i], node_sizes_col[i], colors_col[i], node_widths_col[i]) else: if membership_col[i].nnz == 1: index = membership_col[i].indices[0] svg += svg_node(position_col[i], node_sizes_col[i], colors_col[index], node_widths_col[i]) else: svg += svg_pie_chart_node(position_col[i], node_sizes_col[i], membership_col[i].todense(), colors_col, node_widths_col[i]) # text if names_row is not None: for i in range(n_row): svg += svg_text(position_row[i] - (margin_text + node_sizes_row[i], 0), names_row[i], font_size, True) if names_col is not None: for i in range(n_col): svg += svg_text(position_col[i] + (margin_text + node_sizes_col[i], 0), names_col[i], font_size) svg += """</svg>\n""" if filename is not None: with open(filename + '.svg', 'w') as f: f.write(svg) return svg
def left_sparse_dot(self, matrix: sparse.csr_matrix): """Left dot product with a sparse matrix""" self.backward = matrix.dot(self.backward) return self
def pcg(A: sp.csr_matrix, b: np.array, tol: float = 1e-5, maxiter: int = 100, M1: sp.csr_matrix = None, M2: sp.csr_matrix = None, x0: np.array = None) -> (np.array, int, int): """ PCG Preconditioned Conjugate Gradients Method. X = PCG(A,B) attempts to solve the system of linear equations A*X=B for X. The N-by-N coefficient matrix A must be symmetric and positive definite and the right hand side column vector B must have length N. X = PCG(AFUN,B) accepts a function handle AFUN instead of the matrix A. AFUN(X) accepts a vector input X and returns the matrix-vector product A*X. In all of the following syntaxes, you can replace A by AFUN. X = PCG(A,B,TOL) specifies the tolerance of the method. If TOL is [] then PCG uses the default, 1e-6. X = PCG(A,B,TOL,MAXIT) specifies the maximum number of itations. If MAXIT is [] then PCG uses the default, min(N,20). X = PCG(A,B,TOL,MAXIT,M) and X = PCG(A,B,TOL,MAXIT,M1,M2) use symmetric positive definite preconditioner M or M=M1*M2 and effectively solve the system inv(M)*A*X = inv(M)*B for X. If M is [] then a preconditioner is not applied. M may be a function handle MFUN returning M\X. X = PCG(A,B,TOL,MAXIT,M1,M2,X0) specifies the initial guess. If X0 is [] then PCG uses the default, an all zero vector. [X,FLAG] = PCG(A,B,...) also returns a convergence FLAG: 0 PCG converged to the desired tolerance TOL within MAXIT itations 1 PCG itated MAXIT times but did not converge. 2 preconditioner M was ill-conditioned. 3 PCG stagnated (two consecutive itates were the same). 4 one of the scalar quantities calculated during PCG became too small or too large to continue computing. [X,FLAG,RELRES] = PCG(A,B,...) also returns the relative residual NORM(B-A*X)/NORM(B). If FLAG is 0, then RELRES <= TOL. [X,FLAG,RELRES,ITER] = PCG(A,B,...) also returns the itation number at which X was computed: 0 <= ITER <= MAXIT. [X,FLAG,RELRES,ITER,RESVEC] = PCG(A,B,...) also returns a vector of the estimated residual norms at each itation including NORM(B-A*X0). Example: n1 = 21; A = gallery('moler',n1); b1 = A*ones(n1,1); tol = 1e-6; maxit = 15; M = diag([10:-1:1 1 1:10]); [x1,flag1,rr1,it1,rv1] = pcg(A,b1,tol,maxit,M); Or use this parameterized matrix-vector product function: afun = @(x,n)gallery('moler',n)*x; n2 = 21; b2 = afun(ones(n2,1),n2); [x2,flag2,rr2,it2,rv2] = pcg(@(x)afun(x,n2),b2,tol,maxit,M); Class support for inputs A,B,M1,M2,X0 and the output of AFUN: float: double See also BICG, BICGSTAB, BICGSTABL, CGS, GMRES, LSQR, MINRES, QMR, SYMMLQ, TFQMR, ICHOL, FUNCTION_HANDLE. Copyright 1984-2013 The MathWorks, Inc. """ n = A.shape[0] n2b = np.linalg.norm(b) if n2b == 0: return np.zeros((n)), 0, 0 if x0 == None: x = np.zeros((n)) else: x = x0 flag = 1 it = 0 xmin = x # Iterate which has minimal residual so far imin = 0 # Iteration at which xmin was computed tolb = tol * n2b # Relative tolerance r = b - A.dot(x.transpose()) normr = np.linalg.norm(r) # Norm of residual normr_act = normr eps = 2.2204e-16 if normr <= tolb: return x, 0, 0 normrmin = normr rho = 1 stag = 0 moresteps = 0 maxmsteps = min(n // 50, 5, n - maxiter) maxstagsteps = 3 ii = 0 while ii < maxiter: if M1 != None: y = spsolve(M1, r) else: y = r if M2 != None: z = spsolve(M2, y) else: z = y rho1 = rho rho = r.dot(z) if (rho == 0) or np.isinf(rho): flag = 4 break if (ii == 0): p = z else: beta = rho / rho1 if ((beta == 0) or np.isinf(beta)): flag = 4 break p = z + beta * p q = A.dot(p) pq = p.dot(q) if (pq <= 0) or np.isinf(pq): flag = 4 break else: alpha = rho / pq # Check for stagnation of the method if norm(p) * abs(alpha) < eps * norm(x): stag = stag + 1 else: stag = 0 x = x + alpha * p # form new itate r = r - alpha * q normr = norm(r) normr_act = normr # check for convergence if normr <= tolb or stag >= maxstagsteps or moresteps: r = b - A.dot(x) normr_act = norm(r) if normr_act <= tolb: flag = 0 it = ii break else: if stag >= maxstagsteps and moresteps == 0: stag = 0 moresteps = moresteps + 1 if moresteps >= maxmsteps: flag = 3 it = ii break if normr_act < normrmin: # update minimal norm quantities normrmin = normr_act xmin = x imin = ii if stag >= maxstagsteps: flag = 3 break ii += 1 # returned solution is first with minimal residual if flag: r_comp = b - A.dot(xmin) if norm(r_comp) <= normr_act: x = xmin it = imin else: it = ii return x, flag, it
def dasgupta_cost(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, node_weights: Union[str, np.ndarray] = 'uniform', normalized: bool = True) -> float: """Dasgupta's cost of a hierarchy (cost metric) Parameters ---------- adjacency : Adjacency matrix of the graph. dendrogram : Each row contains the two merged nodes, the height in the dendrogram, and the size of the corresponding cluster node_weights : Vector of node weights. Default = 'uniform', weight 1 for each node. normalized: If true, normalized by the number of ndoes of the graph. Returns ------- cost : float Dasgupta's cost of the hierarchy. Normalized by the number of nodes to get a value between 0 and 1. References ---------- S. Dasgupta. A cost function for similarity-based hierarchical clustering. In Proceedings of ACM symposium on Theory of Computing, 2016. """ if type(adjacency) != sparse.csr_matrix: raise TypeError( 'The adjacency matrix must be in a scipy compressed sparse row (csr) format.' ) # check that the graph is not directed if adjacency.shape[0] != adjacency.shape[1]: raise ValueError('The adjacency matrix must be square.') if (adjacency != adjacency.T).nnz != 0: raise ValueError( 'The graph cannot be directed. Please fit a symmetric adjacency matrix.' ) n_nodes = adjacency.shape[0] if type(node_weights) == np.ndarray: if len(node_weights) != n_nodes: raise ValueError( 'The number of node weights must match the number of nodes.') else: node_weights_vec = node_weights elif type(node_weights) == str: if node_weights == 'degree': node_weights_vec = adjacency.dot(np.ones(n_nodes)) elif node_weights == 'uniform': node_weights_vec = np.ones(n_nodes) else: raise ValueError('Unknown distribution of node weights.') else: raise TypeError( 'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.' ) if np.any(node_weights_vec <= 0): raise ValueError('All node weights must be positive.') else: node_weights_vec = node_weights_vec / np.sum(node_weights_vec) aggregate_graph = AggregateGraph(adjacency, node_weights_vec) height = np.zeros(n_nodes - 1) edge_sampling = np.zeros(n_nodes - 1) cluster_weight = np.zeros(n_nodes - 1) for t in range(n_nodes - 1): node1 = int(dendrogram[t][0]) node2 = int(dendrogram[t][1]) if node1 >= n_nodes and height[node1 - n_nodes] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node1 - n_nodes] edge_sampling[node1 - n_nodes] = 0 elif node2 >= n_nodes and height[node2 - n_nodes] == dendrogram[t][2]: edge_sampling[t] = edge_sampling[node2 - n_nodes] edge_sampling[node2 - n_nodes] = 0 height[t] = dendrogram[t][2] edge_sampling[t] += 2 * aggregate_graph.graph[node1][node2] cluster_weight[t] = aggregate_graph.cluster_probs[ node1] + aggregate_graph.cluster_probs[node2] aggregate_graph.merge(node1, node2) cost: float = (edge_sampling * cluster_weight).sum() if not normalized: cost *= node_weights_vec.sum() return cost