Exemple #1
0
def logisticClassProbability(X: spa.csr_matrix, b: np.array, W: np.array):
    '''Returns Class probabilities given data and parameters'''
    logits = X.dot(W.T)+b
    elogits = np.exp(logits-logits.max(axis=1)[:,np.newaxis]) # make calculation more stable numerically
    elogits_sum = elogits.sum(axis=1)
    class_probs = elogits/elogits_sum[:,np.newaxis]
    return class_probs
def make_weights(distribution: str,
                 adjacency: sparse.csr_matrix) -> np.ndarray:
    """Array of weights from a matrix and a desired distribution.

   Parameters
   ----------
   distribution:
       Distribution for node sampling. Only ``'degree'`` or ``'uniform'`` are accepted.
   adjacency:
       The adjacency matrix of the neighbors.

   Returns
   -------
   node_weights: np.ndarray
       Weights of nodes.
    """
    n = adjacency.shape[0]
    distribution = distribution.lower()
    if distribution == 'degree':
        node_weights_vec = adjacency.dot(np.ones(adjacency.shape[1]))
    elif distribution == 'uniform':
        node_weights_vec = np.ones(n)
    else:
        raise ValueError('Unknown distribution of node weights.')
    return node_weights_vec
    def __init__(self,
                 adjacency: sparse.csr_matrix,
                 damping_factor: float = 0.85,
                 personalization=None,
                 fb_mode: bool = False,
                 verbose: bool = False):
        VerboseMixin.__init__(self, verbose)

        n1, n2 = adjacency.shape
        restart_prob: np.ndarray = restart_probability(n1, personalization)

        if fb_mode:
            restart_prob = np.hstack((restart_prob, np.zeros(n2)))
            adjacency = bipartite2undirected(adjacency)

        LinearOperator.__init__(self, shape=adjacency.shape, dtype=float)
        n = adjacency.shape[0]
        out_degrees = adjacency.dot(np.ones(n))

        damping_matrix = damping_factor * sparse.eye(n, format='csr')
        if fb_mode:
            damping_matrix.data[n1:] = 1

        self.a = (damping_matrix.dot(transition_matrix(adjacency))).T.tocsr()
        self.b = (np.ones(n) -
                  damping_factor * out_degrees.astype(bool)) * restart_prob
Exemple #4
0
    def left_sparse_dot(self, matrix: sparse.csr_matrix):
        """Left dot product with a sparse matrix

        Parameters
        ----------
        matrix:
            Matrix

        Returns
        -------
        SparseLR object

        """
        return SparseLR(matrix.dot(self.sparse_mat),
                        [(matrix.dot(x), y)
                         for (x, y) in self.low_rank_tuples])
Exemple #5
0
 def similarity_from_sparse(matrix_a: sparse.csr_matrix,
                            matrix_b: sparse.csr_matrix):
     intersection = matrix_a.dot(matrix_b.transpose()).toarray()
     norm_1 = np.array(matrix_a.multiply(matrix_a).sum(axis=1))
     norm_2 = np.array(matrix_b.multiply(matrix_b).sum(axis=1))
     union = norm_1 + norm_2.T - intersection
     return intersection / union
Exemple #6
0
    def augmentURM(cls, URM_train: csr_matrix, W_sparse: csr_matrix,
                   threshold_interactions: int, threshold_similarity: float):
        """
        Augmentation of the URM train.

        :param threshold_interactions: here a threshold on the similarity is considered.
        Similarity matrix W_sparse will be considered for this purpose
        :param threshold_similarity: threshold used to insert a new row.
        In this case it is specified as the minimum number of interactions required to insert a new
        row in the URM train
        :param W_sparse: similarity matrix
        :param URM_train: URM train that will be augmented
        :return: a csr_matrix with augmented interactions according to the threshold
        """
        print("Augmenting URM")
        URM_train = URM_train.copy()

        # Count similarity
        count_W_sparse = URM_train.dot(URM_train.transpose())

        # Selecting new
        print("Selecting new candidates")
        users = np.arange(URM_train.shape[0])
        new_rows_list = []
        for i in range(0, users.size):
            if i % 5000 == 0:
                print("{} done in {}".format(i, users.size))
            candidates = count_W_sparse[i].indices  # users candidates
            data = count_W_sparse[i].data  # data for the candidates

            for j, candidate in enumerate(candidates):
                if candidate > i and data[
                        j] > threshold_interactions and W_sparse[
                            i, candidate] > threshold_similarity:
                    new_rows_list.append([i, candidate])

        print("Candidate list size: {}".format(len(new_rows_list)))

        # Creating the new matrix
        print("Creating new URM...", end="")
        new_URM = None
        for candidate in new_rows_list:
            new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0)
            new_row = csr_matrix(new_row)
            new_row.data[new_row.data > 1] = 1

            if new_URM is None:
                new_URM = new_row
            else:
                new_URM = vstack([new_URM, new_row], format="csr")

        if new_URM is None:
            new_URM = URM_train
        else:
            new_URM = vstack([URM_train, new_URM], format="csr")

        print("Done")

        return new_URM
Exemple #7
0
 def gradient(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray,
              l2reg: float) -> np.ndarray:
     m = X.shape[0]
     z = X.dot(beta) - y
     grad = X.T.dot(z)
     # L2 regularization term
     grad += l2reg * np.append(0, beta[1:])
     grad /= m
     return grad
Exemple #8
0
def _precompute_representation(
        features: sp.csr_matrix, feature_embeddings: np.ndarray,
        feature_biases: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    :param: features           csr_matrix         [n_objects, n_features]
    :param: feature_embeddings np.ndarray(float)  [n_features, no_component]
    :param: feature_biases     np.ndarray(float)  [n_features]

    :return:
    TODO:
    tuple of
    - representation    np.ndarray(float)  [n_objects, no_component+1]
    - bias repr
    """

    representation = features.dot(feature_embeddings)
    representation_bias = features.dot(feature_biases)
    return representation, representation_bias
Exemple #9
0
 def gradient(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray,
              is_win: np.ndarray, f, sigma: float, l2reg: float) -> float:
     z = (X.dot(beta) - y) / sigma
     z_lose = -(np.exp(norm.logpdf(z) - norm.logcdf(z)))
     #z_lose = -(norm.pdf(z) / norm.cdf(z))
     z = f(z, z_lose, is_win)
     grad = X.T.dot(z) / sigma
     # L2 regularization term
     grad += l2reg * np.append(0, beta[1:])
     return grad
Exemple #10
0
 def loss_function(self, beta: np.ndarray, X: csr_matrix, y: np.ndarray,
                   l2reg: float) -> float:
     m = X.shape[0]
     # squared loss
     z = X.dot(beta) - y
     #loss = sum(-norm.logpdf(z))
     loss = sum(z**2)
     # L2 regularization term
     loss += l2reg * sum(beta[1:]**2)
     loss /= (2 * m)
     return loss
Exemple #11
0
    def _secondary_outputs(self, input_matrix: sparse.csr_matrix):
        """Compute different variables from labels_."""
        if self.return_membership or self.return_aggregate:
            if np.issubdtype(input_matrix.data.dtype, np.bool_):
                input_matrix = input_matrix.astype(float)
            if not self.bipartite:
                membership = membership_matrix(self.labels_)
                if self.return_membership:
                    self.membership_ = normalize(input_matrix.dot(membership))
                if self.return_aggregate:
                    self.aggregate_ = sparse.csr_matrix(
                        membership.T.dot(input_matrix.dot(membership)))
            else:
                if self.labels_col_ is None:
                    n_labels = max(self.labels_) + 1
                    membership_row = membership_matrix(self.labels_,
                                                       n_labels=n_labels)
                    membership_col = normalize(
                        input_matrix.T.dot(membership_row))
                else:
                    n_labels = max(max(self.labels_row_), max(
                        self.labels_col_)) + 1
                    membership_row = membership_matrix(self.labels_row_,
                                                       n_labels=n_labels)
                    membership_col = membership_matrix(self.labels_col_,
                                                       n_labels=n_labels)
                if self.return_membership:
                    self.membership_row_ = normalize(
                        input_matrix.dot(membership_col))
                    self.membership_col_ = normalize(
                        input_matrix.T.dot(membership_row))
                    self.membership_ = self.membership_row_
                if self.return_aggregate:
                    aggregate_ = sparse.csr_matrix(
                        membership_row.T.dot(input_matrix))
                    aggregate_ = aggregate_.dot(membership_col)
                    self.aggregate_ = aggregate_

        return self
Exemple #12
0
 def loss_function(
         beta: np.ndarray, X: csr_matrix, y: np.ndarray,
         is_win: np.ndarray, f, sigma: float, l2reg: float) -> float:
     z = (X.dot(beta) - y) / sigma
     # loss for win bids
     z_win = -norm.logpdf(z)
     #z_win = -(np.log(1/np.sqrt(2*np.pi)) - z**2/2)
     # loss for lose bids
     z_lose = -norm.logcdf(z)
     loss = sum(f(z_win, z_lose, is_win))
     # L2 regularization term
     loss += l2reg * sum(beta[1:] ** 2) / 2
     return loss
Exemple #13
0
 def __init__(self,
              adjacency: sparse.csr_matrix,
              damping_factor: float,
              border: np.ndarray = None):
     super(DirichletOperator, self).__init__(shape=adjacency.shape,
                                             dtype=float)
     n = adjacency.shape[0]
     out_nodes = adjacency.dot(np.ones(n)).astype(bool)
     if border is None:
         border = np.zeros(n, dtype=bool)
     interior: sparse.csr_matrix = sparse.diags(~border,
                                                shape=(n, n),
                                                format='csr',
                                                dtype=float)
     self.a = damping_factor * interior.dot(normalize(adjacency))
     self.b = interior.dot(np.ones(n) - damping_factor * out_nodes) / n
Exemple #14
0
 def predict(self, x: spa.csr_matrix):
     '''Return a matrix with the predicted ratings. Applies logs to
     avoid underflow and to take into account that the probability of
     appearance of a word increases if the same word has already appeared
     before. Uses only matrix operations.'''
     if self.is_trained == False:
         return ('''The Classifier has not been trained. Please use
                 train(train_data: spa.csr_matrix, scores: np.ndarray,
                 Laplace_alpha) to train the Classifier.''')
     else:
         x.data = np.log(x.data + 1)
         self.log_cond_prob_matrix = spa.hstack(self.log_cond_prob_trans)
         log_freq = np.log(self.fractions)
         pre_final_result = x.dot(self.log_cond_prob_matrix) + log_freq
         final_prediction = (pre_final_result.argmax(axis=1) +
                             1).transpose()
         x.data = np.exp(x.data) - 1
         return final_prediction
Exemple #15
0
    def newAugmentUMR(cls, URM_train: csr_matrix, W_sparse: csr_matrix,
                      threshold_interactions: int,
                      threshold_similarity: float):
        print("New Augmenting URM")
        count_W_sparse = URM_train.dot(URM_train.transpose())
        count_mask: csr_matrix = count_W_sparse > threshold_interactions
        sim_mask: csr_matrix = W_sparse > threshold_similarity
        mask = count_mask.multiply(sim_mask)
        mask = triu(mask)
        mask = mask.tocoo()

        row_user = mask.row
        col_user = mask.col

        new_mask = row_user != col_user
        row_user = row_user[new_mask]
        col_user = col_user[new_mask]
        new_users = np.array([row_user, col_user])
        new_users = np.transpose(new_users)
        new_rows_list: list = new_users.tolist()

        print("Candidate list size: {}".format(len(new_rows_list)))

        # Creating the new matrix
        print("Creating new URM...", end="")
        new_URM = None
        for candidate in new_rows_list:
            new_row = URM_train[[candidate[0], candidate[1]]].sum(axis=0)
            new_row = csr_matrix(new_row)
            new_row.data[new_row.data > 1] = 1

            if new_URM is None:
                new_URM = new_row
            else:
                new_URM = vstack([new_URM, new_row], format="csr")

        if new_URM is None:
            new_URM = URM_train
        else:
            new_URM = vstack([URM_train, new_URM], format="csr")

        print("Done")

        return new_URM
def calc_objective_per_iter(w_i, feature_mat: sparse.csr_matrix, empirical_counts, num_h, true_tags, alpha):
    """
        Calculate max entropy likelihood for an iterative optimization method
        :param alpha: the regularization coefficient
        :param num_h: number of histories in the training data
        :param empirical_counts: pre computed empirical_counts
        :param w_i: weights vector in iteration i


            The function returns the Max Entropy likelihood (objective) and the objective gradient
    """
    scores = feature_mat.dot(w_i)
    scores = scores.reshape((num_h, -1))
    exp_scores = np.exp(scores)
    sum_exp = np.sum(exp_scores, axis=1)
    probs = exp_scores/sum_exp.reshape((num_h, 1))
    expected_counts = feature_mat.transpose().dot(probs.reshape(-1)).reshape(-1)
    likelihood = np.sum(scores[np.arange(num_h), true_tags] - np.log(sum_exp) - (alpha/2) * (w_i**2))
    grad = empirical_counts - expected_counts - alpha*w_i
    return (-1) * likelihood, (-1) * grad
Exemple #17
0
def augment_with_item_similarity_best_scores(urm: sps.csr_matrix,
                                             similarity,
                                             topK,
                                             value=0.5,
                                             remove_seen=True,
                                             users=None):
    # Create a copy of the urm
    augmented_urm = urm.tolil(copy=True).astype(np.float)

    # Compute the score matrix
    score_matrix = urm.dot(similarity).astype(np.float)

    # Remove items that has already been seen
    if remove_seen:
        indices_seen = urm.nonzero()
        score_matrix[indices_seen] = float("-inf")

    # Filtering the data that are not in the users list
    if users is not None:
        score_matrix = score_matrix[users]

    # Find the topK generated interactions
    top_indices = score_matrix.data.argpartition(-topK)[-topK:]
    max_k = score_matrix.data[top_indices].min()
    x = sps.find(score_matrix)
    print(x)
    print(len(x))
    print(len(x[0]))
    user_item_data = zip(x[0], x[1], x[2])
    user_item = [(user, item) for user, item, data in user_item_data
                 if data >= max_k]

    # Insert the best items in the urm matrix
    for user, item in user_item:
        augmented_urm[user, item] += value

    # Return the augmented urm
    return augmented_urm.tocsr()
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.05

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()
    edge_sampling, node_sampling, _ = get_sampling_distributions(
        adjacency, dendrogram, weights)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        weights_row = get_probs(weights, adjacency)
        weights_col = get_probs(weights, adjacency.T)
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        if mutual_information > 0:
            score /= mutual_information
    return score
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    * Graphs
    * Digraphs

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.52

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)
    check_min_nnz(adjacency.nnz, 1)
    adjacency = adjacency.astype(float)
    n = adjacency.shape[0]
    check_min_size(n, 2)

    adjacency.data /= adjacency.data.sum()

    aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars(
        adjacency, weights)
    node_sampling = np.zeros(n - 1)

    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i >= n and height[i - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[i - n]
            edge_sampling[i - n] = 0
            node_sampling[t] = node_sampling[i - n]
        elif j >= n and height[j - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[j - n]
            edge_sampling[j - n] = 0
            node_sampling[t] = node_sampling[j - n]
        if j in aggregate_graph.neighbors[i]:
            edge_sampling[t] += aggregate_graph.neighbors[i][j]
        node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \
            aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(i, j)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        score /= mutual_information
    return score
Exemple #20
0
 def predict(self, X: csr_matrix):
     return X.dot(self.beta)
Exemple #21
0
 def predict(self, X: csr_matrix, wr: np.ndarray):
     return wr * X.dot(self.beta_lm) + (1 - wr) * X.dot(self.beta_clm)
Exemple #22
0
def svg_graph(adjacency: sparse.csr_matrix, position: Optional[np.ndarray] = None, names: Optional[np.ndarray] = None,
              labels: Optional[Union[dict, np.ndarray]] = None, scores: Optional[np.ndarray] = None,
              seeds: Union[list, dict] = None, width: float = 400, height: float = 300,
              margin: float = 20, margin_text: float = 3, scale: float = 1, node_order: Optional[np.ndarray] = None,
              node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20,
              display_node_weight: bool = False, node_weights: Optional[np.ndarray] = None, node_width: float = 1,
              node_width_max: float = 3, node_color: str = 'gray',
              display_edges: bool = True, edge_width: float = 1, edge_width_min: float = 0.5,
              edge_width_max: float = 20, edge_weight: bool = True, edge_color: Optional[str] = None,
              font_size: int = 12, directed: bool = False, filename: Optional[str] = None) -> str:
    """Return SVG image of a graph.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    position :
        Positions of the nodes.
    names :
        Names of the nodes.
    labels :
        Labels of the nodes (negative values mean no label).
    scores :
        Scores of the nodes (measure of importance).
    seeds :
        Nodes to be highlighted (if dict, only keys are considered).
    width :
        Width of the image.
    height :
        Height of the image.
    margin :
        Margin of the image.
    margin_text :
        Margin between node and text.
    scale :
        Multiplicative factor on the dimensions of the image.
    node_order :
        Order in which nodes are displayed.
    node_size :
        Size of nodes.
    node_size_min :
        Minimum size of a node.
    node_size_max:
        Maximum size of a node.
    node_width :
        Width of node circle.
    node_width_max :
        Maximum width of node circle.
    node_color :
        Default color of nodes (svg color).
    display_node_weight :
        Display node weights by node size.
    node_weights :
        Node weights (used only if **display_node_weight** is ``True``).
    display_edges :
        If ``True``, display edges.
    edge_width :
        Width of edges.
    edge_width_min :
        Minimum width of edges.
    edge_width_max :
        Maximum width of edges.
    edge_weight :
        Display edge weights with edge widths.
    edge_color :
        Default color of edges (svg color).
    font_size :
        Font size.
    directed :
        If ``True``, considers the graph as directed.
    filename :
        Filename for saving image (optional).

    Returns
    -------
    image : str
        SVG image.

    Example
    -------
    >>> from sknetwork.data import karate_club
    >>> graph = karate_club(True)
    >>> adjacency = graph.adjacency
    >>> position = graph.position
    >>> from sknetwork.visualization import svg_graph
    >>> image = svg_graph(adjacency, position)
    >>> image[1:4]
    'svg'
    """
    n = adjacency.shape[0]

    # node order
    if node_order is None:
        node_order = np.arange(n)

    # position
    if position is None:
        spring = Spring()
        position = spring.fit_transform(adjacency)

    # colors
    colors = get_colors(n, labels, scores, node_color)
    if edge_color is None:
        if names is None:
            edge_color = 'black'
        else:
            edge_color = 'gray'

    # node sizes
    if node_weights is None:
        node_weights = adjacency.dot(np.ones(n))
    node_sizes = get_node_sizes(node_weights, node_size, node_size_min, node_size_max, display_node_weight)

    # node widths
    node_widths = get_node_widths(n, seeds, node_width, node_width_max)

    # edge widths
    adjacency_ = sparse.coo_matrix(adjacency)
    edge_widths = get_edge_widths(adjacency_.data, edge_width, edge_width_min, edge_width_max, edge_weight)

    # rescaling
    position, width, height = rescale(position, width, height, margin, node_size_max, display_node_weight)

    if names is not None:
        text_length = np.max(np.array([len(str(name)) for name in names]))
        width += text_length * font_size * .5

    # scaling
    position *= scale
    height *= scale
    width *= scale

    svg = """<svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">""".format(width, height)
    if directed:
        svg += """<defs><marker id="arrow" markerWidth="10" markerHeight="10" refX="9" refY="3" orient="auto" >"""
        svg += """<path d="M0,0 L0,6 L9,3 z" fill="{}"/></marker></defs>""".format(edge_color)

    # edges
    if display_edges:
        n_edges = len(adjacency_.row)
        for ix in range(n_edges):
            i = adjacency_.row[ix]
            j = adjacency_.col[ix]
            if directed:
                svg += svg_edge_directed(pos_1=position[i], pos_2=position[j], stroke_width=edge_widths[ix],
                                         stroke_color=edge_color, node_size=node_sizes[j])
            else:
                svg += svg_edge(pos_1=position[i], pos_2=position[j], stroke_width=edge_widths[ix], stroke_color=edge_color)

    # nodes
    for i in node_order:
        svg += svg_node(position[i], node_sizes[i], colors[i], node_widths[i])

    # text
    if names is not None:
        for i in range(n):
            svg += svg_text(position[i] + node_sizes[i] + (margin_text, 0), names[i], font_size)
    svg += """</svg>"""

    if filename is not None:
        with open(filename + '.svg', 'w') as f:
            f.write(svg)

    return svg
def compute_theta(beliefs: csr_matrix, degrees: csr_matrix):
    # compute_theta(beliefs, degrees)[t,0] returns theta of community t
    return degrees.dot(beliefs)
Exemple #24
0
    def fit(self,
            adjacency: sparse.csr_matrix,
            node_weights: Union[str, np.ndarray] = 'degree',
            reorder: bool = True):
        """
        Agglomerative clustering using the nearest neighbor chain.

        Parameters
        ----------
        adjacency :
            Adjacency matrix of the graph to cluster.
        node_weights :
            Node weights used in the linkage.
        reorder :
            If True, reorder the dendrogram in increasing order of heights.

        Returns
        -------
        self: :class:`Paris`
        """
        if type(adjacency) != sparse.csr_matrix:
            raise TypeError(
                'The adjacency matrix must be in a scipy compressed sparse row (csr) format.'
            )
        if adjacency.shape[0] != adjacency.shape[1]:
            raise ValueError('The adjacency matrix must be square.')
        if adjacency.shape[0] <= 1:
            raise ValueError('The graph must contain at least two nodes.')
        if (adjacency != adjacency.T).nnz != 0:
            raise ValueError(
                'The graph cannot be directed. Please fit a symmetric adjacency matrix.'
            )

        n_nodes = adjacency.shape[0]

        if type(node_weights) == np.ndarray:
            if len(node_weights) != n_nodes:
                raise ValueError(
                    'The number of node weights must match the number of nodes.'
                )
            else:
                node_probs = node_weights
        elif type(node_weights) == str:
            if node_weights == 'degree':
                node_probs = adjacency.dot(np.ones(n_nodes))
            elif node_weights == 'uniform':
                node_probs = np.ones(n_nodes)
            else:
                raise ValueError('Unknown distribution of node weights.')
        else:
            raise TypeError(
                'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.'
            )

        if np.any(node_probs <= 0):
            raise ValueError('All node weights must be positive.')
        else:
            node_probs = node_probs / np.sum(node_probs)

        aggregate_graph = AggregateGraph(adjacency, node_probs)

        connected_components = []
        dendrogram = []

        while len(aggregate_graph.cluster_sizes) > 0:
            node = None
            for node in aggregate_graph.cluster_sizes:
                break
            chain = [node]
            while chain:
                node = chain.pop()
                if aggregate_graph.graph[node]:
                    max_sim = -float("inf")
                    nearest_neighbor = None
                    for neighbor in aggregate_graph.graph[node]:
                        sim = aggregate_graph.graph[node][neighbor] / aggregate_graph.cluster_probs[node] / \
                              aggregate_graph.cluster_probs[neighbor]
                        if sim > max_sim:
                            nearest_neighbor = neighbor
                            max_sim = sim
                        elif sim == max_sim:
                            nearest_neighbor = min(neighbor, nearest_neighbor)
                    if chain:
                        nearest_neighbor_last = chain.pop()
                        if nearest_neighbor_last == nearest_neighbor:
                            dendrogram.append([
                                node, nearest_neighbor, 1. / max_sim,
                                aggregate_graph.cluster_sizes[node] +
                                aggregate_graph.cluster_sizes[nearest_neighbor]
                            ])
                            aggregate_graph.merge(node, nearest_neighbor)
                        else:
                            chain.append(nearest_neighbor_last)
                            chain.append(node)
                            chain.append(nearest_neighbor)
                    else:
                        chain.append(node)
                        chain.append(nearest_neighbor)
                else:
                    connected_components.append(
                        (node, aggregate_graph.cluster_sizes[node]))
                    del aggregate_graph.cluster_sizes[node]

        node, cluster_size = connected_components.pop()
        for next_node, next_cluster_size in connected_components:
            cluster_size += next_cluster_size
            dendrogram.append([node, next_node, float("inf"), cluster_size])
            node = aggregate_graph.next_cluster
            aggregate_graph.next_cluster += 1

        dendrogram = np.array(dendrogram)
        if reorder:
            dendrogram = reorder_dendrogram(dendrogram)

        self.dendrogram_ = dendrogram

        return self
Exemple #25
0
def compare_news_vector_with_(arr, vec: csr_matrix):
    return 1 - vec.dot(arr)
Exemple #26
0
 def left_sparse_dot(self, matrix: sparse.csr_matrix):
     """Left dot product with a sparse matrix."""
     return SparseLR(matrix.dot(self.sparse_mat),
                     [(matrix.dot(x), y)
                      for (x, y) in self.low_rank_tuples])
Exemple #27
0
def svg_bigraph(biadjacency: sparse.csr_matrix,
                names_row: Optional[np.ndarray] = None, names_col: Optional[np.ndarray] = None,
                labels_row: Optional[Union[dict, np.ndarray]] = None,
                labels_col: Optional[Union[dict, np.ndarray]] = None,
                scores_row: Optional[Union[dict, np.ndarray]] = None,
                scores_col: Optional[Union[dict, np.ndarray]] = None,
                membership_row: Optional[sparse.csr_matrix] = None,
                membership_col: Optional[sparse.csr_matrix] = None,
                seeds_row: Union[list, dict] = None, seeds_col: Union[list, dict] = None,
                position_row: Optional[np.ndarray] = None, position_col: Optional[np.ndarray] = None,
                reorder: bool = True, width: Optional[float] = 400,
                height: Optional[float] = 300, margin: float = 20, margin_text: float = 3, scale: float = 1,
                node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20,
                display_node_weight: bool = False,
                node_weights_row: Optional[np.ndarray] = None, node_weights_col: Optional[np.ndarray] = None,
                node_width: float = 1, node_width_max: float = 3,
                color_row: str = 'gray', color_col: str = 'gray', label_colors: Optional[Iterable] = None,
                display_edges: bool = True, edge_labels: Optional[list] = None, edge_width: float = 1,
                edge_width_min: float = 0.5, edge_width_max: float = 10, edge_color: str = 'black',
                display_edge_weight: bool = True,
                font_size: int = 12, filename: Optional[str] = None) -> str:
    """Return SVG image of a bigraph.

    Parameters
    ----------
    biadjacency :
        Biadjacency matrix of the graph.
    names_row :
        Names of the rows.
    names_col :
        Names of the columns.
    labels_row :
        Labels of the rows (negative values mean no label).
    labels_col :
        Labels of the columns (negative values mean no label).
    scores_row :
        Scores of the rows (measure of importance).
    scores_col :
        Scores of the columns (measure of importance).
    membership_row :
        Membership of the rows (label distribution).
    membership_col :
        Membership of the columns (label distribution).
    seeds_row :
        Rows to be highlighted (if dict, only keys are considered).
    seeds_col :
        Columns to be highlighted (if dict, only keys are considered).
    position_row :
        Positions of the rows.
    position_col :
        Positions of the columns.
    reorder :
        Use clustering to order nodes.
    width :
        Width of the image.
    height :
        Height of the image.
    margin :
        Margin of the image.
    margin_text :
        Margin between node and text.
    scale :
        Multiplicative factor on the dimensions of the image.
    node_size :
        Size of nodes.
    node_size_min :
        Minimum size of nodes.
    node_size_max :
        Maximum size of nodes.
    display_node_weight :
        If ``True``, display node weights through node size.
    node_weights_row :
        Weights of rows (used only if **display_node_weight** is ``True``).
    node_weights_col :
        Weights of columns (used only if **display_node_weight** is ``True``).
    node_width :
        Width of node circle.
    node_width_max :
        Maximum width of node circle.
    color_row :
        Default color of rows (svg color).
    color_col :
        Default color of cols (svg color).
    label_colors :
        Colors of the labels (svg color).
    display_edges :
        If ``True``, display edges.
    edge_labels :
        Labels of the edges, as a list of tuples (source, destination, label)
    edge_width :
        Width of edges.
    edge_width_min :
        Minimum width of edges.
    edge_width_max :
        Maximum width of edges.
    display_edge_weight :
        If ``True``, display edge weights through edge widths.
    edge_color :
        Default color of edges (svg color).
    font_size :
        Font size.
    filename :
        Filename for saving image (optional).

    Returns
    -------
    image : str
        SVG image.

    Example
    -------
    >>> from sknetwork.data import movie_actor
    >>> biadjacency = movie_actor()
    >>> from sknetwork.visualization import svg_bigraph
    >>> image = svg_bigraph(biadjacency)
    >>> image[1:4]
    'svg'
    """
    n_row, n_col = biadjacency.shape

    # node positions
    if position_row is None or position_col is None:
        position_row = np.zeros((n_row, 2))
        position_col = np.ones((n_col, 2))
        if reorder:
            bilouvain = BiLouvain()
            bilouvain.fit(biadjacency)
            index_row = np.argsort(bilouvain.labels_row_)
            index_col = np.argsort(bilouvain.labels_col_)
        else:
            index_row = np.arange(n_row)
            index_col = np.arange(n_col)
        position_row[index_row, 1] = np.arange(n_row)
        position_col[index_col, 1] = np.arange(n_col) + .5 * (n_row - n_col)
    position = np.vstack((position_row, position_col))

    # node colors
    colors_row = get_node_colors(n_row, labels_row, scores_row, membership_row, color_row, label_colors)
    colors_col = get_node_colors(n_col, labels_col, scores_col, membership_col, color_col, label_colors)

    # node sizes
    if node_weights_row is None:
        node_weights_row = biadjacency.dot(np.ones(n_col))
    if node_weights_col is None:
        node_weights_col = biadjacency.T.dot(np.ones(n_row))
    node_sizes_row, node_sizes_col = get_node_sizes_bipartite(node_weights_row, node_weights_col,
                                                              node_size, node_size_min, node_size_max,
                                                              display_node_weight)

    # node widths
    node_widths_row = get_node_widths(n_row, seeds_row, node_width, node_width_max)
    node_widths_col = get_node_widths(n_col, seeds_col, node_width, node_width_max)

    # rescaling
    if not width and not height:
        raise ValueError("You must specify either the width or the height of the image.")
    position, width, height = rescale(position, width, height, margin, node_size, node_size_max, display_node_weight)

    # node names
    if names_row is not None:
        text_length = np.max(np.array([len(str(name)) for name in names_row]))
        position[:, 0] += text_length * font_size * .5
        width += text_length * font_size * .5
    if names_col is not None:
        text_length = np.max(np.array([len(str(name)) for name in names_col]))
        width += text_length * font_size * .5

    # scaling
    position *= scale
    height *= scale
    width *= scale
    position_row = position[:n_row]
    position_col = position[n_row:]

    svg = """<svg width="{}" height="{}"  xmlns="http://www.w3.org/2000/svg">\n""".format(width, height)

    # edges
    if display_edges:
        biadjacency_coo = sparse.coo_matrix(biadjacency)

        if edge_color is None:
            if names_row is None and names_col is None:
                edge_color = 'black'
            else:
                edge_color = 'gray'

        edge_colors, edge_order, edge_colors_residual = get_edge_colors(biadjacency, edge_labels, edge_color,
                                                                        label_colors)
        edge_widths = get_edge_widths(biadjacency_coo, edge_width, edge_width_min, edge_width_max, display_edge_weight)

        for ix in edge_order:
            i = biadjacency_coo.row[ix]
            j = biadjacency_coo.col[ix]
            color = edge_colors[ix]
            svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_widths[ix], edge_color=color)

        for i, j, color in edge_colors_residual:
            svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_width, edge_color=color)

    # nodes
    for i in range(n_row):
        if membership_row is None:
            svg += svg_node(position_row[i], node_sizes_row[i], colors_row[i], node_widths_row[i])
        else:
            if membership_row[i].nnz == 1:
                index = membership_row[i].indices[0]
                svg += svg_node(position_row[i], node_sizes_row[i], colors_row[index], node_widths_row[i])
            else:
                svg += svg_pie_chart_node(position_row[i], node_sizes_row[i], membership_row[i].todense(),
                                          colors_row, node_widths_row[i])

    for i in range(n_col):
        if membership_col is None:
            svg += svg_node(position_col[i], node_sizes_col[i], colors_col[i], node_widths_col[i])
        else:
            if membership_col[i].nnz == 1:
                index = membership_col[i].indices[0]
                svg += svg_node(position_col[i], node_sizes_col[i], colors_col[index], node_widths_col[i])
            else:
                svg += svg_pie_chart_node(position_col[i], node_sizes_col[i], membership_col[i].todense(),
                                          colors_col, node_widths_col[i])
    # text
    if names_row is not None:
        for i in range(n_row):
            svg += svg_text(position_row[i] - (margin_text + node_sizes_row[i], 0), names_row[i], font_size, True)
    if names_col is not None:
        for i in range(n_col):
            svg += svg_text(position_col[i] + (margin_text + node_sizes_col[i], 0), names_col[i], font_size)
    svg += """</svg>\n"""

    if filename is not None:
        with open(filename + '.svg', 'w') as f:
            f.write(svg)

    return svg
Exemple #28
0
 def left_sparse_dot(self, matrix: sparse.csr_matrix):
     """Left dot product with a sparse matrix"""
     self.backward = matrix.dot(self.backward)
     return self
Exemple #29
0
def pcg(A: sp.csr_matrix,
        b: np.array,
        tol: float = 1e-5,
        maxiter: int = 100,
        M1: sp.csr_matrix = None,
        M2: sp.csr_matrix = None,
        x0: np.array = None) -> (np.array, int, int):
    """
    PCG   Preconditioned Conjugate Gradients Method.
       X = PCG(A,B) attempts to solve the system of linear equations A*X=B for
       X. The N-by-N coefficient matrix A must be symmetric and positive
       definite and the right hand side column vector B must have length N.

       X = PCG(AFUN,B) accepts a function handle AFUN instead of the matrix A.
       AFUN(X) accepts a vector input X and returns the matrix-vector product
       A*X. In all of the following syntaxes, you can replace A by AFUN.

       X = PCG(A,B,TOL) specifies the tolerance of the method. If TOL is []
       then PCG uses the default, 1e-6.

       X = PCG(A,B,TOL,MAXIT) specifies the maximum number of itations. If
       MAXIT is [] then PCG uses the default, min(N,20).

       X = PCG(A,B,TOL,MAXIT,M) and X = PCG(A,B,TOL,MAXIT,M1,M2) use symmetric
       positive definite preconditioner M or M=M1*M2 and effectively solve the
       system inv(M)*A*X = inv(M)*B for X. If M is [] then a preconditioner
       is not applied. M may be a function handle MFUN returning M\X.

       X = PCG(A,B,TOL,MAXIT,M1,M2,X0) specifies the initial guess. If X0 is
       [] then PCG uses the default, an all zero vector.

       [X,FLAG] = PCG(A,B,...) also returns a convergence FLAG:
        0 PCG converged to the desired tolerance TOL within MAXIT itations
        1 PCG itated MAXIT times but did not converge.
        2 preconditioner M was ill-conditioned.
        3 PCG stagnated (two consecutive itates were the same).
        4 one of the scalar quantities calculated during PCG became too
          small or too large to continue computing.

       [X,FLAG,RELRES] = PCG(A,B,...) also returns the relative residual
       NORM(B-A*X)/NORM(B). If FLAG is 0, then RELRES <= TOL.

       [X,FLAG,RELRES,ITER] = PCG(A,B,...) also returns the itation number
       at which X was computed: 0 <= ITER <= MAXIT.

       [X,FLAG,RELRES,ITER,RESVEC] = PCG(A,B,...) also returns a vector of the
       estimated residual norms at each itation including NORM(B-A*X0).

       Example:
          n1 = 21; A = gallery('moler',n1);  b1 = A*ones(n1,1);
          tol = 1e-6;  maxit = 15;  M = diag([10:-1:1 1 1:10]);
          [x1,flag1,rr1,it1,rv1] = pcg(A,b1,tol,maxit,M);
       Or use this parameterized matrix-vector product function:
          afun = @(x,n)gallery('moler',n)*x;
          n2 = 21; b2 = afun(ones(n2,1),n2);
          [x2,flag2,rr2,it2,rv2] = pcg(@(x)afun(x,n2),b2,tol,maxit,M);

       Class support for inputs A,B,M1,M2,X0 and the output of AFUN:
          float: double

       See also BICG, BICGSTAB, BICGSTABL, CGS, GMRES, LSQR, MINRES, QMR,
       SYMMLQ, TFQMR, ICHOL, FUNCTION_HANDLE.

       Copyright 1984-2013 The MathWorks, Inc.
       """
    n = A.shape[0]
    n2b = np.linalg.norm(b)
    if n2b == 0:
        return np.zeros((n)), 0, 0
    if x0 == None:
        x = np.zeros((n))
    else:
        x = x0
    flag = 1
    it = 0
    xmin = x  # Iterate which has minimal residual so far
    imin = 0  # Iteration at which xmin was computed
    tolb = tol * n2b  # Relative tolerance
    r = b - A.dot(x.transpose())
    normr = np.linalg.norm(r)  # Norm of residual
    normr_act = normr
    eps = 2.2204e-16

    if normr <= tolb:
        return x, 0, 0

    normrmin = normr
    rho = 1
    stag = 0
    moresteps = 0
    maxmsteps = min(n // 50, 5, n - maxiter)
    maxstagsteps = 3
    ii = 0
    while ii < maxiter:
        if M1 != None:
            y = spsolve(M1, r)
        else:
            y = r

        if M2 != None:
            z = spsolve(M2, y)
        else:
            z = y
        rho1 = rho
        rho = r.dot(z)
        if (rho == 0) or np.isinf(rho):
            flag = 4
            break
        if (ii == 0):
            p = z
        else:
            beta = rho / rho1
            if ((beta == 0) or np.isinf(beta)):
                flag = 4
                break
            p = z + beta * p
        q = A.dot(p)
        pq = p.dot(q)
        if (pq <= 0) or np.isinf(pq):
            flag = 4
            break
        else:
            alpha = rho / pq
        # Check for stagnation of the method
        if norm(p) * abs(alpha) < eps * norm(x):
            stag = stag + 1
        else:
            stag = 0

        x = x + alpha * p  # form new itate
        r = r - alpha * q
        normr = norm(r)
        normr_act = normr
        # check for convergence
        if normr <= tolb or stag >= maxstagsteps or moresteps:
            r = b - A.dot(x)
            normr_act = norm(r)
            if normr_act <= tolb:
                flag = 0
                it = ii
                break
            else:
                if stag >= maxstagsteps and moresteps == 0:
                    stag = 0
                moresteps = moresteps + 1
                if moresteps >= maxmsteps:
                    flag = 3
                    it = ii
                    break
        if normr_act < normrmin:  # update minimal norm quantities
            normrmin = normr_act
            xmin = x
            imin = ii
        if stag >= maxstagsteps:
            flag = 3
            break
        ii += 1
    # returned solution is first with minimal residual
    if flag:
        r_comp = b - A.dot(xmin)
        if norm(r_comp) <= normr_act:
            x = xmin
            it = imin
        else:
            it = ii
    return x, flag, it
Exemple #30
0
def compare_news_vector_with_(arr, vec: csr_matrix):
    return 1 - vec.dot(arr)
Exemple #31
0
def dasgupta_cost(adjacency: sparse.csr_matrix,
                  dendrogram: np.ndarray,
                  node_weights: Union[str, np.ndarray] = 'uniform',
                  normalized: bool = True) -> float:
    """Dasgupta's cost of a hierarchy (cost metric)

     Parameters
     ----------
     adjacency :
        Adjacency matrix of the graph.
     dendrogram :
        Each row contains the two merged nodes, the height in the dendrogram, and the size of the corresponding cluster
     node_weights :
        Vector of node weights. Default = 'uniform', weight 1 for each node.
     normalized:
        If true, normalized by the number of ndoes of the graph.

     Returns
     -------
     cost : float
         Dasgupta's cost of the hierarchy.
         Normalized by the number of nodes to get a value between 0 and 1.

     References
     ----------
     S. Dasgupta. A cost function for similarity-based hierarchical clustering.
     In Proceedings of ACM symposium on Theory of Computing, 2016.

    """

    if type(adjacency) != sparse.csr_matrix:
        raise TypeError(
            'The adjacency matrix must be in a scipy compressed sparse row (csr) format.'
        )
    # check that the graph is not directed
    if adjacency.shape[0] != adjacency.shape[1]:
        raise ValueError('The adjacency matrix must be square.')
    if (adjacency != adjacency.T).nnz != 0:
        raise ValueError(
            'The graph cannot be directed. Please fit a symmetric adjacency matrix.'
        )

    n_nodes = adjacency.shape[0]

    if type(node_weights) == np.ndarray:
        if len(node_weights) != n_nodes:
            raise ValueError(
                'The number of node weights must match the number of nodes.')
        else:
            node_weights_vec = node_weights
    elif type(node_weights) == str:
        if node_weights == 'degree':
            node_weights_vec = adjacency.dot(np.ones(n_nodes))
        elif node_weights == 'uniform':
            node_weights_vec = np.ones(n_nodes)
        else:
            raise ValueError('Unknown distribution of node weights.')
    else:
        raise TypeError(
            'Node weights must be a known distribution ("degree" or "uniform" string) or a custom NumPy array.'
        )

    if np.any(node_weights_vec <= 0):
        raise ValueError('All node weights must be positive.')
    else:
        node_weights_vec = node_weights_vec / np.sum(node_weights_vec)

    aggregate_graph = AggregateGraph(adjacency, node_weights_vec)

    height = np.zeros(n_nodes - 1)
    edge_sampling = np.zeros(n_nodes - 1)
    cluster_weight = np.zeros(n_nodes - 1)
    for t in range(n_nodes - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n_nodes and height[node1 - n_nodes] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n_nodes]
            edge_sampling[node1 - n_nodes] = 0
        elif node2 >= n_nodes and height[node2 - n_nodes] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n_nodes]
            edge_sampling[node2 - n_nodes] = 0
        height[t] = dendrogram[t][2]
        edge_sampling[t] += 2 * aggregate_graph.graph[node1][node2]
        cluster_weight[t] = aggregate_graph.cluster_probs[
            node1] + aggregate_graph.cluster_probs[node2]
        aggregate_graph.merge(node1, node2)

    cost: float = (edge_sampling * cluster_weight).sum()
    if not normalized:
        cost *= node_weights_vec.sum()
    return cost