Esempio n. 1
0
 def predict(self, x: spa.csr_matrix):
     '''Return a matrix with the predicted ratings. Applies logs to
     avoid underflow and to take into account that the probability of
     appearance of a word increases if the same word has already appeared
     before. Uses only matrix operations.'''
     if self.is_trained == False:
         return ('''The Classifier has not been trained. Please use
                 train(train_data: spa.csr_matrix, scores: np.ndarray,
                 Laplace_alpha) to train the Classifier.''')
     else:
         x.data = np.log(x.data + 1)
         self.log_cond_prob_matrix = spa.hstack(self.log_cond_prob_trans)
         log_freq = np.log(self.fractions)
         pre_final_result = x.dot(self.log_cond_prob_matrix) + log_freq
         final_prediction = (pre_final_result.argmax(axis=1) +
                             1).transpose()
         x.data = np.exp(x.data) - 1
         return final_prediction
Esempio n. 2
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     dataMatrix.data = dataMatrix.data * (1 / np.log1p(feature_data))
     return dataMatrix
Esempio n. 3
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     feature_data[feature_data > 1] = np.log(feature_data[feature_data > 1])
     dataMatrix.data = dataMatrix.data * feature_data
     return dataMatrix
Esempio n. 4
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     dataMatrix.data = dataMatrix.data * feature_data
     return dataMatrix
Esempio n. 5
0
    def _reweight_values(self,
                         doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix:
        """
        Re-weight values in a doc-term matrix according to parameters specified
        in :class:`Vectorizer` initialization: binary or tf-idf weighting,
        sublinear term-frequency, document-normalized weights.

        Args:
            doc_term_matrix

        Returns:
            Reweighted doc-term matrix.
        """
        # re-weight the local components (term freqs)
        if self.tf_type == "binary":
            doc_term_matrix.data.fill(1)
        elif self.tf_type == "bm25":
            if not self.dl_type:
                doc_term_matrix.data = (doc_term_matrix.data *
                                        (BM25_K1 + 1.0) /
                                        (BM25_K1 + doc_term_matrix.data))
            else:
                dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
                length_norm = (1 - BM25_B) + (BM25_B *
                                              (dls / self._avg_doc_length))
                doc_term_matrix = doc_term_matrix.tocoo(copy=False)
                doc_term_matrix.data = (
                    doc_term_matrix.data * (BM25_K1 + 1.0) /
                    (doc_term_matrix.data +
                     (BM25_K1 * length_norm[doc_term_matrix.row])))
                doc_term_matrix = doc_term_matrix.tocsr(copy=False)
        elif self.tf_type == "sqrt":
            _ = np.sqrt(doc_term_matrix.data,
                        doc_term_matrix.data,
                        casting="unsafe")
        elif self.tf_type == "log":
            _ = np.log(doc_term_matrix.data,
                       doc_term_matrix.data,
                       casting="unsafe")
            doc_term_matrix.data += 1.0
        elif self.tf_type == "linear":
            pass  # tfs are already linear
        else:
            # this should never raise, i'm just being a worrywart
            raise ValueError(
                errors.value_invalid_msg(
                    "tf_type", self.tf_type,
                    {"binary", "bm25", "sqrt", "log", "linear"}))

        # apply the global component (idfs), column-wise
        if self.idf_type:
            doc_term_matrix = doc_term_matrix * self._idf_diag

        # apply normalizations, row-wise
        # unless we've already handled it for bm25-style tf
        if self.dl_type and self.tf_type != "bm25":
            n_docs, _ = doc_term_matrix.shape
            dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
            dl_diag = sp.spdiags(1.0 / dls,
                                 diags=0,
                                 m=n_docs,
                                 n=n_docs,
                                 format="csr")
            doc_term_matrix = dl_diag * doc_term_matrix
        if self.norm is not None:
            doc_term_matrix = normalize_mat(doc_term_matrix,
                                            norm=self.norm,
                                            axis=1,
                                            copy=False)

        return doc_term_matrix
Esempio n. 6
0
 def _eliminate_subzeroes(self, m: sparse.csr_matrix, epsilon):
     m.data = np.where(abs(m.data) < epsilon, 0, m.data)
     m.eliminate_zeros()
Esempio n. 7
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix,
                             dendrogram: np.ndarray,
                             weights: str = 'degree',
                             normalized: bool = True) -> float:
    """Tree sampling divergence of a hierarchy (quality metric).

    * Graphs
    * Digraphs

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized :
        If ``True``, normalized score (between 0 and 1).

    Returns
    -------
    score : float
        Score.

    Example
    -------
    >>> from sknetwork.hierarchy import tree_sampling_divergence, Paris
    >>> from sknetwork.data import house
    >>> paris = Paris()
    >>> adjacency = house()
    >>> dendrogram = paris.fit_transform(adjacency)
    >>> score = tree_sampling_divergence(adjacency, dendrogram)
    >>> np.round(score, 2)
    0.52

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.
    """
    adjacency = check_format(adjacency)
    check_square(adjacency)

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    total_weight = adjacency.data.sum()
    if total_weight <= 0:
        raise ValueError('The graph must contain at least one edge.')

    adjacency.data = adjacency.data / total_weight

    aggregate_graph, height, cluster_weight, edge_sampling, weights_row, weights_col = _instanciate_vars(
        adjacency, weights)
    node_sampling = np.zeros(n - 1)

    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if i >= n and height[i - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[i - n]
            edge_sampling[i - n] = 0
            node_sampling[t] = node_sampling[i - n]
        elif j >= n and height[j - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[j - n]
            edge_sampling[j - n] = 0
            node_sampling[t] = node_sampling[j - n]
        if j in aggregate_graph.neighbors[i]:
            edge_sampling[t] += aggregate_graph.neighbors[i][j]
        node_sampling[t] += aggregate_graph.cluster_out_weights[i] * aggregate_graph.cluster_in_weights[j] + \
            aggregate_graph.cluster_out_weights[j] * aggregate_graph.cluster_in_weights[i]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(i, j)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(
        np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(weights_row, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(weights_col, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(
            sampling_ratio.data))
        score /= mutual_information
    return score
Esempio n. 8
0
def tree_sampling_divergence(adjacency: sparse.csr_matrix, dendrogram: np.ndarray, weights: str = 'degree',
                             normalized: bool = True) -> float:
    """
    Tree sampling divergence of a hierarchy (quality metric).

    The higher the score, the better.

    Parameters
    ----------
    adjacency :
        Adjacency matrix of the graph.
    dendrogram :
        Dendrogram.
    weights :
        Weights of nodes.
        ``'degree'`` (default) or ``'uniform'``.
    normalized:
        If ``True``, normalized by the mutual information of the graph.

    Returns
    -------
    score : float
        The tree sampling divergence of the hierarchy.
        If normalized, returns a value between 0 and 1.

    References
    ----------
    Charpentier, B. & Bonald, T. (2019).
    `Tree Sampling Divergence: An Information-Theoretic Metric for
    Hierarchical Graph Clustering.
    <https://hal.telecom-paristech.fr/hal-02144394/document>`_
    Proceedings of IJCAI.

    """
    adjacency = check_format(adjacency)
    if not is_square(adjacency):
        raise ValueError('The adjacency matrix is not square.')

    n = adjacency.shape[0]
    if n <= 1:
        raise ValueError('The graph must contain at least two nodes.')

    total_weight = adjacency.data.sum()
    if total_weight <= 0:
        raise ValueError('The graph must contain at least one edge.')

    adjacency.data = adjacency.data / total_weight

    out_weights = check_probs(weights, adjacency)
    in_weights = check_probs(weights, adjacency.T)

    aggregate_graph = AggregateGraph(adjacency + adjacency.T, out_weights, in_weights)

    height = np.zeros(n - 1)
    edge_sampling = np.zeros(n - 1)
    node_sampling = np.zeros(n - 1)
    for t in range(n - 1):
        node1 = int(dendrogram[t][0])
        node2 = int(dendrogram[t][1])
        if node1 >= n and height[node1 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node1 - n]
            edge_sampling[node1 - n] = 0
            node_sampling[t] = node_sampling[node1 - n]
        elif node2 >= n and height[node2 - n] == dendrogram[t][2]:
            edge_sampling[t] = edge_sampling[node2 - n]
            edge_sampling[node2 - n] = 0
            node_sampling[t] = node_sampling[node2 - n]
        if node2 in aggregate_graph.neighbors[node1]:
            edge_sampling[t] += aggregate_graph.neighbors[node1][node2]
        node_sampling[t] += aggregate_graph.cluster_out_weights[node1] * aggregate_graph.cluster_in_weights[node2] + \
            aggregate_graph.cluster_out_weights[node2] * aggregate_graph.cluster_in_weights[node1]
        height[t] = dendrogram[t][2]
        aggregate_graph.merge(node1, node2)

    index = np.where(edge_sampling)[0]
    score = edge_sampling[index].dot(np.log(edge_sampling[index] / node_sampling[index]))
    if normalized:
        inv_out_weights = sparse.diags(out_weights, shape=(n, n), format='csr')
        inv_out_weights.data = 1 / inv_out_weights.data
        inv_in_weights = sparse.diags(in_weights, shape=(n, n), format='csr')
        inv_in_weights.data = 1 / inv_in_weights.data
        sampling_ratio = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        inv_out_weights.data = np.ones(len(inv_out_weights.data))
        inv_in_weights.data = np.ones(len(inv_in_weights.data))
        edge_sampling = inv_out_weights.dot(adjacency.dot(inv_in_weights))
        mutual_information = edge_sampling.data.dot(np.log(sampling_ratio.data))
        score /= mutual_information
    return score