Ejemplo n.º 1
0
def dasgupta_cost(tree, edge_weights, leaf_graph):
    """
    Dasgupta's cost is an unsupervised measure of the quality of a hierarchical clustering of an edge weighted graph.

    Let :math:`T` be a tree representing a hierarchical clustering of the graph :math:`G=(V, E)`.
    Let :math:`w` be a dissimilarity function on the edges :math:`E` of the graph.

    The Dasgupta's cost is define as:

    .. math::

        dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} \\frac{area(lca_T(x,y))}{w(\{x,y\})}

    :See:

        S. Dasgupta. "`A cost function for similarity-based hierarchical clustering <https://arxiv.org/pdf/1510.05043.pdf>`_ ."
        In Proc. STOC, pages 118–127, Cambridge, MA, USA, 2016

    :Complexity:

    The runtime complexity is :math:`\mathcal{O}(n\log(n) + m)` with :math:`n` the number of nodes in :math:`T` and
    :math:`m` the number of edges in :math:`E`.

    :param tree: Input tree
    :param edge_weights: Edge weights on the leaf graph (dissimilarities)
    :param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`)
    :return: a real number
    """
    area = hg.attribute_area(tree, leaf_graph=leaf_graph)

    lcaf = hg.make_lca_fast(tree)
    lca = lcaf.lca(leaf_graph)

    return np.sum(area[lca] / edge_weights)
Ejemplo n.º 2
0
def dendrogram_purity_naif(tree, leaf_labels):
    from itertools import combinations

    lcaf = hg.make_lca_fast(tree)
    area = hg.attribute_area(tree)
    max_label = np.max(leaf_labels)
    label_histo = np.zeros((tree.num_leaves(), max_label + 1), dtype=np.int64)
    label_histo[np.arange(tree.num_leaves()), leaf_labels] = 1
    label_histo = hg.accumulate_sequential(tree, label_histo,
                                           hg.Accumulators.sum)
    class_purity = label_histo / area[:, None]

    count = 0
    total = 0
    for label in set(leaf_labels):
        same = leaf_labels == label
        same_indices, = same.nonzero()

        if len(same_indices) < 2:
            continue

        pairs = list(combinations(same_indices, 2))
        count += len(pairs)

        pairs = np.asarray(pairs, dtype=np.int64)
        lcas = lcaf.lca(pairs[:, 0], pairs[:, 1])
        total += np.sum(class_purity[lcas, label])

    return total / count
Ejemplo n.º 3
0
def loss_triplet(graph, edge_weights, ultrametric, hierarchy, triplets,
                 margin):
    """
    Triplet loss regularization with triplet :math:`\mathcal{T}`:
    
     .. math::
    
        loss = \sum_{(ref, pos, neg)\in \mathcal{T}} \max(0, ultrametric(ref, pos) - ultrametric(ref, neg) + margin)
    
    :param graph: input graph (``higra.UndirectedGraph``)
    :param edge_weights: edge weights of the input graph (``torch.Tensor``, autograd is supported)
    :param ultrametric; ultrametric on the input graph  (``torch.Tensor``, autograd is supported)
    :param hierarchy: optional,  if provided must be a tuple ``(tree, altitudes)`` corresponding to the result of ``higra.bpt_canonical`` on the input edge weighted graph 
    :param triplets:
    :param margin:
    :return: loss value as a pytorch scalar
    """
    tree, altitudes = hierarchy
    #mst = hg.get_attribute(tree, "mst")
    #mst_map = hg.get_attribute(mst, "mst_edge_map")
    lcaf = hg.make_lca_fast(tree)

    #closest_loss = (ultrametric - edge_weights)**2
    pairs, (pos, neg) = triplets
    pairs_distances = altitudes[lcaf.lca(
        *pairs)]  # ultrametric[mst_map[lcaf.lca(*pairs) - tree.num_leaves()]]

    triplet_loss = tc.relu(pairs_distances[pos] - pairs_distances[neg] +
                           margin)

    return triplet_loss.mean()
Ejemplo n.º 4
0
def attribute_lca_map(tree, leaf_graph):
    """
    Lowest common ancestor of `i` and `j` for each edge :math:`(i, j)` of the leaf graph of the given tree.

    Complexity: :math:`\mathcal{O}(n\log(n)) + \mathcal{O}(m)` where :math:`n` is the number of nodes in `tree` and
    :math:`m` is the number of edges in :attr:`leaf_graph`.

    :param tree: input tree (Concept :class:`~higra.CptHierarchy`)
    :param leaf_graph: graph on the leaves of the input tree (deduced from :class:`~higra.CptHierarchy` on `tree`)
    :return: a 1d array
    """
    lca = hg.make_lca_fast(tree)
    res = lca.lca(leaf_graph)
    return res