Ejemplo n.º 1
0
def test_simple_example():
    c1_elm2clu_dict = {0: [0, 1], 1: [1, 2], 2: [1, 3], 3: [0], 4: [2], 5: [1]}
    c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0, 3], 4: [2, 4], 5: [2]}

    c1 = Clustering(elm2clu_dict=c1_elm2clu_dict)
    c2 = Clustering(elm2clu_dict=c2_elm2clu_dict)

    sim_ppr_pack = sim.element_sim(
        c1,
        c2,
        alpha=0.9,
        r=1.0,
        r2=None,
        rescale_path_type="max",
        ppr_implementation="prpack",
    )
    sim_ppr_power_iteration = sim.element_sim(
        c1,
        c2,
        alpha=0.9,
        r=1.0,
        r2=None,
        rescale_path_type="max",
        ppr_implementation="power_iteration",
    )

    assert_approx_equal(sim_ppr_pack, sim_ppr_power_iteration, significant=3)
Ejemplo n.º 2
0
def test_real_example_on_overlapping_community():
    ground_truth_community = json.load(
        open("ground_truth_community_Philosophy.json", "r")
    )
    detected_community = json.load(open("detected_community_Philosophy.json", "r"))

    c1 = Clustering(elm2clu_dict=ground_truth_community)
    c2 = Clustering(elm2clu_dict=detected_community)

    start = time.time()
    sim_ppr_pack = sim.element_sim(
        c1,
        c2,
        alpha=0.9,
        r=1.0,
        r2=None,
        rescale_path_type="max",
        ppr_implementation="prpack",
    )
    end = time.time()
    print("prpack elapsed time: {}s".format(end - start))
    start = time.time()
    sim_ppr_power_iteration = sim.element_sim(
        c1,
        c2,
        alpha=0.9,
        r=1.0,
        r2=None,
        rescale_path_type="max",
        ppr_implementation="power_iteration",
    )
    end = time.time()
    print("power iteration elapsed time: {}s".format(end - start))

    assert_approx_equal(sim_ppr_pack, sim_ppr_power_iteration, significant=3)
Ejemplo n.º 3
0
def generate_random_partition_num(n_elements, n_clusters):

    clu_list = _random_partition_num_iterator(n_elements, n_clusters)

    new_clustering = Clustering()
    new_clustering.from_cluster_list(clu_list)
    return new_clustering
Ejemplo n.º 4
0
def shuffle_memberships_pa(clustering, n_steps=1, constant_num_clusters=True):
    """
        This function creates a new clustering by shuffling the element
        memberships from the original clustering according to the preferential
        attachment model.

        See :cite:`Gates2017impact` for a detailed explaination of the preferential
        attachment model.

        :param Clustering clustering: The original clustering.

        :param int n_steps: optional (default 1)
            The number of times to run the preferential attachment algorithm.

        :param Boolean constant_num_clusters: optional (default True)
            Reject a shuffling move if it leaves a cluster with no elements.
            Set to True to keep the number of clusters constant.

        :returns:
            The new clustering with shuffled memberships.

        >>> import clusim.clugen as clugen
        >>> from clusim.clustering import print_clustering
        >>> orig_clu = clugen.make_random_clustering(n_elements=9, n_clusters=3,
                                              random_model='num')
        >>> print_clustering(orig_clu)
        >>> shuffle_clu = clugen.shuffle_memberships_pa(orig_clu, n_steps=10,
                                                 constant_num_clusters=True)
        >>> print_clustering(shuffle_clu)
    """
    n_elements_norm = 1./float(clustering.n_elements)

    Nclusters = clustering.n_clusters

    cluster_list = clustering.to_cluster_list()
    cluster_size_prob = np.array(list(map(len, cluster_list))) * n_elements_norm
    clusternames = range(Nclusters)

    for istep in range(n_steps):
        from_cluster = np.random.choice(clusternames, p=cluster_size_prob)
        if cluster_size_prob[from_cluster] > 1.5*n_elements_norm or not constant_num_clusters:

            exchanged_element = np.random.choice(cluster_list[from_cluster], 1,
                                                 replace=False)[0]
            new_cluster = np.random.choice(clusternames, p=cluster_size_prob)

            if new_cluster != from_cluster:
                cluster_list[from_cluster].remove(exchanged_element)
                cluster_size_prob[from_cluster] -= n_elements_norm

                cluster_list[new_cluster].append(exchanged_element)
                cluster_size_prob[new_cluster] += n_elements_norm

    new_clustering = Clustering()
    new_clustering.from_cluster_list(cluster_list)

    return new_clustering
Ejemplo n.º 5
0
def generate_random_partition_perm(clu_size_seq):
    n_elements = sum(clu_size_seq)
    n_clusters = len(clu_size_seq)
    elm_list = np.random.permutation(np.arange(n_elements))
    clu_idx = np.hstack([[0], np.cumsum(clu_size_seq)])

    cluster_list = [elm_list[clu_idx[iclus]:clu_idx[iclus + 1]]
                    for iclus in range(n_clusters)]

    new_clustering = Clustering()
    new_clustering.from_cluster_list(cluster_list)
    return new_clustering
Ejemplo n.º 6
0
def paint_similarity_trace(b,
                           oks,
                           output=None,
                           figsize=(3, 3),
                           dpi=200,
                           **kwargs):
    clu_base = Clustering()
    fig, ax = plt.subplots(figsize=figsize, dpi=300)
    e_sim_list = []
    clu_base.from_membership_list(b)
    for g in oks.trace_mb.values():
        clu = Clustering()
        clu.from_membership_list(g[1])
        e_sim_list += [sim.element_sim(clu_base, clu)]

    ax.autoscale()
    ax.margins(0.1)
    # ax.set_aspect(1)
    plt.xlabel("steps")
    plt.ylabel("Element-centric similarity")
    plt.yticks(np.linspace(0, 1, 5))
    ax.tick_params(direction="in")
    plt.plot(e_sim_list)
    if output is not None:
        plt.savefig(output, dpi=dpi, transparent=True)
Ejemplo n.º 7
0
def test_model_example():
    c1_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0]}
    c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [1]}

    c1 = Clustering(elm2clu_dict=c1_elm2clu_dict)
    c2 = Clustering(elm2clu_dict=c2_elm2clu_dict)

    known_rand_values = {
        'perm': 0.5,
        'perm1': 0.5,
        'num': 0.510204081632653,
        'num1': 0.5,
        'all': 0.555555555555556,
        'all1': 0.5
    }

    known_mi_values = {
        'perm': 0.311278124459133,
        'perm1': 0.311278124459133,
        'num': 0.309927805548467,
        'num1': 0.301825892084476,
        'all': 0.611635721962606,
        'all1': 0.419448541053684
    }

    for rdm in sim.available_random_models:
        exp_rand_value = sim.expected_rand_index(n_elements=c1.n_elements,
                                                 n_clusters1=c1.n_clusters,
                                                 n_clusters2=c2.n_clusters,
                                                 clu_size_seq1=c1.clu_size_seq,
                                                 clu_size_seq2=c2.clu_size_seq,
                                                 random_model=rdm)
        assert_approx_equal(
            exp_rand_value, known_rand_values[rdm], 10**(-10),
            "Expected Rand Index with {} Random Model does not match."
            "{} != {}".format(rdm, exp_rand_value, known_rand_values[rdm]))

        exp_mi_value = float(
            sim.expected_mi(n_elements=c1.n_elements,
                            n_clusters1=c1.n_clusters,
                            n_clusters2=c2.n_clusters,
                            clu_size_seq1=c1.clu_size_seq,
                            clu_size_seq2=c2.clu_size_seq,
                            random_model=rdm,
                            logbase=2.))
        assert_approx_equal(
            exp_mi_value, known_mi_values[rdm], 10**(-10),
            "Expected MI with {} Random Model does not match."
            "{} != {}".format(rdm, exp_mi_value, known_mi_values[rdm]))
Ejemplo n.º 8
0
def test_elementsim_example():

    # taken from Fig 3 of Gates et al (2018) Scientific Reports

    # overlapping clustering
    c1_elm2clu_dict = {
        0: [0],
        1: [0],
        2: [0],
        3: [3],
        4: ['.3'],
        5: ['.3', '.9'],
        6: ['.9']
    }

    # hierarchical clustering
    c2_elm2clu_dict = {
        0: [1],
        1: [1],
        2: [2],
        3: [5],
        4: [5],
        5: [6, 8],
        6: [9]
    }
    c2_dag = DAG()
    c2_dag.add_edges_from([(0, 1), (0, 2), (3, 4), (4, 5), (4, 6), (3, 7),
                           (7, 8), (7, 9)])

    c1 = Clustering(elm2clu_dict=c1_elm2clu_dict)
    c2 = Clustering(elm2clu_dict=c2_elm2clu_dict, hier_graph=c2_dag)

    known_elsim = [
        0.92875658, 0.92875658, 0.85751315, 0.25717544, 0.74282456, 0.82083876,
        0.80767074
    ]

    elsim, ellabels = sim.element_sim_elscore(c1,
                                              c2,
                                              alpha=0.9,
                                              r=1.,
                                              r2=None,
                                              rescale_path_type='max')

    for i in range(7):
        assert_approx_equal(
            elsim[i], known_elsim[i], 10**(-10),
            "Element-centric similarity for element %s does not match. %s != %s"
            % (i, elsim[i], known_elsim[i]))
Ejemplo n.º 9
0
def test_comparison_example():
    c1_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0], 4: [2], 5: [1]}
    c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0], 4: [2], 5: [2]}

    c1 = Clustering(elm2clu_dict=c1_elm2clu_dict)
    c2 = Clustering(elm2clu_dict=c2_elm2clu_dict)

    N11, N10, N01, N00 = sim.count_pairwise_cooccurence(c1, c2)

    assert N11 == 2, "Element Co-occurance counts for N11 does not match. %s != %s" % (
        N11, 2)
    assert N10 == 2, "Element Co-occurance counts for N10 does not match. %s != %s" % (
        N10, 2)
    assert N01 == 1, "Element Co-occurance counts for N01 does not match. %s != %s" % (
        N01, 1)
    assert N00 == 10, "Element Co-occurance counts for N00 does not match. %s != %s" % (
        N00, 10)

    known_sim_values = {
        'jaccard_index': 0.4,
        'rand_index': 0.8,
        'fowlkes_mallows_index': 0.5773502691896258,
        'rogers_tanimoto_index': 2. / 3.,
        'southwood_index': 2. / 3.,
        'czekanowski_index': 0.5714285714285714,
        'dice_index': 0.5714285714285714,
        'sorensen_index': 0.5714285714285714,
        'pearson_correlation': 0.011363636363636364,
        'classification_error': 0.16666666666666674,
        'purity_index': 0.8333333333333333,
        'fmeasure': 0.5714285714285714,
        'nmi': 0.7396673768007593,
        'vi': 0.792481250360578,
        'geometric_accuracy': 0.8333333333333334,
        'overlap_quality': 0.0,
        'onmi': 0.7449589906475155,
        'omega_index': 0.44444444444444453
    }

    for simfunc in sim.available_similarity_measures:
        simvalue = eval('sim.' + simfunc + '(c1, c2)')
        assert simvalue == known_sim_values[
            simfunc], "Similarity Measure %s does not match. %s != %s" % (
                simfunc, simvalue, known_sim_values[simfunc])
Ejemplo n.º 10
0
def calc_nonoverlap_nmi(pred_membership, gt_membership):
    from clusim.clustering import Clustering
    import clusim.sim as sim

    pred = Clustering()
    pred.from_membership_list(pred_membership)

    gt = Clustering()
    gt.from_membership_list(gt_membership)

    ret = sim.nmi(pred, gt, norm_type='sum')
    return ret
Ejemplo n.º 11
0
def generate_random_partition_all(n_elements, tol=1.0e-15):
    """
        This function creates a random clustering according to the 'All'
        random model by uniformly selecting a clustering from the ensemble of all
        clusterings with n_elements.

        :param int n_elements:
            The number of elements

        :param float tol: (optional)
            The tolerance used by the algorithm to approximate the probability distrubtion

        :returns: The randomly genderated clustering.

        >>> import clusim.clugen as clugen
        >>> from clusim.clustering import print_clustering
        >>> clu = clugen.generate_random_partition_all(n_elements = 9)
        >>> print_clustering(clu)
    """

    if (n_elements, tol) in all_partition_weight_dict:
        weights = all_partition_weight_dict[(n_elements, tol)]
    else:
        weights = []
        u = 1
        b = mpmath.bell(n_elements)
        while sum(weights) < 1.0 - tol:
            weights.append(mpmath.power(u, n_elements)/(b * mpmath.e * mpmath.factorial(u)))
            u += 1
        all_partition_weight_dict[(n_elements, tol)] = weights

    K = np.random.choice(np.arange(1, len(weights) + 1), p=weights)
    colors = np.random.randint(K, size=n_elements)

    new_clustering = Clustering()
    new_clustering.from_membership_list(colors)
    return new_clustering
Ejemplo n.º 12
0
def paint_mds(oks, figsize=(20, 20)):
    l2 = len(oks.trace_mb.keys())
    l = int(l2**0.5)
    X = np.zeros([l2, l2])
    for idx_1, pair_1 in enumerate(combinations(range(1, l + 1), 2)):
        b = oks.trace_mb[pair_1]
        clu_1 = Clustering()
        clu_1.from_membership_list(b)
        for idx_2, pair_2 in enumerate(combinations(range(1, l + 1), 2)):
            b = oks.trace_mb[pair_2]
            clu_2 = Clustering()
            clu_2.from_membership_list(b)

            X[idx_1][idx_2] = 1 - sim.element_sim(clu_1, clu_2)
            X[idx_2][idx_1] = X[idx_1][idx_2]

    def _plot_embedding(X, title=None):
        x_min, x_max = np.min(X, 0), np.max(X, 0)
        X = (X - x_min) / (x_max - x_min)

        plt.figure(figsize=figsize)
        for ind, i in enumerate(range(X.shape[0])):
            plt.text(X[i, 0],
                     X[i, 1],
                     str(list(oks.trace_mb.keys())[ind]),
                     color=plt.cm.Set1(1 / 10.),
                     fontdict={
                         'weight': 'bold',
                         'size': 12
                     })
        plt.xticks([]), plt.yticks([])
        if title is not None:
            plt.title(title)

    clf = manifold.MDS(n_components=2,
                       n_init=10,
                       max_iter=10000,
                       dissimilarity="precomputed")
    X_mds = clf.fit_transform(X)
    _plot_embedding(X_mds)
def compare_scores(nexperiment, true_clusters, true_labels, predicted_clusters,
                   predicted_labels):
    mp_score = score.calculate_mp_score(true_clusters, predicted_clusters)
    nmi = normalized_mutual_info_score(true_labels,
                                       predicted_labels,
                                       average_method='arithmetic')
    anmi = adjusted_mutual_info_score(true_labels, predicted_labels)
    completeness = completeness_score(true_labels, predicted_labels)
    v_measure = v_measure_score(true_labels, predicted_labels)
    rand = adjusted_rand_score(true_labels, predicted_labels)
    fms = fowlkes_mallows_score(true_labels, predicted_labels)

    T = Clustering()
    C = Clustering()
    T.from_cluster_list(true_clusters)
    C.from_cluster_list(predicted_clusters)

    jaccard_index = sim.jaccard_index(T, C)
    nmi2 = sim.nmi(T, C)
    fmeasure = sim.fmeasure(T, C)
    element_sim = sim.element_sim(T, C)
    ri = sim.rand_index(T, C)

    print("------------------")
    print("Example ", nexperiment)
    print("Weigthed Similarity: ", round(mp_score, 3))
    print("NMI: ", round(nmi, 3))
    print("AMI: ", round(anmi, 3))
    print("NMI2: ", round(nmi2, 3))
    print("RI: ", round(ri, 3))
    print("Completeness: ", round(completeness, 3))
    print("V-Measure: ", round(v_measure, 3))
    print("Adjusted Rand: ", round(rand, 3))
    print("Fowlkes Mallows: ", round(fms, 3))
    print("Jaccard Index: ", round(jaccard_index, 3))
    print("F-Measure: ", round(fmeasure, 3))
    print("Element-centric: ", round(element_sim, 3))
    print()
Ejemplo n.º 14
0
def make_equal_clustering(n_elements, n_clusters):
    """
    This function creates a random clustering with equally sized clusters.
    If n_elements % n_clusters != 0, cluster sizes will differ by one
    element.

    :param int n_elements:
        The number of elements

    :param int n_clusters:
        The number of clusters

    :returns:
        The new clustering with equally sized clusters.

    >>> import clusim.clugen as clugen
    >>> from clusim.clustering import print_clustering
    >>> clu = clugen.make_equal_clustering(n_elements = 9, n_clusters = 3)
    >>> print_clustering(clu)
    """
    new_elm2clu_dict = {el: [el % n_clusters] for el in range(n_elements)}
    new_clustering = Clustering(new_elm2clu_dict)
    return new_clustering
Ejemplo n.º 15
0
def shuffle_memberships(clustering, percent=1.0):
    """
    This function creates a new clustering by shuffling the element
    memberships from the original clustering.

    :param Clustering clustering: The original clustering.

    :param float percent: optional (default 1.0)
        The fractional percentage (between 0.0 and 1.0) of the elements to
        shuffle.

    :returns: The new clustering.

    >>> import clusim.clugen as clugen
    >>> from clusim.clustering import print_clustering
    >>> orig_clu = clugen.make_random_clustering(n_elements = 9, n_clusters = 3,
                                          random_model = 'num')
    >>> print_clustering(orig_clu)
    >>> shuffle_clu = clugen.shuffle_memberships(orig_clu, percent = 0.5)
    >>> print_clustering(shuffle_clu)
    """
    el_to_shuffle = np.random.choice(clustering.elements,
                                     int(percent * clustering.n_elements),
                                     replace=False)
    shuffled_el = np.random.permutation(el_to_shuffle)
    newkeys = dict(zip(el_to_shuffle, shuffled_el))

    new_elm2clu_dict = copy.deepcopy(clustering.elm2clu_dict)
    for el in shuffled_el:
        new_elm2clu_dict[el] = clustering.elm2clu_dict[newkeys[el]]

    if clustering.is_hierarchical:
        new_clustering = HierClustering(elm2clu_dict=new_elm2clu_dict,
                                        hier_graph=copy.deepcopy(clustering.hiergraph))
    else:
        new_clustering = Clustering(elm2clu_dict=new_elm2clu_dict)
    return new_clustering
def clustering_with_clusim(dis):
    mat = np.array(dis)
    dists = squareform(mat)
    linkage_matrix = linkage(dists, "average")
    c = Clustering().from_scipy_linkage(linkage_matrix, dist_rescaled=True)
    return c