def test_simple_example(): c1_elm2clu_dict = {0: [0, 1], 1: [1, 2], 2: [1, 3], 3: [0], 4: [2], 5: [1]} c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0, 3], 4: [2, 4], 5: [2]} c1 = Clustering(elm2clu_dict=c1_elm2clu_dict) c2 = Clustering(elm2clu_dict=c2_elm2clu_dict) sim_ppr_pack = sim.element_sim( c1, c2, alpha=0.9, r=1.0, r2=None, rescale_path_type="max", ppr_implementation="prpack", ) sim_ppr_power_iteration = sim.element_sim( c1, c2, alpha=0.9, r=1.0, r2=None, rescale_path_type="max", ppr_implementation="power_iteration", ) assert_approx_equal(sim_ppr_pack, sim_ppr_power_iteration, significant=3)
def test_real_example_on_overlapping_community(): ground_truth_community = json.load( open("ground_truth_community_Philosophy.json", "r") ) detected_community = json.load(open("detected_community_Philosophy.json", "r")) c1 = Clustering(elm2clu_dict=ground_truth_community) c2 = Clustering(elm2clu_dict=detected_community) start = time.time() sim_ppr_pack = sim.element_sim( c1, c2, alpha=0.9, r=1.0, r2=None, rescale_path_type="max", ppr_implementation="prpack", ) end = time.time() print("prpack elapsed time: {}s".format(end - start)) start = time.time() sim_ppr_power_iteration = sim.element_sim( c1, c2, alpha=0.9, r=1.0, r2=None, rescale_path_type="max", ppr_implementation="power_iteration", ) end = time.time() print("power iteration elapsed time: {}s".format(end - start)) assert_approx_equal(sim_ppr_pack, sim_ppr_power_iteration, significant=3)
def generate_random_partition_num(n_elements, n_clusters): clu_list = _random_partition_num_iterator(n_elements, n_clusters) new_clustering = Clustering() new_clustering.from_cluster_list(clu_list) return new_clustering
def shuffle_memberships_pa(clustering, n_steps=1, constant_num_clusters=True): """ This function creates a new clustering by shuffling the element memberships from the original clustering according to the preferential attachment model. See :cite:`Gates2017impact` for a detailed explaination of the preferential attachment model. :param Clustering clustering: The original clustering. :param int n_steps: optional (default 1) The number of times to run the preferential attachment algorithm. :param Boolean constant_num_clusters: optional (default True) Reject a shuffling move if it leaves a cluster with no elements. Set to True to keep the number of clusters constant. :returns: The new clustering with shuffled memberships. >>> import clusim.clugen as clugen >>> from clusim.clustering import print_clustering >>> orig_clu = clugen.make_random_clustering(n_elements=9, n_clusters=3, random_model='num') >>> print_clustering(orig_clu) >>> shuffle_clu = clugen.shuffle_memberships_pa(orig_clu, n_steps=10, constant_num_clusters=True) >>> print_clustering(shuffle_clu) """ n_elements_norm = 1./float(clustering.n_elements) Nclusters = clustering.n_clusters cluster_list = clustering.to_cluster_list() cluster_size_prob = np.array(list(map(len, cluster_list))) * n_elements_norm clusternames = range(Nclusters) for istep in range(n_steps): from_cluster = np.random.choice(clusternames, p=cluster_size_prob) if cluster_size_prob[from_cluster] > 1.5*n_elements_norm or not constant_num_clusters: exchanged_element = np.random.choice(cluster_list[from_cluster], 1, replace=False)[0] new_cluster = np.random.choice(clusternames, p=cluster_size_prob) if new_cluster != from_cluster: cluster_list[from_cluster].remove(exchanged_element) cluster_size_prob[from_cluster] -= n_elements_norm cluster_list[new_cluster].append(exchanged_element) cluster_size_prob[new_cluster] += n_elements_norm new_clustering = Clustering() new_clustering.from_cluster_list(cluster_list) return new_clustering
def generate_random_partition_perm(clu_size_seq): n_elements = sum(clu_size_seq) n_clusters = len(clu_size_seq) elm_list = np.random.permutation(np.arange(n_elements)) clu_idx = np.hstack([[0], np.cumsum(clu_size_seq)]) cluster_list = [elm_list[clu_idx[iclus]:clu_idx[iclus + 1]] for iclus in range(n_clusters)] new_clustering = Clustering() new_clustering.from_cluster_list(cluster_list) return new_clustering
def paint_similarity_trace(b, oks, output=None, figsize=(3, 3), dpi=200, **kwargs): clu_base = Clustering() fig, ax = plt.subplots(figsize=figsize, dpi=300) e_sim_list = [] clu_base.from_membership_list(b) for g in oks.trace_mb.values(): clu = Clustering() clu.from_membership_list(g[1]) e_sim_list += [sim.element_sim(clu_base, clu)] ax.autoscale() ax.margins(0.1) # ax.set_aspect(1) plt.xlabel("steps") plt.ylabel("Element-centric similarity") plt.yticks(np.linspace(0, 1, 5)) ax.tick_params(direction="in") plt.plot(e_sim_list) if output is not None: plt.savefig(output, dpi=dpi, transparent=True)
def test_model_example(): c1_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0]} c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [1]} c1 = Clustering(elm2clu_dict=c1_elm2clu_dict) c2 = Clustering(elm2clu_dict=c2_elm2clu_dict) known_rand_values = { 'perm': 0.5, 'perm1': 0.5, 'num': 0.510204081632653, 'num1': 0.5, 'all': 0.555555555555556, 'all1': 0.5 } known_mi_values = { 'perm': 0.311278124459133, 'perm1': 0.311278124459133, 'num': 0.309927805548467, 'num1': 0.301825892084476, 'all': 0.611635721962606, 'all1': 0.419448541053684 } for rdm in sim.available_random_models: exp_rand_value = sim.expected_rand_index(n_elements=c1.n_elements, n_clusters1=c1.n_clusters, n_clusters2=c2.n_clusters, clu_size_seq1=c1.clu_size_seq, clu_size_seq2=c2.clu_size_seq, random_model=rdm) assert_approx_equal( exp_rand_value, known_rand_values[rdm], 10**(-10), "Expected Rand Index with {} Random Model does not match." "{} != {}".format(rdm, exp_rand_value, known_rand_values[rdm])) exp_mi_value = float( sim.expected_mi(n_elements=c1.n_elements, n_clusters1=c1.n_clusters, n_clusters2=c2.n_clusters, clu_size_seq1=c1.clu_size_seq, clu_size_seq2=c2.clu_size_seq, random_model=rdm, logbase=2.)) assert_approx_equal( exp_mi_value, known_mi_values[rdm], 10**(-10), "Expected MI with {} Random Model does not match." "{} != {}".format(rdm, exp_mi_value, known_mi_values[rdm]))
def test_elementsim_example(): # taken from Fig 3 of Gates et al (2018) Scientific Reports # overlapping clustering c1_elm2clu_dict = { 0: [0], 1: [0], 2: [0], 3: [3], 4: ['.3'], 5: ['.3', '.9'], 6: ['.9'] } # hierarchical clustering c2_elm2clu_dict = { 0: [1], 1: [1], 2: [2], 3: [5], 4: [5], 5: [6, 8], 6: [9] } c2_dag = DAG() c2_dag.add_edges_from([(0, 1), (0, 2), (3, 4), (4, 5), (4, 6), (3, 7), (7, 8), (7, 9)]) c1 = Clustering(elm2clu_dict=c1_elm2clu_dict) c2 = Clustering(elm2clu_dict=c2_elm2clu_dict, hier_graph=c2_dag) known_elsim = [ 0.92875658, 0.92875658, 0.85751315, 0.25717544, 0.74282456, 0.82083876, 0.80767074 ] elsim, ellabels = sim.element_sim_elscore(c1, c2, alpha=0.9, r=1., r2=None, rescale_path_type='max') for i in range(7): assert_approx_equal( elsim[i], known_elsim[i], 10**(-10), "Element-centric similarity for element %s does not match. %s != %s" % (i, elsim[i], known_elsim[i]))
def test_comparison_example(): c1_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0], 4: [2], 5: [1]} c2_elm2clu_dict = {0: [0], 1: [1], 2: [1], 3: [0], 4: [2], 5: [2]} c1 = Clustering(elm2clu_dict=c1_elm2clu_dict) c2 = Clustering(elm2clu_dict=c2_elm2clu_dict) N11, N10, N01, N00 = sim.count_pairwise_cooccurence(c1, c2) assert N11 == 2, "Element Co-occurance counts for N11 does not match. %s != %s" % ( N11, 2) assert N10 == 2, "Element Co-occurance counts for N10 does not match. %s != %s" % ( N10, 2) assert N01 == 1, "Element Co-occurance counts for N01 does not match. %s != %s" % ( N01, 1) assert N00 == 10, "Element Co-occurance counts for N00 does not match. %s != %s" % ( N00, 10) known_sim_values = { 'jaccard_index': 0.4, 'rand_index': 0.8, 'fowlkes_mallows_index': 0.5773502691896258, 'rogers_tanimoto_index': 2. / 3., 'southwood_index': 2. / 3., 'czekanowski_index': 0.5714285714285714, 'dice_index': 0.5714285714285714, 'sorensen_index': 0.5714285714285714, 'pearson_correlation': 0.011363636363636364, 'classification_error': 0.16666666666666674, 'purity_index': 0.8333333333333333, 'fmeasure': 0.5714285714285714, 'nmi': 0.7396673768007593, 'vi': 0.792481250360578, 'geometric_accuracy': 0.8333333333333334, 'overlap_quality': 0.0, 'onmi': 0.7449589906475155, 'omega_index': 0.44444444444444453 } for simfunc in sim.available_similarity_measures: simvalue = eval('sim.' + simfunc + '(c1, c2)') assert simvalue == known_sim_values[ simfunc], "Similarity Measure %s does not match. %s != %s" % ( simfunc, simvalue, known_sim_values[simfunc])
def calc_nonoverlap_nmi(pred_membership, gt_membership): from clusim.clustering import Clustering import clusim.sim as sim pred = Clustering() pred.from_membership_list(pred_membership) gt = Clustering() gt.from_membership_list(gt_membership) ret = sim.nmi(pred, gt, norm_type='sum') return ret
def generate_random_partition_all(n_elements, tol=1.0e-15): """ This function creates a random clustering according to the 'All' random model by uniformly selecting a clustering from the ensemble of all clusterings with n_elements. :param int n_elements: The number of elements :param float tol: (optional) The tolerance used by the algorithm to approximate the probability distrubtion :returns: The randomly genderated clustering. >>> import clusim.clugen as clugen >>> from clusim.clustering import print_clustering >>> clu = clugen.generate_random_partition_all(n_elements = 9) >>> print_clustering(clu) """ if (n_elements, tol) in all_partition_weight_dict: weights = all_partition_weight_dict[(n_elements, tol)] else: weights = [] u = 1 b = mpmath.bell(n_elements) while sum(weights) < 1.0 - tol: weights.append(mpmath.power(u, n_elements)/(b * mpmath.e * mpmath.factorial(u))) u += 1 all_partition_weight_dict[(n_elements, tol)] = weights K = np.random.choice(np.arange(1, len(weights) + 1), p=weights) colors = np.random.randint(K, size=n_elements) new_clustering = Clustering() new_clustering.from_membership_list(colors) return new_clustering
def paint_mds(oks, figsize=(20, 20)): l2 = len(oks.trace_mb.keys()) l = int(l2**0.5) X = np.zeros([l2, l2]) for idx_1, pair_1 in enumerate(combinations(range(1, l + 1), 2)): b = oks.trace_mb[pair_1] clu_1 = Clustering() clu_1.from_membership_list(b) for idx_2, pair_2 in enumerate(combinations(range(1, l + 1), 2)): b = oks.trace_mb[pair_2] clu_2 = Clustering() clu_2.from_membership_list(b) X[idx_1][idx_2] = 1 - sim.element_sim(clu_1, clu_2) X[idx_2][idx_1] = X[idx_1][idx_2] def _plot_embedding(X, title=None): x_min, x_max = np.min(X, 0), np.max(X, 0) X = (X - x_min) / (x_max - x_min) plt.figure(figsize=figsize) for ind, i in enumerate(range(X.shape[0])): plt.text(X[i, 0], X[i, 1], str(list(oks.trace_mb.keys())[ind]), color=plt.cm.Set1(1 / 10.), fontdict={ 'weight': 'bold', 'size': 12 }) plt.xticks([]), plt.yticks([]) if title is not None: plt.title(title) clf = manifold.MDS(n_components=2, n_init=10, max_iter=10000, dissimilarity="precomputed") X_mds = clf.fit_transform(X) _plot_embedding(X_mds)
def compare_scores(nexperiment, true_clusters, true_labels, predicted_clusters, predicted_labels): mp_score = score.calculate_mp_score(true_clusters, predicted_clusters) nmi = normalized_mutual_info_score(true_labels, predicted_labels, average_method='arithmetic') anmi = adjusted_mutual_info_score(true_labels, predicted_labels) completeness = completeness_score(true_labels, predicted_labels) v_measure = v_measure_score(true_labels, predicted_labels) rand = adjusted_rand_score(true_labels, predicted_labels) fms = fowlkes_mallows_score(true_labels, predicted_labels) T = Clustering() C = Clustering() T.from_cluster_list(true_clusters) C.from_cluster_list(predicted_clusters) jaccard_index = sim.jaccard_index(T, C) nmi2 = sim.nmi(T, C) fmeasure = sim.fmeasure(T, C) element_sim = sim.element_sim(T, C) ri = sim.rand_index(T, C) print("------------------") print("Example ", nexperiment) print("Weigthed Similarity: ", round(mp_score, 3)) print("NMI: ", round(nmi, 3)) print("AMI: ", round(anmi, 3)) print("NMI2: ", round(nmi2, 3)) print("RI: ", round(ri, 3)) print("Completeness: ", round(completeness, 3)) print("V-Measure: ", round(v_measure, 3)) print("Adjusted Rand: ", round(rand, 3)) print("Fowlkes Mallows: ", round(fms, 3)) print("Jaccard Index: ", round(jaccard_index, 3)) print("F-Measure: ", round(fmeasure, 3)) print("Element-centric: ", round(element_sim, 3)) print()
def make_equal_clustering(n_elements, n_clusters): """ This function creates a random clustering with equally sized clusters. If n_elements % n_clusters != 0, cluster sizes will differ by one element. :param int n_elements: The number of elements :param int n_clusters: The number of clusters :returns: The new clustering with equally sized clusters. >>> import clusim.clugen as clugen >>> from clusim.clustering import print_clustering >>> clu = clugen.make_equal_clustering(n_elements = 9, n_clusters = 3) >>> print_clustering(clu) """ new_elm2clu_dict = {el: [el % n_clusters] for el in range(n_elements)} new_clustering = Clustering(new_elm2clu_dict) return new_clustering
def shuffle_memberships(clustering, percent=1.0): """ This function creates a new clustering by shuffling the element memberships from the original clustering. :param Clustering clustering: The original clustering. :param float percent: optional (default 1.0) The fractional percentage (between 0.0 and 1.0) of the elements to shuffle. :returns: The new clustering. >>> import clusim.clugen as clugen >>> from clusim.clustering import print_clustering >>> orig_clu = clugen.make_random_clustering(n_elements = 9, n_clusters = 3, random_model = 'num') >>> print_clustering(orig_clu) >>> shuffle_clu = clugen.shuffle_memberships(orig_clu, percent = 0.5) >>> print_clustering(shuffle_clu) """ el_to_shuffle = np.random.choice(clustering.elements, int(percent * clustering.n_elements), replace=False) shuffled_el = np.random.permutation(el_to_shuffle) newkeys = dict(zip(el_to_shuffle, shuffled_el)) new_elm2clu_dict = copy.deepcopy(clustering.elm2clu_dict) for el in shuffled_el: new_elm2clu_dict[el] = clustering.elm2clu_dict[newkeys[el]] if clustering.is_hierarchical: new_clustering = HierClustering(elm2clu_dict=new_elm2clu_dict, hier_graph=copy.deepcopy(clustering.hiergraph)) else: new_clustering = Clustering(elm2clu_dict=new_elm2clu_dict) return new_clustering
def clustering_with_clusim(dis): mat = np.array(dis) dists = squareform(mat) linkage_matrix = linkage(dists, "average") c = Clustering().from_scipy_linkage(linkage_matrix, dist_rescaled=True) return c