def test_mt_metrics(): """Table 1 in Vilain et al. (1995) """ # row 1 p1 = ["A B C D".split()] p2 = ["A B".split(), "C D".split()] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.6667], 4) # row 2 p1 = ["A B".split(), "C D".split()] p2 = ["A B C D".split()] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.muc_scores()[:2], [0.6667, 1.0], 4) # row 3 p1 = ["A B C D".split()] p2 = ["A B C D".split()] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 1.0], 4) # row 4 is exactly the same as row 1 # row 5 p1 = ["A B C".split()] p2 = ["A C".split(), "B"] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.5], 4)
def test_split_join(): """test split-join and related metrics Example given in http://stats.stackexchange.com/a/25001/37267 For two different clustering pairs below, one can be obtained from the other by moving only two points, {9, 10} for the first pair, and {11, 12} for the second pair. The split-join distance for the two pairs is thus the same. Mirkin and VI distance is larger for the first pair (C1 and C2). This is not a fault of these measures as the clusterings in C3 and C4 do appear to capture more information than in the case of C1 and C2, and so their similarities should be greater. """ C1 = [{1, 2, 3, 4, 5, 6, 7, 8}, {9, 10, 11, 12, 13, 14, 15, 16}] C2 = [{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}] cm = ClusteringMetrics.from_partitions(C1, C2) assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 56) assert_almost_equal(cm.vi_distance(normalize=False), 0.594, 3) assert_equal(cm.split_join_distance(normalize=False), 4) C3 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}] C4 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10, 11, 12}, {13, 14, 15, 16}] cm = ClusteringMetrics.from_partitions(C3, C4) assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 40) assert_almost_equal(cm.vi_distance(normalize=False), 0.520, 3) assert_equal(cm.split_join_distance(normalize=False), 4)
def test_bc_metrics(): """Examples 1 and 2, listing in Figure 9, Bagga & Baldwin (1998) """ p1 = ["1 2 3 4 5".split(), "6 7".split(), "8 9 A B C".split()] p2 = ["1 2 3 4 5".split(), "6 7 8 9 A B C".split()] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.bc_metrics()[:2], [0.76, 1.0], 2) assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4) p2 = ["1 2 3 4 5 8 9 A B C".split(), "6 7".split()] cm = ClusteringMetrics.from_partitions(p1, p2) assert_array_almost_equal(cm.bc_metrics()[:2], [0.58, 1.0], 2) assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4)
def test_perfect(): p1 = [['A', 'B', 'C']] p2 = [['A', 'B', 'C']] cm = ClusteringMetrics.from_partitions(p1, p2) assert_almost_equal(cm.assignment_score(), 1.0, 4) assert_almost_equal(cm.vi_similarity(), 1.0, 4) assert_almost_equal(cm.split_join_similarity(), 1.0, 4) assert_almost_equal(cm.talburt_wang_index(), 1.0, 4) assert_array_almost_equal(cm.entropy_scores(), (1.0,) * 3, 4) assert_array_almost_equal(cm.bc_metrics(), (1.0,) * 3, 4) assert_array_almost_equal(cm.muc_scores(), (1.0,) * 3, 4)