Example #1
0
def test_split_join():
    """test split-join and related metrics

    Example given in
    http://stats.stackexchange.com/a/25001/37267

    For two different clustering pairs below, one can be obtained from the other
    by moving only two points, {9, 10} for the first pair, and {11, 12} for the
    second pair. The split-join distance for the two pairs is thus the same.

    Mirkin and VI distance is larger for the first pair (C1 and C2). This is not
    a fault of these measures as the clusterings in C3 and C4 do appear to
    capture more information than in the case of C1 and C2, and so their
    similarities should be greater.
    """

    C1 = [{1, 2, 3, 4, 5, 6, 7, 8}, {9, 10, 11, 12, 13, 14, 15, 16}]
    C2 = [{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}]
    cm = ClusteringMetrics.from_partitions(C1, C2)
    assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 56)
    assert_almost_equal(cm.vi_distance(normalize=False), 0.594, 3)
    assert_equal(cm.split_join_distance(normalize=False), 4)

    C3 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10}, {11, 12, 13, 14, 15, 16}]
    C4 = [{1, 2, 3, 4}, {5, 6, 7, 8, 9, 10, 11, 12}, {13, 14, 15, 16}]
    cm = ClusteringMetrics.from_partitions(C3, C4)
    assert_equal(cm.mirkin_mismatch_coeff(normalize=False), 40)
    assert_almost_equal(cm.vi_distance(normalize=False), 0.520, 3)
    assert_equal(cm.split_join_distance(normalize=False), 4)
Example #2
0
def test_mt_metrics():
    """Table 1 in Vilain et al. (1995)
    """

    # row 1
    p1 = ["A B C D".split()]
    p2 = ["A B".split(), "C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.6667], 4)

    # row 2
    p1 = ["A B".split(), "C D".split()]
    p2 = ["A B C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.6667, 1.0], 4)

    # row 3
    p1 = ["A B C D".split()]
    p2 = ["A B C D".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 1.0], 4)

    # row 4 is exactly the same as row 1

    # row 5
    p1 = ["A B C".split()]
    p2 = ["A C".split(), "B"]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.muc_scores()[:2], [1.0, 0.5], 4)
Example #3
0
def test_bc_metrics():
    """Examples 1 and 2, listing in Figure 9, Bagga & Baldwin (1998)
    """
    p1 = ["1 2 3 4 5".split(), "6 7".split(), "8 9 A B C".split()]

    p2 = ["1 2 3 4 5".split(), "6 7 8 9 A B C".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.bc_metrics()[:2], [0.76, 1.0], 2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4)

    p2 = ["1 2 3 4 5 8 9 A B C".split(), "6 7".split()]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_array_almost_equal(cm.bc_metrics()[:2], [0.58, 1.0], 2)
    assert_array_almost_equal(cm.muc_scores()[:2], [0.9, 1.0], 4)
Example #4
0
def test_RxC_metrics():
    """Alternative implementations should coincide for RxC matrices
    """
    for _ in xrange(100):
        ltrue = np.random.randint(low=0, high=5, size=(20,))
        lpred = np.random.randint(low=0, high=5, size=(20,))
        cm = ClusteringMetrics.from_labels(ltrue, lpred)

        # homogeneity, completeness, V-measure
        expected_v = cm.vi_similarity_m3()
        expected_hcv = sklearn_hcv(ltrue, lpred)
        actual_hcv = cm.entropy_scores()
        assert_array_almost_equal(actual_hcv, expected_hcv)
        assert_array_almost_equal(actual_hcv[2], expected_v)

        # mutual information score
        expected_mi = sklearn_mi(ltrue, lpred)
        actual_mi = mutual_info_score(ltrue, lpred)
        assert_array_almost_equal(actual_mi, expected_mi)

        # adjusted mutual information
        expected_ami = sklearn_ami(ltrue, lpred)
        actual_ami = adjusted_mutual_info_score(ltrue, lpred)
        assert_array_almost_equal(actual_ami, expected_ami)

        # adjusted rand index
        expected_ari = sklearn_ari(ltrue, lpred)
        actual_ari = adjusted_rand_score(ltrue, lpred)
        assert_array_almost_equal(actual_ari, expected_ari)
Example #5
0
    def test_emi_matlab(self):
        """Compare EMI values with reference MATLAB code

        http://www.mathworks.com/matlabcentral/fileexchange/33144-the-adjusted-mutual-information
        """

        ltrue = "11 11 11 11 11 11 11 10 10 10 10 13 13 13 13 13 13 13 13 13 12 \
        12 12 12 12 15 15 15 15 15 15 15 14 14 14 14 14 17 17 17 17 16 16 16 16 \
        16 16 19 19 19 19 19 19 19 18 18 18 18 18 18 18 20 20 20 20 20 20 1 1 1 \
        1 3 3 2 2 2 5 5 5 4 4 4 4 7 7 7 7 7 7 7 7 7 6 6 6 9 9 9 8 8".split()

        lpred = "1 19 19 13 2 20 20 8 12 5 17 10 10 13 15 20 20 6 9 8 9 10 15 \
        14 8 11 11 10 13 17 19 5 9 1 2 20 15 19 19 12 14 1 18 18 3 2 5 8 8 7 17 \
        17 17 16 11 11 14 17 16 6 8 13 17 1 3 7 9 9 1 5 18 13 17 13 12 20 11 4 \
        14 19 15 13 5 13 12 16 4 4 7 6 6 8 2 16 16 18 3 7 1 10".split()

        cm = ClusteringMetrics.from_labels(ltrue, lpred)
        ami = cm.adjusted_mutual_info()

        self.assertAlmostEqual(0.0352424389209073, ami, 12)

        rmarg = np.asarray(cm.row_totals.values(), dtype=np.int64)
        cmarg = np.asarray(cm.col_totals.values(), dtype=np.int64)

        emi1 = emi_fortran(rmarg, cmarg)
        emi2 = emi_cython(rmarg, cmarg)

        self.assertAlmostEqual(emi1, emi2, 10)
Example #6
0
def test_RxC_general():
    """General conteingency-table mathods
    """
    for _ in xrange(100):
        size = np.random.randint(4, 100)
        a = np.random.randint(low=0, high=np.random.randint(low=2, high=100),
                              size=(size,))
        b = np.random.randint(low=0, high=np.random.randint(low=2, high=100),
                              size=(size,))
        cm = ClusteringMetrics.from_labels(a, b)

        assert_almost_equal(
            cm.assignment_score(model=None),
            assignment_score_slow(cm, rpad=False, cpad=False))

        assert_almost_equal(
            cm.assignment_score(model=None),
            assignment_score_slow(cm, rpad=True, cpad=True))

        for model in ['m1', 'm2r', 'm2c', 'm3']:

            assert_almost_equal(
                cm.grand_total,
                sum(cm.expected(model=model).itervalues()))

            assert_almost_equal(
                cm.assignment_score(model=model),
                cm.adjust_to_null(cm.assignment_score, model=model)[0])

            assert_almost_equal(
                cm.split_join_similarity(model=model),
                cm.adjust_to_null(cm.split_join_similarity, model=model)[0])
Example #7
0
def test_perfect():
    p1 = [['A', 'B', 'C']]
    p2 = [['A', 'B', 'C']]
    cm = ClusteringMetrics.from_partitions(p1, p2)
    assert_almost_equal(cm.assignment_score(), 1.0, 4)
    assert_almost_equal(cm.vi_similarity(), 1.0, 4)
    assert_almost_equal(cm.split_join_similarity(), 1.0, 4)
    assert_almost_equal(cm.talburt_wang_index(), 1.0, 4)
    assert_array_almost_equal(cm.entropy_scores(), (1.0,) * 3, 4)
    assert_array_almost_equal(cm.bc_metrics(), (1.0,) * 3, 4)
    assert_array_almost_equal(cm.muc_scores(), (1.0,) * 3, 4)
Example #8
0
def test_m1():
    """M1 model
    """
    t2 = ClusteringMetrics(rows=10 * np.ones((2, 2), dtype=int))
    t8 = ClusteringMetrics(rows=10 * np.ones((8, 8), dtype=int))

    assert_almost_equal(0.0, t2.vi_similarity_m1())
    assert_almost_equal(0.0, t8.vi_similarity_m1())

    assert_almost_equal(0.0, t2.split_join_similarity_m1())
    assert_almost_equal(0.0, t8.split_join_similarity_m1())

    assert_almost_equal(0.0, t2.assignment_score_m1())
    assert_almost_equal(0.0, t8.assignment_score_m1())
Example #9
0
def add_incidence_metrics(args, clusters, pairs):
    """Add metrics based on incidence matrix of classes and clusters
    """
    args_metrics = args.metrics
    if set(utils.INCIDENCE_METRICS) & set(args_metrics):

        from lsh_hdc.metrics import ClusteringMetrics
        labels = clusters_to_labels(
            clusters,
            double_negs=bool(args.double_negs),
            join_negs=bool(args.join_negs)
        )
        cm = ClusteringMetrics.from_labels(*labels)

        pairwise_metrics = set(utils.PAIRWISE_METRICS) & set(args_metrics)
        append_scores(cm, pairs, pairwise_metrics)

        contingency_metrics = set(utils.CONTINGENCY_METRICS) & set(args_metrics)
        append_scores(cm, pairs, contingency_metrics)
Example #10
0
def test_adjusted_mutual_info_score():
    # Compute the Adjusted Mutual Information and test against known values
    labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
    labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2])

    # Mutual information
    mi_1 = mutual_info_score(labels_a, labels_b)
    assert_almost_equal(mi_1, 0.41022, 5)
    mi_2 = mutual_info_score(labels_b, labels_a)
    assert_almost_equal(mi_2, 0.41022, 5)

    # Expected mutual information
    cm = ClusteringMetrics.from_labels(labels_a, labels_b)
    row_totals = np.fromiter(cm.iter_row_totals(), dtype=np.int64)
    col_totals = np.fromiter(cm.iter_col_totals(), dtype=np.int64)
    emi_1a = emi_cython(row_totals, col_totals) / cm.grand_total
    emi_1b = emi_fortran(row_totals, col_totals) / cm.grand_total
    assert_almost_equal(emi_1a, 0.15042, 5)
    assert_almost_equal(emi_1b, 0.15042, 5)
    emi_2a = emi_cython(col_totals, row_totals) / cm.grand_total
    emi_2b = emi_fortran(col_totals, row_totals) / cm.grand_total
    assert_almost_equal(emi_2a, 0.15042, 5)
    assert_almost_equal(emi_2b, 0.15042, 5)

    # Adjusted mutual information (1)
    ami_1 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_1, 0.27502, 5)
    ami_2 = adjusted_mutual_info_score(labels_a, labels_b)
    assert_almost_equal(ami_2, 0.27502, 5)

    # Adjusted mutual information (2)
    ami_1 = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3])
    assert_equal(ami_1, 1.0)
    ami_2 = adjusted_mutual_info_score([2, 2, 3, 3], [1, 1, 2, 2])
    assert_equal(ami_2, 1.0)

    # Test AMI with a very large array
    a110 = np.array([list(labels_a) * 110]).flatten()
    b110 = np.array([list(labels_b) * 110]).flatten()
    ami = adjusted_mutual_info_score(a110, b110)
    assert_almost_equal(ami, 0.37, 2)  # not accurate to more than 2 places
Example #11
0
def test_IR_example():
    """Test example from IR book by Manning et al.

    The example gives 3 clusters and 17 points total. It is described on
    http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html
    """
    ltrue = (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2)
    lpred = (0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 2, 2)
    cm = ClusteringMetrics.from_labels(ltrue, lpred)

    # test perfect variants
    rd = cm.row_diag()
    cd = cm.col_diag()
    assert_almost_equal(rd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(cd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(cd.assignment_score(model='m3', discrete=True),   1.0, 6)
    assert_almost_equal(rd.assignment_score(model='m3'), 1.0, 6)
    assert_almost_equal(rd.assignment_score(model='m3', discrete=True),   1.0, 6)

    # test that no redraws happen by default
    assert_almost_equal(cm.assignment_score(model='m3'),
                        cm.assignment_score(model='m3'), 6)

    ex = cm.expected(discrete=False)
    assert_almost_equal(ex.assignment_score(model='m3'), 0.0, 6)

    # test that H1 results in greater score than H0
    ex = cm.expected(discrete=True)
    assert_greater(
        cm.assignment_score(model='m3'),
        ex.assignment_score(model='m3'))

    # test entropy metrics
    h, c, v = cm.entropy_scores()
    assert_almost_equal(h, 0.371468, 6)
    assert_almost_equal(c, 0.357908, 6)
    assert_almost_equal(v, 0.364562, 6)

    assert_almost_equal(cm.vi_similarity(model=None),    0.517754, 6)
    assert_almost_equal(cm.vi_similarity(model='m1'),    0.378167, 6)
    assert_almost_equal(cm.vi_similarity(model='m2r'),   0.365605, 6)
    assert_almost_equal(cm.vi_similarity(model='m2c'),   0.377165, 6)
    assert_almost_equal(cm.vi_similarity(model='m3'),    0.364562, 6)

    assert_almost_equal(cm.mirkin_match_coeff(),         0.695502, 6)
    assert_almost_equal(cm.rand_index(),                 0.676471, 6)
    assert_almost_equal(cm.fowlkes_mallows(),            0.476731, 6)
    assert_almost_equal(cm.assignment_score(model=None), 0.705882, 6)
    assert_almost_equal(cm.assignment_score(model='m3'), 0.554974, 6)

    assert_almost_equal(cm.chisq_score(),          11.900000, 6)
    assert_almost_equal(cm.g_score(),              13.325845, 6)

    # test metrics that are based on pairwise co-association matrix
    conf = cm.pairwise

    assert_almost_equal(conf.chisq_score(),         8.063241, 6)
    assert_almost_equal(conf.g_score(),             7.804221, 6)

    assert_almost_equal(conf.jaccard_coeff(),       0.312500, 6)
    assert_almost_equal(conf.ochiai_coeff(),        0.476731, 6)
    assert_almost_equal(conf.dice_coeff(),          0.476190, 6)
    assert_almost_equal(conf.sokal_sneath_coeff(),  0.185185, 6)

    assert_almost_equal(conf.kappa(),               0.242915, 6)
    assert_almost_equal(conf.accuracy(),            0.676471, 6)
    assert_almost_equal(conf.precision(),           0.500000, 6)
    assert_almost_equal(conf.recall(),              0.454545, 6)

    exp_tw = _talburt_wang_index(ltrue, lpred)
    act_tw = cm.talburt_wang_index()
    assert_almost_equal(exp_tw, act_tw, 6)
Example #12
0
if os.path.exists(PATH):
    print "Loading data from %s" % PATH
    with open(PATH, 'r') as fh:
        ltrue, lpred = pickle.load(fh)
else:
    shape = (ARGS.num_samples,)
    ltrue = np.random.randint(low=0, high=ARGS.max_classes, size=shape)
    lpred = np.random.randint(low=0, high=ARGS.max_clusters, size=shape)
    print "Saving generated data to %s" % PATH
    with open(PATH, 'w') as fh:
        pickle.dump((ltrue, lpred), fh, protocol=pickle.HIGHEST_PROTOCOL)


if ARGS.implementation == 'oo':
    from lsh_hdc.metrics import ClusteringMetrics
    cm = ClusteringMetrics.from_labels(ltrue, lpred)
    method = getattr(cm, METHODS[ARGS.method][1])
    line = "method()"
elif ARGS.implementation == 'sklearn':
    import sklearn.metrics.cluster as module
    method = getattr(module, METHODS[ARGS.method][0])
    line = "method(ltrue, lpred)"
elif ARGS.implementation == 'proposed':
    import lsh_hdc.metrics as module
    method = getattr(module, METHODS[ARGS.method][0])
    line = "method(ltrue, lpred)"
else:
    raise argparse.ArgumentError('Unknown value for --implementation')


print "Sanity check:"
Example #13
0
 def matrix_from_labels(*args):
     ltrue, lpred = args
     return ClusteringMetrics.from_labels(ltrue, lpred)