예제 #1
0
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

import unittest
from math import isnan, sqrt

from abydos.stats import ConfusionTable


UNIT_TABLE = ConfusionTable(1, 1, 1, 1)
NULL_TABLE = ConfusionTable(0, 0, 0, 0)
SCALE_TABLE = ConfusionTable(1, 2, 3, 4)
# https://en.wikipedia.org/wiki/Confusion_matrix#Table_of_confusion
CATSNDOGS_TABLE = ConfusionTable(5, 17, 2, 3)
# https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Worked_example
WORKED_EG_TABLE = ConfusionTable(20, 1820, 180, 10)
VERY_POOR_TABLE = ConfusionTable(0, 0, 200, 200)

ALL_TABLES = (
    UNIT_TABLE,
    NULL_TABLE,
    SCALE_TABLE,
    CATSNDOGS_TABLE,
    WORKED_EG_TABLE,
    VERY_POOR_TABLE,
예제 #2
0
    def test_constructors(self):
        """Test abydos.stats.ConfusionTable constructors."""
        self.assertEqual(ConfusionTable(), ConfusionTable())
        self.assertEqual(ConfusionTable(), ConfusionTable(0))
        self.assertEqual(ConfusionTable(), ConfusionTable(0, 0))
        self.assertEqual(ConfusionTable(), ConfusionTable(0, 0, 0))
        self.assertEqual(ConfusionTable(), ConfusionTable(0, 0, 0, 0))
        self.assertNotEqual(ConfusionTable(), ConfusionTable(1))
        self.assertNotEqual(ConfusionTable(), ConfusionTable(0, 1))
        self.assertNotEqual(ConfusionTable(), ConfusionTable(0, 0, 1))
        self.assertNotEqual(ConfusionTable(), ConfusionTable(0, 0, 0, 1))

        # test int constructor & __eq__ by value
        self.assertEqual(SCALE_TABLE, ConfusionTable(1, 2, 3, 4))
        # test tuple constructor
        self.assertEqual(SCALE_TABLE, ConfusionTable((1, 2, 3, 4)))
        self.assertEqual(SCALE_TABLE, ConfusionTable((1, 2, 3, 4), 5, 6, 7))
        # test list constructor
        self.assertEqual(SCALE_TABLE, ConfusionTable([1, 2, 3, 4]))
        self.assertEqual(SCALE_TABLE, ConfusionTable([1, 2, 3, 4], 5, 6, 7))
        # test dict constructor
        self.assertEqual(
            SCALE_TABLE, ConfusionTable({'tp': 1, 'tn': 2, 'fp': 3, 'fn': 4})
        )
        self.assertEqual(
            SCALE_TABLE,
            ConfusionTable({'tp': 1, 'tn': 2, 'fp': 3, 'fn': 4}, 5, 6, 7),
        )
        self.assertEqual(NULL_TABLE, ConfusionTable({}))
        self.assertEqual(
            NULL_TABLE, ConfusionTable({'pt': 1, 'nt': 2, 'pf': 3, 'nf': 4})
        )

        # test __eq__ by id()
        self.assertTrue(SCALE_TABLE == SCALE_TABLE)
        self.assertFalse(CATSNDOGS_TABLE == SCALE_TABLE)
        # test __eq__ by tuple
        self.assertTrue(SCALE_TABLE == (1, 2, 3, 4))
        self.assertFalse(CATSNDOGS_TABLE == (1, 2, 3, 4))
        # test __eq__ by list
        self.assertTrue(SCALE_TABLE == [1, 2, 3, 4])
        self.assertFalse(CATSNDOGS_TABLE == [1, 2, 3, 4])
        # test __eq__ by dict
        self.assertTrue(SCALE_TABLE == {'tp': 1, 'tn': 2, 'fp': 3, 'fn': 4})
        self.assertFalse(
            CATSNDOGS_TABLE == {'tp': 1, 'tn': 2, 'fp': 3, 'fn': 4}
        )
        # test __eq__ with non-ConfusionTable/tuple/list/dict
        self.assertFalse(SCALE_TABLE == 5)

        # test invalid tuple constructor
        self.assertRaises(AttributeError, ConfusionTable, (1, 2))
예제 #3
0
    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )