def test__tokenizer(self):
        """Test abydos.tokenizer._Tokenizer."""
        self.assertEqual(_Tokenizer().tokenize('').get_counter(),
                         Counter({'': 1}))
        self.assertEqual(_Tokenizer().tokenize('a').get_counter(),
                         Counter({'a': 1}))

        self.assertEqual(
            _Tokenizer().tokenize('NELSON').get_counter(),
            Counter({'NELSON': 1}),
        )
        self.assertEqual(
            _Tokenizer().tokenize('NEILSEN').get_counter(),
            Counter({'NEILSEN': 1}),
        )
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1)
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1)

        tweet = 'Good to be home for a night'
        self.assertEqual(
            _Tokenizer().tokenize(tweet).get_counter(),
            Counter({'Good to be home for a night': 1}),
        )

        nelson = QGrams().tokenize('NELSON')
        neilsen = QGrams().tokenize('NEILSEN')
        self.assertEqual(nelson.get_set(),
                         {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'})
        self.assertEqual(nelson.get_list(),
                         ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#'])
        if sys.version_info >= (3, 6):
            self.assertEqual(
                repr(nelson),
                "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \
'N#': 1})",
            )
        self.assertEqual(nelson & neilsen,
                         Counter({
                             '$N': 1,
                             'NE': 1,
                             'LS': 1,
                             'N#': 1
                         }))
        self.assertEqual(
            nelson + neilsen,
            Counter({
                '$N': 2,
                'NE': 2,
                'EL': 1,
                'LS': 2,
                'SO': 1,
                'ON': 1,
                'N#': 2,
                'EI': 1,
                'IL': 1,
                'SE': 1,
                'EN': 1,
            }),
        )
        self.assertEqual(nelson - neilsen, Counter({
            'EL': 1,
            'SO': 1,
            'ON': 1
        }))

        nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON')
        self.assertEqual(nelsonnelson.count(), 8)

        nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON')
        self.assertAlmostEqual(nelson_ssk.count(), 18.66784401)

        nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON')
        gold_standard = Counter({
            '$$N': 1.0986122886681096,
            '$$E': 0.6931471805599453,
            '$$L': 0.6931471805599453,
            '$$S': 0.6931471805599453,
            '$$O': 0.6931471805599453,
            '$$#': 1.0986122886681096,
            '$NE': 1.0986122886681096,
            '$NL': 1.0986122886681096,
            '$NS': 1.0986122886681096,
            '$NO': 1.0986122886681096,
            '$NN': 1.0986122886681096,
            '$N#': 2.1972245773362196,
            '$EL': 1.0986122886681096,
            '$ES': 1.0986122886681096,
            '$EO': 1.0986122886681096,
            '$EN': 1.0986122886681096,
            '$E#': 1.6094379124341003,
            '$LS': 1.0986122886681096,
            '$LO': 1.0986122886681096,
            '$LN': 1.0986122886681096,
            '$L#': 1.6094379124341003,
            '$SO': 1.0986122886681096,
            '$SN': 1.0986122886681096,
            '$S#': 1.6094379124341003,
            '$ON': 1.0986122886681096,
            '$O#': 1.6094379124341003,
            '$##': 1.0986122886681096,
            'NEL': 0.6931471805599453,
            'NES': 0.6931471805599453,
            'NEO': 0.6931471805599453,
            'NEN': 0.6931471805599453,
            'NE#': 1.0986122886681096,
            'NLS': 0.6931471805599453,
            'NLO': 0.6931471805599453,
            'NLN': 0.6931471805599453,
            'NL#': 1.0986122886681096,
            'NSO': 0.6931471805599453,
            'NSN': 0.6931471805599453,
            'NS#': 1.0986122886681096,
            'NON': 0.6931471805599453,
            'NO#': 1.0986122886681096,
            'NN#': 1.0986122886681096,
            'N##': 1.0986122886681096,
            'ELS': 0.6931471805599453,
            'ELO': 0.6931471805599453,
            'ELN': 0.6931471805599453,
            'EL#': 1.0986122886681096,
            'ESO': 0.6931471805599453,
            'ESN': 0.6931471805599453,
            'ES#': 1.0986122886681096,
            'EON': 0.6931471805599453,
            'EO#': 1.0986122886681096,
            'EN#': 1.0986122886681096,
            'E##': 0.6931471805599453,
            'LSO': 0.6931471805599453,
            'LSN': 0.6931471805599453,
            'LS#': 1.0986122886681096,
            'LON': 0.6931471805599453,
            'LO#': 1.0986122886681096,
            'LN#': 1.0986122886681096,
            'L##': 0.6931471805599453,
            'SON': 0.6931471805599453,
            'SO#': 1.0986122886681096,
            'SN#': 1.0986122886681096,
            'S##': 0.6931471805599453,
            'ON#': 1.0986122886681096,
            'O##': 0.6931471805599453,
        })
        test_counter = nelson_log.get_counter()
        for key in test_counter:
            self.assertAlmostEqual(test_counter[key], gold_standard[key])

        nelson_entropy = QSkipgrams(scaler='entropy').tokenize('NELSON')
        self.assertAlmostEqual(nelson_entropy.count(), 4.6644977792)
예제 #2
0
    def test_tversky_sim(self):
        """Test abydos.distance.Tversky.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 11)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 11)

        # test valid alpha & beta
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=-1.0).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=-1.0, beta=0.0).sim, 'abcd', 'dcba')
        self.assertRaises(ValueError,
                          Tversky(alpha=0.0, beta=-1.0).sim, 'abcd', 'dcba')

        # test empty QGrams
        self.assertAlmostEqual(
            Tversky(tokenizer=QGrams(7, start_stop='')).sim(
                'nelson', 'neilsen'),
            0.0,
        )

        # test unequal alpha & beta
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 10,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            3 / 13,
        )

        # test bias parameter
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 11,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=1.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 9,
        )
        self.assertAlmostEqual(
            Tversky(alpha=1.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 15,
        )
        self.assertAlmostEqual(
            Tversky(alpha=2.0, beta=2.0, bias=0.5,
                    tokenizer=QGrams(2)).sim('niall', 'neal'),
            7 / 11,
        )

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 11,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 3)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 3)

        # Test wrapper
        self.assertAlmostEqual(sim_tversky('nelson', 'neilsen'), 4 / 11)
예제 #3
0
    VCClusterTokenizer,
    WhitespaceTokenizer,
    WordpunctTokenizer,
)

from nltk import TweetTokenizer

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

algorithms = {
    'corvcluster': COrVClusterTokenizer().tokenize,
    'cvcluster': CVClusterTokenizer().tokenize,
    'character': CharacterTokenizer().tokenize,
    'legalipy': LegaliPyTokenizer().tokenize,
    'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize,
    'qgrams': QGrams().tokenize,
    'qskipgrams': QSkipgrams().tokenize,
    'regexp': RegexpTokenizer().tokenize,
    'saps': SAPSTokenizer().tokenize,
    'sonoripy': SonoriPyTokenizer().tokenize,
    'vccluster': VCClusterTokenizer().tokenize,
    'whitespace': WhitespaceTokenizer().tokenize,
    'wordpunct': WordpunctTokenizer().tokenize,
}


class BigListOfNaughtyStringsTestCases(unittest.TestCase):
    """Test each tokenizer against the BLNS set.

    Here, we test each algorithm against each string, but we only care that it
    does not result in an exception.
예제 #4
0
class CosineSimilarityTestCases(unittest.TestCase):
    """Test cosine similarity functions.

    abydos.distance.Cosine
    """

    cmp = Cosine()
    cmp_q2 = Cosine(tokenizer=QGrams(2))
    cmp_ws = Cosine(tokenizer=WhitespaceTokenizer())

    def test_cosine_sim(self):
        """Test abydos.distance.Cosine.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / math.sqrt(7 * 8),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO),
                               4 / math.sqrt(9 * 7))
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM),
                               4 / math.sqrt(9 * 7))

        self.assertEqual(self.cmp_q2.sim('eh', 'a'), 0.0)

        # Test wrapper
        self.assertAlmostEqual(sim_cosine('nelson', 'neilsen'),
                               4 / math.sqrt(7 * 8))

    def test_cosine_dist(self):
        """Test abydos.distance.Cosine.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1 - (4 / math.sqrt(7 * 8)),
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO),
                               1 - 4 / math.sqrt(9 * 7))
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM),
                               1 - 4 / math.sqrt(9 * 7))

        # Test wrapper
        self.assertAlmostEqual(dist_cosine('nelson', 'neilsen'),
                               1 - (4 / math.sqrt(7 * 8)))
class FuzzyWuzzyTokenSetTestCases(unittest.TestCase):
    """Test FuzzyWuzzyTokenSet functions.

    abydos.distance.FuzzyWuzzyTokenSet
    """

    cmp = FuzzyWuzzyTokenSet()
    cmp_q2 = FuzzyWuzzyTokenSet(tokenizer=QGrams(qval=2))

    def test_fuzzywuzzy_token_set_sim(self):
        """Test abydos.distance.FuzzyWuzzyTokenSet.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'a'), 1.0)
        self.assertEqual(self.cmp.sim('abc', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.3333333333333333)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6666666667)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6666666667)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8333333333)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8333333333)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.6666666667)

        # tests from blog
        self.assertEqual(
            self.cmp.sim(
                'mariners vs angels',
                'los angeles angels of anaheim at seattle mariners',
            ),
            0.9411764705882353,
        )
        self.assertEqual(self.cmp.sim('Sirhan, Sirhan', 'Sirhan'), 1.0)

        # q2 tokenizer
        self.assertAlmostEqual(self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.84)
        self.assertAlmostEqual(self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'),
                               0.9545454545454546)
        self.assertAlmostEqual(
            self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'),
            0.8450704225352113,
        )
        self.assertAlmostEqual(
            self.cmp_q2.sim(
                'New York Mets vs Atlanta Braves',
                'Atlanta Braves vs New York Mets',
            ),
            0.9782608695652174,
        )

    def test_fuzzywuzzy_token_set_dist(self):
        """Test abydos.distance.FuzzyWuzzyTokenSet.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'a'), 0.0)
        self.assertEqual(self.cmp.dist('abc', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.6666666666666667)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3333333333)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3333333333)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.1666666667)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.1666666667)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.3333333333)
class Sift4ExtendedTestCases(unittest.TestCase):
    """Test Sift4Extended functions.

    abydos.distance.Sift4Extended
    """

    ltamc = Sift4Extended.longer_transpositions_are_more_costly

    cmp = Sift4Extended()
    cmp_kwargs = Sift4Extended(
        tokenizer=QGrams(qval=2),
        token_matcher=Sift4Extended.sift4_token_matcher,
        matching_evaluator=Sift4Extended.sift4_matching_evaluator,
        local_length_evaluator=Sift4Extended.reward_length_evaluator,
        transposition_cost_evaluator=ltamc,
        transpositions_evaluator=lambda lcss, trans: lcss - trans,
    )
    cmp_kwargs2 = Sift4Extended(
        local_length_evaluator=Sift4Extended.reward_length_evaluator_exp
    )
    cmp_md = Sift4Extended(max_distance=3)

    def test_sift4_extended_dist_abs(self):
        """Test abydos.distance.Sift4Extended.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp.dist_abs('', 'a'), 1)
        self.assertEqual(self.cmp.dist_abs('abc', ''), 3)
        self.assertEqual(self.cmp.dist_abs('', 'abc'), 3)
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 4)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 1)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 1)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 4)

        self.assertEqual(self.cmp_kwargs.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_kwargs.dist_abs('a', ''), 2)
        self.assertEqual(self.cmp_kwargs.dist_abs('', 'a'), 2)
        self.assertEqual(self.cmp_kwargs.dist_abs('abc', ''), 4)
        self.assertEqual(self.cmp_kwargs.dist_abs('', 'abc'), 4)
        self.assertEqual(self.cmp_kwargs.dist_abs('abc', 'abc'), -1)
        self.assertEqual(self.cmp_kwargs.dist_abs('abcd', 'efgh'), -2)

        self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Nigel', 'Niall'), 1)
        self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Niall', 'Nigel'), 1)
        self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Colin', 'Coiln'), 1)
        self.assertAlmostEqual(self.cmp_kwargs.dist_abs('Coiln', 'Colin'), 1)
        self.assertAlmostEqual(
            self.cmp_kwargs.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2
        )

        self.assertEqual(self.cmp_kwargs2.dist_abs('abc', 'abc'), 0)
        self.assertEqual(self.cmp_kwargs2.dist_abs('abcd', 'efgh'), 8)

        self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Nigel', 'Niall'), 7)
        self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Niall', 'Nigel'), 7)
        self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Colin', 'Coiln'), 6)
        self.assertAlmostEqual(self.cmp_kwargs2.dist_abs('Coiln', 'Colin'), 6)
        self.assertAlmostEqual(
            self.cmp_kwargs2.dist_abs('ATCAACGAGT', 'AACGATTAG'), 25
        )

        # coverage completion
        self.assertAlmostEqual(
            self.cmp_kwargs.dist_abs('beaurocracy', 'bureaucracy'), 3
        )
        self.assertAlmostEqual(
            self.cmp_md.dist_abs('beaurocratically', 'bureaucracy'), 3
        )
        self.assertAlmostEqual(
            self.cmp_md.dist_abs('bureaucracy', 'bureaucracy'), 3
        )
예제 #7
0
class MinkowskiTestCases(unittest.TestCase):
    """Test Minkowski functions.

    abydos.distance.Minkowski
    """

    cmp = Minkowski()
    cmp_q2 = Minkowski(tokenizer=QGrams(2))
    cmp_q1p0 = Minkowski(pval=0, tokenizer=QGrams(1))
    cmp_ws = Minkowski(tokenizer=WhitespaceTokenizer())

    def test_minkowski_dist_abs(self):
        """Test abydos.distance.Minkowski.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8)

        # test l_0 "norm"
        self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True),
                         1 / 2)

        # test with alphabet
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1)
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet=26).dist_abs('ab', 'b', normalized=True),
            1 / 26,
        )
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs(
                          'ab', 'b', normalized=True),
            1 / 26,
        )

        self.assertEqual(
            Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0)

        # Test wrapper
        self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7)

    def test_minkowski_sim(self):
        """Test abydos.distance.Minkowski.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(sim_minkowski('nelson', 'neilsen'), 8 / 15)

    def test_minkowski_dist(self):
        """Test abydos.distance.Minkowski.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(dist_minkowski('nelson', 'neilsen'), 7 / 15)
예제 #8
0
    def test_qgrams(self):
        """Test abydos.tokenizer.QGrams."""
        self.assertEqual(sorted(QGrams().tokenize('').get_list()), [])
        self.assertEqual(sorted(QGrams(2).tokenize('a').get_list()),
                         ['$a', 'a#'])
        self.assertEqual(sorted(QGrams(-1).tokenize('NELSON').get_list()), [])

        self.assertEqual(
            sorted(QGrams(3).tokenize('NELSON').get_list()),
            sorted(['$$N', '$NE', 'NEL', 'ELS', 'LSO', 'SON', 'ON#', 'N##']),
        )
        self.assertEqual(
            sorted(QGrams(7).tokenize('NELSON').get_list()),
            sorted([
                '$$$$$$N',
                '$$$$$NE',
                '$$$$NEL',
                '$$$NELS',
                '$$NELSO',
                '$NELSON',
                'ELSON##',
                'LSON###',
                'N######',
                'NELSON#',
                'ON#####',
                'SON####',
            ]),
        )

        # http://www.sound-ex.com/alternative_qgram.htm
        self.assertEqual(
            sorted(QGrams().tokenize('NELSON').get_list()),
            sorted(['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']),
        )
        self.assertEqual(
            sorted(QGrams().tokenize('NEILSEN').get_list()),
            sorted(['$N', 'NE', 'EI', 'IL', 'LS', 'SE', 'EN', 'N#']),
        )
        self.assertEqual(
            sorted(QGrams(start_stop='').tokenize('NELSON').get_list()),
            sorted(['NE', 'EL', 'LS', 'SO', 'ON']),
        )
        self.assertEqual(
            sorted(QGrams(start_stop='').tokenize('NEILSEN').get_list()),
            sorted(['NE', 'EI', 'IL', 'LS', 'SE', 'EN']),
        )

        # qval=(1,2)
        self.assertEqual(
            sorted(QGrams(qval=(1, 2)).tokenize('NELSON').get_list()),
            sorted([
                '$N',
                'E',
                'EL',
                'L',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'O',
                'ON',
                'S',
                'SO',
            ]),
        )
        self.assertEqual(
            sorted(QGrams(qval=(2, 1)).tokenize('NELSON').get_list()),
            sorted([
                '$N',
                'E',
                'EL',
                'L',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'O',
                'ON',
                'S',
                'SO',
            ]),
        )
        self.assertEqual(
            sorted(QGrams(qval=range(3)).tokenize('NELSON').get_list()),
            sorted([
                '$N',
                'E',
                'EL',
                'L',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'O',
                'ON',
                'S',
                'SO',
            ]),
        )
        self.assertEqual(QGrams(qval=(1, 2)).tokenize('NELSON').count(), 13)

        # skip=(1,2)
        self.assertEqual(
            sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()),
            sorted([
                '$E',
                '$L',
                '$N',
                'EL',
                'EO',
                'ES',
                'LN',
                'LO',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'NL',
                'NS',
                'O',
                'O#',
                'ON',
                'S#',
                'SN',
                'SO',
            ]),
        )
        self.assertEqual(
            sorted(QGrams(skip=(2, 1, 0)).tokenize('NELSON').get_list()),
            sorted([
                '$E',
                '$L',
                '$N',
                'EL',
                'EO',
                'ES',
                'LN',
                'LO',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'NL',
                'NS',
                'O',
                'O#',
                'ON',
                'S#',
                'SN',
                'SO',
            ]),
        )
        self.assertEqual(
            sorted(QGrams(skip=range(3)).tokenize('NELSON').get_list()),
            sorted([
                '$E',
                '$L',
                '$N',
                'EL',
                'EO',
                'ES',
                'LN',
                'LO',
                'LS',
                'N',
                'N',
                'N#',
                'NE',
                'NL',
                'NS',
                'O',
                'O#',
                'ON',
                'S#',
                'SN',
                'SO',
            ]),
        )
        self.assertEqual(QGrams(skip=(0, 1, 2)).tokenize('NELSON').count(), 21)
        self.assertEqual(
            QGrams(qval=1).tokenize('COLIN').get_counter(),
            Counter({
                'C': 1,
                'O': 1,
                'L': 1,
                'I': 1,
                'N': 1
            }),
        )
        self.assertEqual(
            QGrams(qval=10, start_stop='').tokenize('COLIN').get_counter(),
            Counter({}),
        )
        if sys.version_info >= (3, 6):
            self.assertEqual(
                repr(QGrams(qval=1).tokenize('COLIN')),
                "QGrams({'C': 1, 'O': 1, 'L': 1, 'I': 1, 'N': 1})",
            )
        self.assertEqual(
            QGrams(qval=1).tokenize('COLIN').get_set(),
            {'C', 'O', 'L', 'I', 'N'},
        )

        # Test exception
        self.assertRaises(ValueError, QGrams, 0)
예제 #9
0
class ChebyshevTestCases(unittest.TestCase):
    """Test Chebyshev functions.

    abydos.distance.Chebyshev
    """

    cmp = Chebyshev()
    cmp_q2 = Chebyshev(tokenizer=QGrams(2))
    cmp_ws = Chebyshev(tokenizer=WhitespaceTokenizer())

    def test_chebyshev_dist_abs(self):
        """Test abydos.distance.Chebyshev.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 1)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 1)
        self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 1)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 1)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 1)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 1)

        # Test wrapper
        self.assertAlmostEqual(chebyshev('nelson', 'neilsen', 2), 1)

    def test_chebyshev_dist(self):
        """Test abydos.distance.Chebyshev.dist."""
        self.assertRaises(NotImplementedError, self.cmp.dist)

    def test_chebyshev_sim(self):
        """Test abydos.distance.Chebyshev.sim."""
        self.assertRaises(NotImplementedError, self.cmp.sim)
예제 #10
0
    def test_qgrams_intersections(self):
        """Test abydos.tokenizer.QGrams intersections."""
        self.assertEqual(
            sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('')), [])
        self.assertEqual(
            sorted(QGrams().tokenize('') & QGrams().tokenize('NEILSEN')), [])
        self.assertEqual(
            sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NEILSEN')),
            sorted(['$N', 'NE', 'LS', 'N#']),
        )
        self.assertEqual(
            sorted(QGrams().tokenize('NELSON') & QGrams().tokenize('NOSLEN')),
            sorted(['$N', 'N#']),
        )
        self.assertEqual(
            sorted(QGrams().tokenize('NAIL') & QGrams().tokenize('LIAN')), [])

        self.assertEqual(
            sorted(
                QGrams(start_stop='').tokenize('NELSON')
                & QGrams(start_stop='').tokenize('NEILSEN')),
            sorted(['NE', 'LS']),
        )
        self.assertEqual(
            sorted(
                QGrams(start_stop='').tokenize('NELSON')
                & QGrams(start_stop='').tokenize('NOSLEN')),
            [],
        )
        self.assertEqual(
            sorted(
                QGrams(start_stop='').tokenize('NAIL')
                & QGrams(start_stop='').tokenize('LIAN')),
            [],
        )
예제 #11
0
    def test_qgrams_counts(self):
        """Test abydos.tokenizer.QGrams counts."""
        self.assertEqual(QGrams().tokenize('').count(), 0)
        self.assertEqual(len(QGrams().tokenize('').get_list()), 0)

        self.assertEqual(QGrams().tokenize('NEILSEN').count(), 8)
        self.assertEqual(QGrams().tokenize('NELSON').count(), 7)
        self.assertEqual(QGrams(start_stop='').tokenize('NEILSEN').count(), 6)
        self.assertEqual(QGrams(start_stop='').tokenize('NELSON').count(), 5)

        self.assertEqual(len(QGrams().tokenize('NEILSEN').get_list()), 8)
        self.assertEqual(len(QGrams().tokenize('NELSON').get_list()), 7)
        self.assertEqual(
            len(QGrams(start_stop='').tokenize('NEILSEN').get_list()), 6)
        self.assertEqual(
            len(QGrams(start_stop='').tokenize('NELSON').get_list()), 5)

        self.assertEqual(
            QGrams(scaler='set').tokenize('ACAACACCTAG').get_counter(),
            Counter({
                '$A': 1,
                'AC': 1,
                'CA': 1,
                'AA': 1,
                'CC': 1,
                'CT': 1,
                'TA': 1,
                'AG': 1,
                'G#': 1,
            }),
        )

        gold_standard = Counter({
            '$A': 0.6931471805599453,
            'AC': 1.3862943611198906,
            'CA': 1.0986122886681096,
            'AA': 0.6931471805599453,
            'CC': 0.6931471805599453,
            'CT': 0.6931471805599453,
            'TA': 0.6931471805599453,
            'AG': 0.6931471805599453,
            'G#': 0.6931471805599453,
        })
        test_counter = (QGrams(
            scaler=log1p).tokenize('ACAACACCTAG').get_counter())
        for key in test_counter:
            self.assertAlmostEqual(test_counter[key], gold_standard[key])

        self.assertEqual(
            QGrams(scaler=log1p).tokenize('ACAACACCTAG').count_unique(), 9)

        tokens1 = QGrams().tokenize('ACAACACCTAG')
        tokens2 = QGrams().tokenize('GAAGATAC')
        self.assertEqual(
            tokens1 - tokens2,
            Counter({
                '$A': 1,
                'AC': 2,
                'CA': 2,
                'CC': 1,
                'CT': 1,
                'G#': 1
            }),
        )
        self.assertEqual(
            tokens1 + tokens2,
            Counter({
                '$A': 1,
                'AC': 4,
                'CA': 2,
                'AA': 2,
                'CC': 1,
                'CT': 1,
                'TA': 2,
                'AG': 2,
                'G#': 1,
                '$G': 1,
                'GA': 2,
                'AT': 1,
                'C#': 1,
            }),
        )
예제 #12
0
class SAPSTestCases(unittest.TestCase):
    """Test SAPS functions.

    abydos.distance.SAPS
    """

    cmp = SAPS()
    cmp_q2 = SAPS(tokenizer=QGrams(2))

    def test_saps_sim(self):
        """Test abydos.distance.SAPS.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 0.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0666666667)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0666666667)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0666666667)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0666666667)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.4333333333)

        # Coverage
        self.assertAlmostEqual(self.cmp_q2.sim('Stevenson', 'Stinson'),
                               0.3857142857)

        # Examples from paper
        self.assertAlmostEqual(self.cmp.sim('Stevenson', 'Stinson'),
                               0.551724138)

    def test_saps_dist(self):
        """Test abydos.distance.SAPS.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 1.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.9333333333)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.9333333333)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.9333333333)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.9333333333)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.5666666667)

        # Coverage
        self.assertAlmostEqual(self.cmp_q2.dist('Stevenson', 'Stinson'),
                               0.614285714)

        # Examples from paper
        self.assertAlmostEqual(self.cmp.dist('Stevenson', 'Stinson'),
                               0.448275862)

    def test_saps_sim_score(self):
        """Test abydos.distance.SAPS.sim_score."""
        # Base cases
        self.assertEqual(self.cmp.sim_score('', ''), 0)
        self.assertEqual(self.cmp.sim_score('a', ''), -3)
        self.assertEqual(self.cmp.sim_score('', 'a'), -3)
        self.assertEqual(self.cmp.sim_score('abc', ''), -7)
        self.assertEqual(self.cmp.sim_score('', 'abc'), -7)
        self.assertEqual(self.cmp.sim_score('abc', 'abc'), 13)
        self.assertEqual(self.cmp.sim_score('abcd', 'efgh'), -7)

        self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'), 1)
        self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'), 1)
        self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'), 1)
        self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'), 1)
        self.assertAlmostEqual(self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'),
                               13)

        # Coverage
        self.assertEqual(self.cmp_q2.sim_score('Stevenson', 'Stinson'), 27)

        # Examples from paper
        self.assertEqual(self.cmp.sim_score('Stevenson', 'Stinson'), 16)
예제 #13
0
    def test__tokenizer(self):
        """Test abydos.tokenizer._Tokenizer."""
        self.assertEqual(
            _Tokenizer().tokenize('').get_counter(), Counter({'': 1})
        )
        self.assertEqual(
            _Tokenizer().tokenize('a').get_counter(), Counter({'a': 1})
        )

        self.assertEqual(
            _Tokenizer().tokenize('NELSON').get_counter(),
            Counter({'NELSON': 1}),
        )
        self.assertEqual(
            _Tokenizer().tokenize('NEILSEN').get_counter(),
            Counter({'NEILSEN': 1}),
        )
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1)
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1)

        tweet = 'Good to be home for a night'
        self.assertEqual(
            _Tokenizer().tokenize(tweet).get_counter(),
            Counter({'Good to be home for a night': 1}),
        )

        nelson = QGrams().tokenize('NELSON')
        neilsen = QGrams().tokenize('NEILSEN')
        self.assertEqual(
            nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'}
        )
        self.assertEqual(
            nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']
        )
        if sys.version_info >= (3, 6):
            self.assertEqual(
                repr(nelson),
                "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \
'N#': 1})",
            )
        self.assertEqual(
            nelson & neilsen, Counter({'$N': 1, 'NE': 1, 'LS': 1, 'N#': 1})
        )
        self.assertEqual(
            nelson + neilsen,
            Counter(
                {
                    '$N': 2,
                    'NE': 2,
                    'EL': 1,
                    'LS': 2,
                    'SO': 1,
                    'ON': 1,
                    'N#': 2,
                    'EI': 1,
                    'IL': 1,
                    'SE': 1,
                    'EN': 1,
                }
            ),
        )
        self.assertEqual(
            nelson - neilsen, Counter({'EL': 1, 'SO': 1, 'ON': 1})
        )

        nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON')
        self.assertEqual(nelsonnelson.count(), 8)

        nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON')
        self.assertAlmostEqual(nelson_ssk.count(), 18.66784401)

        nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON')
        gold_standard = Counter(
            {
                '$$N': 1.0986122886681096,
                '$$E': 0.6931471805599453,
                '$$L': 0.6931471805599453,
                '$$S': 0.6931471805599453,
                '$$O': 0.6931471805599453,
                '$$#': 1.0986122886681096,
                '$NE': 1.0986122886681096,
                '$NL': 1.0986122886681096,
                '$NS': 1.0986122886681096,
                '$NO': 1.0986122886681096,
                '$NN': 1.0986122886681096,
                '$N#': 2.1972245773362196,
                '$EL': 1.0986122886681096,
                '$ES': 1.0986122886681096,
                '$EO': 1.0986122886681096,
                '$EN': 1.0986122886681096,
                '$E#': 1.6094379124341003,
                '$LS': 1.0986122886681096,
                '$LO': 1.0986122886681096,
                '$LN': 1.0986122886681096,
                '$L#': 1.6094379124341003,
                '$SO': 1.0986122886681096,
                '$SN': 1.0986122886681096,
                '$S#': 1.6094379124341003,
                '$ON': 1.0986122886681096,
                '$O#': 1.6094379124341003,
                '$##': 1.0986122886681096,
                'NEL': 0.6931471805599453,
                'NES': 0.6931471805599453,
                'NEO': 0.6931471805599453,
                'NEN': 0.6931471805599453,
                'NE#': 1.0986122886681096,
                'NLS': 0.6931471805599453,
                'NLO': 0.6931471805599453,
                'NLN': 0.6931471805599453,
                'NL#': 1.0986122886681096,
                'NSO': 0.6931471805599453,
                'NSN': 0.6931471805599453,
                'NS#': 1.0986122886681096,
                'NON': 0.6931471805599453,
                'NO#': 1.0986122886681096,
                'NN#': 1.0986122886681096,
                'N##': 1.0986122886681096,
                'ELS': 0.6931471805599453,
                'ELO': 0.6931471805599453,
                'ELN': 0.6931471805599453,
                'EL#': 1.0986122886681096,
                'ESO': 0.6931471805599453,
                'ESN': 0.6931471805599453,
                'ES#': 1.0986122886681096,
                'EON': 0.6931471805599453,
                'EO#': 1.0986122886681096,
                'EN#': 1.0986122886681096,
                'E##': 0.6931471805599453,
                'LSO': 0.6931471805599453,
                'LSN': 0.6931471805599453,
                'LS#': 1.0986122886681096,
                'LON': 0.6931471805599453,
                'LO#': 1.0986122886681096,
                'LN#': 1.0986122886681096,
                'L##': 0.6931471805599453,
                'SON': 0.6931471805599453,
                'SO#': 1.0986122886681096,
                'SN#': 1.0986122886681096,
                'S##': 0.6931471805599453,
                'ON#': 1.0986122886681096,
                'O##': 0.6931471805599453,
            }
        )
        test_counter = nelson_log.get_counter()
        for key in test_counter:
            self.assertAlmostEqual(test_counter[key], gold_standard[key])
예제 #14
0
    def test_minkowski_dist_abs(self):
        """Test abydos.distance.Minkowski.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8)
        self.assertEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8)

        # test l_0 "norm"
        self.assertEqual(self.cmp_q1p0.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', ''), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b'), 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b'), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('', '', normalized=True), 0)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', '', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('a', 'b', normalized=True), 1)
        self.assertEqual(self.cmp_q1p0.dist_abs('ab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'b', normalized=True),
                         1 / 2)
        self.assertEqual(self.cmp_q1p0.dist_abs('aaab', 'ab', normalized=True),
                         1 / 2)

        # test with alphabet
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1), alphabet=26).dist_abs('ab', 'b'), 1)
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet=26).dist_abs('ab', 'b', normalized=True),
            1 / 26,
        )
        self.assertEqual(
            Minkowski(tokenizer=QGrams(1),
                      alphabet='abcdefghijklmnopqrstuvwxyz').dist_abs(
                          'ab', 'b', normalized=True),
            1 / 26,
        )

        self.assertEqual(
            Minkowski(pval=float('inf')).dist_abs('nelsonian', 'neilsen'), 1.0)

        # Test wrapper
        self.assertAlmostEqual(minkowski('nelson', 'neilsen'), 7)
예제 #15
0
class DiceTestCases(unittest.TestCase):
    """Test Dice functions.

    abydos.distance.Dice
    """

    cmp = Dice()
    cmp_q2 = Dice(tokenizer=QGrams(2))
    cmp_ws = Dice(tokenizer=WhitespaceTokenizer())

    def test_dice_sim(self):
        """Test abydos.distance.Dice.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(sim_dice('nelson', 'neilsen'), 8 / 15)

    def test_dice_dist(self):
        """Test abydos.distance.Dice.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(dist_dice('nelson', 'neilsen'), 7 / 15)
예제 #16
0
class FuzzyWuzzyTokenSortTestCases(unittest.TestCase):
    """Test FuzzyWuzzyTokenSort functions.

    abydos.distance.FuzzyWuzzyTokenSort
    """

    cmp = FuzzyWuzzyTokenSort()
    cmp_q2 = FuzzyWuzzyTokenSort(tokenizer=QGrams(qval=2))

    def test_fuzzywuzzy_token_sort_sim(self):
        """Test abydos.distance.FuzzyWuzzyTokenSort.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.8)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.8)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.6315789474)

        # tests from blog
        self.assertEqual(
            self.cmp.sim(
                'New York Mets vs Atlanta Braves',
                'Atlanta Braves vs New York Mets',
            ),
            1.0,
        )

        # q2 tokenizer
        self.assertAlmostEqual(self.cmp_q2.sim('ATCAACGAGT', 'AACGATTAG'),
                               0.8524590163934426)
        self.assertAlmostEqual(self.cmp_q2.sim('YANKEES', 'NEW YORK YANKEES'),
                               0.6027397260273972)
        self.assertAlmostEqual(
            self.cmp_q2.sim('NEW YORK METS', 'NEW YORK YANKEES'),
            0.7692307692307693,
        )
        self.assertAlmostEqual(
            self.cmp_q2.sim(
                'New York Mets vs Atlanta Braves',
                'Atlanta Braves vs New York Mets',
            ),
            0.9578947368421052,
        )

    def test_fuzzywuzzy_token_sort_dist(self):
        """Test abydos.distance.FuzzyWuzzyTokenSort.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.4)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.4)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.3684210526)
예제 #17
0
class OverlapTestCases(unittest.TestCase):
    """Test overlap functions.

    abydos.distance.Overlap
    """

    cmp = Overlap()
    cmp_q2 = Overlap(tokenizer=QGrams(2))
    cmp_ws = Overlap(tokenizer=WhitespaceTokenizer())

    def test_overlap_sim(self):
        """Test abydos.distance.Overlap.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 4 / 7)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 4 / 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            4 / 7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 4 / 7)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 4 / 7)

    def test_overlap_dist(self):
        """Test abydos.distance.Overlap.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 3 / 7)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 3 / 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            3 / 7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 3 / 7)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 3 / 7)
예제 #18
0
class EuclideanTestCases(unittest.TestCase):
    """Test Euclidean functions.

    abydos.distance.Euclidean
    """

    cmp = Euclidean()
    cmp_q2 = Euclidean(tokenizer=QGrams(2))
    cmp_ws = Euclidean(tokenizer=WhitespaceTokenizer())

    def test_euclidean_dist_abs(self):
        """Test abydos.distance.Euclidean.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7**0.5)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8**0.5)
        self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7**0.5)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7**0.5)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8**0.5)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'),
                               7**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7**0.5,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8**0.5,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2**0.5)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO),
                               8**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM),
                               8**0.5)

        # Test wrapper
        self.assertAlmostEqual(euclidean('nelson', 'neilsen'), 7**0.5)

    def test_euclidean_sim(self):
        """Test abydos.distance.Euclidean.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1 - 7**0.5 / 23**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO),
                               1 - 8**0.5 / 24**0.5)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM),
                               1 - 8**0.5 / 24**0.5)

        # Test wrapper
        self.assertAlmostEqual(sim_euclidean('nelson', 'neilsen'),
                               1 - 7**0.5 / 23**0.5)

    def test_euclidean_dist(self):
        """Test abydos.distance.Euclidean.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7**0.5 / 23**0.5,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO),
                               8**0.5 / 24**0.5)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM),
                               8**0.5 / 24**0.5)

        # Test wrapper
        self.assertAlmostEqual(dist_euclidean('nelson', 'neilsen'),
                               7**0.5 / 23**0.5)
예제 #19
0
class ManhattanTestCases(unittest.TestCase):
    """Test Manhattan functions.

    abydos.distance.Manhattan
    """

    cmp = Manhattan()
    cmp_q2 = Manhattan(tokenizer=QGrams(2))
    cmp_ws = Manhattan(tokenizer=WhitespaceTokenizer())

    def test_manhattan_dist_abs(self):
        """Test abydos.distance.Manhattan.dist_abs."""
        self.assertEqual(self.cmp.dist_abs('', ''), 0)
        self.assertEqual(self.cmp.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp.dist_abs('', 'neilsen'), 8)
        self.assertAlmostEqual(self.cmp.dist_abs('nelson', 'neilsen'), 7)

        self.assertEqual(self.cmp_q2.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_q2.dist_abs('nelson', ''), 7)
        self.assertEqual(self.cmp_q2.dist_abs('', 'neilsen'), 8)
        self.assertAlmostEqual(self.cmp_q2.dist_abs('nelson', 'neilsen'), 7)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            7,
        )
        self.assertEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8,
        )
        self.assertAlmostEqual(
            self.cmp.dist_abs(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist_abs('', ''), 0)
        self.assertEqual(self.cmp_ws.dist_abs('the quick', ''), 2)
        self.assertEqual(self.cmp_ws.dist_abs('', 'the quick'), 2)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_FROM, NONQ_TO), 8)
        self.assertAlmostEqual(self.cmp_ws.dist_abs(NONQ_TO, NONQ_FROM), 8)

        # Test wrapper
        self.assertAlmostEqual(manhattan('nelson', 'neilsen'), 7)

    def test_manhattan_sim(self):
        """Test abydos.distance.Manhattan.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 8 / 15)

        self.assertEqual(self.cmp_q2.sim('', ''), 1)
        self.assertEqual(self.cmp_q2.sim('nelson', ''), 0)
        self.assertEqual(self.cmp_q2.sim('', 'neilsen'), 0)
        self.assertAlmostEqual(self.cmp_q2.sim('nelson', 'neilsen'), 8 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.sim(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            0,
        )
        self.assertAlmostEqual(
            self.cmp.sim(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            8 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.sim('', ''), 1)
        self.assertEqual(self.cmp_ws.sim('the quick', ''), 0)
        self.assertEqual(self.cmp_ws.sim('', 'the quick'), 0)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.sim(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(sim_manhattan('nelson', 'neilsen'), 8 / 15)

    def test_manhattan_dist(self):
        """Test abydos.distance.Manhattan.dist."""
        self.assertEqual(self.cmp.dist('', ''), 0)
        self.assertEqual(self.cmp.dist('nelson', ''), 1)
        self.assertEqual(self.cmp.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp.dist('nelson', 'neilsen'), 7 / 15)

        self.assertEqual(self.cmp_q2.dist('', ''), 0)
        self.assertEqual(self.cmp_q2.dist('nelson', ''), 1)
        self.assertEqual(self.cmp_q2.dist('', 'neilsen'), 1)
        self.assertAlmostEqual(self.cmp_q2.dist('nelson', 'neilsen'), 7 / 15)

        # supplied q-gram tests
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            0,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('').get_counter(),
            ),
            1,
        )
        self.assertEqual(
            self.cmp.dist(
                QGrams().tokenize('').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            1,
        )
        self.assertAlmostEqual(
            self.cmp.dist(
                QGrams().tokenize('nelson').get_counter(),
                QGrams().tokenize('neilsen').get_counter(),
            ),
            7 / 15,
        )

        # non-q-gram tests
        self.assertEqual(self.cmp_ws.dist('', ''), 0)
        self.assertEqual(self.cmp_ws.dist('the quick', ''), 1)
        self.assertEqual(self.cmp_ws.dist('', 'the quick'), 1)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_FROM, NONQ_TO), 1 / 2)
        self.assertAlmostEqual(self.cmp_ws.dist(NONQ_TO, NONQ_FROM), 1 / 2)

        # Test wrapper
        self.assertAlmostEqual(dist_manhattan('nelson', 'neilsen'), 7 / 15)
예제 #20
0
class CompleteLinkageTestCases(unittest.TestCase):
    """Test CompleteLinkage functions.

    abydos.distance.CompleteLinkage
    """

    cmp = CompleteLinkage()
    cmp_q4 = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''))
    cmp_q4_jw = CompleteLinkage(tokenizer=QGrams(qval=4, start_stop=''),
                                metric=JaroWinkler())

    def test_complete_linkage_dist(self):
        """Test abydos.distance.CompleteLinkage.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'a'), 0.0)
        self.assertEqual(self.cmp.dist('abc', ''), 0.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 1.0)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 1.0)

        self.assertEqual(self.cmp_q4.dist('AAAT', 'AATT'), 0.25)
        self.assertAlmostEqual(self.cmp_q4_jw.dist('AAAT', 'AATT'),
                               0.133333333333)

    def test_complete_linkage_sim(self):
        """Test abydos.distance.CompleteLinkage.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 1.0)
        self.assertEqual(self.cmp.sim('a', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'a'), 1.0)
        self.assertEqual(self.cmp.sim('abc', ''), 1.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.0)
        self.assertAlmostEqual(self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.0)

    def test_complete_linkage_dist_abs(self):
        """Test abydos.distance.CompleteLinkage.dist_abs."""
        # Base cases
        self.assertEqual(self.cmp.dist_abs('', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('a', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('', 'a'), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('abc', ''), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('', 'abc'), float('-inf'))
        self.assertEqual(self.cmp.dist_abs('abc', 'abc'), 2)
        self.assertEqual(self.cmp.dist_abs('abcd', 'efgh'), 2)

        self.assertAlmostEqual(self.cmp.dist_abs('Nigel', 'Niall'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Niall', 'Nigel'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Colin', 'Coiln'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('Coiln', 'Colin'), 2)
        self.assertAlmostEqual(self.cmp.dist_abs('ATCAACGAGT', 'AACGATTAG'), 2)