def test_character_tokenizer(self):
        """Test abydos.tokenizer.CharacterTokenizer."""
        self.assertEqual(sorted(CharacterTokenizer().tokenize('').get_list()),
                         [])
        self.assertEqual(sorted(CharacterTokenizer().tokenize('a').get_list()),
                         ['a'])

        self.assertEqual(
            sorted(CharacterTokenizer().tokenize('NELSON').get_list()),
            sorted(['N', 'E', 'L', 'S', 'O', 'N']),
        )
    def test_character_tokenizer_counts(self):
        """Test abydos.tokenizer.CharacterTokenizer counts."""
        self.assertEqual(CharacterTokenizer().tokenize('').count(), 0)
        self.assertEqual(len(CharacterTokenizer().tokenize('').get_list()), 0)

        self.assertEqual(CharacterTokenizer().tokenize('NEILSEN').count(), 7)
        self.assertEqual(CharacterTokenizer().tokenize('NELSON').count(), 6)

        self.assertEqual(
            len(CharacterTokenizer().tokenize('NEILSEN').get_list()), 7)
        self.assertEqual(
            len(CharacterTokenizer().tokenize('NELSON').get_list()), 6)
 def test_character_tokenizer_intersections(self):
     """Test abydos.tokenizer.CharacterTokenizer intersections."""
     self.assertEqual(
         sorted(CharacterTokenizer().tokenize('NELSON')
                & CharacterTokenizer().tokenize('')),
         [],
     )
     self.assertEqual(
         sorted(CharacterTokenizer().tokenize('')
                & CharacterTokenizer().tokenize('NEILSEN')),
         [],
     )
     self.assertEqual(
         sorted(CharacterTokenizer().tokenize('NELSON')
                & CharacterTokenizer().tokenize('NEILSEN')),
         sorted(['N', 'E', 'L', 'S']),
     )
     self.assertEqual(
         sorted(CharacterTokenizer().tokenize('NAIL')
                & CharacterTokenizer().tokenize('LIAN')),
         sorted(['N', 'A', 'I', 'L']),
     )
Beispiel #4
0
    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )
Beispiel #5
0
    RegexpTokenizer,
    SAPSTokenizer,
    SonoriPyTokenizer,
    VCClusterTokenizer,
    WhitespaceTokenizer,
    WordpunctTokenizer,
)

from nltk import TweetTokenizer

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

algorithms = {
    'corvcluster': COrVClusterTokenizer().tokenize,
    'cvcluster': CVClusterTokenizer().tokenize,
    'character': CharacterTokenizer().tokenize,
    'legalipy': LegaliPyTokenizer().tokenize,
    'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize,
    'qgrams': QGrams().tokenize,
    'qskipgrams': QSkipgrams().tokenize,
    'regexp': RegexpTokenizer().tokenize,
    'saps': SAPSTokenizer().tokenize,
    'sonoripy': SonoriPyTokenizer().tokenize,
    'vccluster': VCClusterTokenizer().tokenize,
    'whitespace': WhitespaceTokenizer().tokenize,
    'wordpunct': WordpunctTokenizer().tokenize,
}


class BigListOfNaughtyStringsTestCases(unittest.TestCase):
    """Test each tokenizer against the BLNS set.