Beispiel #1
0
class BLEUTestCases(unittest.TestCase):
    """Test BLEU functions.

    abydos.distance.BLEU
    """

    cmp = BLEU()
    cmp_skip_saps = BLEU(
        tokenizers=[QSkipgrams(), SAPSTokenizer()], weights=[0.33, 0.67]
    )

    def test_bleu_sim(self):
        """Test abydos.distance.BLEU.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 0.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6223329773)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6223329773)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7071067812)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7071067812)
        self.assertAlmostEqual(
            self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5119598032
        )

        self.assertAlmostEqual(
            self.cmp_skip_saps.sim('Nigel', 'Niall'), 0.7828303104
        )

    def test_bleu_dist(self):
        """Test abydos.distance.BLEU.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 1.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3776670227)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3776670227)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2928932188)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2928932188)
        self.assertAlmostEqual(
            self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4880401968
        )
    def test_pearson_chi_squared_sim_score(self):
        """Test abydos.distance.PearsonChiSquared.sim_score."""
        # Base cases
        self.assertEqual(self.cmp.sim_score('', ''), 784.0)
        self.assertEqual(self.cmp.sim_score('a', ''), 0.0)
        self.assertEqual(self.cmp.sim_score('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim_score('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim_score('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim_score('abc', 'abc'), 784.0)
        self.assertEqual(self.cmp.sim_score('abcd', 'efgh'),
                         0.032298410951138765)

        self.assertAlmostEqual(self.cmp.sim_score('Nigel', 'Niall'),
                               192.9885210909)
        self.assertAlmostEqual(self.cmp.sim_score('Niall', 'Nigel'),
                               192.9885210909)
        self.assertAlmostEqual(self.cmp.sim_score('Colin', 'Coiln'),
                               192.9885210909)
        self.assertAlmostEqual(self.cmp.sim_score('Coiln', 'Colin'),
                               192.9885210909)
        self.assertAlmostEqual(self.cmp.sim_score('ATCAACGAGT', 'AACGATTAG'),
                               344.5438630111)

        # Tests with alphabet=0 (no d factor)
        self.assertEqual(self.cmp_no_d.sim_score('', ''), 0.0)
        self.assertEqual(self.cmp_no_d.sim_score('a', ''), 0.0)
        self.assertEqual(self.cmp_no_d.sim_score('', 'a'), 0.0)
        self.assertEqual(self.cmp_no_d.sim_score('abc', ''), 0.0)
        self.assertEqual(self.cmp_no_d.sim_score('', 'abc'), 0.0)
        self.assertEqual(self.cmp_no_d.sim_score('abc', 'abc'), 4.0)
        self.assertEqual(self.cmp_no_d.sim_score('abcd', 'efgh'), 10.0)

        self.assertAlmostEqual(self.cmp_no_d.sim_score('Nigel', 'Niall'), 2.25)
        self.assertAlmostEqual(self.cmp_no_d.sim_score('Niall', 'Nigel'), 2.25)
        self.assertAlmostEqual(self.cmp_no_d.sim_score('Colin', 'Coiln'), 2.25)
        self.assertAlmostEqual(self.cmp_no_d.sim_score('Coiln', 'Colin'), 2.25)
        self.assertAlmostEqual(
            self.cmp_no_d.sim_score('ATCAACGAGT', 'AACGATTAG'), 1.5272727273)

        self.assertEqual(
            PearsonChiSquared(alphabet=0,
                              tokenizer=QSkipgrams(qval=2,
                                                   scaler='SSK')).sim_score(
                                                       'a', 'eh'),
            0.0,
        )
    def test_koppen_i_corr(self):
        """Test abydos.distance.KoppenI.corr."""
        # Base cases
        self.assertEqual(self.cmp.corr('', ''), 1.0)
        self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081735637)
        self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081735637)
        self.assertEqual(self.cmp.corr('abc', ''), -0.002557544757033164)
        self.assertEqual(self.cmp.corr('', 'abc'), -0.002557544757033164)
        self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237483896)

        self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4942159383)
        self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4942159383)
        self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4942159383)
        self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4942159383)
        self.assertAlmostEqual(
            self.cmp.corr('ATCAACGAGT', 'AACGATTAG'), 0.6591251885
        )

        # Tests with alphabet=0 (no d factor)
        self.assertEqual(self.cmp_no_d.corr('', ''), 1.0)
        self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0)

        self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0)
        self.assertAlmostEqual(
            self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'), -1.0
        )

        self.assertEqual(
            KoppenI(
                alphabet=0, tokenizer=QSkipgrams(qval=2, scaler='SSK')
            ).corr('eh', 'a'),
            0.0,
        )
    def test_scott_pi_corr(self):
        """Test abydos.distance.ScottPi.corr."""
        # Base cases
        self.assertEqual(self.cmp.corr('', ''), 1.0)
        self.assertEqual(self.cmp.corr('a', ''), -0.0012771392081137526)
        self.assertEqual(self.cmp.corr('', 'a'), -0.0012771392081137526)
        self.assertEqual(self.cmp.corr('abc', ''), -0.0025575447570442954)
        self.assertEqual(self.cmp.corr('', 'abc'), -0.0025575447570442954)
        self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.corr('abcd', 'efgh'), -0.006418485237489689)

        self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.4961439589)
        self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.4961439589)
        self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.4961439589)
        self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.4961439589)
        self.assertAlmostEqual(self.cmp.corr('ATCAACGAGT', 'AACGATTAG'),
                               0.6621417798)

        # Tests with alphabet=0 (no d factor)
        self.assertEqual(self.cmp_no_d.corr('', ''), 1.0)
        self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0)

        self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -0.5)
        self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -0.5)
        self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -0.5)
        self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -0.5)
        self.assertAlmostEqual(self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'),
                               -0.3333333333)

        self.assertEqual(
            ScottPi(alphabet=0,
                    tokenizer=QSkipgrams(qval=2,
                                         scaler='SSK')).corr('eh', 'a'),
            0.0,
        )
Beispiel #5
0
    def test_digby_corr(self):
        """Test abydos.distance.Digby.corr."""
        # Base cases
        self.assertEqual(self.cmp.corr('', ''), 1.0)
        self.assertEqual(self.cmp.corr('a', ''), -1.0)
        self.assertEqual(self.cmp.corr('', 'a'), -1.0)
        self.assertEqual(self.cmp.corr('abc', ''), -1.0)
        self.assertEqual(self.cmp.corr('', 'abc'), -1.0)
        self.assertEqual(self.cmp.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.corr('abcd', 'efgh'), -1.0)

        self.assertAlmostEqual(self.cmp.corr('Nigel', 'Niall'), 0.9694362533)
        self.assertAlmostEqual(self.cmp.corr('Niall', 'Nigel'), 0.9694362533)
        self.assertAlmostEqual(self.cmp.corr('Colin', 'Coiln'), 0.9694362533)
        self.assertAlmostEqual(self.cmp.corr('Coiln', 'Colin'), 0.9694362533)
        self.assertAlmostEqual(self.cmp.corr('ATCAACGAGT', 'AACGATTAG'),
                               0.9797093576)

        # Tests with alphabet=0 (no d factor)
        self.assertEqual(self.cmp_no_d.corr('', ''), 1.0)
        self.assertEqual(self.cmp_no_d.corr('a', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'a'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', ''), -1.0)
        self.assertEqual(self.cmp_no_d.corr('', 'abc'), -1.0)
        self.assertEqual(self.cmp_no_d.corr('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp_no_d.corr('abcd', 'efgh'), -1.0)

        self.assertAlmostEqual(self.cmp_no_d.corr('Nigel', 'Niall'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Niall', 'Nigel'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Colin', 'Coiln'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('Coiln', 'Colin'), -1.0)
        self.assertAlmostEqual(self.cmp_no_d.corr('ATCAACGAGT', 'AACGATTAG'),
                               -1.0)

        self.assertEqual(
            Digby(alphabet=0,
                  tokenizer=QSkipgrams(qval=2, scaler='SSK')).corr('a', 'eh'),
            0.0,
        )
Beispiel #6
0
    def test_baulieu_xi_dist(self):
        """Test abydos.distance.BaulieuXI.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 0.0)
        self.assertEqual(self.cmp.dist('a', ''), 0.002551020408163265)
        self.assertEqual(self.cmp.dist('', 'a'), 0.002551020408163265)
        self.assertEqual(self.cmp.dist('abc', ''), 0.00510204081632653)
        self.assertEqual(self.cmp.dist('', 'abc'), 0.00510204081632653)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 0.012755102040816327)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.0076824584)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.0076824584)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.0076824584)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.0076824584)
        self.assertAlmostEqual(self.cmp.dist('ATCAACGAGT', 'AACGATTAG'),
                               0.009009009)

        self.assertEqual(
            BaulieuXI(alphabet=None,
                      tokenizer=QSkipgrams(qval=2,
                                           scaler='SSK')).dist('a', 'eh'),
            0.0,
        )
Beispiel #7
0
    def test_corpus(self):
        """Test abydos.corpus.Corpus."""
        # base cases
        self.assertEqual(Corpus().corpus, [])
        self.assertEqual(Corpus('').corpus, [])
        self.assertEqual(Corpus(' ').corpus, [])
        self.assertEqual(Corpus('\n').corpus, [])
        self.assertEqual(Corpus(' \n').corpus, [])
        self.assertEqual(Corpus(' \n ').corpus, [])

        # one document/one sentence
        self.assertEqual(Corpus('a').corpus, [[['a']]])
        self.assertEqual(Corpus('ab ab').corpus, [[['ab', 'ab']]])
        self.assertEqual(
            Corpus('abc def ghi').corpus, [[['abc', 'def', 'ghi']]])

        # multiple documents (one sentence each)
        self.assertEqual(
            Corpus('abc\n\ndef ghi').corpus, [[['abc']], [['def', 'ghi']]])
        self.assertEqual(
            Corpus('abc\n\ndef ghi\n\n').corpus, [[['abc']], [['def', 'ghi']]])
        self.assertEqual(
            Corpus('\n\nabc\r\n\ndef ghi\n\n').corpus,
            [[['abc']], [['def', 'ghi']]],
        )

        # one document (multiple sentences each)
        self.assertEqual(
            Corpus('abc\n def ghi').corpus, [[['abc'], ['def', 'ghi']]])
        self.assertEqual(
            Corpus('abc\n def ghi\n').corpus, [[['abc'], ['def', 'ghi']]])
        self.assertEqual(
            Corpus('\nabc\n def ghi\n').corpus, [[['abc'], ['def', 'ghi']]])

        # multiple documents (multiple sentences each)
        self.assertEqual(
            Corpus('abc\n abc def\n\n\ndef ghi\n jkl\n').corpus,
            [[['abc'], ['abc', 'def']], [['def', 'ghi'], ['jkl']]],
        )

        # sentence(s) with ignorables
        self.assertEqual(
            Corpus('abc\nd-ef ghi\n', filter_chars='.-').corpus,
            [[['abc'], ['def', 'ghi']]],
        )
        self.assertEqual(
            Corpus('abc\n\n\nd-ef ghi\n\n\n', filter_chars='.-').corpus,
            [[['abc']], [['def', 'ghi']]],
        )
        self.assertEqual(
            Corpus(
                '\n\nabc\r\n\ndef ghi.\n\n' + 'a b c d e f g.\n\n\n',
                filter_chars='.-',
            ).corpus,
            [
                [['abc']],
                [['def', 'ghi']],
                [['a', 'b', 'c', 'd', 'e', 'f', 'g']],
            ],
        )

        # sentences with stopword removal
        self.assertEqual(
            Corpus(
                'The quick brown fox jumped over the lazy dog',
                stop_words=('The', 'the'),
            ).corpus,
            [[['quick', 'brown', 'fox', 'jumped', 'over', 'lazy', 'dog']]],
        )
        self.assertEqual(
            Corpus('a ab abc def', stop_words=('A', 'a')).corpus,
            [[['ab', 'abc', 'def']]],
        )

        # alternate document divider
        self.assertEqual(
            Corpus(
                'The quick brown@ fox jumped over@' + 'the lazy dog',
                doc_split='@',
            ).corpus,
            [
                [['The', 'quick', 'brown']],
                [['fox', 'jumped', 'over']],
                [['the', 'lazy', 'dog']],
            ],
        )

        # alternate sentence divider
        self.assertEqual(
            Corpus(
                'The quick brown$ fox jumped over$' + 'the lazy dog',
                sent_split='$',
            ).corpus,
            [[
                ['The', 'quick', 'brown'],
                ['fox', 'jumped', 'over'],
                ['the', 'lazy', 'dog'],
            ]],
        )
        self.assertEqual(
            Corpus(
                'The quick brown$ fox jumped over@' + 'the lazy dog',
                doc_split='@',
                sent_split='$',
            ).corpus,
            [
                [['The', 'quick', 'brown'], ['fox', 'jumped', 'over']],
                [['the', 'lazy', 'dog']],
            ],
        )
        self.assertEqual(
            Corpus(
                '<BOS> The quick brown <EOS>' +
                '<BOS> fox jumped over the lazy dog <EOS>',
                sent_split='<BOS>',
                stop_words=['<EOS>'],
            ).corpus,
            [[
                ['The', 'quick', 'brown'],
                ['fox', 'jumped', 'over', 'the', 'lazy', 'dog'],
            ]],
        )
        self.assertEqual(
            Corpus('quick', word_tokenizer=QSkipgrams(qval=3,
                                                      start_stop='')).corpus,
            [[[
                'qui',
                'quc',
                'quk',
                'qic',
                'qik',
                'qck',
                'uic',
                'uik',
                'uck',
                'ick',
            ]]],
        )
Beispiel #8
0
    def test_token_distance(self):
        """Test abydos.distance._TokenDistance members."""
        self.assertAlmostEqual(
            Jaccard(intersection_type='soft', alphabet=24).sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.68,
        )
        self.assertAlmostEqual(
            Jaccard(qval=1, alphabet='CGAT').sim('ATCAACGAGT', 'AACGATTAG'),
            0.9,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3), alphabet='CGAT').sim(
                'ATCAACGAGT', 'AACGATTAG'
            ),
            0.6372795969773299,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=None).sim('synonym', 'antonym'),
            0.3333333333333333,
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=QSkipgrams(qval=3)).sim('synonym', 'antonym'),
            0.34146341463414637,
        )

        src_ctr = Counter({'a': 5, 'b': 2, 'c': 10})
        tar_ctr = Counter({'a': 2, 'c': 1, 'd': 3, 'e': 12})
        self.assertAlmostEqual(Jaccard().sim(src_ctr, tar_ctr), 0.09375)

        self.assertAlmostEqual(
            SokalMichener(normalizer='proportional').sim('synonym', 'antonym'),
            0.984777917351113,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='log').sim('synonym', 'antonym'),
            1.2385752469545532,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='exp', alphabet=0).sim(
                'synonym', 'antonym'
            ),
            3.221246147982545e18,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='laplace').sim('synonym', 'antonym'),
            0.98856416772554,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='inverse').sim('synonym', 'antonym'),
            197.95790155440417,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='complement').sim('synonym', 'antonym'),
            1.0204081632653061,
        )
        self.assertAlmostEqual(
            SokalMichener(normalizer='base case').sim('synonym', 'antonym'),
            0.9897959183673469,
        )
        self.assertAlmostEqual(
            SokalMichener().sim('synonym', 'antonym'), 0.9897959183673469
        )

        sm = SokalMichener()
        sm._tokenize('synonym', 'antonym')  # noqa: SF01

        self.assertEqual(
            sm._get_tokens(),  # noqa: SF01
            (
                Counter(
                    {
                        '$s': 1,
                        'sy': 1,
                        'yn': 1,
                        'no': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
                Counter(
                    {
                        '$a': 1,
                        'an': 1,
                        'nt': 1,
                        'to': 1,
                        'on': 1,
                        'ny': 1,
                        'ym': 1,
                        'm#': 1,
                    }
                ),
            ),
        )
        self.assertEqual(sm._src_card(), 8)  # noqa: SF01
        self.assertEqual(sm._tar_card(), 8)  # noqa: SF01
        self.assertEqual(
            sm._symmetric_difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._symmetric_difference_card(), 8)  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 772)  # noqa: SF01
        self.assertEqual(sm._population_card(), 788)  # noqa: SF01
        self.assertEqual(
            sm._union(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 1,
                    'ny': 1,
                    'ym': 1,
                    'm#': 1,
                    '$a': 1,
                    'an': 1,
                    'nt': 1,
                    'to': 1,
                }
            ),
        )
        self.assertEqual(sm._union_card(), 12)  # noqa: SF01
        self.assertEqual(
            sm._difference(),  # noqa: SF01
            Counter(
                {
                    '$s': 1,
                    'sy': 1,
                    'yn': 1,
                    'no': 1,
                    'on': 0,
                    'ny': 0,
                    'ym': 0,
                    'm#': 0,
                    '$a': -1,
                    'an': -1,
                    'nt': -1,
                    'to': -1,
                }
            ),
        )
        self.assertEqual(
            sm._intersection(),  # noqa: SF01
            Counter({'on': 1, 'ny': 1, 'ym': 1, 'm#': 1}),
        )
        self.assertEqual(
            sm._get_confusion_table(),  # noqa: SF01
            ConfusionTable(tp=4, tn=772, fp=4, fn=4),
        )

        sm = SokalMichener(
            alphabet=Counter({'C': 20, 'G': 20, 'A': 20, 'T': 20}), qval=1
        )
        sm._tokenize('ATCAACGAGT', 'AACGATTAG')  # noqa: SF01
        self.assertEqual(sm._total_complement_card(), 61)  # noqa: SF01

        jac = Jaccard(
            intersection_type='linkage', internal_assignment_problem=True
        )
        self.assertAlmostEqual(jac.sim('abandonned', 'abandoned'), 1.0)
        self.assertAlmostEqual(
            jac.sim('abundacies', 'abundances'), 0.6296296296296297
        )

        # Some additional constructors needed to complete test coverage
        self.assertAlmostEqual(
            Jaccard(alphabet=None, qval=range(2, 4)).sim('abc', 'abcd'),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            AverageLinkage(qval=range(2, 4)).sim('abc', 'abcd'),
            0.22558922558922556,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet='abcdefghijklmnop', qval=range(2, 4)).sim(
                'abc', 'abcd'
            ),
            0.42857142857142855,
        )
        self.assertAlmostEqual(
            Jaccard(
                alphabet='abcdefghijklmnop', tokenizer=WhitespaceTokenizer()
            ).sim('abc', 'abcd'),
            0.0,
        )
        self.assertAlmostEqual(
            Jaccard(alphabet=list('abcdefghijklmnop')).sim('abc', 'abcd'), 0.5
        )
        self.assertAlmostEqual(
            Jaccard(tokenizer=CharacterTokenizer()).sim('abc', 'abcd'), 0.75
        )
class UnigramCorpusTestCases(unittest.TestCase):
    """Test abydos.corpus.UnigramCorpus."""

    simple_corpus = UnigramCorpus()
    simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))

    double_corpus = UnigramCorpus()
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))

    sotu2015_sample = "Mr. Speaker, Mr. Vice President, Members of Congress,\
    my fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\
    years that dawned with terror touching our shores; that unfolded with a\
    new generation fighting two long and costly wars; that saw a vicious\
    recession spread across our nation and the world.\n It has been, and still\
    is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\
    after a breakthrough year for America, our economy is growing and creating\
    jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\
    than it was before the financial crisis.\n More of our kids are graduating\
    than ever before.\n More of our people are insured than ever before.\n And\
    we are as free from the grip of foreign oil as we've been in almost 30\
    years.\n\nTonight, for the first time since 9/11, our combat mission in\
    Afghanistan is over.\n Six years ago, nearly 180,000 American troops\
    served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\
    we salute the courage and sacrifice of every man and woman in this 9/11\
    Generation who has served to keep us safe.\n We are humbled and grateful\
    for your service.\n\nAmerica, for all that we have endured; for all the\
    grit and hard work required to come back; for all the tasks that lie\
    ahead, know this: The shadow of crisis has passed, and the State of the\
    Union is strong.\n\nAt this moment -- with a growing economy, shrinking\
    deficits, bustling industry, booming energy production -- we have risen\
    from recession freer to write our own future than any other nation on\
    Earth.\n It's now up to us to choose who we want to be over the next 15\
    years and for decades to come.\n\nWill we accept an economy where only a\
    few of us do spectacularly well?\n Or will we commit ourselves to an\
    economy that generates rising incomes and chances for everyone who makes\
    the effort?\n\nWill we approach the world fearful and reactive, dragged\
    into costly conflicts that strain our military and set back our\
    standing?\n Or will we lead wisely, using all elements of our power to\
    defeat new threats and protect our planet?\n\nWill we allow ourselves to\
    be sorted into factions and turned against one another?\n Or will we\
    recapture the sense of common purpose that has always propelled America\
    forward?\n\nIn two weeks, I will send this Congress a budget filled with\
    ideas that are practical, not partisan.\n And in the months ahead, I'll\
    crisscross the country making a case for those ideas.\n So tonight, I want\
    to focus less on a checklist of proposals, and focus more on the values at\
    stake in the choices before us."

    sotu2015_corpus = UnigramCorpus(sotu2015_sample)

    sdx_corpus = UnigramCorpus(word_transform=Soundex().encode)

    qsg_corpus = UnigramCorpus(
        word_tokenizer=QSkipgrams(qval=3, start_stop=''))

    pos_corpus = UnigramCorpus()
    pos_corpus.gng_importer(_corpus_file('simple-ngrams-pos.txt'))

    def test_unigram_corpus_init(self):
        """Test abydos.corpus.UnigramCorpus.__init__."""
        self.assertIsInstance(UnigramCorpus(), UnigramCorpus)
        self.assertIsInstance(self.sotu2015_corpus, UnigramCorpus)

    def test_unigram_corpus_gng_importer(self):
        """Test abydos.corpus.UnigramCorpus.gng_importer."""
        self.assertIsInstance(self.simple_corpus, UnigramCorpus)
        self.assertIsInstance(self.simple_corpus.corpus, defaultdict)

        # skip tests of UnigramCorpus on Python < 3.6 (lack ordered dict)
        if sys.version_info < (3, 6):
            return

        self.sdx_corpus.gng_importer('tests/corpora/simple-ngrams.txt')
        self.assertEqual(
            list(self.sdx_corpus.corpus.items()),
            [
                ('T000', (20, 20)),
                ('Q200', (2, 2)),
                ('B650', (3, 3)),
                ('F200', (1, 1)),
                ('J513', (4, 4)),
                ('O160', (6, 6)),
                ('L200', (1, 1)),
                ('D200', (5, 5)),
                ('T220', (2, 2)),
                ('Q216', (1, 1)),
                ('B651', (1, 1)),
                ('F251', (1, 1)),
                ('O163', (3, 3)),
                ('T420', (2, 2)),
                ('L232', (1, 1)),
            ],
        )

        self.qsg_corpus.gng_importer('tests/corpora/simple-ngrams.txt')
        self.assertEqual(
            list(self.qsg_corpus.corpus.items())[:30:2],
            [
                ('the', (27, 27)),
                ('quc', (5, 5)),
                ('qic', (5, 5)),
                ('qck', (5, 5)),
                ('uik', (5, 5)),
                ('ick', (5, 5)),
                ('brw', (5, 5)),
                ('bow', (5, 5)),
                ('bwn', (5, 5)),
                ('ron', (5, 5)),
                ('own', (5, 5)),
                ('jum', (5, 5)),
                ('jue', (6, 5)),
                ('jmp', (5, 5)),
                ('jmd', (5, 5)),
            ],
        )

        for term, _ in self.pos_corpus.corpus.items():
            self.assertTrue('_' not in term)

    def test_unigram_corpus_save_load_corpus(self):
        """Test abydos.corpus.UnigramCorpus.save_corpus & .load_corpus."""
        handle, path = tempfile.mkstemp('.dat')
        self.sotu2015_corpus.save_corpus(path)
        self.sotu2015_corpus.load_corpus(path)
        statinfo = os.stat(path)
        self.assertGreater(statinfo.st_size, 0)
        os.close(handle)
        os.remove(path)

    def test_unigram_corpus_idf(self):
        """Test abydos.corpus.UnigramCorpus.idf."""
        # string-style tests
        self.assertAlmostEqual(self.simple_corpus.idf('the'), 0.69314718056)
        self.assertAlmostEqual(self.simple_corpus.idf('quick'), 2.3978952728)
        self.assertAlmostEqual(self.simple_corpus.idf('trolley'), float('inf'))
    def test_qskipgrams(self):
        """Test abydos.tokenizer.QSkipgrams."""
        self.assertEqual(sorted(QSkipgrams().tokenize('').get_list()), [])
        self.assertEqual(
            sorted(QSkipgrams(start_stop='').tokenize('a').get_list()), [])
        self.assertEqual(sorted(QSkipgrams().tokenize('a').get_list()),
                         ['$#', '$a', 'a#'])
        self.assertEqual(
            sorted(QSkipgrams().tokenize('ab').get_list()),
            sorted(['$a', '$b', '$#', 'ab', 'a#', 'b#']),
        )

        self.assertEqual(
            sorted(QSkipgrams().tokenize('NELSON').get_list()),
            sorted([
                '$N',
                '$E',
                '$L',
                '$S',
                '$O',
                '$N',
                '$#',
                'NE',
                'NL',
                'NS',
                'NO',
                'NN',
                'N#',
                'EL',
                'ES',
                'EO',
                'EN',
                'E#',
                'LS',
                'LO',
                'LN',
                'L#',
                'SO',
                'SN',
                'S#',
                'ON',
                'O#',
                'N#',
            ]),
        )
        self.assertEqual(
            sorted(QSkipgrams().tokenize('NEILSEN').get_list()),
            sorted([
                '$N',
                '$E',
                '$I',
                '$L',
                '$S',
                '$E',
                '$N',
                '$#',
                'NE',
                'NI',
                'NL',
                'NS',
                'NE',
                'NN',
                'N#',
                'EI',
                'EL',
                'ES',
                'EE',
                'EN',
                'E#',
                'IL',
                'IS',
                'IE',
                'IN',
                'I#',
                'LS',
                'LE',
                'LN',
                'L#',
                'SE',
                'SN',
                'S#',
                'EN',
                'E#',
                'N#',
            ]),
        )

        self.assertEqual(
            sorted(QSkipgrams(qval=1).tokenize('NEILSEN').get_list()),
            sorted(['N', 'E', 'I', 'L', 'S', 'E', 'N']),
        )

        test_counter = (QSkipgrams(
            qval=(2, ), scaler='SSK').tokenize('NEILSEN').get_counter())
        gold_counter = Counter({
            '$N': 1.2404672100000003,
            '$E': 1.2072969000000002,
            '$I': 0.6561,
            '$L': 0.5904900000000001,
            '$S': 0.531441,
            '$#': 0.3874204890000001,
            'NE': 1.341441,
            'NI': 0.7290000000000001,
            'NL': 0.6561,
            'NS': 0.5904900000000001,
            'NN': 0.4782969000000001,
            'N#': 1.2404672100000003,
            'EI': 0.81,
            'EL': 0.7290000000000001,
            'ES': 0.6561,
            'EE': 0.5904900000000001,
            'EN': 1.341441,
            'E#': 1.2072969000000002,
            'IL': 0.81,
            'IS': 0.7290000000000001,
            'IE': 0.6561,
            'IN': 0.5904900000000001,
            'I#': 0.531441,
            'LS': 0.81,
            'LE': 0.7290000000000001,
            'LN': 0.6561,
            'L#': 0.5904900000000001,
            'SE': 0.81,
            'SN': 0.7290000000000001,
            'S#': 0.6561,
        })
        for key in gold_counter.keys():
            self.assertAlmostEqual(gold_counter[key], test_counter[key])

        test_counter = (QSkipgrams(
            qval=(4, 6, 5, 1, 0),
            scaler='SSK').tokenize('NIALL').get_counter())
        gold_counter = Counter({
            '$$$N': 0.531441,
            '$$$I': 0.4782969000000001,
            '$$$A': 0.4304672100000001,
            '$$$L': 0.7360989291000002,
            '$$$#': 0.8504267154039002,
            '$$NI': 1.4880348000000003,
            '$$NA': 1.3392313200000003,
            '$$NL': 2.2900855572000007,
            '$$N#': 2.645772003478801,
            '$$IA': 1.3392313200000003,
            '$$IL': 2.2900855572000007,
            '$$I#': 2.645772003478801,
            '$$AL': 2.2900855572000007,
            '$$A#': 2.645772003478801,
            '$$LL': 1.0847773692000002,
            '$$L#': 5.291544006957601,
            '$$##': 2.460275073345601,
            '$NIA': 1.4402051100000002,
            '$NIL': 2.462750738100001,
            '$NI#': 2.845254813264901,
            '$NAL': 2.462750738100001,
            '$NA#': 2.845254813264901,
            '$NLL': 1.1665661391000004,
            '$NL#': 5.690509626529802,
            '$N##': 2.645772003478801,
            '$IAL': 2.462750738100001,
            '$IA#': 2.845254813264901,
            '$ILL': 1.1665661391000004,
            '$IL#': 5.690509626529802,
            '$I##': 2.645772003478801,
            '$ALL': 1.1665661391000004,
            '$AL#': 5.690509626529802,
            '$A##': 2.645772003478801,
            '$LL#': 2.845254813264901,
            '$L##': 5.291544006957601,
            '$###': 0.8504267154039002,
            'NIAL': 1.0097379000000002,
            'NIA#': 1.1665661391000002,
            'NILL': 0.4782969000000001,
            'NIL#': 2.3331322782000004,
            'NI##': 1.0847773692000002,
            'NALL': 0.4782969000000001,
            'NAL#': 2.3331322782000004,
            'NA##': 1.0847773692000002,
            'NLL#': 1.1665661391000002,
            'NL##': 2.1695547384000005,
            'N###': 0.3486784401000001,
            'IALL': 0.531441,
            'IAL#': 2.5923691980000005,
            'IA##': 1.2053081880000003,
            'ILL#': 1.2961845990000003,
            'IL##': 2.4106163760000006,
            'I###': 0.3874204890000001,
            'ALL#': 1.4402051100000004,
            'AL##': 2.6784626400000007,
            'A###': 0.4304672100000001,
            'LL##': 1.4880348000000003,
            'L###': 1.0097379000000002,
            '$$$$$N': 0.3486784401000001,
            '$$$$$I': 0.31381059609000006,
            '$$$$$A': 0.2824295364810001,
            '$$$$$L': 0.48295450738251017,
            '$$$$$#': 0.8431447750407974,
            '$$$$NI': 1.6039208244600003,
            '$$$$NA': 1.4435287420140006,
            '$$$$NL': 2.468434148843941,
            '$$$$N#': 4.309406627986299,
            '$$$$IA': 1.4435287420140006,
            '$$$$IL': 2.468434148843941,
            '$$$$I#': 4.309406627986299,
            '$$$$AL': 2.468434148843941,
            '$$$$A#': 4.309406627986299,
            '$$$$LL': 1.1692582810313406,
            '$$$$L#': 8.618813255972597,
            '$$$$##': 7.715070145397851,
            '$$$NIA': 2.984687447256001,
            '$$$NIL': 5.103815534807762,
            '$$$NI#': 8.910270709073119,
            '$$$NAL': 5.103815534807762,
            '$$$NA#': 8.910270709073119,
            '$$$NLL': 2.417596832277361,
            '$$$NL#': 17.82054141814625,
            '$$$N##': 15.951932474542438,
            '$$$IAL': 5.103815534807762,
            '$$$IA#': 8.910270709073119,
            '$$$ILL': 2.417596832277361,
            '$$$IL#': 17.82054141814625,
            '$$$I##': 15.951932474542438,
            '$$$ALL': 2.417596832277361,
            '$$$AL#': 17.82054141814625,
            '$$$A##': 15.951932474542438,
            '$$$LL#': 8.910270709073119,
            '$$$L##': 31.903864949084834,
            '$$$###': 15.08638445665049,
            '$$NIAL': 5.396635688803742,
            '$$NIA#': 9.42147782919388,
            '$$NILL': 2.556301115749141,
            '$$NIL#': 18.84295565838777,
            '$$NI##': 16.867139400002937,
            '$$NALL': 2.556301115749141,
            '$$NAL#': 18.84295565838777,
            '$$NA##': 16.867139400002937,
            '$$NLL#': 9.42147782919388,
            '$$NL##': 33.73427880000585,
            '$$N###': 15.951932474542435,
            '$$IALL': 2.556301115749141,
            '$$IAL#': 18.84295565838777,
            '$$IA##': 16.867139400002937,
            '$$ILL#': 9.42147782919388,
            '$$IL##': 33.73427880000585,
            '$$I###': 15.951932474542435,
            '$$ALL#': 9.42147782919388,
            '$$AL##': 33.73427880000585,
            '$$A###': 15.951932474542435,
            '$$LL##': 16.867139400002937,
            '$$L###': 31.903864949084824,
            '$$####': 7.715070145397851,
            '$NIALL': 1.4278730800535104,
            '$NIAL#': 10.525109490228838,
            '$NIA##': 9.421477829193876,
            '$NILL#': 5.262554745114417,
            '$NIL##': 18.842955658387766,
            '$NI###': 8.910270709073117,
            '$NALL#': 5.262554745114417,
            '$NAL##': 18.842955658387766,
            '$NA###': 8.910270709073117,
            '$NLL##': 9.421477829193876,
            '$NL###': 17.820541418146256,
            '$N####': 4.309406627986299,
            '$IALL#': 5.262554745114417,
            '$IAL##': 18.842955658387766,
            '$IA###': 8.910270709073117,
            '$ILL##': 9.421477829193876,
            '$IL###': 17.820541418146256,
            '$I####': 4.309406627986299,
            '$ALL##': 9.421477829193876,
            '$AL###': 17.820541418146256,
            '$A####': 4.309406627986299,
            '$LL###': 8.910270709073117,
            '$L####': 8.618813255972595,
            '$#####': 0.8431447750407974,
            'NIALL#': 1.4278730800535104,
            'NIAL##': 5.112602231498281,
            'NIA###': 2.417596832277361,
            'NILL##': 2.556301115749141,
            'NIL###': 4.835193664554721,
            'NI####': 1.1692582810313406,
            'NALL##': 2.556301115749141,
            'NAL###': 4.835193664554721,
            'NA####': 1.1692582810313406,
            'NLL###': 2.417596832277361,
            'NL####': 2.338516562062681,
            'N#####': 0.2287679245496101,
            'IALL##': 2.8403345730546006,
            'IAL###': 5.3724374050608015,
            'IA####': 1.2991758678126004,
            'ILL###': 2.6862187025304003,
            'IL####': 2.5983517356252004,
            'I#####': 0.2541865828329001,
            'ALL###': 2.984687447256001,
            'AL####': 2.887057484028001,
            'A#####': 0.2824295364810001,
            'LL####': 1.6039208244600003,
            'L#####': 0.6624890361900002,
            '$$$$N': 0.4304672100000001,
            '$$$$I': 0.3874204890000001,
            '$$$$A': 0.3486784401000001,
            '$$$$L': 0.5962401325710002,
            '$$$$#': 0.8741476583623434,
            '$$$NI': 1.5927286770000002,
            '$$$NA': 1.4334558093000005,
            '$$$NL': 2.4512094339030006,
            '$$$N#': 3.59371815104519,
            '$$$IA': 1.4334558093000005,
            '$$$IL': 2.4512094339030006,
            '$$$I#': 3.59371815104519,
            '$$$AL': 2.4512094339030006,
            '$$$A#': 3.59371815104519,
            '$$$LL': 1.1610992055330005,
            '$$$L#': 7.187436302090378,
            '$$$##': 4.91876456439945,
            '$$NIA': 2.2513435083000006,
            '$$NIL': 3.849797399193001,
            '$$NI#': 5.644187966956859,
            '$$NAL': 3.849797399193001,
            '$$NA#': 5.644187966956859,
            '$$NLL': 1.8235882417230007,
            '$$NL#': 11.28837593391372,
            '$$N##': 7.725266868411147,
            '$$IAL': 3.849797399193001,
            '$$IA#': 5.644187966956859,
            '$$ILL': 1.8235882417230007,
            '$$IL#': 11.28837593391372,
            '$$I##': 7.725266868411147,
            '$$ALL': 1.8235882417230007,
            '$$AL#': 11.28837593391372,
            '$$A##': 7.725266868411147,
            '$$LL#': 5.644187966956859,
            '$$L##': 15.4505337368223,
            '$$###': 4.918764564399449,
            '$NIAL': 2.812715796861001,
            '$NIA#': 4.123722629777913,
            '$NILL': 1.3323390616710005,
            '$NIL#': 8.247445259555828,
            '$NI##': 5.644187966956858,
            '$NALL': 1.3323390616710005,
            '$NAL#': 8.247445259555828,
            '$NA##': 5.644187966956858,
            '$NLL#': 4.123722629777913,
            '$NL##': 11.288375933913724,
            '$N###': 3.593718151045189,
            '$IALL': 1.3323390616710005,
            '$IAL#': 8.247445259555828,
            '$IA##': 5.644187966956858,
            '$ILL#': 4.123722629777913,
            '$IL##': 11.288375933913724,
            '$I###': 3.593718151045189,
            '$ALL#': 4.123722629777913,
            '$AL##': 11.288375933913724,
            '$A###': 3.593718151045189,
            '$LL##': 5.644187966956858,
            '$L###': 7.187436302090377,
            '$####': 0.8741476583623434,
            'NIALL': 0.4304672100000001,
            'NIAL#': 2.664678123342001,
            'NIA##': 1.8235882417230007,
            'NILL#': 1.3323390616710005,
            'NIL##': 3.6471764834460014,
            'NI###': 1.1610992055330005,
            'NALL#': 1.3323390616710005,
            'NAL##': 3.6471764834460014,
            'NA###': 1.1610992055330005,
            'NLL##': 1.8235882417230007,
            'NL###': 2.322198411066001,
            'N####': 0.2824295364810001,
            'IALL#': 1.4803767351900001,
            'IAL##': 4.0524183149400015,
            'IA###': 1.2901102283700003,
            'ILL##': 2.0262091574700007,
            'IL###': 2.5802204567400007,
            'I####': 0.31381059609000006,
            'ALL##': 2.2513435083000006,
            'AL###': 2.8669116186000005,
            'A####': 0.3486784401000001,
            'LL###': 1.5927286770000004,
            'L####': 0.8178876990000001,
            'N': 1.0,
            'I': 1.0,
            'A': 1.0,
            'L': 2.0,
        })
        for key in gold_counter.keys():
            self.assertAlmostEqual(gold_counter[key], test_counter[key])

        self.assertEqual(
            QSkipgrams(qval=(2, 3),
                       scaler='length').tokenize('NIALL').get_counter(),
            Counter({
                '$N': 2,
                '$I': 2,
                '$A': 2,
                '$L': 4,
                '$#': 2,
                'NI': 2,
                'NA': 2,
                'NL': 4,
                'N#': 2,
                'IA': 2,
                'IL': 4,
                'I#': 2,
                'AL': 4,
                'A#': 2,
                'LL': 2,
                'L#': 4,
                '$$N': 3,
                '$$I': 3,
                '$$A': 3,
                '$$L': 6,
                '$$#': 6,
                '$NI': 6,
                '$NA': 6,
                '$NL': 12,
                '$N#': 12,
                '$IA': 6,
                '$IL': 12,
                '$I#': 12,
                '$AL': 12,
                '$A#': 12,
                '$LL': 6,
                '$L#': 24,
                '$##': 6,
                'NIA': 3,
                'NIL': 6,
                'NI#': 6,
                'NAL': 6,
                'NA#': 6,
                'NLL': 3,
                'NL#': 12,
                'N##': 3,
                'IAL': 6,
                'IA#': 6,
                'ILL': 3,
                'IL#': 12,
                'I##': 3,
                'ALL': 3,
                'AL#': 12,
                'A##': 3,
                'LL#': 6,
                'L##': 6,
            }),
        )
        test_counter = (QSkipgrams(
            qval=(2, 3), scaler='length-log').tokenize('NIALL').get_counter())
        gold_counter = Counter({
            '$N': 1.0986122886681096,
            '$I': 1.0986122886681096,
            '$A': 1.0986122886681096,
            '$L': 2.197224577336219,
            '$#': 1.0986122886681096,
            'NI': 1.0986122886681096,
            'NA': 1.0986122886681096,
            'NL': 2.197224577336219,
            'N#': 1.0986122886681096,
            'IA': 1.0986122886681096,
            'IL': 2.197224577336219,
            'I#': 1.0986122886681096,
            'AL': 2.197224577336219,
            'A#': 1.0986122886681096,
            'LL': 1.0986122886681096,
            'L#': 2.197224577336219,
            '$$N': 1.3862943611198906,
            '$$I': 1.3862943611198906,
            '$$A': 1.3862943611198906,
            '$$L': 2.772588722239781,
            '$$#': 2.772588722239781,
            '$NI': 2.772588722239781,
            '$NA': 2.772588722239781,
            '$NL': 5.545177444479562,
            '$N#': 5.545177444479562,
            '$IA': 2.772588722239781,
            '$IL': 5.545177444479562,
            '$I#': 5.545177444479562,
            '$AL': 5.545177444479562,
            '$A#': 5.545177444479562,
            '$LL': 2.772588722239781,
            '$L#': 11.090354888959125,
            '$##': 2.772588722239781,
            'NIA': 1.3862943611198906,
            'NIL': 2.772588722239781,
            'NI#': 2.772588722239781,
            'NAL': 2.772588722239781,
            'NA#': 2.772588722239781,
            'NLL': 1.3862943611198906,
            'NL#': 5.545177444479562,
            'N##': 1.3862943611198906,
            'IAL': 2.772588722239781,
            'IA#': 2.772588722239781,
            'ILL': 1.3862943611198906,
            'IL#': 5.545177444479562,
            'I##': 1.3862943611198906,
            'ALL': 1.3862943611198906,
            'AL#': 5.545177444479562,
            'A##': 1.3862943611198906,
            'LL#': 2.772588722239781,
            'L##': 2.772588722239781,
        })
        for key in gold_counter.keys():
            self.assertAlmostEqual(gold_counter[key], test_counter[key])

        test_counter = (QSkipgrams(
            qval=(2, 3), scaler='length-exp').tokenize('NIALL').get_counter())
        gold_counter = Counter({
            '$N': 7.38905609893065,
            '$I': 7.38905609893065,
            '$A': 7.38905609893065,
            '$L': 14.7781121978613,
            '$#': 7.38905609893065,
            'NI': 7.38905609893065,
            'NA': 7.38905609893065,
            'NL': 14.7781121978613,
            'N#': 7.38905609893065,
            'IA': 7.38905609893065,
            'IL': 14.7781121978613,
            'I#': 7.38905609893065,
            'AL': 14.7781121978613,
            'A#': 7.38905609893065,
            'LL': 7.38905609893065,
            'L#': 14.7781121978613,
            '$$N': 20.085536923187668,
            '$$I': 20.085536923187668,
            '$$A': 20.085536923187668,
            '$$L': 40.171073846375336,
            '$$#': 40.171073846375336,
            '$NI': 40.171073846375336,
            '$NA': 40.171073846375336,
            '$NL': 80.34214769275067,
            '$N#': 80.34214769275067,
            '$IA': 40.171073846375336,
            '$IL': 80.34214769275067,
            '$I#': 80.34214769275067,
            '$AL': 80.34214769275067,
            '$A#': 80.34214769275067,
            '$LL': 40.171073846375336,
            '$L#': 160.68429538550137,
            '$##': 40.171073846375336,
            'NIA': 20.085536923187668,
            'NIL': 40.171073846375336,
            'NI#': 40.171073846375336,
            'NAL': 40.171073846375336,
            'NA#': 40.171073846375336,
            'NLL': 20.085536923187668,
            'NL#': 80.34214769275067,
            'N##': 20.085536923187668,
            'IAL': 40.171073846375336,
            'IA#': 40.171073846375336,
            'ILL': 20.085536923187668,
            'IL#': 80.34214769275067,
            'I##': 20.085536923187668,
            'ALL': 20.085536923187668,
            'AL#': 80.34214769275067,
            'A##': 20.085536923187668,
            'LL#': 40.171073846375336,
            'L##': 40.171073846375336,
        })
        for key in gold_counter.keys():
            self.assertAlmostEqual(gold_counter[key], test_counter[key])
    def test__tokenizer(self):
        """Test abydos.tokenizer._Tokenizer."""
        self.assertEqual(_Tokenizer().tokenize('').get_counter(),
                         Counter({'': 1}))
        self.assertEqual(_Tokenizer().tokenize('a').get_counter(),
                         Counter({'a': 1}))

        self.assertEqual(
            _Tokenizer().tokenize('NELSON').get_counter(),
            Counter({'NELSON': 1}),
        )
        self.assertEqual(
            _Tokenizer().tokenize('NEILSEN').get_counter(),
            Counter({'NEILSEN': 1}),
        )
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1)
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1)

        tweet = 'Good to be home for a night'
        self.assertEqual(
            _Tokenizer().tokenize(tweet).get_counter(),
            Counter({'Good to be home for a night': 1}),
        )

        nelson = QGrams().tokenize('NELSON')
        neilsen = QGrams().tokenize('NEILSEN')
        self.assertEqual(nelson.get_set(),
                         {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'})
        self.assertEqual(nelson.get_list(),
                         ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#'])
        if sys.version_info >= (3, 6):
            self.assertEqual(
                repr(nelson),
                "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \
'N#': 1})",
            )
        self.assertEqual(nelson & neilsen,
                         Counter({
                             '$N': 1,
                             'NE': 1,
                             'LS': 1,
                             'N#': 1
                         }))
        self.assertEqual(
            nelson + neilsen,
            Counter({
                '$N': 2,
                'NE': 2,
                'EL': 1,
                'LS': 2,
                'SO': 1,
                'ON': 1,
                'N#': 2,
                'EI': 1,
                'IL': 1,
                'SE': 1,
                'EN': 1,
            }),
        )
        self.assertEqual(nelson - neilsen, Counter({
            'EL': 1,
            'SO': 1,
            'ON': 1
        }))

        nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON')
        self.assertEqual(nelsonnelson.count(), 8)

        nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON')
        self.assertAlmostEqual(nelson_ssk.count(), 18.66784401)

        nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON')
        gold_standard = Counter({
            '$$N': 1.0986122886681096,
            '$$E': 0.6931471805599453,
            '$$L': 0.6931471805599453,
            '$$S': 0.6931471805599453,
            '$$O': 0.6931471805599453,
            '$$#': 1.0986122886681096,
            '$NE': 1.0986122886681096,
            '$NL': 1.0986122886681096,
            '$NS': 1.0986122886681096,
            '$NO': 1.0986122886681096,
            '$NN': 1.0986122886681096,
            '$N#': 2.1972245773362196,
            '$EL': 1.0986122886681096,
            '$ES': 1.0986122886681096,
            '$EO': 1.0986122886681096,
            '$EN': 1.0986122886681096,
            '$E#': 1.6094379124341003,
            '$LS': 1.0986122886681096,
            '$LO': 1.0986122886681096,
            '$LN': 1.0986122886681096,
            '$L#': 1.6094379124341003,
            '$SO': 1.0986122886681096,
            '$SN': 1.0986122886681096,
            '$S#': 1.6094379124341003,
            '$ON': 1.0986122886681096,
            '$O#': 1.6094379124341003,
            '$##': 1.0986122886681096,
            'NEL': 0.6931471805599453,
            'NES': 0.6931471805599453,
            'NEO': 0.6931471805599453,
            'NEN': 0.6931471805599453,
            'NE#': 1.0986122886681096,
            'NLS': 0.6931471805599453,
            'NLO': 0.6931471805599453,
            'NLN': 0.6931471805599453,
            'NL#': 1.0986122886681096,
            'NSO': 0.6931471805599453,
            'NSN': 0.6931471805599453,
            'NS#': 1.0986122886681096,
            'NON': 0.6931471805599453,
            'NO#': 1.0986122886681096,
            'NN#': 1.0986122886681096,
            'N##': 1.0986122886681096,
            'ELS': 0.6931471805599453,
            'ELO': 0.6931471805599453,
            'ELN': 0.6931471805599453,
            'EL#': 1.0986122886681096,
            'ESO': 0.6931471805599453,
            'ESN': 0.6931471805599453,
            'ES#': 1.0986122886681096,
            'EON': 0.6931471805599453,
            'EO#': 1.0986122886681096,
            'EN#': 1.0986122886681096,
            'E##': 0.6931471805599453,
            'LSO': 0.6931471805599453,
            'LSN': 0.6931471805599453,
            'LS#': 1.0986122886681096,
            'LON': 0.6931471805599453,
            'LO#': 1.0986122886681096,
            'LN#': 1.0986122886681096,
            'L##': 0.6931471805599453,
            'SON': 0.6931471805599453,
            'SO#': 1.0986122886681096,
            'SN#': 1.0986122886681096,
            'S##': 0.6931471805599453,
            'ON#': 1.0986122886681096,
            'O##': 0.6931471805599453,
        })
        test_counter = nelson_log.get_counter()
        for key in test_counter:
            self.assertAlmostEqual(test_counter[key], gold_standard[key])

        nelson_entropy = QSkipgrams(scaler='entropy').tokenize('NELSON')
        self.assertAlmostEqual(nelson_entropy.count(), 4.6644977792)
Beispiel #12
0
    WhitespaceTokenizer,
    WordpunctTokenizer,
)

from nltk import TweetTokenizer

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

algorithms = {
    'corvcluster': COrVClusterTokenizer().tokenize,
    'cvcluster': CVClusterTokenizer().tokenize,
    'character': CharacterTokenizer().tokenize,
    'legalipy': LegaliPyTokenizer().tokenize,
    'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize,
    'qgrams': QGrams().tokenize,
    'qskipgrams': QSkipgrams().tokenize,
    'regexp': RegexpTokenizer().tokenize,
    'saps': SAPSTokenizer().tokenize,
    'sonoripy': SonoriPyTokenizer().tokenize,
    'vccluster': VCClusterTokenizer().tokenize,
    'whitespace': WhitespaceTokenizer().tokenize,
    'wordpunct': WordpunctTokenizer().tokenize,
}


class BigListOfNaughtyStringsTestCases(unittest.TestCase):
    """Test each tokenizer against the BLNS set.

    Here, we test each algorithm against each string, but we only care that it
    does not result in an exception.
    def test__tokenizer(self):
        """Test abydos.tokenizer._Tokenizer."""
        self.assertEqual(
            _Tokenizer().tokenize('').get_counter(), Counter({'': 1})
        )
        self.assertEqual(
            _Tokenizer().tokenize('a').get_counter(), Counter({'a': 1})
        )

        self.assertEqual(
            _Tokenizer().tokenize('NELSON').get_counter(),
            Counter({'NELSON': 1}),
        )
        self.assertEqual(
            _Tokenizer().tokenize('NEILSEN').get_counter(),
            Counter({'NEILSEN': 1}),
        )
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count(), 1)
        self.assertEqual(_Tokenizer().tokenize('NEILSEN').count_unique(), 1)

        tweet = 'Good to be home for a night'
        self.assertEqual(
            _Tokenizer().tokenize(tweet).get_counter(),
            Counter({'Good to be home for a night': 1}),
        )

        nelson = QGrams().tokenize('NELSON')
        neilsen = QGrams().tokenize('NEILSEN')
        self.assertEqual(
            nelson.get_set(), {'$N', 'EL', 'LS', 'N#', 'NE', 'ON', 'SO'}
        )
        self.assertEqual(
            nelson.get_list(), ['$N', 'NE', 'EL', 'LS', 'SO', 'ON', 'N#']
        )
        if sys.version_info >= (3, 6):
            self.assertEqual(
                repr(nelson),
                "QGrams({'$N': 1, 'NE': 1, 'EL': 1, 'LS': 1, 'SO': 1, 'ON': 1, \
'N#': 1})",
            )
        self.assertEqual(
            nelson & neilsen, Counter({'$N': 1, 'NE': 1, 'LS': 1, 'N#': 1})
        )
        self.assertEqual(
            nelson + neilsen,
            Counter(
                {
                    '$N': 2,
                    'NE': 2,
                    'EL': 1,
                    'LS': 2,
                    'SO': 1,
                    'ON': 1,
                    'N#': 2,
                    'EI': 1,
                    'IL': 1,
                    'SE': 1,
                    'EN': 1,
                }
            ),
        )
        self.assertEqual(
            nelson - neilsen, Counter({'EL': 1, 'SO': 1, 'ON': 1})
        )

        nelsonnelson = QGrams(scaler='set').tokenize('NELSONNELSON')
        self.assertEqual(nelsonnelson.count(), 8)

        nelson_ssk = QSkipgrams(scaler='SSK').tokenize('NELSON')
        self.assertAlmostEqual(nelson_ssk.count(), 18.66784401)

        nelson_log = QSkipgrams(qval=3, scaler=log1p).tokenize('NELSON')
        gold_standard = Counter(
            {
                '$$N': 1.0986122886681096,
                '$$E': 0.6931471805599453,
                '$$L': 0.6931471805599453,
                '$$S': 0.6931471805599453,
                '$$O': 0.6931471805599453,
                '$$#': 1.0986122886681096,
                '$NE': 1.0986122886681096,
                '$NL': 1.0986122886681096,
                '$NS': 1.0986122886681096,
                '$NO': 1.0986122886681096,
                '$NN': 1.0986122886681096,
                '$N#': 2.1972245773362196,
                '$EL': 1.0986122886681096,
                '$ES': 1.0986122886681096,
                '$EO': 1.0986122886681096,
                '$EN': 1.0986122886681096,
                '$E#': 1.6094379124341003,
                '$LS': 1.0986122886681096,
                '$LO': 1.0986122886681096,
                '$LN': 1.0986122886681096,
                '$L#': 1.6094379124341003,
                '$SO': 1.0986122886681096,
                '$SN': 1.0986122886681096,
                '$S#': 1.6094379124341003,
                '$ON': 1.0986122886681096,
                '$O#': 1.6094379124341003,
                '$##': 1.0986122886681096,
                'NEL': 0.6931471805599453,
                'NES': 0.6931471805599453,
                'NEO': 0.6931471805599453,
                'NEN': 0.6931471805599453,
                'NE#': 1.0986122886681096,
                'NLS': 0.6931471805599453,
                'NLO': 0.6931471805599453,
                'NLN': 0.6931471805599453,
                'NL#': 1.0986122886681096,
                'NSO': 0.6931471805599453,
                'NSN': 0.6931471805599453,
                'NS#': 1.0986122886681096,
                'NON': 0.6931471805599453,
                'NO#': 1.0986122886681096,
                'NN#': 1.0986122886681096,
                'N##': 1.0986122886681096,
                'ELS': 0.6931471805599453,
                'ELO': 0.6931471805599453,
                'ELN': 0.6931471805599453,
                'EL#': 1.0986122886681096,
                'ESO': 0.6931471805599453,
                'ESN': 0.6931471805599453,
                'ES#': 1.0986122886681096,
                'EON': 0.6931471805599453,
                'EO#': 1.0986122886681096,
                'EN#': 1.0986122886681096,
                'E##': 0.6931471805599453,
                'LSO': 0.6931471805599453,
                'LSN': 0.6931471805599453,
                'LS#': 1.0986122886681096,
                'LON': 0.6931471805599453,
                'LO#': 1.0986122886681096,
                'LN#': 1.0986122886681096,
                'L##': 0.6931471805599453,
                'SON': 0.6931471805599453,
                'SO#': 1.0986122886681096,
                'SN#': 1.0986122886681096,
                'S##': 0.6931471805599453,
                'ON#': 1.0986122886681096,
                'O##': 0.6931471805599453,
            }
        )
        test_counter = nelson_log.get_counter()
        for key in test_counter:
            self.assertAlmostEqual(test_counter[key], gold_standard[key])