Esempio n. 1
0
    def test_saps_tokenizer(self):
        """Test abydos.tokenizer.SAPSTokenizer."""
        self.assertEqual(sorted(SAPSTokenizer().tokenize('').get_list()), [])
        self.assertEqual(
            sorted(SAPSTokenizer().tokenize('a').get_list()), ['a']
        )

        tok = SAPSTokenizer()

        self.assertEqual(
            sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son'])
        )
        self.assertEqual(
            sorted(tok.tokenize('neilson').get_list()), sorted(['neil', 'son'])
        )
        self.assertEqual(
            sorted(tok.tokenize('peninsular').get_list()),
            sorted(['pe', 'nin', 'su', 'lar']),
        )
        self.assertEqual(
            sorted(tok.tokenize('spectacular').get_list()),
            sorted(['s', 'pec', 'ta', 'cu', 'lar']),
        )
        self.assertEqual(
            sorted(tok.tokenize('sufficiently').get_list()),
            sorted(['suf', 'fi', 'cien', 't', 'ly']),
        )
        self.assertEqual(
            sorted(tok.tokenize('yachting').get_list()),
            sorted(['yac', 'h', 'tin', 'g']),
        )
        self.assertEqual(
            sorted(tok.tokenize('caterpillars').get_list()),
            sorted(['ca', 'ter', 'pil', 'lar', 's']),
        )
Esempio n. 2
0
class BLEUTestCases(unittest.TestCase):
    """Test BLEU functions.

    abydos.distance.BLEU
    """

    cmp = BLEU()
    cmp_skip_saps = BLEU(
        tokenizers=[QSkipgrams(), SAPSTokenizer()], weights=[0.33, 0.67]
    )

    def test_bleu_sim(self):
        """Test abydos.distance.BLEU.sim."""
        # Base cases
        self.assertEqual(self.cmp.sim('', ''), 0.0)
        self.assertEqual(self.cmp.sim('a', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'a'), 0.0)
        self.assertEqual(self.cmp.sim('abc', ''), 0.0)
        self.assertEqual(self.cmp.sim('', 'abc'), 0.0)
        self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0)
        self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0)

        self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6223329773)
        self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6223329773)
        self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7071067812)
        self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7071067812)
        self.assertAlmostEqual(
            self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5119598032
        )

        self.assertAlmostEqual(
            self.cmp_skip_saps.sim('Nigel', 'Niall'), 0.7828303104
        )

    def test_bleu_dist(self):
        """Test abydos.distance.BLEU.dist."""
        # Base cases
        self.assertEqual(self.cmp.dist('', ''), 1.0)
        self.assertEqual(self.cmp.dist('a', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'a'), 1.0)
        self.assertEqual(self.cmp.dist('abc', ''), 1.0)
        self.assertEqual(self.cmp.dist('', 'abc'), 1.0)
        self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0)
        self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0)

        self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3776670227)
        self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3776670227)
        self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2928932188)
        self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2928932188)
        self.assertAlmostEqual(
            self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4880401968
        )
Esempio n. 3
0
    def test_bag_sim(self):
        """Test abydos.distance.Bag.sim."""
        self.assertEqual(self.cmp.sim('', ''), 1)
        self.assertEqual(self.cmp.sim('nelson', ''), 0)
        self.assertEqual(self.cmp.sim('', 'neilsen'), 0)
        self.assertEqual(self.cmp.sim('ab', 'a'), 0.5)
        self.assertEqual(self.cmp.sim('ab', 'c'), 0)
        self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 5 / 7)
        self.assertAlmostEqual(self.cmp.sim('neilsen', 'nelson'), 5 / 7)
        self.assertAlmostEqual(self.cmp.sim('niall', 'neal'), 3 / 5)
        self.assertAlmostEqual(self.cmp.sim('aluminum', 'Catalan'), 3 / 8)
        self.assertEqual(self.cmp.sim('abcdefg', 'hijklm'), 0)
        self.assertEqual(self.cmp.sim('abcdefg', 'hijklmno'), 0)

        self.assertEqual(Bag(tokenizer=SAPSTokenizer()).sim('DNA', 'RNA'), 0.5)
Esempio n. 4
0
    def test_saps_tokenizer(self):
        """Test abydos.tokenizer.SAPSTokenizer."""
        self.assertEqual(sorted(SAPSTokenizer().tokenize('').get_list()), [])
        self.assertEqual(
            sorted(SAPSTokenizer().tokenize('a').get_list()), ['a']
        )

        tok = SAPSTokenizer()

        self.assertEqual(
            sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son'])
        )
        self.assertEqual(
            sorted(tok.tokenize('neilson').get_list()), sorted(['neil', 'son'])
        )
        self.assertEqual(
            sorted(tok.tokenize('peninsular').get_list()),
            sorted(['pe', 'nin', 'su', 'lar']),
        )
        self.assertEqual(
            sorted(tok.tokenize('spectacular').get_list()),
            sorted(['s', 'pec', 'ta', 'cu', 'lar']),
        )
        self.assertEqual(
            sorted(tok.tokenize('sufficiently').get_list()),
            sorted(['suf', 'fi', 'cien', 't', 'ly']),
        )
        self.assertEqual(
            sorted(tok.tokenize('yachting').get_list()),
            sorted(['yac', 'h', 'tin', 'g']),
        )
        self.assertEqual(
            sorted(tok.tokenize('caterpillars').get_list()),
            sorted(['ca', 'ter', 'pil', 'lar', 's']),
        )
Esempio n. 5
0
)

from nltk import TweetTokenizer

from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char

algorithms = {
    'corvcluster': COrVClusterTokenizer().tokenize,
    'cvcluster': CVClusterTokenizer().tokenize,
    'character': CharacterTokenizer().tokenize,
    'legalipy': LegaliPyTokenizer().tokenize,
    'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize,
    'qgrams': QGrams().tokenize,
    'qskipgrams': QSkipgrams().tokenize,
    'regexp': RegexpTokenizer().tokenize,
    'saps': SAPSTokenizer().tokenize,
    'sonoripy': SonoriPyTokenizer().tokenize,
    'vccluster': VCClusterTokenizer().tokenize,
    'whitespace': WhitespaceTokenizer().tokenize,
    'wordpunct': WordpunctTokenizer().tokenize,
}


class BigListOfNaughtyStringsTestCases(unittest.TestCase):
    """Test each tokenizer against the BLNS set.

    Here, we test each algorithm against each string, but we only care that it
    does not result in an exception.

    While not actually a fuzz test, this does serve the purpose of looking for
    errors resulting from unanticipated input.