def test_saps_tokenizer(self): """Test abydos.tokenizer.SAPSTokenizer.""" self.assertEqual(sorted(SAPSTokenizer().tokenize('').get_list()), []) self.assertEqual( sorted(SAPSTokenizer().tokenize('a').get_list()), ['a'] ) tok = SAPSTokenizer() self.assertEqual( sorted(tok.tokenize('nelson').get_list()), sorted(['nel', 'son']) ) self.assertEqual( sorted(tok.tokenize('neilson').get_list()), sorted(['neil', 'son']) ) self.assertEqual( sorted(tok.tokenize('peninsular').get_list()), sorted(['pe', 'nin', 'su', 'lar']), ) self.assertEqual( sorted(tok.tokenize('spectacular').get_list()), sorted(['s', 'pec', 'ta', 'cu', 'lar']), ) self.assertEqual( sorted(tok.tokenize('sufficiently').get_list()), sorted(['suf', 'fi', 'cien', 't', 'ly']), ) self.assertEqual( sorted(tok.tokenize('yachting').get_list()), sorted(['yac', 'h', 'tin', 'g']), ) self.assertEqual( sorted(tok.tokenize('caterpillars').get_list()), sorted(['ca', 'ter', 'pil', 'lar', 's']), )
class BLEUTestCases(unittest.TestCase): """Test BLEU functions. abydos.distance.BLEU """ cmp = BLEU() cmp_skip_saps = BLEU( tokenizers=[QSkipgrams(), SAPSTokenizer()], weights=[0.33, 0.67] ) def test_bleu_sim(self): """Test abydos.distance.BLEU.sim.""" # Base cases self.assertEqual(self.cmp.sim('', ''), 0.0) self.assertEqual(self.cmp.sim('a', ''), 0.0) self.assertEqual(self.cmp.sim('', 'a'), 0.0) self.assertEqual(self.cmp.sim('abc', ''), 0.0) self.assertEqual(self.cmp.sim('', 'abc'), 0.0) self.assertEqual(self.cmp.sim('abc', 'abc'), 1.0) self.assertEqual(self.cmp.sim('abcd', 'efgh'), 0.0) self.assertAlmostEqual(self.cmp.sim('Nigel', 'Niall'), 0.6223329773) self.assertAlmostEqual(self.cmp.sim('Niall', 'Nigel'), 0.6223329773) self.assertAlmostEqual(self.cmp.sim('Colin', 'Coiln'), 0.7071067812) self.assertAlmostEqual(self.cmp.sim('Coiln', 'Colin'), 0.7071067812) self.assertAlmostEqual( self.cmp.sim('ATCAACGAGT', 'AACGATTAG'), 0.5119598032 ) self.assertAlmostEqual( self.cmp_skip_saps.sim('Nigel', 'Niall'), 0.7828303104 ) def test_bleu_dist(self): """Test abydos.distance.BLEU.dist.""" # Base cases self.assertEqual(self.cmp.dist('', ''), 1.0) self.assertEqual(self.cmp.dist('a', ''), 1.0) self.assertEqual(self.cmp.dist('', 'a'), 1.0) self.assertEqual(self.cmp.dist('abc', ''), 1.0) self.assertEqual(self.cmp.dist('', 'abc'), 1.0) self.assertEqual(self.cmp.dist('abc', 'abc'), 0.0) self.assertEqual(self.cmp.dist('abcd', 'efgh'), 1.0) self.assertAlmostEqual(self.cmp.dist('Nigel', 'Niall'), 0.3776670227) self.assertAlmostEqual(self.cmp.dist('Niall', 'Nigel'), 0.3776670227) self.assertAlmostEqual(self.cmp.dist('Colin', 'Coiln'), 0.2928932188) self.assertAlmostEqual(self.cmp.dist('Coiln', 'Colin'), 0.2928932188) self.assertAlmostEqual( self.cmp.dist('ATCAACGAGT', 'AACGATTAG'), 0.4880401968 )
def test_bag_sim(self): """Test abydos.distance.Bag.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('nelson', ''), 0) self.assertEqual(self.cmp.sim('', 'neilsen'), 0) self.assertEqual(self.cmp.sim('ab', 'a'), 0.5) self.assertEqual(self.cmp.sim('ab', 'c'), 0) self.assertAlmostEqual(self.cmp.sim('nelson', 'neilsen'), 5 / 7) self.assertAlmostEqual(self.cmp.sim('neilsen', 'nelson'), 5 / 7) self.assertAlmostEqual(self.cmp.sim('niall', 'neal'), 3 / 5) self.assertAlmostEqual(self.cmp.sim('aluminum', 'Catalan'), 3 / 8) self.assertEqual(self.cmp.sim('abcdefg', 'hijklm'), 0) self.assertEqual(self.cmp.sim('abcdefg', 'hijklmno'), 0) self.assertEqual(Bag(tokenizer=SAPSTokenizer()).sim('DNA', 'RNA'), 0.5)
) from nltk import TweetTokenizer from . import EXTREME_TEST, _corpus_file, _fuzz, _random_char algorithms = { 'corvcluster': COrVClusterTokenizer().tokenize, 'cvcluster': CVClusterTokenizer().tokenize, 'character': CharacterTokenizer().tokenize, 'legalipy': LegaliPyTokenizer().tokenize, 'nltk': NLTKTokenizer(nltk_tokenizer=TweetTokenizer()).tokenize, 'qgrams': QGrams().tokenize, 'qskipgrams': QSkipgrams().tokenize, 'regexp': RegexpTokenizer().tokenize, 'saps': SAPSTokenizer().tokenize, 'sonoripy': SonoriPyTokenizer().tokenize, 'vccluster': VCClusterTokenizer().tokenize, 'whitespace': WhitespaceTokenizer().tokenize, 'wordpunct': WordpunctTokenizer().tokenize, } class BigListOfNaughtyStringsTestCases(unittest.TestCase): """Test each tokenizer against the BLNS set. Here, we test each algorithm against each string, but we only care that it does not result in an exception. While not actually a fuzz test, this does serve the purpose of looking for errors resulting from unanticipated input.