Example #1
0
class TokenizerTestCase(unittest.TestCase):
    """ Tests for tokenizer.py """
    maxDiff = None  # for printing large output

    def setUp(self):
        self.t = Tokenizer(_test_path('test.prf'))

    def test_errors(self):
        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '<{0}>'.format(c))
        self.assertEqual(t('habe'), '<i> a b <e>')

        with self.assertRaises(ValueError):
            t('habe', form='xyz')

        with self.assertRaises(ValueError):
            t('habe', errors='strict')

        self.assertEqual(t('habe', errors='ignore'), 'a b')

    def test_boundaries(self):
        self.assertEqual(self.t('aa aa', separator=' _ '), ' b _  b')

    def test_normalization(self):
        t = Tokenizer()
        s = 'n\u0303a'
        self.assertEqual(t(s), 'n\u0303 a')
        self.assertEqual(t('\xf1a'), 'n\u0303 a')
        self.assertEqual(t(s, form='NFC'), '\xf1 a')

    def test_ipa(self):
        t = Tokenizer()
        self.assertEqual(t('\u02b0ello', ipa=True), '\u02b0e l l o')

    def test_tokenize_with_profile(self):
        self.assertEqual(self.t('aa'), ' b')

    def test_tokenize_with_profile_from_object(self):
        prf = Profile(dict(Grapheme='aa', mapping='xy'), dict(Grapheme='b', mapping='z'))
        self.assertEqual(Tokenizer(profile=prf)('aab', column='mapping'), 'xy z')

    def test_tokenize_without_profile(self):
        self.assertEqual(Tokenizer()('aa', form='NFC'), 'a a')

    def test_printTree(self):
        stream = StringIO()
        self.t.op.tree.printTree(self.t.op.tree.root, stream=stream)
        stream.seek(0)
        self.assertIn('a* -- a*', stream.read().split('\n'))
        printMultigraphs(self.t.op.tree.root, '', '')
        printMultigraphs(self.t.op.tree.root, 'abcd', '')

    def test_characters(self):
        t = Tokenizer()
        result = t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p")

    def test_grapheme_clusters(self):
        t = Tokenizer()
        result = t.grapheme_clusters("ĉháɾã̌ctʼɛ↗ʐː| k͡p")
        self.assertEqual(result, "ĉ h á ɾ ã̌ c t ʼ ɛ ↗ ʐ ː | # k͡ p")

    def test_graphemes(self):
        t = Tokenizer()
        self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h")
        self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")

    def test_transform1(self):
        self.assertEqual(self.t.transform("aabchonn-ih"), "aa b ch on n - ih")

        with self.assertRaises(ValueError):
            Tokenizer().transform('abc')

        with self.assertRaises(ValueError):
            self.assertEqual(self.t.transform("aabchonn-ih", 'xx'), "aa b ch on n - ih")

    def test_transform2(self):
        result = self.t.transform("aabchonn-ih", "IPA")
        self.assertEqual(result, "aː b tʃ õ n í")

    def test_transform3(self):
        result = self.t.transform("aabchonn-ih", "XSAMPA")
        self.assertEqual(result, "a: b tS o~ n i_H")

    def test_rules(self):
        self.assertEqual(Tokenizer().rules('abc'), 'abc')
        result = self.t.rules("aabchonn-ih")
        self.assertEqual(result, "  ii-ii")

    def test_transform_rules(self):
        result = self.t.transform_rules("aabchonn-ih")
        self.assertEqual(result, " b b ii - ii")

    def test_find_missing_characters(self):
        result = self.t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih \ufffd \ufffd \ufffd")

        t = Tokenizer(_test_path('test.prf'), errors_replace=lambda c: '?')
        result = t.find_missing_characters("aa b ch on n - ih x y z")
        self.assertEqual(result, "aa b ch on n - ih ? ? ?")
Example #2
0
 def test_graphemes(self):
     t = Tokenizer()
     self.assertEqual(t.graphemes("aabchonn-ih"), "a a b c h o n n - i h")
     self.assertEqual(self.t.graphemes("aabchonn-ih"), "aa b ch on n - ih")