Esempio n. 1
0
    def test_glyphs(self):
        """Test the correct detection and rendering of glyphs. The characters in
        the resulting token should be the characters that are the content of the
        g tag. """
        testdata = {'յեգի<g xmlns="http://www.tei-c.org/ns/1.0" ref="#&#x57A;&#x57F;"/>ոս': 'յեգիոս',
                    'յ<g xmlns="http://www.tei-c.org/ns/1.0" ref="&#x561;&#x577;&#x56D;&#x561;&#x580;&#x570;">աշխար</g>հն': 'յաշխարհն',
                    '<g xmlns="http://www.tei-c.org/ns/1.0" ref="">աշխարհ</g>ին': 'աշխարհին',
                    'ար<g xmlns="http://www.tei-c.org/ns/1.0" ref="">ա</g>պ<lb xmlns="http://www.tei-c.org/ns/1.0" xml:id="l101276841" n="14"/>կաց': 'արապկաց',
                    '<g xmlns="http://www.tei-c.org/ns/1.0" ref="">աշխարհ</g>ն': 'աշխարհն'}

        testdata_special = {'յեգի<g xmlns="http://www.tei-c.org/ns/1.0" ref="#ptlig">պտ</g>ոս': {'token': 'յեգիպտոս', 'occurrence': 1},
                            'յ<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխար</g>հն': {'token': 'յաշխարհն', 'occurrence': 1},
                            '<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխարհ</g>ին': {'token': 'աշխարհին', 'occurrence': 2},
                            'ար<g xmlns="http://www.tei-c.org/ns/1.0" ref="#avar">ա</g>պ<lb xmlns="http://www.tei-c.org/ns/1.0" xml:id="l101276841" n="14"/>կաց': {'token': 'արապկաց', 'occurrence': 1},
                            '<g xmlns="http://www.tei-c.org/ns/1.0" ref="#asxarh">աշխարհ</g>ն': {'token': 'աշխարհն', 'occurrence': 1}}

        tokens = wordtokenize.from_etree(self.testdoc)
        # Find the token that has our substitution
        for t in tokens:
            if '<g xmlns="http://www.tei-c.org/ns/1.0" ref="' in t['lit']:
                self.assertIsNotNone(testdata.get(t['lit']), "Error in rendering glyphs (input data not covered by testdata)")
                self.assertTrue(t['t'] == testdata.get(t['lit']), "Error in rendering glyphs")
                del testdata[t['lit']]
        self.assertEqual(len(testdata), 0, "Did not find any test token")

        tokens = wordtokenize.from_etree(self.testdoc_special)
        # Find the token that has our substitution
        for t in tokens:
            if '<g xmlns="http://www.tei-c.org/ns/1.0" ref="' in t['lit']:
                self.assertIsNotNone(testdata_special.get(t['lit']), "Error in rendering glyphs (input data not covered by testdata)")
                self.assertTrue(t['t'] == testdata_special.get(t['lit'])['token'], "Error in rendering glyphs")
                testdata_special[t['lit']]['occurrence'] -= 1
                if testdata_special[t['lit']]['occurrence'] == 0:
                    del testdata_special[t['lit']]
        self.assertEqual(len(testdata_special), 0, "Did not find any test token")
Esempio n. 2
0
 def test_substitution_layer(self):
     """Test that the first_layer option works correctly."""
     tokens = wordtokenize.from_etree(self.testdoc, first_layer=True)
     # Find the token that has our substitution
     for t in tokens:
         if t['lit'] != 'դե<del xmlns="http://www.tei-c.org/ns/1.0">ղ</del>ևս':
             continue
         self.assertEqual(t['t'], 'դեղևս')
         break
     else:
         self.assertTrue(False, "Did not find the testing token")
Esempio n. 3
0
 def test_substitution(self):
     """Test that the correct words are picked out of a subst tag."""
     tokens = wordtokenize.from_etree(self.testdoc)
     # Find the token that has our substitution
     for t in tokens:
         if t['lit'] != 'դե<add xmlns="http://www.tei-c.org/ns/1.0">ռ</add>ևս':
             continue
         self.assertEqual(t['t'], 'դեռևս')
         break
     else:
         self.assertTrue(False, "Did not find the testing token")