Beispiel #1
0
 def test_UnicodeTag(self):
     """Test that unicode language tags are accepted"""
     d1 = self.broker._request_dict_data(raw_unicode("en_US"))
     self.assertTrue(d1)
     _e.broker_free_dict(self.broker._this,d1)
     d1 = Dict(raw_unicode("en_US"))
     self.assertTrue(d1)
Beispiel #2
0
 def test_UnicodeTag(self):
     """Test that unicode language tags are accepted"""
     d1 = self.broker._request_dict_data(raw_unicode("en_US"))
     self.assertTrue(d1)
     self.broker._free_dict_data(d1)
     d1 = Dict(raw_unicode("en_US"))
     self.assertTrue(d1)
Beispiel #3
0
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
        outputT = [
(raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),]
        for (itmO,itmV) in zip(outputT,tokenize_en(inputT)):
            self.assertEqual(itmO,itmV)
Beispiel #4
0
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.')
        outputT = [
(raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),]
        for (itmO,itmV) in zip(outputT,tokenize_en(inputT)):
            self.assertEqual(itmO,itmV)
Beispiel #5
0
 def test_unicodeCombining(self):
     """Test tokenization with unicode combining symbols."""
     input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
Beispiel #6
0
 def test_unicodeCombining(self):
     """Test tokenization with unicode combining symbols."""
     input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
Beispiel #7
0
 def test_unicodeBasic(self):
     """Test tokenization of a basic unicode string."""
     input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
Beispiel #8
0
 def test_bug2785373(self):
     """Testcases for bug #2785373"""
     input = "So, one dey when I wes 17, I left."
     for _ in tokenize_en(input):
         pass
     input = raw_unicode("So, one dey when I wes 17, I left.")
     for _ in tokenize_en(input):
         pass
Beispiel #9
0
 def test_unicode1(self):
     """Test checking/suggesting for unicode strings"""
     # TODO: find something that actually returns suggestions
     us1 = raw_unicode(r"he\u2149lo")
     self.assertTrue(type(us1) is unicode)
     self.assertFalse(self.dict.check(us1))
     for s in self.dict.suggest(us1):
         self.assertTrue(type(s) is unicode)
Beispiel #10
0
 def test_unicodeBasic(self):
     """Test tokenization of a basic unicode string."""
     input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao")
     output = input.split(" ")
     output[8] = output[8][0:-1]
     for (itmO,itmV) in zip(output,tokenize_en(input)):
         self.assertEqual(itmO,itmV[0])
         self.assertTrue(input[itmV[1]:].startswith(itmO))
Beispiel #11
0
 def test_bug2785373(self):
     """Testcases for bug #2785373"""
     input = "So, one dey when I wes 17, I left."
     for _ in tokenize_en(input):
         pass
     input = raw_unicode("So, one dey when I wes 17, I left.")
     for _ in tokenize_en(input):
         pass
Beispiel #12
0
 def test_unicode1(self):
     """Test checking/suggesting for unicode strings"""
     # TODO: find something that actually returns suggestions
     us1 = raw_unicode(r"he\u2149lo")
     self.assertTrue(type(us1) is unicode)
     self.assertFalse(self.dict.check(us1))
     for s in self.dict.suggest(us1):
         self.assertTrue(type(s) is unicode)
Beispiel #13
0
    def test_finnish_text(self):
        """Test tokenizing some Finnish text.

        This really should work since there are no special rules to apply,
        just lots of non-ascii characters.
        """
        inputT = raw_unicode(
            "T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 'tarkistaa' sanat jotka \"lainausmerkeiss\\xe4\". Heittomerkki ja vaa'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf."
        )
        outputT = [
            (raw_unicode("T\\xe4m\\xe4"), 0),
            (raw_unicode("on"), 5),
            (raw_unicode("kappale"), 8),
            (raw_unicode("Eip\\xe4"), 17),
            (raw_unicode("ole"), 22),
            (raw_unicode("kovin"), 26),
            (raw_unicode("nen"), 34),
            (raw_unicode("mutta"), 39),
            (raw_unicode("tarkoitus"), 45),
            (raw_unicode("on"), 55),
            (raw_unicode("n\\xe4ytt\\xe4\\xe4"), 58),
            (raw_unicode("miten"), 66),
            (raw_unicode("sanastaja"), 72),
            (raw_unicode("toimii"), 83),
            (raw_unicode("useiden"), 90),
            (raw_unicode("erilaisten"), 98),
            (raw_unicode("sanarypp\\xe4iden"), 109),
            (raw_unicode("kimpussa"), 123),
            (raw_unicode("Pit\\xe4\\xe4p\\xe4"), 133),
            (raw_unicode("viel\\xe4"), 141),
            (raw_unicode("tarkistaa"), 148),
            (raw_unicode("sanat"), 159),
            (raw_unicode("jotka"), 165),
            (raw_unicode("lainausmerkeiss\\xe4"), 172),
            (raw_unicode("Heittomerkki"), 191),
            (raw_unicode("ja"), 204),
            (raw_unicode("vaa'an"), 207),
            (raw_unicode("Ulkomaisia"), 215),
            (raw_unicode("sanoja"), 226),
            (raw_unicode("s\\xfcss"), 233),
            (raw_unicode("spa\\xdf"), 239),
        ]
        for (itmO, itmV) in zip(outputT, tokenize_en(inputT)):
            self.assertEqual(itmO, itmV)
Beispiel #14
0
 def test_UnicodeInstallPath(self):
     """Test installation in a path containing unicode chars."""
     self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing')
     self.install()
     self.runtests()
Beispiel #15
0
 def test_UnicodeCharsInPath(self):
     """Test that unicode chars in PWL paths are accepted."""
     self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing")
     d = request_pwl_dict(self._path())
     self.assertTrue(d)
Beispiel #16
0
 def test_UnicodeCharsInPath(self):
     """Test that unicode chars in PWL paths are accepted."""
     self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing")
     d = request_pwl_dict(self._path())
     self.assertTrue(d)
Beispiel #17
0
 def test_UnicodeInstallPath(self):
     """Test installation in a path containing unicode chars."""
     self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing')
     self.install()
     self.runtests()