def test_UnicodeTag(self): """Test that unicode language tags are accepted""" d1 = self.broker._request_dict_data(raw_unicode("en_US")) self.assertTrue(d1) _e.broker_free_dict(self.broker._this,d1) d1 = Dict(raw_unicode("en_US")) self.assertTrue(d1)
def test_UnicodeTag(self): """Test that unicode language tags are accepted""" d1 = self.broker._request_dict_data(raw_unicode("en_US")) self.assertTrue(d1) self.broker._free_dict_data(d1) d1 = Dict(raw_unicode("en_US")) self.assertTrue(d1)
def test_finnish_text(self): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.') outputT = [ (raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),] for (itmO,itmV) in zip(outputT,tokenize_en(inputT)): self.assertEqual(itmO,itmV)
def test_finnish_text(self): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ inputT = raw_unicode('T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 \'tarkistaa\' sanat jotka "lainausmerkeiss\\xe4". Heittomerkki ja vaa\'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf.') outputT = [ (raw_unicode('T\\xe4m\\xe4'),0), (raw_unicode('on'),5), (raw_unicode('kappale'),8), (raw_unicode('Eip\\xe4'),17), (raw_unicode('ole'),22), (raw_unicode('kovin'),26), (raw_unicode('nen'),34), (raw_unicode('mutta'),39), (raw_unicode('tarkoitus'),45), (raw_unicode('on'),55), (raw_unicode('n\\xe4ytt\\xe4\\xe4'),58), (raw_unicode('miten'),66), (raw_unicode('sanastaja'),72), (raw_unicode('toimii'),83), (raw_unicode('useiden'),90), (raw_unicode('erilaisten'),98), (raw_unicode('sanarypp\\xe4iden'),109), (raw_unicode('kimpussa'),123), (raw_unicode('Pit\\xe4\\xe4p\\xe4'),133), (raw_unicode('viel\\xe4'),141), (raw_unicode('tarkistaa'),148), (raw_unicode('sanat'),159), (raw_unicode('jotka'),165), (raw_unicode('lainausmerkeiss\\xe4'),172), (raw_unicode('Heittomerkki'),191), (raw_unicode('ja'),204), (raw_unicode("vaa'an"),207), (raw_unicode('Ulkomaisia'),215), (raw_unicode('sanoja'),226), (raw_unicode('s\\xfcss'),233), (raw_unicode('spa\\xdf'),239),] for (itmO,itmV) in zip(outputT,tokenize_en(inputT)): self.assertEqual(itmO,itmV)
def test_unicodeCombining(self): """Test tokenization with unicode combining symbols.""" input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_unicodeCombining(self): """Test tokenization with unicode combining symbols.""" input = raw_unicode(r"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_unicodeBasic(self): """Test tokenization of a basic unicode string.""" input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_bug2785373(self): """Testcases for bug #2785373""" input = "So, one dey when I wes 17, I left." for _ in tokenize_en(input): pass input = raw_unicode("So, one dey when I wes 17, I left.") for _ in tokenize_en(input): pass
def test_unicode1(self): """Test checking/suggesting for unicode strings""" # TODO: find something that actually returns suggestions us1 = raw_unicode(r"he\u2149lo") self.assertTrue(type(us1) is unicode) self.assertFalse(self.dict.check(us1)) for s in self.dict.suggest(us1): self.assertTrue(type(s) is unicode)
def test_unicodeBasic(self): """Test tokenization of a basic unicode string.""" input = raw_unicode(r"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao") output = input.split(" ") output[8] = output[8][0:-1] for (itmO,itmV) in zip(output,tokenize_en(input)): self.assertEqual(itmO,itmV[0]) self.assertTrue(input[itmV[1]:].startswith(itmO))
def test_bug2785373(self): """Testcases for bug #2785373""" input = "So, one dey when I wes 17, I left." for _ in tokenize_en(input): pass input = raw_unicode("So, one dey when I wes 17, I left.") for _ in tokenize_en(input): pass
def test_unicode1(self): """Test checking/suggesting for unicode strings""" # TODO: find something that actually returns suggestions us1 = raw_unicode(r"he\u2149lo") self.assertTrue(type(us1) is unicode) self.assertFalse(self.dict.check(us1)) for s in self.dict.suggest(us1): self.assertTrue(type(s) is unicode)
def test_finnish_text(self): """Test tokenizing some Finnish text. This really should work since there are no special rules to apply, just lots of non-ascii characters. """ inputT = raw_unicode( "T\\xe4m\\xe4 on kappale. Eip\\xe4 ole kovin 2 nen, mutta tarkoitus on n\\xe4ytt\\xe4\\xe4 miten sanastaja \\ntoimii useiden-erilaisten sanarypp\\xe4iden kimpussa.\\nPit\\xe4\\xe4p\\xe4 viel\\xe4 'tarkistaa' sanat jotka \"lainausmerkeiss\\xe4\". Heittomerkki ja vaa'an.\\nUlkomaisia sanoja s\\xfcss, spa\\xdf." ) outputT = [ (raw_unicode("T\\xe4m\\xe4"), 0), (raw_unicode("on"), 5), (raw_unicode("kappale"), 8), (raw_unicode("Eip\\xe4"), 17), (raw_unicode("ole"), 22), (raw_unicode("kovin"), 26), (raw_unicode("nen"), 34), (raw_unicode("mutta"), 39), (raw_unicode("tarkoitus"), 45), (raw_unicode("on"), 55), (raw_unicode("n\\xe4ytt\\xe4\\xe4"), 58), (raw_unicode("miten"), 66), (raw_unicode("sanastaja"), 72), (raw_unicode("toimii"), 83), (raw_unicode("useiden"), 90), (raw_unicode("erilaisten"), 98), (raw_unicode("sanarypp\\xe4iden"), 109), (raw_unicode("kimpussa"), 123), (raw_unicode("Pit\\xe4\\xe4p\\xe4"), 133), (raw_unicode("viel\\xe4"), 141), (raw_unicode("tarkistaa"), 148), (raw_unicode("sanat"), 159), (raw_unicode("jotka"), 165), (raw_unicode("lainausmerkeiss\\xe4"), 172), (raw_unicode("Heittomerkki"), 191), (raw_unicode("ja"), 204), (raw_unicode("vaa'an"), 207), (raw_unicode("Ulkomaisia"), 215), (raw_unicode("sanoja"), 226), (raw_unicode("s\\xfcss"), 233), (raw_unicode("spa\\xdf"), 239), ] for (itmO, itmV) in zip(outputT, tokenize_en(inputT)): self.assertEqual(itmO, itmV)
def test_UnicodeInstallPath(self): """Test installation in a path containing unicode chars.""" self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing') self.install() self.runtests()
def test_UnicodeCharsInPath(self): """Test that unicode chars in PWL paths are accepted.""" self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing") d = request_pwl_dict(self._path()) self.assertTrue(d)
def test_UnicodeCharsInPath(self): """Test that unicode chars in PWL paths are accepted.""" self._fileName = raw_unicode(r"test_\xe5\xe4\xf6_ing") d = request_pwl_dict(self._path()) self.assertTrue(d)
def test_UnicodeInstallPath(self): """Test installation in a path containing unicode chars.""" self._insDir = raw_unicode(r'test_\xe5\xe4\xf6_ing') self.install() self.runtests()