class Unihandecoder(object): preferred_encoding = None decoder = None def __init__(self, lang="zh", encoding='utf-8'): self.preferred_encoding = encoding if lang == "ja": self.decoder = Jadecoder() elif lang == "kr": self.decoder = Krdecoder() elif lang == "vn": self.decoder = Vndecoder() else: #zh and others self.decoder = Unidecoder(lang) def _text_filter(self, text): if PY2: if not isinstance(text, unicode): try: text = unicode(text) except: # pragma: no cover try: text = text.decode(self.preferred_encoding) except: text = text.decode('utf-8', 'replace') #at first unicode normalize it. (see Unicode standards) return unicodedata.normalize('NFC',text) def decode(self, text): return self.decoder.decode(self._text_filter(text))
class Unihandecoder(object): preferred_encoding = None decoder = None def __init__(self, lang="zh", encoding='utf-8'): self.preferred_encoding = encoding if lang == "ja": self.decoder = Jadecoder() elif lang == "kr": self.decoder = Krdecoder() elif lang == "vn": self.decoder = Vndecoder() else: #zh and others self.decoder = Unidecoder() def decode(self, text): try: unicode # python2 if not isinstance(text, unicode): try: text = unicode(text) except: try: text = text.decode(self.preferred_encoding) except: text = text.decode('utf-8', 'replace') except: # python3, str is unicode pass #at first unicode normalize it. (see Unicode standards) ntext = unicodedata.normalize('NFC',text) return self.decoder.decode(ntext)
class Unihandecoder(object): preferred_encoding = None decoder = None def __init__(self, lang="zh", encoding='utf-8'): self.preferred_encoding = encoding if lang == "ja": self.decoder = Jadecoder() elif lang == "kr": self.decoder = Krdecoder() elif lang == "vn": self.decoder = Vndecoder() else: #zh and others self.decoder = Unidecoder(lang) def _text_filter(self, text): if PY2: if not isinstance(text, unicode): try: text = unicode(text) except: # pragma: no cover try: text = text.decode(self.preferred_encoding) except: text = text.decode('utf-8', 'replace') #at first unicode normalize it. (see Unicode standards) return unicodedata.normalize('NFC', text) def decode(self, text): return self.decoder.decode(self._text_filter(text))
class TestUnidecoder(unittest.TestCase): def setUp(self): self.decoder = Unidecoder("zh") def test_code_group(self): self.assertEqual(self.decoder.code_group(u"\u1234"), "x12") def test_grouped_point(self): self.assertEqual(self.decoder.grouped_point(u"\u1234"), 0x34) def test_decode(self): self.assertEqual(self.decoder.decode("a"), "a") def test_replace_point(self): self.assertEqual(self.decoder.replace_point("a"), "a")
class TestUnidecoder(unittest.TestCase): def setUp(self): self.decoder = Unidecoder('zh') def test_code_group(self): self.assertEqual(self.decoder.code_group("\u1234"), 'x12') def test_grouped_point(self): self.assertEqual(self.decoder.grouped_point("\u1234"), 0x34) def test_decode(self): self.assertEqual(self.decoder.decode("a"), "a") def test_replace_point(self): self.assertEqual(self.decoder.replace_point('a'), 'a')