def test_filter_transliterate_replace_codepoint_length(self): self.filter_transliterate([u"x" + gf.safe_unichr(0x0008) + u"z"], [u"xaz"]) self.filter_transliterate([u"x" + gf.safe_unichr(0x0088) + u"z"], [u"xaz"]) self.filter_transliterate([u"x" + gf.safe_unichr(0x0888) + u"z"], [u"xaz"]) self.filter_transliterate([u"x" + gf.safe_unichr(0x8888) + u"z"], [u"xaz"]) if gf.is_py2_narrow_build(): # NOTE Python 2 narrow builds cannot handle codepoints above 0x10000 correctly pass else: self.filter_transliterate([u"x" + gf.safe_unichr(0x88888) + u"z"], [u"xaz"]) self.filter_transliterate([u"x" + gf.safe_unichr(0x108888) + u"z"], [u"xaz"])
def _build_map(self): """ Read the map file at path. """ if gf.is_py2_narrow_build(): self.log_warn( u"Running on a Python 2 narrow build: be aware that Unicode chars above 0x10000 cannot be replaced correctly." ) self.trans_map = {} with io.open(self.file_path, "r", encoding="utf-8") as file_obj: contents = file_obj.read().replace(u"\t", u" ") for line in contents.splitlines(): # ignore lines starting with "#" or blank (after stripping) if not line.startswith(u"#"): line = line.strip() if len(line) > 0: self._process_map_rule(line)
def transliterate(self, string): result = [] # # NOTE on Python 2 narrow builds, # this iterator is not 100% correct # because an Unicode character above 0x10000 # is "split" into two characters, # and hence it cannot be found as a key of the map # if gf.is_py2_narrow_build(): self.log_warn( u"Running on a Python 2 narrow build: be aware that Unicode chars above 0x10000 cannot be replaced correctly." ) for char in string: try: result.append(self.trans_map[char]) except: result.append(char) result = u"".join(result) return result