def getDecompositionData(u,missingMarks): # inside so we can use umap, nmap ... udec = None try: dec = unicodedata.decomposition(unichr(u)) if len(dec) > 1: if not dec[:1] == "<": udec = [int(s, 16) for s in dec.split()] decall = 0 for ud in udec: if ud in SKIP_MARKS_FINAL: # if mark is in SKIP_MARKS_FINAL we don't want to do any decomposition return 0 if ud in umap: decall += 1 else: if ud not in SKIP_MARKS_FINAL \ and ud in MARK_GLYPH_CODEPOINT_RANGE: missingMarks += [unicodeIntToHexstr(ud)] # if decall == len(udec) and decall == 1: # print "SAME:",umap[u],[umap[ud] for ud in udec] if decall == len(udec) and decall > 1: # the last condition may go for the sake of allowing reference to same-shape glyphs return umap[u],[umap[ud] for ud in udec],udec[0] # last one is the one to check next except ValueError: return 0 return 0
def remove_accent(character: str) -> str: decomposed = unicodedata.decomposition(character) # type: str if not decomposed: return character codes = decomposed.split(" ") # type: List[str] return chr(int(codes[0], 16))
def parse_chars(characters, decompose=True, retainDecomposed=False): """ From a string of characters get a set of unique unicode codepoints needed Note this will "decompose" combinging characters/marks and remove any standard whitespace characters (space, line break) but treat special whitespace characters as part of the charset (e.g. non breaking, enspace, etc.) Use this on all orthography base/auxiliary data """ unique_chars = [] try: unique_strings = "".join(character_list_from_string(characters)) additional = [] if not decompose: # If we want to just get the string of characters as a list without # doing any decomposition return a list of unique, space separated, # strings return character_list_from_string(unique_strings, False) for c in unique_strings: # decomposition is either "" or a space separated string of # zero-filled unicode hex values like "0075 0308" decomposition = unicodedata2.decomposition(c) # This glyph should be part of the list if either it cannot be # decomposed or if we want to keep also decomposable ones (e.g. # when pruning and saving the DB) if decomposition == "" or retainDecomposed: unique_chars.append(c) # Not _entirely_ sure why the following can be parts of the # decomposition but let's ignore them when encountered. Some glyphs # decompose to these kind of parts instead of uni hex, presumambly # as layout hints based on the glyph context # Match and ignore them for now # e.g. <isolated> <compat> <super> <vertical> <final> <medial> # <initial> <sub> <fraction> <font> <wide> <narrow> inbrackets = re.compile(r"^<\w+\>$") if decomposition != "": for unihexstr in decomposition.split(" "): if inbrackets.match(unihexstr): continue try: additional.append(chr(int(unihexstr, 16))) except Exception as e: log.error("Error getting glyph from decomposition " "part '%s' of '%s' (decomposition '%s'):" " %s" % (unihexstr, c, decomposition, e)) # Append additional chars retrieved from decomposition to the end, but # sort those so that we have letters, then marks, then anything else additional = sort_by_character_type(additional) unique_chars = list_unique(unique_chars + additional) except Exception as e: log.error("Error parsing characters '%s': %s" % (characters, e)) return list_unique([u for u in unique_chars if not re.match(r"\s", u) and len(u) != 0])