Example #1
0
	def getDecompositionData(u,missingMarks):
	# inside so we can use umap, nmap ...
			udec = None
			try: 
				dec = unicodedata.decomposition(unichr(u))
				if len(dec) > 1:
					if not dec[:1] == "<":
						udec = [int(s, 16) for s in dec.split()]
						decall = 0
						for ud in udec:
							if ud in SKIP_MARKS_FINAL: # if mark is in SKIP_MARKS_FINAL we don't want to do any decomposition
								return 0
							if ud in umap:
								decall += 1
							else:
								if  ud not in SKIP_MARKS_FINAL \
								and ud     in MARK_GLYPH_CODEPOINT_RANGE:
									missingMarks += [unicodeIntToHexstr(ud)]
	#					if decall == len(udec) and decall == 1:
	#						print "SAME:",umap[u],[umap[ud] for ud in udec]
						if decall == len(udec) and decall > 1: # the last condition may go for the sake of allowing reference to same-shape glyphs
							return umap[u],[umap[ud] for ud in udec],udec[0] # last one is the one to check next
			except ValueError: 
				return 0
			return 0
Example #2
0
def remove_accent(character: str) -> str:
    decomposed = unicodedata.decomposition(character)  # type: str
    if not decomposed:
        return character

    codes = decomposed.split(" ")  # type: List[str]

    return chr(int(codes[0], 16))
Example #3
0
def parse_chars(characters, decompose=True, retainDecomposed=False):
    """
    From a string of characters get a set of unique unicode codepoints needed
    Note this will "decompose" combinging characters/marks and remove any
    standard whitespace characters (space, line break) but treat special
    whitespace characters as part of the charset (e.g. non breaking, enspace,
    etc.)
    Use this on all orthography base/auxiliary data
    """
    unique_chars = []
    try:
        unique_strings = "".join(character_list_from_string(characters))
        additional = []

        if not decompose:
            # If we want to just get the string of characters as a list without
            # doing any decomposition return a list of unique, space separated,
            # strings
            return character_list_from_string(unique_strings, False)

        for c in unique_strings:

            # decomposition is either "" or a space separated string of
            # zero-filled unicode hex values like "0075 0308"
            decomposition = unicodedata2.decomposition(c)

            # This glyph should be part of the list if either it cannot be
            # decomposed or if we want to keep also decomposable ones (e.g.
            # when pruning and saving the DB)
            if decomposition == "" or retainDecomposed:
                unique_chars.append(c)

            # Not _entirely_ sure why the following can be parts of the
            # decomposition but let's ignore them when encountered. Some glyphs
            # decompose to these kind of parts instead of uni hex, presumambly
            # as layout hints based on the glyph context
            # Match and ignore them for now
            # e.g. <isolated> <compat> <super> <vertical> <final> <medial>
            # <initial> <sub> <fraction> <font> <wide> <narrow>
            inbrackets = re.compile(r"^<\w+\>$")

            if decomposition != "":
                for unihexstr in decomposition.split(" "):
                    if inbrackets.match(unihexstr):
                        continue
                    try:
                        additional.append(chr(int(unihexstr, 16)))
                    except Exception as e:
                        log.error("Error getting glyph from decomposition "
                                  "part '%s' of '%s' (decomposition '%s'):"
                                  " %s" % (unihexstr, c, decomposition, e))

        # Append additional chars retrieved from decomposition to the end, but
        # sort those so that we have letters, then marks, then anything else
        additional = sort_by_character_type(additional)

        unique_chars = list_unique(unique_chars + additional)
    except Exception as e:
        log.error("Error parsing characters '%s': %s" % (characters, e))

    return list_unique([u for u in unique_chars
                        if not re.match(r"\s", u) and len(u) != 0])