def normalize_unicode_buffer(self): unicodes = [item.codepoint for item in self.buffer.items] newunicodes = [] for cp in unicodes: newunicodes.extend([ord(x) for x in unicodedata.normalize("NFD", chr(cp))]) # Now recompose newstring = "" ix = 0 while ix < len(newunicodes): a = newunicodes[ix] if ix + 1 == len(newunicodes): newstring = newstring + chr(a) break b = newunicodes[ix + 1] s = chr(a) + chr(b) composed = unicodedata.normalize("NFC", s) if ucd_data(a)["General_Category"][0] == "M": newstring = newstring + chr(a) ix = ix + 1 continue elif composed != unicodedata.normalize("NFD", s): assert len(s) == 1 newunicodes[ix] = ord(x) del newunicodes[ix + 1] continue else: newstring = newstring + chr(a) ix = ix + 1 self.buffer.store_unicode(newstring)
def substitute_default(self): super().substitute_default() state = 0 prev_item = None for item in self.buffer.items: item.arabic_joining = "NONE" ucd = ucd_data(item.codepoint) joining = ucd.get("Joining_Type") if not joining: if ucd.get("General_Category") in ["Mn", "Cf", "Em"]: joining = "T" else: joining = "U" if joining == "T": continue if joining == "C": joining = "D" # Mongolian if ucd.get("Joining_Group") == "ALAPH": joining = "ALAPH" if ucd.get("Joining_Group") == "DALATH RISH": joining = "DALATH_RISH" prev, this, state = state_table[state][jts[joining]] if prev_item: prev_item.arabic_joining = prev item.arabic_joining = this prev_item = item if self.buffer.script == "Mongolian": self.mongolian_variation_selectors() self.plan.msg("Assigned Arabic joining", self.buffer, serialize_options=["arabic_joining"]) for f in arabic_features: if f not in self.plan.fontfeatures.features: continue for item in self.buffer.items: item.feature_masks[f] = item.arabic_joining != f
def set_matra_position(item): script = ucd_data(item.codepoint)["Script"] u = item.codepoint if item.syllabic_position == IndicPosition.PRE_C: selector = matra_pos_left elif item.syllabic_position == IndicPosition.POST_C: selector = matra_pos_right if script == "Telugu": if u <= 0x0C42: item.syllabic_position = IndicPosition.BEFORE_SUB else: item.syllabic_position = IndicPosition.AFTER_SUB return if script == "Kannada": if u < 0x0CC3 or u > 0xCD6: item.syllabic_position = IndicPosition.BEFORE_SUB else: item.syllabic_position = IndicPosition.AFTER_SUB return elif item.syllabic_position == IndicPosition.ABOVE_C: selector = matra_pos_top elif item.syllabic_position == IndicPosition.BELOW_C: selector = matra_pos_bottom else: return item.syllabic_position = selector.get(script, selector["Default"])
def assign_category(self, item): # Base behavior is Indic ucd = ucd_data(item.codepoint) item.syllabic_category = syllabic_category_map.get( ucd.get("Indic_Syllabic_Category", "Other"), "X") item.positional_category = ucd.get("Indic_Positional_Category", "x") item.syllabic_position = IndicPositionalCategory2IndicPosition( item.positional_category) self.reassign_category(item)
def guess_segment_properties(self): for u in self.items: # Guess segment properties if not self.script: thisScript = ucd_data(u.codepoint)["Script"] if thisScript not in ["Common", "Unknown", "Inherited"]: self.script = thisScript if not self.direction: from fontFeatures.shaperLib.Shaper import _script_direction self.direction = _script_direction(self.script)
def normalize_unicode_buffer(self): unistring = "".join([chr(item.codepoint) for item in self.buffer.items]) self.buffer.store_unicode(unicodedata.normalize("NFC", unistring)) # Some fix-ups from hb-ot-shape-normalize for item in self.buffer.items: if ucd_data(item.codepoint)[ "General_Category" ] == "Zs" and self.font.glyphForCodepoint(0x20, False): item.codepoint = 0x20 # Harfbuzz adjusts the width here, in _hb_ot_shape_fallback_spaces if item.codepoint == 0x2011 and self.font.glyphForCodepoint(0x2010, False): item.codepoint = 0x2010
def normalize_to_glyphs(self, font): cmap = font.mapping self.info = [ BufferInfo({ "position": i, "original": x, "unicode_props": 0, "ucd_data": youseedee.ucd_data(ord(x)), "codepoint": ord(x), "glyph": cmap[ord(x)], "isMark": False, "mask": 0, }) for i, x in enumerate(self.characters) ]
def _fallback_categorize(self): if not self.codepoint: # Now what? self.category = ("unknown", None) return genCat = ucd_data(self.codepoint).get("General_Category", "L") if genCat[0] == "M": self.category = ("mark", None) elif genCat == "Ll": self.category = ("ligature", None) elif genCat[0] == "L": self.category = ("base", None) else: self.category = ("unknown", None)
def normalize_unicode_buffer(self): unicodes = [item.codepoint for item in self.buffer.items] newunicodes = [] for cp in unicodes: if cp in [0x0931, 0x09DC, 0x09DD, 0x0B94]: newunicodes.append(cp) elif cp in [0x0DDA, 0x0DDC, 0x0DDD, 0x0DDE]: # Sinhala split matras glyph = BufferItem.new_unicode(cp) glyph.map_to_glyph(self.buffer.font) if self.would_substitute("pstf", [glyph]): newunicodes.extend([0x0DD9, cp]) else: newunicodes.append(cp) else: newunicodes.extend( [ord(x) for x in unicodedata.normalize("NFD", chr(cp))]) # Now recompose newstring = "" ix = 0 while ix < len(newunicodes): a = newunicodes[ix] if ix + 1 == len(newunicodes): newstring = newstring + chr(a) break b = newunicodes[ix + 1] s = chr(a) + chr(b) composed = unicodedata.normalize("NFC", s) if ucd_data(a)["General_Category"][0] == "M": newstring = newstring + chr(a) ix = ix + 1 continue elif a == 0x9af and b == 0x9bc: newstring = newstring + chr(0x9df) ix = ix + 2 continue # elif composed != unicodedata.normalize("NFD", s): # assert(len(s) == 1) # newunicodes[ix] = ord(x) # del newunicodes[ix+1] # continue else: newstring = newstring + chr(a) ix = ix + 1 self.buffer.store_unicode(newstring)
def assign_category(self, item): item.syllabic_category = ucd_data(item.codepoint).get("USE_Category", "X") # Separate positional categories are not used, it's all in the syllabic_category item.positional_category = "x"
def final_reordering_syllable(self, start, end): def cat(i): return self.buffer.items[i].syllabic_category def pos(i): return self.buffer.items[i].syllabic_position def swap(a, b): self.buffer.items[b], self.buffer.items[a] = self.buffer.items[ a], self.buffer.items[b] def is_joiner(n): return cat(n) == "ZWJ" or cat(n) == "ZWNJ" def is_halant(n): return cat(n) == "H" def is_consonant(n): isc = cat(n) is_medial = isc == "CM" return isc in [ "C", "CS", "Ra", "V", "PLACEHOLDER", "DOTTEDCIRCLE" ] or is_medial virama = self.config["virama"] virama_item = BufferItem.new_unicode(virama) virama_item.map_to_glyph(self.buffer.font) if virama_item.glyph != ".notdef": for i in range(start, end): if self.buffer.items[i].glyph == virama_item.glyph \ and self.buffer.items[i].ligated \ and self.buffer.items[i].multiplied: self.buffer.items[i].syllabic_category = "H" self.buffer.items[i].ligated = False self.buffer.items[i].multiplied = False try_pref = any([ "pref" in item.feature_masks and item.feature_masks["pref"] == False for item in self.buffer.items ]) base = start while base < end: if pos(base) >= IndicPosition.BASE_C: if try_pref and base + 1 < end: for i in range(base + 1, end): item = self.buffer.items[i] if not item.feature_masks.get("pref", True): if not (item.substituted and (item.ligated and not item.multiplied)): base = i while base < end and is_halant(base): base = base + 1 self.buffer.items[ base].syllabic_positional_category = IndicPosition.BASE_C try_pref = false break if self.buffer.script == "Malayalam": i = base + 1 while i < end: while i < end and is_joiner(i): i = i + 1 if i == end or not is_halant(i): break i = i + 1 while i < end and is_joiner(i): i = i + 1 if i < end and is_consonant(i) and pos( i) == IndicPosition.BELOW_C: base = i self.buffer.items[ base].syllabic_positional_category = IndicPosition.BASE_C i = i + 1 if start < base and pos(base) > IndicPosition.BASE_C: base = base - 1 break base = base + 1 if base == end and start < base and cat(base - i) == "ZWJ": base = base - 1 if base < end: while start < base and cat(base) in ["N", "H"]: base = base - 1 # Reorder matras if start + 1 < end and start < base: new_pos = base - 1 if base == end: new_pos = base - 2 # XXX for i in range(start, end): self.buffer.items[i].feature_masks["init"] = True if pos(start) == IndicPosition.PRE_M: reverse_map = { k: v for k, v in self.buffer.font.unicode_map.items() } if start == 0 or ucd_data( reverse_map.get(self.buffer.items[start - 1].glyph, 0))["General_Category"] not in [ "Cf", "Cn", "Co", "Cs", "Ll", "Lm", "Lo", "Lt", "Lu", "Mc", "Me", "Mn" ]: self.buffer.items[start].feature_masks["init"] = False
from youseedee import ucd_data from pprint import pprint import sys char = sys.argv[1] if len(char) > 1: pprint(ucd_data(int(char,16))) else: pprint(ucd_data(ord(char)))