def check_bidi(label, check_ltr=False): # Bidi rules should only be applied if string contains RTL characters bidi_label = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if direction == '': # String likely comes from a newer version of Unicode raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx)) if direction in ['R', 'AL', 'AN']: bidi_label = True break if not bidi_label and not check_ltr: return True # Bidi rule 1 direction = unicodedata.bidirectional(label[0]) if direction in ['R', 'AL']: rtl = True elif direction == 'L': rtl = False else: raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label))) valid_ending = False number_type = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if rtl: # Bidi rule 2 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: raise IDNABidiError( 'Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx)) # Bidi rule 3 if direction in ['R', 'AL', 'EN', 'AN']: valid_ending = True elif direction != 'NSM': valid_ending = False # Bidi rule 4 if direction in ['AN', 'EN']: if not number_type: number_type = direction else: if number_type != direction: raise IDNABidiError('Can not mix numeral types in a right-to-left label') else: # Bidi rule 5 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: raise IDNABidiError( 'Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx)) # Bidi rule 6 if direction in ['L', 'EN']: valid_ending = True elif direction != 'NSM': valid_ending = False if not valid_ending: raise IDNABidiError('Label ends with illegal codepoint directionality') return True
def _rtl(text, default=False): if not text: return default first_character = text[0] if bidirectional(first_character) in ['RLE', 'RLO', 'R', 'AL']: return True elif bidirectional(first_character) in ['LRE', 'LRO', 'L']: return False elif len(text)>1: return _rtl(text[1:]) return default
def _check_true_dir(self, text): is_rtl = False is_ltr = False quoted_text = False last_inline_html_char_pos = text.rfind(">") if last_inline_html_char_pos > -1: it_here = text[last_inline_html_char_pos+1:] else: it_here = text for ch in it_here: res = UD.bidirectional(ch) if ch == '"': quoted_text = not quoted_text elif not quoted_text and res in {'R', 'AL'}: is_rtl = True elif not quoted_text and res == 'L': is_ltr = True #print(text, it_here, is_rtl, is_ltr) if is_rtl: return 'rtl' elif is_ltr: return 'ltr' else: return 'auto'
def _character_direction(ch): ch_bidi = bidirectional(ch) if ch_bidi in ['L', 'LRE', 'LRO']: return 'LTR' if ch_bidi in ['R', 'RLE', 'RLO', 'AL']: return 'RTL' return None
def info(self, char): cat = unicodedata.category(char) if cat == 'Cn': raise UnassignedCharacter catname = self.categories[cat] bidi = self.bidis[unicodedata.bidirectional(char)] name = unicodedata.name(char, 'an unnamed character').decode('ascii') if cat[0] == 'C' or cat in ('Zp', 'Zl'): example = u'' elif cat[0] == 'M' and cat[1] != 'c': example = u'\N{DOTTED CIRCLE}' + char else: example = char haninfo = u'' if 'CJK' in name and 'IDEOGRAPH' in name: unihan = Unihan(char) haninfo = unicode(unihan) if haninfo: haninfo = u'. ' + haninfo + u'.' return {'code': u'%04X' % ord(char), 'name': name.title().replace('Cjk', 'CJK'), 'char': char, 'example': example, 'category': catname.lower(), 'bidi': bidi, 'unihan': haninfo}
def fix_rtl(string): # If there are any strongly RTL chars, hint that we're already in an LTR context, and want to be afterwards rtl_chars = [c for c in string if unicodedata.bidirectional(c) in ['R', 'AL', 'RLE', 'RLO', 'RLI']] if rtl_chars: return u'\u200e' + string + u'\u200e' return string
def rtlString(source, lang): if lang and source and lang[0:2] in set([u"ar",u"he"]): line = [] lineInsertion = 0 words = [] rtl = True for c in source: bidi = unicodedata.bidirectional(c) if rtl: if bidi == u'L': if words: line.insert(lineInsertion, u''.join(words)) words = [] rtl = False elif bidi in (u'R', u'NSM', u'AN'): pass else: if words: line.insert(lineInsertion, u''.join(words)) words = [] line.insert(lineInsertion, c) continue else: if bidi == u'R' or bidi == u'AN': if words: line.append(u''.join(words)) words = [] rtl = True words.append(c) if words: if rtl: line.insert(0, u''.join(words)) return u''.join(line) else: return source
def lookup(self): # look up all the external references we need. if self.uniNumber is None: return try: self.uniLetter = unicodeToChar(self.uniNumber) except: print("GlyphName valueerror for %04X" % self.uniNumber) return if self.uniNumber in mathUniNumbers: self.isMath = True try: # self.uniName = unicodedata.name(self.uniLetter) self.uniName = unicodelist.get(self.uniNumber) if self.uniName is None: self.uniNameProcessed = "" else: self.uniNameProcessed = self.uniName self.bidiType = unicodedata.bidirectional(self.uniLetter) except ValueError: self.uniName = None self.uniNameProcessed = "" self.uniLetter = None self.bidiType = None self.uniRangeName = getRangeName(self.uniNumber)
def get_base_direction(text): """Find the base direction of a text string according to the first character with strong bidi type. Returns ``0`` for LTR, ``1`` for RTL and ``-1`` for undefined (no strong characters found). """ text = force_unicode(text) # Find first character with strong bidi type. first = None for c in text: bidi_type = bidirectional(c) if bidi_type in _strong_types: first = bidi_type break if first: if first in _rtl_types: return 1 else: return 0 else: # Text composed of weak bidi characters. return -1
def __init__(self, myData, headerData, comboData): QtGui.QMainWindow.__init__(self) self.ui = Ui_MainWindow() self.ui.setupUi(self) self.ui.OKButton.clicked.connect(self.OKClicked) self.ui.CancelButton.clicked.connect(self.CancelClicked) self.__model = LinkerTable(myData, headerData) self.ui.tableView.setModel(self.__model) # Prepare the checkbox column for row in range(0, self.__model.rowCount(self)): self.ui.tableView.openPersistentEditor(self.__model.index(row, 0)) self.__combo_model = LinkerCombo(comboData) self.ui.targetLexCombo.setModel(self.__combo_model) self.ret_val = 0 self.cols = 7 self.ui.targetLexCombo.currentIndexChanged.connect(self.ComboClicked) self.ui.FilterCheckBox.clicked.connect(self.FilterClicked) self.ComboClicked() myHPG = self.__combo_model.getCurrentHPG() myHeadword = myHPG.getHeadword() # Check for right to left data and set the combobox direction if needed for i in range(0, len(myHeadword)): if unicodedata.bidirectional(myHeadword[i]) in (u'R', u'AL'): self.ui.targetLexCombo.setLayoutDirection(QtCore.Qt.RightToLeft) self.__combo_model.setRTL(True) break
def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False): """Get the paragraph base embedding level and direction, set the storage to the array of chars""" prev_surrogate = False base_level = storage['base_level'] has_rtl = False # preset the storage's chars for _ch in text: if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX): prev_surrogate = _ch continue elif prev_surrogate: _ch = prev_surrogate + _ch prev_surrogate = False if upper_is_rtl and _ch.isupper(): bidi_type = 'R' else: try: bidi_type = bidirectional(_ch) except: bidi_type = None has_rtl |= (bidi_type == 'R') storage[ 'chars'].append({'ch': _ch, 'level': base_level, 'type': bidi_type, 'orig': bidi_type}) if debug: debug_storage(storage, base_info=True) return has_rtl
def drawText(canvas, x, y, text, en = False, bold = False, size = 12): wrkText = text isArabic = False isBidi = False for c in wrkText: cat = unicodedata.bidirectional(c) if cat == "AL" or cat == "AN" or cat == "FA": isArabic = True isBidi = True break elif cat == "R" or cat == "RLE" or cat == "RLO": isBidi = True if isArabic: wrkText = a_forms.fuse(wrkText) wrkText = a_process.shape(wrkText) if isBidi: wrkText = get_display(wrkText) if bold: canvas.setFont('BNazanin', size) else: canvas.setFont('Nazanin', size) canvas.drawRightString(x, canvas._pagesize[1] - y, wrkText)
def find_bidi(self, el): """Get directionality from element text.""" for node in self.get_children(el, tags=False): # Analyze child text nodes if self.is_tag(node): # Avoid analyzing certain elements specified in the specification. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) if ( self.get_tag(node) in ('bdi', 'script', 'style', 'textarea') or direction is not None ): continue # pragma: no cover # Check directionality of this node's text value = self.find_bidi(node) if value is not None: return value # Direction could not be determined continue # pragma: no cover # Skip `doctype` comments, etc. if self.is_special_string(node): continue # Analyze text nodes for directionality. for c in node: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return None
def rtlString(source, lang): if lang and lang[0:2] in {"ar","he"}: line = [] lineInsertion = 0 words = [] rtl = True for c in source: bidi = unicodedata.bidirectional(c) if rtl: if bidi == 'L': if words: line.insert(lineInsertion, ''.join(words)) words = [] rtl = False elif bidi in ('R', 'NSM', 'AN'): pass else: if words: line.insert(lineInsertion, ''.join(words)) words = [] line.insert(lineInsertion, c) continue else: if bidi == 'R' or bidi == 'AN': if words: line.append(''.join(words)) words = [] rtl = True words.append(c) if words: if rtl: line.insert(0, ''.join(words)) return ''.join(line) else: return source
def _truncate_invalid_chars(value, length): '''Safety check: make sure we aren't truncating within the boundaries of a multibyte character. Also, add a LTR BOM if the last character is RTL. ''' value = smart_str(value) if length: value = value[:length] valid = False while not valid and len(value): try: test = value.decode('utf8') # check for RTL encoding without order marker terminator direction = bidirectional(test[-1]) if direction in RTL_TYPES: # this is RTL, we need 3 bytes for the BOM if len(value) > (length - 3): # not enough room - keep chopping raise ValueError('Not enough room to truncate') else: test += u'\u200e' # LTR BOM return smart_str(test) else: valid = True del test except (UnicodeDecodeError, ValueError): # chop a letter off the end and try again value = value[:-1] return value
def print_Unicode_info(char, short): name = unicodedata.name(char, "UNKNOWN") decCodepoint = ord(char) hexCodepoint = hex(decCodepoint) lower = char.lower() upper = char.upper() category = unicodedata.category(char) bidirectional = unicodedata.bidirectional(char) mirrored = True if (unicodedata.mirrored(char) == 1) else False nfc = unicodedata.normalize("NFC", char) nfd = unicodedata.normalize("NFD", char) if (short): print(char + "\t" + name + " (U+" + str(hexCodepoint).upper().replace("0X", "") + ")") else: print("Name " + name) print("Character " + char) print("Dec Codepoint " + str(decCodepoint)) print("Hex Codepoint " + str(hexCodepoint)) print("Lowercase " + lower) print("Uppercase " + upper) print("Category " + category) print("Bidirectional " + bidirectional) print("Mirrored " + str(mirrored)) print("NFC " + nfc) print("NFD " + nfd)
def lookup(self): # look up all the external references we need. if self.uniNumber is None: return try: self.uniLetter = unicodeToChar(self.uniNumber) except: # print("GlyphName value error for %04X" % self.uniNumber) return if self.uniNumber in mathUniNumbers: self.isMath = True try: self.uniName = unicodelist.get(self.uniNumber) if self.uniName is None: self.uniNameProcessed = "" else: self.uniNameProcessed = self.uniName # NOTE: this is still a dependency on the unicodedata module. # Would be nice to extract this data directly from the unicode data # but the algotirhm is ot trivial.. self.bidiType = unicodedata.bidirectional(self.uniLetter) except ValueError: self.uniName = None self.uniNameProcessed = "" self.uniLetter = None self.bidiType = None except: import traceback traceback.print_exc() self.uniRangeName = getRangeName(self.uniNumber)
def match_dir(self, el, directionality): """Check directionality.""" # If we have to match both left and right, we can't match either. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: return False # Element has defined direction of left to right or right to left direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) if direction not in (None, 0): return direction == directionality # Element is the document element (the root) and no direction assigned, assume left to right. is_root = self.match_root(el) if is_root and direction is None: return ct.SEL_DIR_LTR == directionality # If `input[type=telephone]` and no direction is assigned, assume left to right. name = self.get_tag(el) is_input = name == 'input' is_textarea = name == 'textarea' is_bdi = name == 'bdi' itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' if is_input and itype == 'tel' and direction is None: return ct.SEL_DIR_LTR == directionality # Auto handling for text inputs if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: if is_textarea: value = [] for node in el.contents: if self.is_content_string(node): value.append(node) value = ''.join(value) else: value = self.get_attribute_by_name(el, 'value', '') if value: for c in value: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return direction == directionality # Assume left to right return ct.SEL_DIR_LTR == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el), directionality) # Auto handling for `bdi` and other non text inputs. if (is_bdi and direction is None) or direction == 0: direction = self.find_bidi(el) if direction is not None: return direction == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el), directionality) # Match parents direction return self.match_dir(self.get_parent(el), directionality)
def text_dir(c): """Classify a character as 'R'/'L'/''.""" dir = unicodedata.bidirectional(c) if dir in ('L',): return 'L' if dir in ('R', 'AL'): return 'R' return ''
def ub(self,bidi): """ Match a character with a given Unicode bidirectional class """ import unicodedata x, e = self.rule_anything() if unicodedata.bidirectional(x[0]) is bidi: return x, e else: e[1] = expected("bidi:"+ bidi) raise _MaybeParseError(*e)
def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False): """Get the paragraph base embedding level and direction, set the storage to the array of chars""" base_level = None # P2 for _ch in text: # treat upper as RTL ? if upper_is_rtl and _ch.isupper(): base_level = 1 break bidi_type = bidirectional(_ch) if bidi_type in ('AL', 'R'): base_level = 1 break elif bidi_type == 'L': base_level = 0 break # P3 if base_level is None: base_level = 0 storage['base_level'] = base_level storage['base_dir'] = ('L', 'R')[base_level] # preset the storage's chars for _ch in text: if upper_is_rtl and _ch.isupper(): bidi_type = 'R' else: bidi_type = bidirectional(_ch) storage['chars'].append({'ch':_ch, 'level':base_level, 'type':bidi_type, 'orig':bidi_type}) if debug: debug_storage(storage, base_info=True)
def categorize(lower, upper): """Place each character in the range of lower,upper into unicode_dict based on the directionality of the character. """ global unicode_dict for x in xrange(lower, upper): bidi = bidirectional(unichr(x)) if len(bidi) == 0: bidi = "UNK" if bidi in unicode_dict.keys(): unicode_dict[bidi].append(unichr(x)) else: unicode_dict[bidi] = [unichr(x)]
def calc_paragraph_level(self, chars=None): """Applies P2 and P3. P2_ : In each paragraph, find the first character of type L, AL, or R while skipping over any characters between an isolate initiator and its matching PDI or, if it has no matching PDI, the end of the paragraph. P3_ : If a character is found in P2 and it is of type AL or R, then set the paragraph embedding level to one; otherwise, set it to zero. .. _P2: http://www.unicode.org/reports/tr9/#P2 .. _P3: http://www.unicode.org/reports/tr9/#P3 """ upper_is_rtl = self.upper_is_rtl isolate_initiator_level = 0 if chars is None: chars = self.chars getter = itemgetter('ch') for ch in map(getter, chars): bidi_type = bidirectional(ch) if bidi_type in ISOLATE_INITIATORS: isolate_initiator_level += 1 continue if bidi_type == 'PDI' and isolate_initiator_level > 0: isolate_initiator_level -= 1 continue # ignore isolate initiators till it's matching PDI if isolate_initiator_level > 0: continue if upper_is_rtl and ch.isupper(): bidi_type = 'R' base_level = PARAGRAPH_LEVELS.get(bidi_type) if base_level is not None: break if base_level is None: base_level = PARAGRAPH_LEVELS['L'] return base_level
async def check_text(self, message): text = message.message result = self.storage.check_text(text) ret = None for txt, severity in result.items(): ret += MessageTextBlacklisted(severity, txt) rtl = False for char in text: if unicodedata.bidirectional(char) == "AL": # Arabic Letter rtl = True if rtl: ret += MessageContainsRtl(3) return ret
def __init__(self, msg1, infotext, parent=None, monospaced=False): self.xml = Glade(toplevel='infodialog') self.top = self.xml.toplevel self.top.set_icon(ICON) self.top.set_title("%s - Gramps" % msg1) label = self.xml.get_object('toplabel') label.set_text('<span weight="bold" size="larger">%s</span>' % msg1) label.set_use_markup(True) infoview = self.xml.get_object('infoview') infobuffer = Gtk.TextBuffer() infoview.set_buffer(infobuffer) if isinstance(infotext, str): infobuffer.set_text(infotext) else: for item in infotext: enditer = infobuffer.get_end_iter() if isinstance(item, str): infobuffer.insert(enditer, item + '\n') elif isinstance(item, list): grid = Gtk.Grid() grid.set_margin_start(6) grid.set_margin_end(6) grid.set_column_spacing(12) if unicodedata.bidirectional(item[0][0][0]) == 'R': grid.set_direction(Gtk.TextDirection.RTL) for offset_y, row in enumerate(item): for offset_x, col in enumerate(row): cell = Gtk.Label(col) cell.set_halign(Gtk.Align.END) grid.attach(cell, offset_x, offset_y, 1, 1) grid.show_all() anchor = infobuffer.create_child_anchor(enditer) infoview.add_child_at_anchor(grid, anchor) enditer = infobuffer.get_end_iter() infobuffer.insert(enditer, '\n') if monospaced: startiter, enditer = infobuffer.get_bounds() tag = infobuffer.create_tag(family="Monospace") infobuffer.apply_tag(tag, startiter, enditer) if parent: self.top.set_transient_for(parent) self.top.connect('response', self.destroy) self.top.show()
def paragraph_direction_mark(text): """ Determine paragraph writing direction according to http://www.unicode.org/reports/tr9/#The_Paragraph_Level Returns either Unicode LTR mark or RTL mark. """ for char in text: bidi = unicodedata.bidirectional(char) if bidi == 'L': return u'\u200E' elif bidi == 'AL' or bidi == 'R': return u'\u200F' return u'\u200E'
def bad_unicode(wc): w = wc[0] if not isinstance(w, unicode): w = unicode(w) prev_surrogate = False for _ch in w: if sys.maxunicode == 0xffff and (0xD800 <= ord(_ch) <= 0xDBFF): prev_surrogate = _ch continue elif prev_surrogate: _ch = prev_surrogate + _ch prev_surrogate = False if unicodedata.bidirectional(_ch) == '': return False return True
def overview(tree_item): """ Returns an overview of the character """ char = tree_item.obj return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char, unicodedata.decimal(char, ''), unicodedata.digit(char, ''), unicodedata.numeric(char, ''), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.combining(char), unicodedata.east_asian_width(char), unicodedata.mirrored(char), unicodedata.decomposition(char))
def paragraph_direction_mark(text): """ Determine paragraph writing direction according to http://www.unicode.org/reports/tr9/#The_Paragraph_Level Returns either Unicode LTR mark or RTL mark. """ for char in text: bidi = unicodedata.bidirectional(char) if bidi == 'L': return '\u200E' if bidi in ('AL', 'R'): return '\u200F' return '\u200E'
def obfuscation_machine(use_unicode=False, identifier_length=1): """ A generator that returns short sequential combinations of lower and upper-case letters that will never repeat. If *use_unicode* is ``True``, use nonlatin cryllic, arabic, and syriac letters instead of the usual ABCs. The *identifier_length* represents the length of the string to return using the aforementioned characters. """ # This generates a list of the letters a-z: lowercase = list(map(chr, range(97, 123))) # Same thing but ALL CAPS: uppercase = list(map(chr, range(65, 90))) # digit digit = list(map(chr, range(48, 57))) if use_unicode: # Python 3 lets us have some *real* fun: allowed_categories = ('LC', 'Ll', 'Lu', 'Lo', 'Lu') # All the fun characters start at 1580 (hehe): big_list = list(map(chr, range(1580, HIGHEST_UNICODE))) max_chars = 1000 # Ought to be enough for anybody :) combined = [] rtl_categories = ('AL', 'R') # AL == Arabic, R == Any right-to-left last_orientation = 'L' # L = Any left-to-right # Find a good mix of left-to-right and right-to-left characters while len(combined) < max_chars: char = choice(big_list) if unicodedata.category(char) in allowed_categories: orientation = unicodedata.bidirectional(char) if last_orientation in rtl_categories: if orientation not in rtl_categories: combined.append(char) else: if orientation in rtl_categories: combined.append(char) last_orientation = orientation else: combined = lowercase + uppercase + digit shuffle(combined) # Randomize it all to keep things interesting while True: for perm in permutations(combined, identifier_length): perm = "_0x" + "".join(perm) if perm not in analyze.reserved_words: # Can't replace reserved words yield perm identifier_length += 1
def UniqueUni(codepoint): array_result = dict() letter = chr(int(codepoint)) print(str(chr(int(codepoint)))) lettertest = unicodedata.bidirectional(letter) name_unicode = unicodedata.name(letter, ' ') hexauni = hex(ord(letter)) hexauni = hexauni.replace("0x", "U+") category = unicodedata.category(chr(int(codepoint))) array_result['unicode'] = u"" + letter + "" array_result['bidir'] = lettertest array_result['hexa'] = hexauni array_result['name'] = name_unicode array_result['number'] = int(codepoint) array_result['cat'] = category return array_result
def char2info(ch): name = U.name(ch, None) decimal = U.decimal(ch, None) digit = U.digit(ch, None) numeric = U.numeric(ch, None) category = U.category(ch) bidirectional = U.bidirectional(ch) combining = U.combining(ch) east_asian_width = U.east_asian_width(ch) mirrored = U.mirrored(ch) decomposition = U.decomposition(ch) unicode = ord(ch) unicode_hex = hex(unicode) return dict(locals())
def is_right_to_left(text): '''Check whether a text is right-to-left text or not :param text: The text to check :type text: string :rtype: boolean See: http://unicode.org/reports/tr9/#P2 TR9> In each paragraph, find the first character of type L, AL, or R TR9> while skipping over any characters between an isolate initiator TR9> and its matching PDI or, if it has no matching PDI, the end of the TR9> paragraph Examples: >>> is_right_to_left('Hallo!') False >>> is_right_to_left('﷼') True >>> is_right_to_left('﷼') False >>> is_right_to_left('﷼﷼') True >>> is_right_to_left('a﷼﷼') False >>> is_right_to_left('a﷼﷼') True ''' skip = False for char in text: bidi_cat = unicodedata.bidirectional(char) if skip and bidi_cat != 'PDI': continue skip = False if bidi_cat in ('AL', 'R'): return True if bidi_cat == 'L': return False if bidi_cat in ('LRI', 'RLI', 'FSI'): skip = True return False
def test_compare_functions(self): def getX(fun, code): try: return getattr(unicodedb_5_2_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_5_2_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
def obfuscation_machine(use_unicode=False, identifier_length=1): """ A generator that returns short sequential combinations of lower and upper-case letters that will never repeat. If *use_unicode* is ``True``, use nonlatin cryllic, arabic, and syriac letters instead of the usual ABCs. The *identifier_length* represents the length of the string to return using the aforementioned characters. """ # This generates a list of the letters a-z: lowercase = list(map(chr, range(97, 123))) # Same thing but ALL CAPS: uppercase = list(map(chr, range(65, 90))) if use_unicode: # Python 3 lets us have some *real* fun: allowed_categories = ('LC', 'Ll', 'Lu', 'Lo', 'Lu') # All the fun characters start at 1580 (hehe): big_list = list(map(chr, range(1580, HIGHEST_UNICODE))) max_chars = 1000 # Ought to be enough for anybody :) combined = [] rtl_categories = ('AL', 'R') # AL == Arabic, R == Any right-to-left last_orientation = 'L' # L = Any left-to-right # Find a good mix of left-to-right and right-to-left characters while len(combined) < max_chars: char = choice(big_list) if unicodedata.category(char) in allowed_categories: orientation = unicodedata.bidirectional(char) if last_orientation in rtl_categories: if orientation not in rtl_categories: combined.append(char) else: if orientation in rtl_categories: combined.append(char) last_orientation = orientation else: combined = lowercase + uppercase shuffle(combined) # Randomize it all to keep things interesting while True: for perm in permutations(combined, identifier_length): perm = "".join(perm) if perm not in RESERVED_WORDS: # Can't replace reserved words yield perm identifier_length += 1
def data(self, index, role): row = index.row() col = index.column() if role == QtCore.Qt.EditRole: if col == 0: #return self.__localData[row][col][0] return 1 # default to 1 every time so the user can just double-click if role == QtCore.Qt.ForegroundRole: qColor = QtGui.QColor(QtCore.Qt.black) if row >= 0: if col == 1: qColor = QtGui.QColor(QtCore.Qt.darkGreen) elif col == 4: qColor = QtGui.QColor(QtCore.Qt.darkBlue) elif col == 3 or col == 6: #gram cat. # If there is a mismatch in grammatical category color it red if self.__localData[row][3] != self.__localData[row][6]: qColor = QtGui.QColor(QtCore.Qt.red) qBrush = QtGui.QBrush(qColor) return qBrush if role == QtCore.Qt.DisplayRole: #if row == 0 and col == 0: #self.__localData[col][row].setChecked() #self.__localData[col][row].setData(QtCore.Qt.Unchecked, QtCore.Qt.CheckStateRole) #return if col == 0: value = self.__localData[row][col][0] # first part of the tuple else: value = self.__localData[row][col] if type(value) == str: return QtCore.QString(value) else: return value elif role == QtCore.Qt.TextAlignmentRole: # Check if we have right to left data in a column, if so align it right if col > 0 and unicodedata.bidirectional(\ self.__localData[row][col][0]) in ('R', 'AL'): # check first character of first row return QtCore.Qt.AlignRight | QtCore.Qt.AlignCenter
def main(): try: v = bytes(int(x, 16) for x in sys.argv[1:]) c = v.decode('utf8') print('gryph: %s' % c) print('codepoint: U+%x' % ord(c)) print('name: %s' % unicodedata.name(c, 'Unknown')) print('decimal: %s' % unicodedata.decimal(c, 'Unknown')) print('digit: %s' % unicodedata.digit(c, 'Unknown')) print('numeric: %s' % unicodedata.numeric(c, 'Unknown')) print('category: %s' % unicodedata.category(c)) print('bidirectional: %s' % unicodedata.bidirectional(c)) print('combining: %s' % unicodedata.combining(c)) print('east_asian_width: %s' % unicodedata.east_asian_width(c)) print('mirrored: %s' % unicodedata.mirrored(c)) print('decomposition: %s' % unicodedata.decomposition(c)) except Exception as ex: print('ERROR: %s' % ex)
def get_base_level(text, upper_is_rtl=False): """Get the paragraph base embedding level. Returns 0 for LTR, 1 for RTL. `text` a unicode object. Set `upper_is_rtl` to True to treat upper case chars as strong 'R' for debugging (default: False). """ base_level = None prev_surrogate = False # P2 for _ch in text: # surrogate in case of ucs2 if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX): prev_surrogate = _ch continue elif prev_surrogate: _ch = prev_surrogate + _ch prev_surrogate = False # treat upper as RTL ? if upper_is_rtl and _ch.isupper(): base_level = 1 break bidi_type = bidirectional(_ch) if bidi_type in ('AL', 'R'): base_level = 1 break elif bidi_type == 'L': base_level = 0 break # P3 if base_level is None: base_level = 0 return base_level
def prepare(self): """Setup the initial chars and their attributes""" upper_is_rtl = self.upper_is_rtl self.chars.clear() for ch in self.iter_text(): if upper_is_rtl and ch.isupper(): bidi_type = 'R' else: bidi_type = bidirectional(ch) self.chars.append({ 'ch': ch, 'level': None, 'type': bidi_type, 'orig': bidi_type, })
def test_function_checksum(self): h = hashlib.sha1() # nosec: B303 for i in range(sys.maxunicode + 1): char = chr(i) data = [ # Properties format(self.db.digit(char, -1), ".12g"), format(self.db.numeric(char, -1), ".12g"), format(self.db.decimal(char, -1), ".12g"), unicodedata.category(char), unicodedata.bidirectional(char), unicodedata.decomposition(char), str(unicodedata.mirrored(char)), str(unicodedata.combining(char)), ] h.update(''.join(data).encode("ascii")) result = h.hexdigest() self.assertEqual(result, self.expectedchecksum)
def format_message(text): """ Convert facebook-style text to wordpress-style text """ lines = text.split('\n') direction = DEFAULT_TEXT_DIRECTION divs = [] for line in lines: if len(line.strip()) == 0: divs.append('<br />') else: line_bidi = unicodedata.bidirectional(line.strip()[0]) if line_bidi == 'L': direction = 'ltr' elif line_bidi == 'R': direction = 'rtl' divs.append(div_with_direction(line, direction)) return '\n'.join(divs)
def display(text, right_to_left=False, return_log_pos=False, upper_is_rtl=False): """ Returns `text` in display form. `right_to_left` determines the base direction. If `return_log_pos` is `True`, the original logical positions of the characters will also be returned, which is useful if you need to retain logical order but calculate display metrics. """ base_level = 1 if right_to_left else 0 base_direction = "R" if right_to_left else "L" storage = { "base_level": base_level, "base_dir": base_direction, "chars": [], "runs": deque(), } for log_pos, char in enumerate(text): if upper_is_rtl and char.isupper(): bidi_type = "R" else: bidi_type = bidirectional(char) storage["chars"].append({ "ch": char, "level": base_level, "type": bidi_type, "orig": bidi_type, "log_pos": log_pos, }) explicit_embed_and_overrides(storage) resolve_weak_types(storage) resolve_neutral_types(storage) resolve_implicit_levels(storage) reorder_resolved_levels(storage) if return_log_pos: return [(char["ch"], char["log_pos"]) for char in storage["chars"]] apply_mirroring(storage) return "".join([char["ch"] for char in storage["chars"]])
def __init__(self, symbol): self.symbol = symbol self.name = u.name(symbol, 'NO_NAME_FOUND') self.decimal = u.decimal(self.symbol, -1) self.digit = u.digit(self.symbol, -1) self.numeric = u.numeric(self.symbol, -1) self.category = u.category(self.symbol) self.bidirectional = u.bidirectional(self.symbol) self.combining = u.combining(self.symbol) self.east_asian_width = u.east_asian_width(self.symbol) self.mirrored = u.mirrored(self.symbol) self.decomposition = u.decomposition(self.symbol) self.normalize_nfc = u.normalize('NFC', self.symbol) self.normalize_nkfc = u.normalize('NFKC', self.symbol) self.normalize_nfd = u.normalize('NFD', self.symbol) self.normalize_nkfd = u.normalize('NFKD', self.symbol) if Config.debug['unicode']: self.print_debug()
def test_compare_functions(self): import unicodedata # CPython implementation def getX(fun, code): if fun == 'numeric' and code in self.diff_numeric: return -1 try: return getattr(unicodedb_4_1_0, fun)(code) except KeyError: return -1 for code in range(0x10000): char = unichr(code) assert unicodedata.digit(char, -1) == getX('digit', code) assert unicodedata.numeric(char, -1) == getX('numeric', code) assert unicodedata.decimal(char, -1) == getX('decimal', code) assert unicodedata.category(char) == unicodedb_4_1_0.category(code) assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code) assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code) assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code) assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
def test_ipy2_gh357(self): """https://github.com/IronLanguages/ironpython2/issues/357""" import unicodedata if is_cli: self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>') else: self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D') self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d') self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d') self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0) self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d') self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0) self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo') self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L') self.assertEqual(unicodedata.combining(u'\u4e2d'), 0) self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W') self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0) self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
def find_bidi(self, el): """Get directionality from element text.""" for node in self.get_children(el, tags=False): # Analyze child text nodes if self.is_tag(node): # Avoid analyzing certain elements specified in the specification. direction = DIR_MAP.get( util.lower(self.get_attribute_by_name(node, "dir", "")), None ) if ( self.get_tag(node) in ("bdi", "script", "style", "textarea") or direction is not None ): continue # pragma: no cover # Check directionality of this node's text value = self.find_bidi(node) if value is not None: return value # Direction could not be determined continue # pragma: no cover # Skip `doctype` comments, etc. if self.is_special_string(node): continue # Analyze text nodes for directionality. for c in node: bidi = unicodedata.bidirectional(c) if bidi in ("AL", "R", "L"): return ct.SEL_DIR_LTR if bidi == "L" else ct.SEL_DIR_RTL return None
def info(self, char): cat = unicodedata.category(char) if cat == 'Cn': raise UnassignedCharacter catname = self.categories[cat] bidi = self.bidis[unicodedata.bidirectional(char)] name = unicodedata.name(char, 'an unnamed character').decode('ascii') if cat[0] == 'C' or cat in ('Zp', 'Zl'): example = u'' elif cat[0] == 'M' and cat[1] != 'c': example = u'\N{DOTTED CIRCLE}' + char else: example = char return { 'code': u'%04X' % ord(char), 'name': name.title().replace('Cjk', 'CJK'), 'char': char, 'example': example, 'category': catname.lower(), 'bidi': bidi }
def get_direction(char): """Return character direction.""" try: return bidi[ud.bidirectional(char)] except: # pylint: disable=bare-except return ''
def is_RandALCat(c: str) -> bool: return unicodedata.bidirectional(c) in ('R', 'AL')
def test_against_unicodedata(): ''' Check against `unicodedata` or `unicodedata2` if available with the correct version of Unicode. ''' if unicodedata is None: raise Exception( 'Packages unicodedata and unicodedata2 are not available with the necessary version of Unicode ({0}); many consistency tests were omitted' .format(mdl.UNICODE_VERSION)) ucdf = mdl.UCDFiles() ud = ucdf.unicodedata for cp in range(0, 0x10FFFF + 1): c = chr(cp) if cp in ud: name = unicodedata.name(c, None) if name is None: # Handle missing names in unicodedata # Compare Table 4-13 in Unicode Standard # http://www.unicode.org/versions/Unicode9.0.0/ch04.pdf if 0x17000 <= cp <= 0x187EC: assert ud[cp]['Name'] == 'TANGUT IDEOGRAPH-{0:04X}'.format( cp) else: assert ud[cp]['Name'] == '' else: assert name == ud[cp]['Name'] decimal, digit, numeric = (unicodedata.decimal(c, None), unicodedata.digit(c, None), unicodedata.numeric(c, None)) if any(x is not None for x in (decimal, digit, numeric)): if decimal is not None: assert decimal == int(ud[cp]['Numeric_Value']) and ud[cp][ 'Numeric_Type'] == 'Decimal' and digit is not None and decimal is not None elif digit is not None: assert digit == int(ud[cp]['Numeric_Value']) and ud[cp][ 'Numeric_Type'] == 'Digit' and decimal is None and numeric is not None elif numeric is not None: try: num = float(ud[cp]['Numeric_Value']) except ValueError: if '/' in ud[cp]['Numeric_Value']: numerator, denominator = ud[cp][ 'Numeric_Value'].split('/') num = float(numerator) / float(denominator) else: raise assert numeric == num and ud[cp][ 'Numeric_Type'] == 'Numeric' and digit is None and decimal is None else: raise Exception else: assert ud[cp]['Numeric_Value'] == 'NaN' and ud[cp][ 'Numeric_Type'] == 'None' assert unicodedata.category(c) == ud[cp]['General_Category'] assert unicodedata.bidirectional(c) == ud[cp]['Bidi_Class'] assert unicodedata.combining(c) == int( ud[cp]['Canonical_Combining_Class']) assert unicodedata.mirrored(c) == ud[cp]['Bidi_Mirrored'] if unicodedata.decomposition(c) == '': if ud[cp]['Name'].startswith('HANGUL SYLLABLE'): # The Hangul syllables lack decomposition mapping in # unicodedata, so calculate with a full decomposition # followed by a partial composition (Unicode Standard, # chapter 3.12) decomp = unicodedata.normalize('NFD', c) if len(decomp) == 3: decomp = unicodedata.normalize('NFC', decomp[:2]) + decomp[-1] decomp = tuple(ord(x) for x in decomp) assert decomp == ud[cp]['Decomposition_Mapping'] else: assert ud[cp]['Decomposition_Mapping'] == (cp, ) else: x = unicodedata.decomposition(c) if '<' in x: x = x.split('>', 1)[1].strip() x = tuple(int(y, 16) for y in x.split('\x20')) assert x == ud[cp]['Decomposition_Mapping'] dbc = ucdf.derivedbidiclass for cp in range(0, 0x10FFFF + 1): c = chr(cp) # Only compare assigned code points, because unicodedata and # unicodedata2 lack correct defaults for unassigned if cp in dbc and cp in ud: assert unicodedata.bidirectional(c) == dbc[cp]['Bidi_Class'] eaw = ucdf.eastasianwidth deaw = ucdf.derivedeastasianwidth for cp in range(0, 0x10FFFF + 1): c = chr(cp) # Only compare assigned code points, because unicodedata and # unicodedata2 lack correct defaults for unassigned if cp in eaw and cp in ud: assert unicodedata.east_asian_width( c) == eaw[cp]['East_Asian_Width'] if cp in deaw and cp in ud: assert unicodedata.east_asian_width( c) == deaw[cp]['East_Asian_Width']
def detectStringDirection(s): direction=0 for b in (unicodedata.bidirectional(ch) for ch in s): if b=='L': direction+=1 if b in ('R','AL'): direction-=1 return direction
async def charinfo(self, *, data: str): """Shows information about one or several characters. 'data' can either be a character, a unicode escape sequence, a unicode character name or a string. If 'data' is a string only a summary of each character's info will be displayed. """ data = data.lower() if data.startswith('\\u'): # Let's interpret the unicode escape sequence hex_values = data.split('\\u')[1:] try: code_points = [int(val, 16) for val in hex_values] except ValueError: await self.bot.say('Invalid unicode escape sequence.') return else: data = ''.join(chr(cp) for cp in code_points) elif len(data) > 1: # Maybe we've been given the character's name ? try: data = unicodedata.lookup(data) except KeyError: pass # Normalise the input data = unicodedata.normalize('NFC', data) url_fmt = '<http://unicode-table.com/en/{:X}>' if len(data) == 1: # Detailed info on the character entries = [ ('Character', data), ('Name', unicodedata.name(data, 'None')), ('Code point', '{:04x}'.format(ord(data))) ] decomposition = unicodedata.decomposition(data) if decomposition != '': entries.append(('Decomposition', decomposition)) combining = unicodedata.combining(data) if combining: entries.append(('Combining class', combining)) entries.append(('Category', unicodedata.category(data))) bidirectional = unicodedata.bidirectional(data) entries.append(('Bidirectional', bidirectional if bidirectional != '' else 'None')) entries.append(('Mirrored', 'True' if unicodedata.mirrored(data) == 1 else 'False')) entries.append(('East asian width', unicodedata.east_asian_width(data))) entries.append(('Url', url_fmt.format(ord(data)))) # Create the message's content and send it content = utils.indented_entry_to_str(entries) await self.bot.say_block(content) else: # Minimal info for each character entries = [] for char in data: entries.append('{} | `\\u{:04x}` | {} | {}'.format(char, ord(char), unicodedata.name(char, 'None'), url_fmt.format(ord(char)))) content = '\n'.join(entries) await self.bot.say(content)
def check_bidi(label, check_ltr=False): # Bidi rules should only be applied if string contains RTL characters bidi_label = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if direction == "": # String likely comes from a newer version of Unicode raise IDNABidiError( "Unknown directionality in label {0} at position {1}".format( repr(label), idx)) if direction in ["R", "AL", "AN"]: bidi_label = True break if not bidi_label and not check_ltr: return True # Bidi rule 1 direction = unicodedata.bidirectional(label[0]) if direction in ["R", "AL"]: rtl = True elif direction == "L": rtl = False else: raise IDNABidiError( "First codepoint in label {0} must be directionality L, R or AL". format(repr(label))) valid_ending = False number_type = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if rtl: # Bidi rule 2 if not direction in [ "R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM", ]: raise IDNABidiError( "Invalid direction for codepoint at position {0} in a right-to-left label" .format(idx)) # Bidi rule 3 if direction in ["R", "AL", "EN", "AN"]: valid_ending = True elif direction != "NSM": valid_ending = False # Bidi rule 4 if direction in ["AN", "EN"]: if not number_type: number_type = direction else: if number_type != direction: raise IDNABidiError( "Can not mix numeral types in a right-to-left label" ) else: # Bidi rule 5 if not direction in [ "L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM" ]: raise IDNABidiError( "Invalid direction for codepoint at position {0} in a left-to-right label" .format(idx)) # Bidi rule 6 if direction in ["L", "EN"]: valid_ending = True elif direction != "NSM": valid_ending = False if not valid_ending: raise IDNABidiError("Label ends with illegal codepoint directionality") return True
def ShouldFail(domain): """Returns True for domains that we know are invalid, False otherwise.""" if "." not in domain: return True pieces = domain.split(".") total_length = len(b".".join( [piece.encode("punycode") for piece in pieces])) if total_length > 253: return True for piece in pieces: # Iteration over each label in the domain, checking various requirements. if len(piece) == 0: return True if len(piece) > 63: return True if len(piece.encode("punycode")) > 59: return True # Domain labels must not start with a -, end with a -, or have both their # third and fourth characters be --. if piece.startswith("-"): return True if piece.endswith("-"): return True if len(piece) >= 4 and piece[2] == "-" and piece[3] == "-": return True if len(piece) and unicodedata.category(piece[0])[0] == "M": return True # Bidirectional checks (ensures that the label follows the "bidi rule" # for IDNA) direction = unicodedata.bidirectional(piece[0]) if direction in ["R", "AL"]: rtl = True elif direction == "L": rtl = False else: return True if rtl: has_en = False has_an = False for c in piece: biditype = unicodedata.bidirectional(c) if biditype not in [ "R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM" ]: return True if biditype == "EN": has_en = True if biditype == "AN": has_an = True if has_en and has_an: return True for i in range(len(piece) - 1, 0 - 1, -1): biditype = unicodedata.bidirectional(piece[i]) if biditype in ["R", "AL", "EN", "AN"]: break if biditype != "NSM": return True else: for c in piece: if unicodedata.bidirectional(c) not in [ "L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM" ]: return True for i in range(len(piece) - 1, 0 - 1, -1): biditype = unicodedata.bidirectional(piece[i]) if biditype in ["L", "EN"]: break if biditype != "NSM": return True return False
def match_dir(self, el, directionality): """Check directionality.""" # If we have to match both left and right, we can't match either. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: return False # Element has defined direction of left to right or right to left direction = DIR_MAP.get(util.lower(el.attrs.get('dir', '')), None) if direction not in (None, 0): return direction == directionality # Element is the document element (the root) and no direction assigned, assume left to right. is_root = self.match_root(el) if is_root and direction is None: return ct.SEL_DIR_LTR == directionality # If `input[type=telephone]` and no direction is assigned, assume left to right. is_input = util.lower(el.name) == 'input' is_textarea = util.lower(el.name) == 'textarea' is_bdi = util.lower(el.name) == 'bdi' itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' if is_input and itype == 'tel' and direction is None: return ct.SEL_DIR_LTR == directionality # Auto handling for text inputs if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: if is_textarea: value = [] for node in el.contents: if util.is_navigable_string( node) and not util.is_special_string(node): value.append(node) value = ''.join(value) else: value = self.get_attribute_by_name(el, 'value', '') if value: for c in value: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return direction == directionality # Assume left to right return ct.SEL_DIR_LTR == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(el.parent, directionality) # Auto handling for `bdi` and other non text inputs. if (is_bdi and direction is None) or direction == 0: direction = self.get_bidi(el) if direction is not None: return direction == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(el.parent, directionality) # Match parents direction return self.match_dir(el.parent, directionality)
# Test the unicode support! 👋 ᚴ=2 assert ᚴ*8 == 16 ᚴ="👋" c = ᚴ*3 assert c == '👋👋👋' import unicodedata assert unicodedata.category('a') == 'Ll' assert unicodedata.category('A') == 'Lu' assert unicodedata.name('a') == 'LATIN SMALL LETTER A' assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a' assert unicodedata.bidirectional('a') == 'L' assert unicodedata.normalize('NFC', 'bla') == 'bla'
def check_bidi(label, check_ltr=False): # Bidi rules should only be applied if string contains RTL characters bidi_label = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if direction == '': # String likely comes from a newer version of Unicode raise IDNABidiError( 'Unknown directionality in label {0} at position {1}'.format( repr(label), idx)) if direction in ['R', 'AL', 'AN']: bidi_label = True if not bidi_label and not check_ltr: return True # Bidi rule 1 direction = unicodedata.bidirectional(label[0]) if direction in ['R', 'AL']: rtl = True elif direction == 'L': rtl = False else: raise IDNABidiError( 'First codepoint in label {0} must be directionality L, R or AL'. format(repr(label))) valid_ending = False number_type = False for (idx, cp) in enumerate(label, 1): direction = unicodedata.bidirectional(cp) if rtl: # Bidi rule 2 if not direction in [ 'R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM' ]: raise IDNABidiError( 'Invalid direction for codepoint at position {0} in a right-to-left label' .format(idx)) # Bidi rule 3 if direction in ['R', 'AL', 'EN', 'AN']: valid_ending = True elif direction != 'NSM': valid_ending = False # Bidi rule 4 if direction in ['AN', 'EN']: if not number_type: number_type = direction else: if number_type != direction: raise IDNABidiError( 'Can not mix numeral types in a right-to-left label' ) else: # Bidi rule 5 if not direction in [ 'L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM' ]: raise IDNABidiError( 'Invalid direction for codepoint at position {0} in a left-to-right label' .format(idx)) # Bidi rule 6 if direction in ['L', 'EN']: valid_ending = True elif direction != 'NSM': valid_ending = False if not valid_ending: raise IDNABidiError('Label ends with illegal codepoint directionality') return True
def in_table_d2(code): return unicodedata.bidirectional(code) == "L"