Example #1
0
def check_bidi(label, check_ltr=False):
    # Bidi rules should only be applied if string contains RTL characters
    bidi_label = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)
        if direction == '':
            # String likely comes from a newer version of Unicode
            raise IDNABidiError('Unknown directionality in label {0} at position {1}'.format(repr(label), idx))
        if direction in ['R', 'AL', 'AN']:
            bidi_label = True
            break
    if not bidi_label and not check_ltr:
        return True

    # Bidi rule 1
    direction = unicodedata.bidirectional(label[0])
    if direction in ['R', 'AL']:
        rtl = True
    elif direction == 'L':
        rtl = False
    else:
        raise IDNABidiError('First codepoint in label {0} must be directionality L, R or AL'.format(repr(label)))

    valid_ending = False
    number_type = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)

        if rtl:
            # Bidi rule 2
            if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
                raise IDNABidiError(
                    'Invalid direction for codepoint at position {0} in a right-to-left label'.format(idx))
            # Bidi rule 3
            if direction in ['R', 'AL', 'EN', 'AN']:
                valid_ending = True
            elif direction != 'NSM':
                valid_ending = False
            # Bidi rule 4
            if direction in ['AN', 'EN']:
                if not number_type:
                    number_type = direction
                else:
                    if number_type != direction:
                        raise IDNABidiError('Can not mix numeral types in a right-to-left label')
        else:
            # Bidi rule 5
            if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:
                raise IDNABidiError(
                    'Invalid direction for codepoint at position {0} in a left-to-right label'.format(idx))
            # Bidi rule 6
            if direction in ['L', 'EN']:
                valid_ending = True
            elif direction != 'NSM':
                valid_ending = False

    if not valid_ending:
        raise IDNABidiError('Label ends with illegal codepoint directionality')

    return True
Example #2
0
def _rtl(text, default=False):
    if not text:
        return default
    first_character = text[0]
    if bidirectional(first_character) in ['RLE', 'RLO', 'R', 'AL']:
        return True
    elif bidirectional(first_character) in ['LRE', 'LRO', 'L']:
        return False
    elif len(text)>1:
        return _rtl(text[1:])
    return default
Example #3
0
    def _check_true_dir(self, text):
        is_rtl = False
        is_ltr = False
        quoted_text = False

        last_inline_html_char_pos = text.rfind(">")
        if last_inline_html_char_pos > -1:
            it_here = text[last_inline_html_char_pos+1:]
        else:
            it_here = text

        for ch in it_here:
            res = UD.bidirectional(ch)
            if ch == '"':
                quoted_text = not quoted_text
            elif not quoted_text and res in {'R', 'AL'}:
                is_rtl = True
            elif not quoted_text and res == 'L':
                is_ltr = True

        #print(text, it_here, is_rtl, is_ltr)

        if is_rtl:
            return 'rtl'
        elif is_ltr:
            return 'ltr'
        else:
            return 'auto'
Example #4
0
def _character_direction(ch):
    ch_bidi = bidirectional(ch)
    if ch_bidi in ['L', 'LRE', 'LRO']:
        return 'LTR'
    if ch_bidi in ['R', 'RLE', 'RLO', 'AL']:
        return 'RTL'
    return None
Example #5
0
    def info(self, char):
        cat = unicodedata.category(char)
        if cat == 'Cn':
            raise UnassignedCharacter

        catname = self.categories[cat]
        bidi = self.bidis[unicodedata.bidirectional(char)]
        name = unicodedata.name(char, 'an unnamed character').decode('ascii')

        if cat[0] == 'C' or cat in ('Zp', 'Zl'):
            example = u''
        elif cat[0] == 'M' and cat[1] != 'c':
            example = u'\N{DOTTED CIRCLE}' + char
        else:
            example = char

        haninfo = u''
        if 'CJK' in name and 'IDEOGRAPH' in name:
            unihan = Unihan(char)
            haninfo = unicode(unihan)
            if haninfo:
                haninfo = u'. ' + haninfo + u'.'

        return {'code': u'%04X' % ord(char),
                'name': name.title().replace('Cjk', 'CJK'), 'char': char,
                'example': example, 'category': catname.lower(), 'bidi': bidi,
                'unihan': haninfo}
def fix_rtl(string):
  # If there are any strongly RTL chars, hint that we're already in an LTR context, and want to be afterwards
  rtl_chars = [c for c in string if unicodedata.bidirectional(c) in ['R', 'AL', 'RLE', 'RLO', 'RLI']]
  if rtl_chars:
    return u'\u200e' + string + u'\u200e'

  return string
Example #7
0
def rtlString(source, lang):
    if lang and source and lang[0:2] in set([u"ar",u"he"]):
        line = []
        lineInsertion = 0
        words = []
        rtl = True
        for c in source:
            bidi = unicodedata.bidirectional(c)
            if rtl:
                if bidi == u'L':
                    if words:
                        line.insert(lineInsertion, u''.join(words))
                        words = []
                    rtl = False
                elif bidi in (u'R', u'NSM', u'AN'):
                    pass
                else:
                    if words:
                        line.insert(lineInsertion, u''.join(words))
                        words = []
                    line.insert(lineInsertion, c)
                    continue
            else:
                if bidi == u'R' or bidi == u'AN':
                    if words:
                        line.append(u''.join(words))
                        words = []
                    rtl = True
            words.append(c)
        if words:
            if rtl:
                line.insert(0, u''.join(words))
        return u''.join(line)
    else:
        return source
Example #8
0
 def lookup(self):
     # look up all the external references we need.
     if self.uniNumber is None:
         return
     try:
         self.uniLetter = unicodeToChar(self.uniNumber)
     except:
         print("GlyphName valueerror for %04X" % self.uniNumber)
         return
     if self.uniNumber in mathUniNumbers:
         self.isMath = True
     try:
         # self.uniName = unicodedata.name(self.uniLetter)
         self.uniName = unicodelist.get(self.uniNumber)
         if self.uniName is None:
             self.uniNameProcessed = ""
         else:
             self.uniNameProcessed = self.uniName
         self.bidiType = unicodedata.bidirectional(self.uniLetter)
     except ValueError:
         self.uniName = None
         self.uniNameProcessed = ""
         self.uniLetter = None
         self.bidiType = None
     self.uniRangeName = getRangeName(self.uniNumber)
Example #9
0
def get_base_direction(text):
    """Find the base direction of a text string according to the first
    character with strong bidi type.

    Returns ``0`` for LTR, ``1`` for RTL and ``-1`` for undefined (no strong
    characters found).
    """
    text = force_unicode(text)

    # Find first character with strong bidi type.
    first = None
    for c in text:
        bidi_type = bidirectional(c)
        if bidi_type in _strong_types:
            first = bidi_type
            break

    if first:
        if first in _rtl_types:
            return 1
        else:
            return 0
    else:
        # Text composed of weak bidi characters.
        return -1
Example #10
0
 def __init__(self, myData, headerData, comboData):
     QtGui.QMainWindow.__init__(self)
     self.ui = Ui_MainWindow()
     self.ui.setupUi(self)
     self.ui.OKButton.clicked.connect(self.OKClicked)
     self.ui.CancelButton.clicked.connect(self.CancelClicked)
     self.__model = LinkerTable(myData, headerData)
     self.ui.tableView.setModel(self.__model)
     # Prepare the checkbox column
     for row in range(0, self.__model.rowCount(self)):
         self.ui.tableView.openPersistentEditor(self.__model.index(row, 0))
     self.__combo_model = LinkerCombo(comboData)
     self.ui.targetLexCombo.setModel(self.__combo_model)
     self.ret_val = 0
     self.cols = 7
     self.ui.targetLexCombo.currentIndexChanged.connect(self.ComboClicked)
     self.ui.FilterCheckBox.clicked.connect(self.FilterClicked)
     self.ComboClicked()
     
     myHPG = self.__combo_model.getCurrentHPG()
     myHeadword = myHPG.getHeadword()
     # Check for right to left data and set the combobox direction if needed
     for i in range(0, len(myHeadword)):
         if unicodedata.bidirectional(myHeadword[i]) in (u'R', u'AL'):
             self.ui.targetLexCombo.setLayoutDirection(QtCore.Qt.RightToLeft)
             self.__combo_model.setRTL(True)
             break
Example #11
0
def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False):
    """Get the paragraph base embedding level and direction,
    set the storage to the array of chars"""

    prev_surrogate = False
    base_level = storage['base_level']
    has_rtl = False
    # preset the storage's chars
    for _ch in text:
        if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
            prev_surrogate = _ch
            continue
        elif prev_surrogate:
            _ch = prev_surrogate + _ch
            prev_surrogate = False

        if upper_is_rtl and _ch.isupper():
            bidi_type = 'R'
        else:
            try:
                bidi_type = bidirectional(_ch)
            except:
                bidi_type = None

        has_rtl |= (bidi_type == 'R')

        storage[
            'chars'].append({'ch': _ch, 'level': base_level, 'type': bidi_type,
                             'orig': bidi_type})
    if debug:
        debug_storage(storage, base_info=True)

    return has_rtl
Example #12
0
def drawText(canvas, x, y, text, en = False, bold = False, size = 12):
    wrkText = text
    isArabic = False
    isBidi = False
    for c in wrkText:
        cat = unicodedata.bidirectional(c)
        if cat == "AL" or cat == "AN" or cat == "FA":
            isArabic = True
            isBidi = True
            break
        elif cat == "R" or cat == "RLE" or cat == "RLO":
            isBidi = True
    if isArabic:
        wrkText = a_forms.fuse(wrkText)
        wrkText = a_process.shape(wrkText)

    if isBidi:
        wrkText = get_display(wrkText)

    if bold:
        canvas.setFont('BNazanin', size)
    else:
        canvas.setFont('Nazanin', size)

    canvas.drawRightString(x, canvas._pagesize[1] - y, wrkText)
    def find_bidi(self, el):
        """Get directionality from element text."""

        for node in self.get_children(el, tags=False):

            # Analyze child text nodes
            if self.is_tag(node):

                # Avoid analyzing certain elements specified in the specification.
                direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
                if (
                    self.get_tag(node) in ('bdi', 'script', 'style', 'textarea') or
                    direction is not None
                ):
                    continue  # pragma: no cover

                # Check directionality of this node's text
                value = self.find_bidi(node)
                if value is not None:
                    return value

                # Direction could not be determined
                continue  # pragma: no cover

            # Skip `doctype` comments, etc.
            if self.is_special_string(node):
                continue

            # Analyze text nodes for directionality.
            for c in node:
                bidi = unicodedata.bidirectional(c)
                if bidi in ('AL', 'R', 'L'):
                    return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
        return None
Example #14
0
def rtlString(source, lang):
    if lang and lang[0:2] in {"ar","he"}:
        line = []
        lineInsertion = 0
        words = []
        rtl = True
        for c in source:
            bidi = unicodedata.bidirectional(c)
            if rtl:
                if bidi == 'L':
                    if words:
                        line.insert(lineInsertion, ''.join(words))
                        words = []
                    rtl = False
                elif bidi in ('R', 'NSM', 'AN'):
                    pass
                else:
                    if words:
                        line.insert(lineInsertion, ''.join(words))
                        words = []
                    line.insert(lineInsertion, c)
                    continue
            else:
                if bidi == 'R' or bidi == 'AN':
                    if words:
                        line.append(''.join(words))
                        words = []
                    rtl = True
            words.append(c)
        if words:
            if rtl:
                line.insert(0, ''.join(words))
        return ''.join(line)
    else:
        return source
Example #15
0
    def _truncate_invalid_chars(value, length):
        '''Safety check: make sure we aren't truncating within the boundaries
        of a multibyte character. Also, add a LTR BOM if the last character
        is RTL.
        '''
        value = smart_str(value)
        if length:
            value = value[:length]
            valid = False
            while not valid and len(value):
                try:
                    test = value.decode('utf8')

                    # check for RTL encoding without order marker terminator
                    direction = bidirectional(test[-1])
                    if direction in RTL_TYPES:
                        # this is RTL, we need 3 bytes for the BOM
                        if len(value) > (length - 3):
                            # not enough room - keep chopping
                            raise ValueError('Not enough room to truncate')
                        else:
                            test += u'\u200e'  # LTR BOM
                            return smart_str(test)
                    else:
                        valid = True
                        del test
                except (UnicodeDecodeError, ValueError):
                    # chop a letter off the end and try again
                    value = value[:-1]
        return value
Example #16
0
def print_Unicode_info(char, short):
    name = unicodedata.name(char, "UNKNOWN")
    decCodepoint = ord(char)
    hexCodepoint = hex(decCodepoint)
    lower = char.lower()
    upper = char.upper()
    category = unicodedata.category(char)
    bidirectional = unicodedata.bidirectional(char)
    mirrored = True if (unicodedata.mirrored(char) == 1) else False
    nfc = unicodedata.normalize("NFC", char)
    nfd = unicodedata.normalize("NFD", char)

    if (short):
        print(char + "\t" + name + " (U+" + str(hexCodepoint).upper().replace("0X", "") + ")")
    else:
        print("Name          " + name)
        print("Character     " + char)
        print("Dec Codepoint " + str(decCodepoint))
        print("Hex Codepoint " + str(hexCodepoint))
        print("Lowercase     " + lower)
        print("Uppercase     " + upper)
        print("Category      " + category)
        print("Bidirectional " + bidirectional)
        print("Mirrored      " + str(mirrored))
        print("NFC           " + nfc)
        print("NFD           " + nfd)
Example #17
0
 def lookup(self):
     # look up all the external references we need.
     if self.uniNumber is None:
         return
     try:
         self.uniLetter = unicodeToChar(self.uniNumber)
     except:
         # print("GlyphName value error for %04X" % self.uniNumber)
         return
     if self.uniNumber in mathUniNumbers:
         self.isMath = True
     try:
         self.uniName = unicodelist.get(self.uniNumber)
         if self.uniName is None:
             self.uniNameProcessed = ""
         else:
             self.uniNameProcessed = self.uniName
         # NOTE: this is still a dependency on the unicodedata module.
         # Would be nice to extract this data directly from the unicode data
         # but the algotirhm is ot trivial..
         self.bidiType = unicodedata.bidirectional(self.uniLetter)
     except ValueError:
         self.uniName = None
         self.uniNameProcessed = ""
         self.uniLetter = None
         self.bidiType = None
     except:
         import traceback
         traceback.print_exc()
     self.uniRangeName = getRangeName(self.uniNumber)
    def match_dir(self, el, directionality):
        """Check directionality."""

        # If we have to match both left and right, we can't match either.
        if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
            return False

        # Element has defined direction of left to right or right to left
        direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
        if direction not in (None, 0):
            return direction == directionality

        # Element is the document element (the root) and no direction assigned, assume left to right.
        is_root = self.match_root(el)
        if is_root and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # If `input[type=telephone]` and no direction is assigned, assume left to right.
        name = self.get_tag(el)
        is_input = name == 'input'
        is_textarea = name == 'textarea'
        is_bdi = name == 'bdi'
        itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
        if is_input and itype == 'tel' and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # Auto handling for text inputs
        if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
            if is_textarea:
                value = []
                for node in el.contents:
                    if self.is_content_string(node):
                        value.append(node)
                value = ''.join(value)
            else:
                value = self.get_attribute_by_name(el, 'value', '')
            if value:
                for c in value:
                    bidi = unicodedata.bidirectional(c)
                    if bidi in ('AL', 'R', 'L'):
                        direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
                        return direction == directionality
                # Assume left to right
                return ct.SEL_DIR_LTR == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(self.get_parent(el), directionality)

        # Auto handling for `bdi` and other non text inputs.
        if (is_bdi and direction is None) or direction == 0:
            direction = self.find_bidi(el)
            if direction is not None:
                return direction == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(self.get_parent(el), directionality)

        # Match parents direction
        return self.match_dir(self.get_parent(el), directionality)
Example #19
0
def text_dir(c):
    """Classify a character as 'R'/'L'/''."""
    dir = unicodedata.bidirectional(c)
    if dir in ('L',):
        return 'L'
    if dir in ('R', 'AL'):
        return 'R'
    return ''
Example #20
0
 def ub(self,bidi):
     """
     Match a character with a given Unicode bidirectional class
     """
     import unicodedata
     x, e = self.rule_anything()
     if unicodedata.bidirectional(x[0]) is bidi:
         return x, e
     else:
         e[1] = expected("bidi:"+ bidi)
         raise _MaybeParseError(*e)
Example #21
0
def get_embedding_levels(text, storage, upper_is_rtl=False, debug=False):
    """Get the paragraph base embedding level and direction,
    set the storage to the array of chars"""

    base_level = None

    # P2
    for _ch in text:
        # treat upper as RTL ?
        if upper_is_rtl and _ch.isupper():
            base_level = 1
            break

        bidi_type = bidirectional(_ch)

        if bidi_type in ('AL', 'R'):
            base_level = 1
            break

        elif bidi_type == 'L':
            base_level = 0
            break

    # P3
    if base_level is None:
        base_level = 0

    storage['base_level'] = base_level
    storage['base_dir'] = ('L', 'R')[base_level]

    # preset the storage's chars
    for _ch in text:
        if upper_is_rtl and _ch.isupper():
            bidi_type = 'R'
        else:
            bidi_type = bidirectional(_ch)
        storage['chars'].append({'ch':_ch, 'level':base_level, 'type':bidi_type,
                                 'orig':bidi_type})
    if debug:
        debug_storage(storage, base_info=True)
Example #22
0
def categorize(lower, upper):
    """Place each character in the range of lower,upper into unicode_dict
    based on the directionality of the character.
    """
    global unicode_dict
    for x in xrange(lower, upper):
        bidi = bidirectional(unichr(x))
        if len(bidi) == 0:
            bidi = "UNK"
        if bidi in unicode_dict.keys():
            unicode_dict[bidi].append(unichr(x))
        else:
            unicode_dict[bidi] = [unichr(x)]
Example #23
0
    def calc_paragraph_level(self, chars=None):
        """Applies P2 and P3.

        P2_ :

        In each paragraph, find the first character of type L, AL, or R while
        skipping over any characters between an isolate initiator and its
        matching PDI or, if it has no matching PDI, the end of the paragraph.

        P3_ :

        If a character is found in P2 and it is of type AL or R, then set the
        paragraph embedding level to one; otherwise, set it to zero.

        .. _P2: http://www.unicode.org/reports/tr9/#P2
        .. _P3: http://www.unicode.org/reports/tr9/#P3
        """
        upper_is_rtl = self.upper_is_rtl
        isolate_initiator_level = 0

        if chars is None:
            chars = self.chars

        getter = itemgetter('ch')
        for ch in map(getter, chars):
            bidi_type = bidirectional(ch)

            if bidi_type in ISOLATE_INITIATORS:
                isolate_initiator_level += 1
                continue

            if bidi_type == 'PDI' and isolate_initiator_level > 0:
                isolate_initiator_level -= 1
                continue

            # ignore isolate initiators till it's matching PDI
            if isolate_initiator_level > 0:
                continue

            if upper_is_rtl and ch.isupper():
                bidi_type = 'R'

            base_level = PARAGRAPH_LEVELS.get(bidi_type)

            if base_level is not None:
                break

        if base_level is None:
            base_level = PARAGRAPH_LEVELS['L']

        return base_level
Example #24
0
 async def check_text(self, message):
     text = message.message
     result = self.storage.check_text(text)
     ret = None
     for txt, severity in result.items():
         ret += MessageTextBlacklisted(severity, txt)
     rtl = False
     for char in text:
         if unicodedata.bidirectional(char) == "AL":
             # Arabic Letter
             rtl = True
     if rtl:
         ret += MessageContainsRtl(3)
     return ret
Example #25
0
    def __init__(self, msg1, infotext, parent=None, monospaced=False):
        self.xml = Glade(toplevel='infodialog')

        self.top = self.xml.toplevel
        self.top.set_icon(ICON)
        self.top.set_title("%s - Gramps" % msg1)

        label = self.xml.get_object('toplabel')
        label.set_text('<span weight="bold" size="larger">%s</span>' % msg1)
        label.set_use_markup(True)

        infoview = self.xml.get_object('infoview')
        infobuffer = Gtk.TextBuffer()
        infoview.set_buffer(infobuffer)

        if isinstance(infotext, str):
            infobuffer.set_text(infotext)
        else:
            for item in infotext:
                enditer = infobuffer.get_end_iter()
                if isinstance(item, str):
                    infobuffer.insert(enditer, item + '\n')
                elif isinstance(item, list):
                    grid = Gtk.Grid()
                    grid.set_margin_start(6)
                    grid.set_margin_end(6)
                    grid.set_column_spacing(12)
                    if unicodedata.bidirectional(item[0][0][0]) == 'R':
                        grid.set_direction(Gtk.TextDirection.RTL)
                    for offset_y, row in enumerate(item):
                        for offset_x, col in enumerate(row):
                            cell = Gtk.Label(col)
                            cell.set_halign(Gtk.Align.END)
                            grid.attach(cell, offset_x, offset_y, 1, 1)
                    grid.show_all()
                    anchor = infobuffer.create_child_anchor(enditer)
                    infoview.add_child_at_anchor(grid, anchor)
                    enditer = infobuffer.get_end_iter()
                    infobuffer.insert(enditer, '\n')

        if monospaced:
            startiter, enditer = infobuffer.get_bounds()
            tag = infobuffer.create_tag(family="Monospace")
            infobuffer.apply_tag(tag, startiter, enditer)

        if parent:
            self.top.set_transient_for(parent)
        self.top.connect('response', self.destroy)
        self.top.show()
Example #26
0
def paragraph_direction_mark(text):
    """
    Determine paragraph writing direction according to
    http://www.unicode.org/reports/tr9/#The_Paragraph_Level

    Returns either Unicode LTR mark or RTL mark.
    """
    for char in text:
        bidi = unicodedata.bidirectional(char)
        if bidi == 'L':
            return u'\u200E'
        elif bidi == 'AL' or bidi == 'R':
            return u'\u200F'

    return u'\u200E'
def bad_unicode(wc):
    w = wc[0]
    if not isinstance(w, unicode):
        w = unicode(w)
    prev_surrogate = False
    for _ch in w:
        if sys.maxunicode == 0xffff and (0xD800 <= ord(_ch) <= 0xDBFF):
            prev_surrogate = _ch
            continue
        elif prev_surrogate:
            _ch = prev_surrogate + _ch
            prev_surrogate = False
        if unicodedata.bidirectional(_ch) == '':
            return False
    return True
Example #28
0
def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), 
                           char, 
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))                          
Example #29
0
def paragraph_direction_mark(text):
    """
    Determine paragraph writing direction according to
    http://www.unicode.org/reports/tr9/#The_Paragraph_Level

    Returns either Unicode LTR mark or RTL mark.
    """
    for char in text:
        bidi = unicodedata.bidirectional(char)
        if bidi == 'L':
            return '\u200E'
        if bidi in ('AL', 'R'):
            return '\u200F'

    return '\u200E'
Example #30
0
def obfuscation_machine(use_unicode=False, identifier_length=1):
    """
    A generator that returns short sequential combinations of lower and
    upper-case letters that will never repeat.

    If *use_unicode* is ``True``, use nonlatin cryllic, arabic, and syriac
    letters instead of the usual ABCs.

    The *identifier_length* represents the length of the string to return using
    the aforementioned characters.
    """
    # This generates a list of the letters a-z:
    lowercase = list(map(chr, range(97, 123)))
    # Same thing but ALL CAPS:
    uppercase = list(map(chr, range(65, 90)))
    # digit
    digit = list(map(chr, range(48, 57)))

    if use_unicode:
        # Python 3 lets us have some *real* fun:
        allowed_categories = ('LC', 'Ll', 'Lu', 'Lo', 'Lu')
        # All the fun characters start at 1580 (hehe):
        big_list = list(map(chr, range(1580, HIGHEST_UNICODE)))
        max_chars = 1000  # Ought to be enough for anybody :)
        combined = []
        rtl_categories = ('AL', 'R')  # AL == Arabic, R == Any right-to-left
        last_orientation = 'L'  # L = Any left-to-right
        # Find a good mix of left-to-right and right-to-left characters
        while len(combined) < max_chars:
            char = choice(big_list)
            if unicodedata.category(char) in allowed_categories:
                orientation = unicodedata.bidirectional(char)
                if last_orientation in rtl_categories:
                    if orientation not in rtl_categories:
                        combined.append(char)
                else:
                    if orientation in rtl_categories:
                        combined.append(char)
                last_orientation = orientation
    else:
        combined = lowercase + uppercase + digit
    shuffle(combined)  # Randomize it all to keep things interesting
    while True:
        for perm in permutations(combined, identifier_length):
            perm = "_0x" + "".join(perm)
            if perm not in analyze.reserved_words:  # Can't replace reserved words
                yield perm
        identifier_length += 1
Example #31
0
def UniqueUni(codepoint):
    array_result = dict()
    letter = chr(int(codepoint))
    print(str(chr(int(codepoint))))
    lettertest = unicodedata.bidirectional(letter)
    name_unicode = unicodedata.name(letter, ' ')
    hexauni = hex(ord(letter))
    hexauni = hexauni.replace("0x", "U+")
    category = unicodedata.category(chr(int(codepoint)))
    array_result['unicode'] = u"" + letter + ""
    array_result['bidir'] = lettertest
    array_result['hexa'] = hexauni
    array_result['name'] = name_unicode
    array_result['number'] = int(codepoint)
    array_result['cat'] = category
    return array_result
Example #32
0
def char2info(ch):
    name = U.name(ch, None)
    decimal = U.decimal(ch, None)
    digit = U.digit(ch, None)
    numeric = U.numeric(ch, None)

    category = U.category(ch)
    bidirectional = U.bidirectional(ch)
    combining = U.combining(ch)
    east_asian_width = U.east_asian_width(ch)
    mirrored = U.mirrored(ch)
    decomposition = U.decomposition(ch)

    unicode = ord(ch)
    unicode_hex = hex(unicode)
    return dict(locals())
def is_right_to_left(text):
    '''Check whether a text is right-to-left text or not

    :param text: The text to check
    :type text: string
    :rtype: boolean

    See: http://unicode.org/reports/tr9/#P2

    TR9> In each paragraph, find the first character of type L, AL, or R
    TR9> while skipping over any characters between an isolate initiator
    TR9> and its matching PDI or, if it has no matching PDI, the end of the
    TR9> paragraph

    Examples:

    >>> is_right_to_left('Hallo!')
    False

    >>> is_right_to_left('﷼')
    True

    >>> is_right_to_left('⁨﷼⁩')
    False

    >>> is_right_to_left('⁨﷼⁩﷼')
    True

    >>> is_right_to_left('a⁨﷼⁩﷼')
    False

    >>> is_right_to_left('⁨a⁩⁨﷼⁩﷼')
    True
    '''
    skip = False
    for char in text:
        bidi_cat = unicodedata.bidirectional(char)
        if skip and bidi_cat != 'PDI':
            continue
        skip = False
        if bidi_cat in ('AL', 'R'):
            return True
        if bidi_cat == 'L':
            return False
        if bidi_cat in ('LRI', 'RLI', 'FSI'):
            skip = True
    return False
Example #34
0
    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
Example #35
0
    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
Example #36
0
def obfuscation_machine(use_unicode=False, identifier_length=1):
    """
    A generator that returns short sequential combinations of lower and
    upper-case letters that will never repeat.

    If *use_unicode* is ``True``, use nonlatin cryllic, arabic, and syriac
    letters instead of the usual ABCs.

    The *identifier_length* represents the length of the string to return using
    the aforementioned characters.
    """
    # This generates a list of the letters a-z:
    lowercase = list(map(chr, range(97, 123)))
    # Same thing but ALL CAPS:
    uppercase = list(map(chr, range(65, 90)))
    if use_unicode:
        # Python 3 lets us have some *real* fun:
        allowed_categories = ('LC', 'Ll', 'Lu', 'Lo', 'Lu')
        # All the fun characters start at 1580 (hehe):
        big_list = list(map(chr, range(1580, HIGHEST_UNICODE)))
        max_chars = 1000 # Ought to be enough for anybody :)
        combined = []
        rtl_categories = ('AL', 'R') # AL == Arabic, R == Any right-to-left
        last_orientation = 'L'       # L = Any left-to-right
        # Find a good mix of left-to-right and right-to-left characters
        while len(combined) < max_chars:
            char = choice(big_list)
            if unicodedata.category(char) in allowed_categories:
                orientation = unicodedata.bidirectional(char)
                if last_orientation in rtl_categories:
                    if orientation not in rtl_categories:
                        combined.append(char)
                else:
                    if orientation in rtl_categories:
                        combined.append(char)
                last_orientation = orientation
    else:
        combined = lowercase + uppercase
    shuffle(combined) # Randomize it all to keep things interesting
    while True:
        for perm in permutations(combined, identifier_length):
            perm = "".join(perm)
            if perm not in RESERVED_WORDS: # Can't replace reserved words
                yield perm
        identifier_length += 1
Example #37
0
 def data(self, index, role):
     row = index.row()
     col = index.column()
     
     if role == QtCore.Qt.EditRole:
         if col == 0:
             #return self.__localData[row][col][0]
             return 1 # default to 1 every time so the user can just double-click
     
     if role == QtCore.Qt.ForegroundRole:
         qColor = QtGui.QColor(QtCore.Qt.black)
         if row >= 0:
             if col == 1:
                 qColor = QtGui.QColor(QtCore.Qt.darkGreen)
             elif col == 4:
                 qColor = QtGui.QColor(QtCore.Qt.darkBlue)
             elif col == 3 or col == 6: #gram cat.
                 # If there is a mismatch in grammatical category color it red
                 if self.__localData[row][3] != self.__localData[row][6]:
                     qColor = QtGui.QColor(QtCore.Qt.red)
             qBrush = QtGui.QBrush(qColor)
             return qBrush
     
     if role == QtCore.Qt.DisplayRole:
         #if row == 0 and col == 0:
             #self.__localData[col][row].setChecked()
             #self.__localData[col][row].setData(QtCore.Qt.Unchecked, QtCore.Qt.CheckStateRole)
             #return
         
         if col == 0:
             value = self.__localData[row][col][0] # first part of the tuple
         else:
             value = self.__localData[row][col]
             
         if type(value) == str:
             return QtCore.QString(value)
         else:
             return value
         
     elif role == QtCore.Qt.TextAlignmentRole:
         
         # Check if we have right to left data in a column, if so align it right
         if col > 0 and unicodedata.bidirectional(\
           self.__localData[row][col][0]) in ('R', 'AL'): # check first character of first row
             return QtCore.Qt.AlignRight | QtCore.Qt.AlignCenter
Example #38
0
def main():
    try:
        v = bytes(int(x, 16) for x in sys.argv[1:])
        c = v.decode('utf8')
        print('gryph:            %s' % c)
        print('codepoint:        U+%x' % ord(c))
        print('name:             %s' % unicodedata.name(c, 'Unknown'))
        print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
        print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
        print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
        print('category:         %s' % unicodedata.category(c))
        print('bidirectional:    %s' % unicodedata.bidirectional(c))
        print('combining:        %s' % unicodedata.combining(c))
        print('east_asian_width: %s' % unicodedata.east_asian_width(c))
        print('mirrored:         %s' % unicodedata.mirrored(c))
        print('decomposition:    %s' % unicodedata.decomposition(c))
    except Exception as ex:
        print('ERROR: %s' % ex)
Example #39
0
def main():
  try:
    v = bytes(int(x, 16) for x in sys.argv[1:])
    c = v.decode('utf8')
    print('gryph:            %s' % c)
    print('codepoint:        U+%x' % ord(c))
    print('name:             %s' % unicodedata.name(c, 'Unknown'))
    print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
    print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
    print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
    print('category:         %s' % unicodedata.category(c))
    print('bidirectional:    %s' % unicodedata.bidirectional(c))
    print('combining:        %s' % unicodedata.combining(c))
    print('east_asian_width: %s' % unicodedata.east_asian_width(c))
    print('mirrored:         %s' % unicodedata.mirrored(c))
    print('decomposition:    %s' % unicodedata.decomposition(c))
  except Exception as ex:
    print('ERROR: %s' % ex)
Example #40
0
def get_base_level(text, upper_is_rtl=False):
    """Get the paragraph base embedding level. Returns 0 for LTR,
    1 for RTL.

    `text` a unicode object.

    Set `upper_is_rtl` to True to treat upper case chars as strong 'R'
    for debugging (default: False).

    """

    base_level = None

    prev_surrogate = False
    # P2
    for _ch in text:
        # surrogate in case of ucs2
        if _IS_UCS2 and (_SURROGATE_MIN <= ord(_ch) <= _SURROGATE_MAX):
            prev_surrogate = _ch
            continue
        elif prev_surrogate:
            _ch = prev_surrogate + _ch
            prev_surrogate = False

        # treat upper as RTL ?
        if upper_is_rtl and _ch.isupper():
            base_level = 1
            break

        bidi_type = bidirectional(_ch)

        if bidi_type in ('AL', 'R'):
            base_level = 1
            break

        elif bidi_type == 'L':
            base_level = 0
            break

    # P3
    if base_level is None:
        base_level = 0

    return base_level
Example #41
0
    def prepare(self):
        """Setup the initial chars and their attributes"""

        upper_is_rtl = self.upper_is_rtl

        self.chars.clear()

        for ch in self.iter_text():
            if upper_is_rtl and ch.isupper():
                bidi_type = 'R'
            else:
                bidi_type = bidirectional(ch)

            self.chars.append({
                'ch': ch,
                'level': None,
                'type': bidi_type,
                'orig': bidi_type,
            })
	def test_function_checksum(self):
		h = hashlib.sha1()  # nosec: B303

		for i in range(sys.maxunicode + 1):
			char = chr(i)
			data = [
					# Properties
					format(self.db.digit(char, -1), ".12g"),
					format(self.db.numeric(char, -1), ".12g"),
					format(self.db.decimal(char, -1), ".12g"),
					unicodedata.category(char),
					unicodedata.bidirectional(char),
					unicodedata.decomposition(char),
					str(unicodedata.mirrored(char)),
					str(unicodedata.combining(char)),
					]
			h.update(''.join(data).encode("ascii"))
		result = h.hexdigest()
		self.assertEqual(result, self.expectedchecksum)
Example #43
0
def format_message(text):
    """
    Convert facebook-style text to wordpress-style text
    """

    lines = text.split('\n')
    direction = DEFAULT_TEXT_DIRECTION
    divs = []
    for line in lines:
        if len(line.strip()) == 0:
            divs.append('<br />')
        else:
            line_bidi = unicodedata.bidirectional(line.strip()[0])
            if line_bidi == 'L':
                direction = 'ltr'
            elif line_bidi == 'R':
                direction = 'rtl'
            divs.append(div_with_direction(line, direction))
    return '\n'.join(divs)
Example #44
0
def display(text,
            right_to_left=False,
            return_log_pos=False,
            upper_is_rtl=False):
    """
    Returns `text` in display form. `right_to_left` determines the base
    direction. If `return_log_pos` is `True`, the original logical positions
    of the characters will also be returned, which is useful if you need to
    retain logical order but calculate display metrics.
    """
    base_level = 1 if right_to_left else 0
    base_direction = "R" if right_to_left else "L"
    storage = {
        "base_level": base_level,
        "base_dir": base_direction,
        "chars": [],
        "runs": deque(),
    }
    for log_pos, char in enumerate(text):
        if upper_is_rtl and char.isupper():
            bidi_type = "R"
        else:
            bidi_type = bidirectional(char)
        storage["chars"].append({
            "ch": char,
            "level": base_level,
            "type": bidi_type,
            "orig": bidi_type,
            "log_pos": log_pos,
        })

    explicit_embed_and_overrides(storage)
    resolve_weak_types(storage)
    resolve_neutral_types(storage)
    resolve_implicit_levels(storage)
    reorder_resolved_levels(storage)

    if return_log_pos:
        return [(char["ch"], char["log_pos"]) for char in storage["chars"]]

    apply_mirroring(storage)
    return "".join([char["ch"] for char in storage["chars"]])
Example #45
0
    def __init__(self, symbol):

        self.symbol = symbol
        self.name = u.name(symbol, 'NO_NAME_FOUND')
        self.decimal = u.decimal(self.symbol, -1)
        self.digit = u.digit(self.symbol, -1)
        self.numeric = u.numeric(self.symbol, -1)
        self.category = u.category(self.symbol)
        self.bidirectional = u.bidirectional(self.symbol)
        self.combining = u.combining(self.symbol)
        self.east_asian_width = u.east_asian_width(self.symbol)
        self.mirrored = u.mirrored(self.symbol)
        self.decomposition = u.decomposition(self.symbol)
        self.normalize_nfc = u.normalize('NFC', self.symbol)
        self.normalize_nkfc = u.normalize('NFKC', self.symbol)
        self.normalize_nfd = u.normalize('NFD', self.symbol)
        self.normalize_nkfd = u.normalize('NFKD', self.symbol)

        if Config.debug['unicode']:
            self.print_debug()
Example #46
0
    def test_compare_functions(self):
        import unicodedata # CPython implementation

        def getX(fun, code):
            if fun == 'numeric' and code in self.diff_numeric:
                return -1
            try:
                return getattr(unicodedb_4_1_0, fun)(code)
            except KeyError:
                return -1
        
        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_4_1_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
Example #47
0
    def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
Example #48
0
    def find_bidi(self, el):
        """Get directionality from element text."""

        for node in self.get_children(el, tags=False):

            # Analyze child text nodes
            if self.is_tag(node):

                # Avoid analyzing certain elements specified in the specification.
                direction = DIR_MAP.get(
                    util.lower(self.get_attribute_by_name(node, "dir", "")), None
                )
                if (
                    self.get_tag(node) in ("bdi", "script", "style", "textarea")
                    or direction is not None
                ):
                    continue  # pragma: no cover

                # Check directionality of this node's text
                value = self.find_bidi(node)
                if value is not None:
                    return value

                # Direction could not be determined
                continue  # pragma: no cover

            # Skip `doctype` comments, etc.
            if self.is_special_string(node):
                continue

            # Analyze text nodes for directionality.
            for c in node:
                bidi = unicodedata.bidirectional(c)
                if bidi in ("AL", "R", "L"):
                    return ct.SEL_DIR_LTR if bidi == "L" else ct.SEL_DIR_RTL
        return None
Example #49
0
    def info(self, char):
        cat = unicodedata.category(char)
        if cat == 'Cn':
            raise UnassignedCharacter

        catname = self.categories[cat]
        bidi = self.bidis[unicodedata.bidirectional(char)]
        name = unicodedata.name(char, 'an unnamed character').decode('ascii')

        if cat[0] == 'C' or cat in ('Zp', 'Zl'):
            example = u''
        elif cat[0] == 'M' and cat[1] != 'c':
            example = u'\N{DOTTED CIRCLE}' + char
        else:
            example = char

        return {
            'code': u'%04X' % ord(char),
            'name': name.title().replace('Cjk', 'CJK'),
            'char': char,
            'example': example,
            'category': catname.lower(),
            'bidi': bidi
        }
Example #50
0
def get_direction(char):
    """Return character direction."""
    try:
        return bidi[ud.bidirectional(char)]
    except:  # pylint: disable=bare-except
        return ''
Example #51
0
def is_RandALCat(c: str) -> bool:
    return unicodedata.bidirectional(c) in ('R', 'AL')
Example #52
0
def test_against_unicodedata():
    '''
    Check against `unicodedata` or `unicodedata2` if available with the
    correct version of Unicode.
    '''
    if unicodedata is None:
        raise Exception(
            'Packages unicodedata and unicodedata2 are not available with the necessary version of Unicode ({0}); many consistency tests were omitted'
            .format(mdl.UNICODE_VERSION))
    ucdf = mdl.UCDFiles()

    ud = ucdf.unicodedata
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        if cp in ud:
            name = unicodedata.name(c, None)
            if name is None:
                # Handle missing names in unicodedata
                # Compare Table 4-13 in Unicode Standard
                # http://www.unicode.org/versions/Unicode9.0.0/ch04.pdf
                if 0x17000 <= cp <= 0x187EC:
                    assert ud[cp]['Name'] == 'TANGUT IDEOGRAPH-{0:04X}'.format(
                        cp)
                else:
                    assert ud[cp]['Name'] == ''
            else:
                assert name == ud[cp]['Name']
            decimal, digit, numeric = (unicodedata.decimal(c, None),
                                       unicodedata.digit(c, None),
                                       unicodedata.numeric(c, None))
            if any(x is not None for x in (decimal, digit, numeric)):
                if decimal is not None:
                    assert decimal == int(ud[cp]['Numeric_Value']) and ud[cp][
                        'Numeric_Type'] == 'Decimal' and digit is not None and decimal is not None
                elif digit is not None:
                    assert digit == int(ud[cp]['Numeric_Value']) and ud[cp][
                        'Numeric_Type'] == 'Digit' and decimal is None and numeric is not None
                elif numeric is not None:
                    try:
                        num = float(ud[cp]['Numeric_Value'])
                    except ValueError:
                        if '/' in ud[cp]['Numeric_Value']:
                            numerator, denominator = ud[cp][
                                'Numeric_Value'].split('/')
                            num = float(numerator) / float(denominator)
                        else:
                            raise
                    assert numeric == num and ud[cp][
                        'Numeric_Type'] == 'Numeric' and digit is None and decimal is None
                else:
                    raise Exception
            else:
                assert ud[cp]['Numeric_Value'] == 'NaN' and ud[cp][
                    'Numeric_Type'] == 'None'
            assert unicodedata.category(c) == ud[cp]['General_Category']
            assert unicodedata.bidirectional(c) == ud[cp]['Bidi_Class']
            assert unicodedata.combining(c) == int(
                ud[cp]['Canonical_Combining_Class'])
            assert unicodedata.mirrored(c) == ud[cp]['Bidi_Mirrored']
            if unicodedata.decomposition(c) == '':
                if ud[cp]['Name'].startswith('HANGUL SYLLABLE'):
                    # The Hangul syllables lack decomposition mapping in
                    # unicodedata, so calculate with a full decomposition
                    # followed by a partial composition (Unicode Standard,
                    # chapter 3.12)
                    decomp = unicodedata.normalize('NFD', c)
                    if len(decomp) == 3:
                        decomp = unicodedata.normalize('NFC',
                                                       decomp[:2]) + decomp[-1]
                    decomp = tuple(ord(x) for x in decomp)
                    assert decomp == ud[cp]['Decomposition_Mapping']
                else:
                    assert ud[cp]['Decomposition_Mapping'] == (cp, )
            else:
                x = unicodedata.decomposition(c)
                if '<' in x:
                    x = x.split('>', 1)[1].strip()
                x = tuple(int(y, 16) for y in x.split('\x20'))
                assert x == ud[cp]['Decomposition_Mapping']

    dbc = ucdf.derivedbidiclass
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        # Only compare assigned code points, because unicodedata and
        # unicodedata2 lack correct defaults for unassigned
        if cp in dbc and cp in ud:
            assert unicodedata.bidirectional(c) == dbc[cp]['Bidi_Class']

    eaw = ucdf.eastasianwidth
    deaw = ucdf.derivedeastasianwidth
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        # Only compare assigned code points, because unicodedata and
        # unicodedata2 lack correct defaults for unassigned
        if cp in eaw and cp in ud:
            assert unicodedata.east_asian_width(
                c) == eaw[cp]['East_Asian_Width']
        if cp in deaw and cp in ud:
            assert unicodedata.east_asian_width(
                c) == deaw[cp]['East_Asian_Width']
Example #53
0
def detectStringDirection(s):
	direction=0
	for b in (unicodedata.bidirectional(ch) for ch in s):
		if b=='L': direction+=1
		if b in ('R','AL'): direction-=1
	return direction
Example #54
0
    async def charinfo(self, *, data: str):
        """Shows information about one or several characters.

        'data' can either be a character, a unicode escape sequence, a unicode character name or a string.
        If 'data' is a string only a summary of each character's info will be displayed.
        """
        data = data.lower()

        if data.startswith('\\u'):
            # Let's interpret the unicode escape sequence
            hex_values = data.split('\\u')[1:]
            try:
                code_points = [int(val, 16) for val in hex_values]
            except ValueError:
                await self.bot.say('Invalid unicode escape sequence.')
                return
            else:
                data = ''.join(chr(cp) for cp in code_points)
        elif len(data) > 1:
            # Maybe we've been given the character's name ?
            try:
                data = unicodedata.lookup(data)
            except KeyError:
                pass

        # Normalise the input
        data = unicodedata.normalize('NFC', data)
        url_fmt = '<http://unicode-table.com/en/{:X}>'

        if len(data) == 1:
            # Detailed info on the character
            entries = [
                ('Character', data),
                ('Name', unicodedata.name(data, 'None')),
                ('Code point', '{:04x}'.format(ord(data)))
            ]
            decomposition = unicodedata.decomposition(data)
            if decomposition != '':
                entries.append(('Decomposition', decomposition))

            combining = unicodedata.combining(data)
            if combining:
                entries.append(('Combining class', combining))

            entries.append(('Category', unicodedata.category(data)))
            bidirectional = unicodedata.bidirectional(data)
            entries.append(('Bidirectional', bidirectional if bidirectional != '' else 'None'))
            entries.append(('Mirrored', 'True' if unicodedata.mirrored(data) == 1 else 'False'))
            entries.append(('East asian width', unicodedata.east_asian_width(data)))
            entries.append(('Url', url_fmt.format(ord(data))))

            # Create the message's content and send it
            content = utils.indented_entry_to_str(entries)
            await self.bot.say_block(content)
        else:
            # Minimal info for each character
            entries = []
            for char in data:
                entries.append('{} | `\\u{:04x}` | {} | {}'.format(char,
                                                                   ord(char),
                                                                   unicodedata.name(char, 'None'),
                                                                   url_fmt.format(ord(char))))
            content = '\n'.join(entries)
            await self.bot.say(content)
Example #55
0
def check_bidi(label, check_ltr=False):

    # Bidi rules should only be applied if string contains RTL characters
    bidi_label = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)
        if direction == "":
            # String likely comes from a newer version of Unicode
            raise IDNABidiError(
                "Unknown directionality in label {0} at position {1}".format(
                    repr(label), idx))
        if direction in ["R", "AL", "AN"]:
            bidi_label = True
            break
    if not bidi_label and not check_ltr:
        return True

    # Bidi rule 1
    direction = unicodedata.bidirectional(label[0])
    if direction in ["R", "AL"]:
        rtl = True
    elif direction == "L":
        rtl = False
    else:
        raise IDNABidiError(
            "First codepoint in label {0} must be directionality L, R or AL".
            format(repr(label)))

    valid_ending = False
    number_type = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)

        if rtl:
            # Bidi rule 2
            if not direction in [
                    "R",
                    "AL",
                    "AN",
                    "EN",
                    "ES",
                    "CS",
                    "ET",
                    "ON",
                    "BN",
                    "NSM",
            ]:
                raise IDNABidiError(
                    "Invalid direction for codepoint at position {0} in a right-to-left label"
                    .format(idx))
            # Bidi rule 3
            if direction in ["R", "AL", "EN", "AN"]:
                valid_ending = True
            elif direction != "NSM":
                valid_ending = False
            # Bidi rule 4
            if direction in ["AN", "EN"]:
                if not number_type:
                    number_type = direction
                else:
                    if number_type != direction:
                        raise IDNABidiError(
                            "Can not mix numeral types in a right-to-left label"
                        )
        else:
            # Bidi rule 5
            if not direction in [
                    "L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"
            ]:
                raise IDNABidiError(
                    "Invalid direction for codepoint at position {0} in a left-to-right label"
                    .format(idx))
            # Bidi rule 6
            if direction in ["L", "EN"]:
                valid_ending = True
            elif direction != "NSM":
                valid_ending = False

    if not valid_ending:
        raise IDNABidiError("Label ends with illegal codepoint directionality")

    return True
Example #56
0
def ShouldFail(domain):
    """Returns True for domains that we know are invalid, False otherwise."""
    if "." not in domain:
        return True
    pieces = domain.split(".")

    total_length = len(b".".join(
        [piece.encode("punycode") for piece in pieces]))
    if total_length > 253:
        return True

    for piece in pieces:
        # Iteration over each label in the domain, checking various requirements.
        if len(piece) == 0:
            return True
        if len(piece) > 63:
            return True
        if len(piece.encode("punycode")) > 59:
            return True
        # Domain labels must not start with a -, end with a -, or have both their
        # third and fourth characters be --.
        if piece.startswith("-"):
            return True
        if piece.endswith("-"):
            return True
        if len(piece) >= 4 and piece[2] == "-" and piece[3] == "-":
            return True
        if len(piece) and unicodedata.category(piece[0])[0] == "M":
            return True

        # Bidirectional checks (ensures that the label follows the "bidi rule"
        # for IDNA)
        direction = unicodedata.bidirectional(piece[0])
        if direction in ["R", "AL"]:
            rtl = True
        elif direction == "L":
            rtl = False
        else:
            return True
        if rtl:
            has_en = False
            has_an = False
            for c in piece:
                biditype = unicodedata.bidirectional(c)
                if biditype not in [
                        "R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN",
                        "NSM"
                ]:
                    return True
                if biditype == "EN":
                    has_en = True
                if biditype == "AN":
                    has_an = True
            if has_en and has_an:
                return True
            for i in range(len(piece) - 1, 0 - 1, -1):
                biditype = unicodedata.bidirectional(piece[i])
                if biditype in ["R", "AL", "EN", "AN"]:
                    break
                if biditype != "NSM":
                    return True

        else:
            for c in piece:
                if unicodedata.bidirectional(c) not in [
                        "L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"
                ]:
                    return True
            for i in range(len(piece) - 1, 0 - 1, -1):
                biditype = unicodedata.bidirectional(piece[i])
                if biditype in ["L", "EN"]:
                    break
                if biditype != "NSM":
                    return True
    return False
Example #57
0
    def match_dir(self, el, directionality):
        """Check directionality."""

        # If we have to match both left and right, we can't match either.
        if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
            return False

        # Element has defined direction of left to right or right to left
        direction = DIR_MAP.get(util.lower(el.attrs.get('dir', '')), None)
        if direction not in (None, 0):
            return direction == directionality

        # Element is the document element (the root) and no direction assigned, assume left to right.
        is_root = self.match_root(el)
        if is_root and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # If `input[type=telephone]` and no direction is assigned, assume left to right.
        is_input = util.lower(el.name) == 'input'
        is_textarea = util.lower(el.name) == 'textarea'
        is_bdi = util.lower(el.name) == 'bdi'
        itype = util.lower(self.get_attribute_by_name(el, 'type',
                                                      '')) if is_input else ''
        if is_input and itype == 'tel' and direction is None:
            return ct.SEL_DIR_LTR == directionality

        # Auto handling for text inputs
        if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email'))
                or is_textarea) and direction == 0:
            if is_textarea:
                value = []
                for node in el.contents:
                    if util.is_navigable_string(
                            node) and not util.is_special_string(node):
                        value.append(node)
                value = ''.join(value)
            else:
                value = self.get_attribute_by_name(el, 'value', '')
            if value:
                for c in value:
                    bidi = unicodedata.bidirectional(c)
                    if bidi in ('AL', 'R', 'L'):
                        direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
                        return direction == directionality
                # Assume left to right
                return ct.SEL_DIR_LTR == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(el.parent, directionality)

        # Auto handling for `bdi` and other non text inputs.
        if (is_bdi and direction is None) or direction == 0:
            direction = self.get_bidi(el)
            if direction is not None:
                return direction == directionality
            elif is_root:
                return ct.SEL_DIR_LTR == directionality
            return self.match_dir(el.parent, directionality)

        # Match parents direction
        return self.match_dir(el.parent, directionality)
Example #58
0
# Test the unicode support! 👋


áš´=2

assert áš´*8 == 16

ᚴ="👋"

c = áš´*3

assert c == '👋👋👋'

import unicodedata
assert unicodedata.category('a') == 'Ll'
assert unicodedata.category('A') == 'Lu'
assert unicodedata.name('a') == 'LATIN SMALL LETTER A'
assert unicodedata.lookup('LATIN SMALL LETTER A') == 'a'
assert unicodedata.bidirectional('a') == 'L'
assert unicodedata.normalize('NFC', 'bla') == 'bla'
Example #59
0
def check_bidi(label, check_ltr=False):

    # Bidi rules should only be applied if string contains RTL characters
    bidi_label = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)
        if direction == '':
            # String likely comes from a newer version of Unicode
            raise IDNABidiError(
                'Unknown directionality in label {0} at position {1}'.format(
                    repr(label), idx))
        if direction in ['R', 'AL', 'AN']:
            bidi_label = True
    if not bidi_label and not check_ltr:
        return True

    # Bidi rule 1
    direction = unicodedata.bidirectional(label[0])
    if direction in ['R', 'AL']:
        rtl = True
    elif direction == 'L':
        rtl = False
    else:
        raise IDNABidiError(
            'First codepoint in label {0} must be directionality L, R or AL'.
            format(repr(label)))

    valid_ending = False
    number_type = False
    for (idx, cp) in enumerate(label, 1):
        direction = unicodedata.bidirectional(cp)

        if rtl:
            # Bidi rule 2
            if not direction in [
                    'R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM'
            ]:
                raise IDNABidiError(
                    'Invalid direction for codepoint at position {0} in a right-to-left label'
                    .format(idx))
            # Bidi rule 3
            if direction in ['R', 'AL', 'EN', 'AN']:
                valid_ending = True
            elif direction != 'NSM':
                valid_ending = False
            # Bidi rule 4
            if direction in ['AN', 'EN']:
                if not number_type:
                    number_type = direction
                else:
                    if number_type != direction:
                        raise IDNABidiError(
                            'Can not mix numeral types in a right-to-left label'
                        )
        else:
            # Bidi rule 5
            if not direction in [
                    'L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM'
            ]:
                raise IDNABidiError(
                    'Invalid direction for codepoint at position {0} in a left-to-right label'
                    .format(idx))
            # Bidi rule 6
            if direction in ['L', 'EN']:
                valid_ending = True
            elif direction != 'NSM':
                valid_ending = False

    if not valid_ending:
        raise IDNABidiError('Label ends with illegal codepoint directionality')

    return True
def in_table_d2(code):
    return unicodedata.bidirectional(code) == "L"