Example #1
0
def _ucp_width(ucs, control_chars='guess'):
    '''Get the textual width of a ucs character.

    :arg ucs: integer representing a single unicode :term:`code point`
    :kwarg control_chars: specify how to deal with control chars.  Possible
        values are:
            :guess: (default) will take a guess for control code widths.  Most
                codes will return 0 width.  backspace, delete, and clear
                delete return -1.  escape currently returns -1 as well but
                this is not guaranteed as it's not always correct
            :strict: will raise
                :exc:`~kitchen.text.exceptions.ControlCharError` if
                a control code is encountered
    :raises ControlCharError: if the :term:`code point` is a unicode
        control character and :attr:`control_chars` is set to 'strict'
    :returns: :term:`textual width` of the character.

    .. note: It's important to remember this is :term:`textual width` and not
        the number of characters or bytes.
    '''
    # test for 8-bit control characters
    if ucs < 32 or (ucs < 0xa0 and ucs >= 0x7f):
        # Control character detected
        if control_chars == 'strict':
            raise ControlCharError(
                _('_ucp_width does not understand how to'
                  ' assign a width value to control characters.'))
        if ucs in (0x08, 0x07F, 0x94):
            # Backspace, delete, and clear delete remove a single character
            return -1
        if ucs == 0x1b:
            # Excape is tricky.  It removes some number of characters that
            # come after it but the amount is dependent on what is
            # interpreting the code.
            # So this is going to often be wrong but other values will be
            # wrong as well.
            return -1
        # All other control characters get 0 width
        return 0

    if _interval_bisearch(ucs, _COMBINING):
        # Combining characters return 0 width as they will be combined with
        # the width from other characters
        return 0

    # if we arrive here, ucs is not a combining or C0/C1 control character

    return (1 + (
        ucs >= 0x1100 and
        (ucs <= 0x115f or  # Hangul Jamo init. consonants
         ucs == 0x2329 or ucs == 0x232a or
         (ucs >= 0x2e80 and ucs <= 0xa4cf and ucs != 0x303f) or  # CJK ... Yi
         (ucs >= 0xac00 and ucs <= 0xd7a3) or  # Hangul Syllables
         (ucs >= 0xf900 and ucs <= 0xfaff) or  # CJK Compatibility Ideographs
         (ucs >= 0xfe10 and ucs <= 0xfe19) or  # Vertical forms
         (ucs >= 0xfe30 and ucs <= 0xfe6f) or  # CJK Compatibility Forms
         (ucs >= 0xff00 and ucs <= 0xff60) or  # Fullwidth Forms
         (ucs >= 0xffe0 and ucs <= 0xffe6) or
         (ucs >= 0x20000 and ucs <= 0x2fffd) or
         (ucs >= 0x30000 and ucs <= 0x3fffd))))
def process_control_chars(string, strategy='replace'):
    '''Look for and transform :term:`control characters` in a string

    :arg string: string to search for and transform :term:`control characters`
        within
    :kwarg strategy: XML does not allow :term:`ASCII` :term:`control
        characters`.  When we encounter those we need to know what to do.
        Valid options are:

        :replace: (default) Replace the :term:`control characters`
            with ``"?"``
        :ignore: Remove the characters altogether from the output
        :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when
            we encounter a control character
    :raises TypeError: if :attr:`string` is not a unicode string.
    :raises ValueError: if the strategy is not one of replace, ignore, or
        strict.
    :raises kitchen.text.exceptions.ControlCharError: if the strategy is
        ``strict`` and a :term:`control character` is present in the
        :attr:`string`
    :returns: :class:`str` string with no :term:`control characters` in
        it.

    .. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
        Strip out the C1 control characters in addition to the C0 control
        characters.
    '''
    if not isunicodestring(string):
        raise TypeError('process_control_char must have a unicode type'
                        ' (str) as the first argument.')
    if strategy not in ('replace', 'ignore', 'strict'):
        raise ValueError('The strategy argument to process_control_chars'
                         ' must be one of ignore, replace, or strict')

    # Most strings don't have control chars and translating carries
    # a higher cost than testing whether the chars are in the string
    # So only translate if necessary
    if not _CONTROL_CHARS.isdisjoint(string):
        if strategy == 'replace':
            control_table = _REPLACE_TABLE
        elif strategy == 'ignore':
            control_table = _IGNORE_TABLE
        else:
            # strategy can only equal 'strict'
            raise ControlCharError('ASCII control code present in string'
                                   ' input')
        string = string.translate(control_table)

    return string
Example #3
0
def process_control_chars(string, strategy='replace'):
    '''Look for and transform :term:`control characters` in a string

    :arg string: string to search for and transform :term:`control characters`
        within
    :kwarg strategy: XML does not allow :term:`ASCII` :term:`control
        characters`.  When we encounter those we need to know what to do.
        Valid options are:

        :replace: (default) Replace the :term:`control characters`
            with ``"?"``
        :ignore: Remove the characters altogether from the output
        :strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when
            we encounter a control character
    :raises TypeError: if :attr:`string` is not a unicode string.
    :raises ValueError: if the strategy is not one of replace, ignore, or
        strict.
    :raises kitchen.text.exceptions.ControlCharError: if the strategy is
        ``strict`` and a :term:`control character` is present in the
        :attr:`string`
    :returns: :class:`unicode` string with no :term:`control characters` in
        it.
    '''
    if not isinstance(string, unicode):
        raise TypeError(
            k.b_('process_control_char must have a unicode type as'
                 ' the first argument.'))
    if strategy == 'ignore':
        control_table = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES)))
    elif strategy == 'replace':
        control_table = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES)))
    elif strategy == 'strict':
        control_table = None
        # Test that there are no control codes present
        data = frozenset(string)
        if [c for c in _CONTROL_CHARS if c in data]:
            raise ControlCharError(
                k.b_('ASCII control code present in string'
                     ' input'))
    else:
        raise ValueError(
            k.b_('The strategy argument to process_control_chars'
                 ' must be one of ignore, replace, or strict'))

    if control_table:
        string = string.translate(control_table)

    return string
Example #4
0
def process_control_chars(string, strategy='replace'):
    '''Look for and transform control characters in a string

    :arg string: string to search for and transform control characters in
    :kwarg strategy: XML does not allow ASCII control characters.  When
        we encounter those we need to know what to do.  Valid options are:
        :replace: (default) Replace the control characters with "?"
        :ignore: Remove the characters altogether from the output
        :strict: Raise an error when we encounter a control character
    :raises TypeError: if :attr:`string` is not a unicode string.
    :raises ValueError: if the strategy is not one of replace, ignore, or
        strict.
    :returns: unicode string with no control characters in it.
    '''
    if not isinstance(string, unicode):
        raise TypeError(
            _('process_control_char must have a unicode type as'
              ' the first argument.'))
    if strategy == 'ignore':
        control_table = dict(zip(_control_codes, [None] * len(_control_codes)))
    elif strategy == 'replace':
        control_table = dict(zip(_control_codes, [u'?'] * len(_control_codes)))
    elif strategy == 'strict':
        control_table = None
        # Test that there are no control codes present
        data = frozenset(string)
        if [c for c in _control_chars if c in data]:
            raise ControlCharError(
                _('ASCII control code present in string'
                  ' input'))
    else:
        raise ValueError(
            _('The strategy argument to process_control_chars'
              ' must be one of ignore, replace, or strict'))

    if control_table:
        string = string.translate(control_table)

    return string