Exemple #1
0
def sort_key(s):
    """Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
    is used for utf-8 and unicode strings sorting and for utf-8 strings
    comparison

    Note:
        pyuca is a very memory cost module! It loads the whole
        "allkey.txt" file (~2mb!) into the memory. But this
        functionality is needed only when sort_key() is called as a
        part of sort() function or when Utf8 strings are compared.

    So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
    FIRST CALL) imports pyuca and replaces itself with a real
    sort_key() function
    """
    global sort_key
    try:
        from gluon.contrib.pyuca import unicode_collator
        unicode_sort_key = unicode_collator.sort_key
        sort_key = lambda s: unicode_sort_key(
            to_unicode(s, 'utf-8') if isinstance(s, str) else s)
    except:
        sort_key = lambda s: (
            to_unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
    return sort_key(s)
Exemple #2
0
def sort_key(s):
    """Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
    is used for utf-8 and unicode strings sorting and for utf-8 strings
    comparison

    Note:
        pyuca is a very memory cost module! It loads the whole
        "allkey.txt" file (~2mb!) into the memory. But this
        functionality is needed only when sort_key() is called as a
        part of sort() function or when Utf8 strings are compared.

    So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
    FIRST CALL) imports pyuca and replaces itself with a real
    sort_key() function
    """
    global sort_key
    try:
        from gluon.contrib.pyuca import unicode_collator
        unicode_sort_key = unicode_collator.sort_key
        sort_key = lambda s: unicode_sort_key(
            to_unicode(s, 'utf-8') if isinstance(s, str) else s)
    except:
        sort_key = lambda s: (to_unicode(s, 'utf-8')
                              if isinstance(s, str) else s).lower()
    return sort_key(s)
Exemple #3
0
    def __repr__(self):
        r''' # note that we use raw strings to avoid having to use double back slashes below
        NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function::

            utf8.__repr__() works same as str.repr() when processing ascii string
            >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
            True
            >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
            True
            >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
            True
            >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
            True
            >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
            True

        Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string::

            >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
            True
            >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
            True
            >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
            True
            >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
            True
            >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
            True
        '''
        if str.find(self, "'") >= 0 and str.find(self, '"') < 0:  # only single quote exists
            return '"' + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab), 'utf-8') + '"'
        else:
            return "'" + to_native(to_unicode(self, 'utf-8').translate(repr_escape_tab2), 'utf-8') + "'"
Exemple #4
0
def truncate(string, length, dots='...'):
    """Returns string of length < *length* or truncate string with adding
    *dots* suffix to the string's end

    Args:
        length (int): max length of string
        dots (str or unicode): string suffix, when string is cutted

    Returns:
        (utf8-str): original or cutted string
    """
    text = to_unicode(string, 'utf-8')
    dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots
    if len(text) > length:
        text = text[:length - len(dots)] + dots
    return str.__new__(Utf8, text.encode('utf-8'))
Exemple #5
0
def ord(char):
    """Returns unicode id for utf8 or unicode *char* character
    SUPPOSE that *char* is an utf-8 or unicode character only
    """
    if isinstance(char, unicodeT):
        return __builtin__.ord(char)
    return __builtin__.ord(to_unicode(char, 'utf-8'))
Exemple #6
0
def truncate(string, length, dots='...'):
    """Returns string of length < *length* or truncate string with adding
    *dots* suffix to the string's end

    Args:
        length (int): max length of string
        dots (str or unicode): string suffix, when string is cutted

    Returns:
        (utf8-str): original or cutted string
    """
    text = to_unicode(string, 'utf-8')
    dots = to_unicode(dots, 'utf-8') if isinstance(dots, str) else dots
    if len(text) > length:
        text = text[:length - len(dots)] + dots
    return str.__new__(Utf8, text.encode('utf-8'))
Exemple #7
0
def ord(char):
    """Returns unicode id for utf8 or unicode *char* character
    SUPPOSE that *char* is an utf-8 or unicode character only
    """
    if isinstance(char, unicodeT):
        return __builtin__.ord(char)
    return __builtin__.ord(to_unicode(char, 'utf-8'))
Exemple #8
0
 def __new__(cls, content='', codepage='utf-8'):
     if isinstance(content, unicodeT):
         return str.__new__(cls, to_native(content, 'utf-8'))
     elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
         return str.__new__(cls, content)
     else:
         return str.__new__(cls, to_native(to_unicode(content, codepage), 'utf-8'))
Exemple #9
0
 def __new__(cls, content='', codepage='utf-8'):
     if isinstance(content, unicodeT):
         return str.__new__(cls, to_native(content, 'utf-8'))
     elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
         return str.__new__(cls, content)
     else:
         return str.__new__(
             cls, to_native(to_unicode(content, codepage), 'utf-8'))
Exemple #10
0
    def __repr__(self):
        r''' # note that we use raw strings to avoid having to use double back slashes below
        NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function::

            utf8.__repr__() works same as str.repr() when processing ascii string
            >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
            True
            >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
            True
            >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
            True
            >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
            True
            >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
            True

        Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string::

            >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
            True
            >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
            True
            >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
            True
            >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
            True
            >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
            True
        '''
        if str.find(self, "'") >= 0 and str.find(
                self, '"') < 0:  # only single quote exists
            return '"' + to_native(
                to_unicode(self, 'utf-8').translate(repr_escape_tab),
                'utf-8') + '"'
        else:
            return "'" + to_native(
                to_unicode(self, 'utf-8').translate(repr_escape_tab2),
                'utf-8') + "'"
Exemple #11
0
def write_dict(filename, contents):
    if '__corrupted__' in contents:
        return
    fp = None
    try:
        fp = LockedFile(filename, 'w')
        fp.write('# -*- coding: utf-8 -*-\n{\n')
        for key in sorted(contents, key=lambda x: to_unicode(x, 'utf-8').lower()):
            fp.write('%s: %s,\n' % (repr(Utf8(key)),
                                    repr(Utf8(contents[key]))))
        fp.write('}\n')
    except (IOError, OSError):
        if is_writable():
            logging.warning('Unable to write to file %s' % filename)
        return
    finally:
        if fp:
            fp.close()
Exemple #12
0
def write_dict(filename, contents):
    if '__corrupted__' in contents:
        return
    fp = None
    try:
        fp = LockedFile(filename, 'w')
        fp.write('# -*- coding: utf-8 -*-\n{\n')
        for key in sorted(contents, key=lambda x: to_unicode(x, 'utf-8').lower()):
            fp.write('%s: %s,\n' % (repr(Utf8(key)),
                                    repr(Utf8(contents[key]))))
        fp.write('}\n')
    except (IOError, OSError):
        if is_writable():
            logging.warning('Unable to write to file %s' % filename)
        return
    finally:
        if fp:
            fp.close()
Exemple #13
0
 def sub_dict(m):
     """ word(key or num)
         !word(key or num), !!word(key or num), !!!word(key or num)
         ?word1?word(key or num)
              ??word(key or num), ?word(key or num)
         ?word1?word?word0(key or num)
         ?word1?word?(key or num)
              ??word?word0(key or num)
         ?word1?word?(key or num)
              ??word?(key or num), ?word?(key or num)
     """
     w, n = m.group('w', 'n')
     c = w[0]
     n = int(n) if n.isdigit() else symbols[n]
     if c not in '!?':
         return self.plural(w, n)
     elif c == '?':
         # ?[word1]?word[?word0](key or num), ?[word1]?word(key or num) or ?word(key or num)
         (p1, sep, p2) = w[1:].partition("?")
         part1 = p1 if sep else ""
         (part2, sep, part3) = (p2 if sep else p1).partition("?")
         if not sep:
             part3 = part2
         num = int(n)
         return part1 if num == 1 else part3 if num == 0 else part2
     elif w.startswith('!!!'):
         word = w[3:]
         fun = upper_fun
     elif w.startswith('!!'):
         word = w[2:]
         fun = title_fun
     else:
         word = w[1:]
         fun = cap_fun
     s = fun(self.plural(word, n))
     return s if PY2 else to_unicode(s)
Exemple #14
0
 def sub_dict(m):
     """ word(key or num)
         !word(key or num), !!word(key or num), !!!word(key or num)
         ?word1?word(key or num)
              ??word(key or num), ?word(key or num)
         ?word1?word?word0(key or num)
         ?word1?word?(key or num)
              ??word?word0(key or num)
         ?word1?word?(key or num)
              ??word?(key or num), ?word?(key or num)
     """
     w, n = m.group('w', 'n')
     c = w[0]
     n = int(n) if n.isdigit() else symbols[n]
     if c not in '!?':
         return self.plural(w, n)
     elif c == '?':
         # ?[word1]?word[?word0](key or num), ?[word1]?word(key or num) or ?word(key or num)
         (p1, sep, p2) = w[1:].partition("?")
         part1 = p1 if sep else ""
         (part2, sep, part3) = (p2 if sep else p1).partition("?")
         if not sep:
             part3 = part2
         num = int(n)
         return part1 if num == 1 else part3 if num == 0 else part2
     elif w.startswith('!!!'):
         word = w[3:]
         fun = upper_fun
     elif w.startswith('!!'):
         word = w[2:]
         fun = title_fun
     else:
         word = w[1:]
         fun = cap_fun
     s = fun(self.plural(word, n))
     return s if PY2 else to_unicode(s)
Exemple #15
0
| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
| Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
| for Web2py project

Utilities and class for UTF8 strings managing
----------------------------------------------
"""
from __future__ import print_function
from pydal._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native

__all__ = ['Utf8']

repr_escape_tab = {}
#FIXME PY3
for i in range(1, 32):
    repr_escape_tab[i] = to_unicode("\\" + "x%02x" % i)
repr_escape_tab[7] = u'\\a'
repr_escape_tab[8] = u'\\b'
repr_escape_tab[9] = u'\\t'
repr_escape_tab[10] = u'\\n'
repr_escape_tab[11] = u'\\v'
repr_escape_tab[12] = u'\\f'
repr_escape_tab[13] = u'\\r'
repr_escape_tab[ord('\\')] = u'\\\\'
repr_escape_tab2 = repr_escape_tab.copy()
repr_escape_tab2[ord('\'')] = u"\\'"


def sort_key(s):
    """Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
    is used for utf-8 and unicode strings sorting and for utf-8 strings
Exemple #16
0
def upper_fun(s):
    return to_bytes(to_unicode(s).upper())
Exemple #17
0
def title_fun(s):
    return to_bytes(to_unicode(s).title())
Exemple #18
0
def sort_function(x):
    return to_unicode(x, 'utf-8').lower()
Exemple #19
0
 def __len__(self):
     return len(to_unicode(self, 'utf-8'))
Exemple #20
0
 def encode(self, *a, **b):
     if PY2 and a[0] != 'utf8':
         return to_unicode(str(self)).encode(*a, **b)
     else:
         return str(self)
Exemple #21
0
def cap_fun(s):
    return to_bytes(to_unicode(s).capitalize())
Exemple #22
0
 def __len__(self):
     return len(to_unicode(self, 'utf-8'))
Exemple #23
0
 def __getitem__(self, index):
     return str.__new__(
         Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8'))
Exemple #24
0
| License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)
| Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
| for Web2py project

Utilities and class for UTF8 strings managing
----------------------------------------------
"""
from __future__ import print_function
from pydal._compat import builtin as __builtin__, unicodeT, iteritems, to_unicode, to_native

__all__ = ['Utf8']

repr_escape_tab = {}
#FIXME PY3
for i in range(1, 32):
    repr_escape_tab[i] = to_unicode("\\"+"x%02x" % i)
repr_escape_tab[7] = u'\\a'
repr_escape_tab[8] = u'\\b'
repr_escape_tab[9] = u'\\t'
repr_escape_tab[10] = u'\\n'
repr_escape_tab[11] = u'\\v'
repr_escape_tab[12] = u'\\f'
repr_escape_tab[13] = u'\\r'
repr_escape_tab[ord('\\')] = u'\\\\'
repr_escape_tab2 = repr_escape_tab.copy()
repr_escape_tab2[ord('\'')] = u"\\'"


def sort_key(s):
    """Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
    is used for utf-8 and unicode strings sorting and for utf-8 strings
Exemple #25
0
 def encode(self, *a, **b):
     if PY2 and a[0] != 'utf8':
         return to_unicode(str(self)).encode(*a, **b)
     else:
         return str(self)
Exemple #26
0
def sort_function(x):
    return to_unicode(x, 'utf-8').lower()
Exemple #27
0
 def __getslice__(self, begin, end):
     return str.__new__(
         Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8'))
Exemple #28
0
def title_fun(s):
    return to_bytes(to_unicode(s).title())
Exemple #29
0
def cap_fun(s):
    return to_bytes(to_unicode(s).capitalize())
Exemple #30
0
 def __getitem__(self, index):
     return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[index], 'utf-8'))
Exemple #31
0
def upper_fun(s):
    return to_bytes(to_unicode(s).upper())
Exemple #32
0
 def __getslice__(self, begin, end):
     return str.__new__(Utf8, to_native(to_unicode(self, 'utf-8')[begin:end], 'utf-8'))