Beispiel #1
0
    def getdetails(self, text):
        chardetails = {}
        for character in text:
            chardetails[character] = {}
            chardetails[character]['Name'] = unicodedata.name(character)
            chardetails[character]['HTML Entity'] = str(ord(character))
            chardetails[character]['Code point'] = repr(character)
            try:
                chardetails[character]['Numeric Value'] = \
                        unicodedata.numeric(character)
            except:
                pass
            try:
                chardetails[character]['Decimal Value'] = \
                        unicodedata.decimal(character)
            except:
                pass
            try:
                chardetails[character]['Digit'] = unicodedata.digit(mychar)
            except:
                pass
            chardetails[character]['Alphabet'] = str(character.isalpha())
            chardetails[character]['Digit'] = str(character.isdigit())
            chardetails[character]['AlphaNumeric'] = str(character.isalnum())
            chardetails[character]['Canonical Decomposition'] = \
                    unicodedata.decomposition(character)

        chardetails['Characters'] = list(text)
        return chardetails
Beispiel #2
0
def test_numeric_chars_contains_all_valid_unicode_numeric_and_digit_characters(
):
    set_numeric_hex = set(numeric_hex)
    set_numeric_chars = set(numeric_chars)
    set_digit_chars = set(digit_chars)
    set_decimal_chars = set(decimal_chars)
    for i in py23_range(0X110000):
        try:
            a = py23_unichr(i)
        except ValueError:
            break
        if a in set('0123456789'):
            continue
        if unicodedata.numeric(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_numeric_chars
        if unicodedata.digit(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_digit_chars
        if unicodedata.decimal(a, None) is not None:
            assert i in set_numeric_hex
            assert a in set_decimal_chars

    assert set_decimal_chars.isdisjoint(digits_no_decimals)
    assert set_digit_chars.issuperset(digits_no_decimals)

    assert set_decimal_chars.isdisjoint(numeric_no_decimals)
    assert set_numeric_chars.issuperset(numeric_no_decimals)
Beispiel #3
0
def get_type_numeric(pred, db):
    cur = db.conn.cursor()
    types = {}
    obj_list = []
    # cur.execute("""select max(a.obj1), min(a.obj1), avg(a.obj1), percentile_disc(0.1) within group (order by a.obj1) as p10,
    # 	percentile_disc(0.9) within group (order by a.obj1) as p90
    # 	from (select cast(obj as bigint) as obj1
    # 	from spot_triples where pred=(%s) and obj_type='int') a""", [pred])
    query = "select obj from " + db.spot_tb + " where pred=(%s) and obj_type='int' "
    cur.execute(query, [pred])
    for row in cur:
        val = 0
        try:
            val = int(row[0])
        except ValueError:
            # print (pred, row)
            try:
                unicode_char_list = ''.join([
                    str(unicodedata.decimal(d, -1))
                    for d in row[0].decode('utf8')
                ])
                val = int(unicode_char_list)
            except Exception as e:
                print(pred, row[0], e)
        else:
            obj_list.append(abs(val))
    cur.close()
    types['max'] = max(obj_list)
    types['min'] = min(obj_list)
    types['avg'] = sum(obj_list) / float(len(obj_list))
    np.asarray(obj_list)
    types['p10'] = np.percentile(obj_list, 10)
    types['p90'] = np.percentile(obj_list, 90)
    return types
Beispiel #4
0
def conv(unicode_arabic_date):
    new_date = ''
    for d in unicode_arabic_date:
        if d != ':' and d != '/':
            new_date += str(unicodedata.decimal(d))
        elif d == ':':
            new_date += ':'
        elif d == '/':
            new_date += '/'
    return new_date
Beispiel #5
0
    def _explain_char(self, ch, further):
        try:
            name = unicodedata.name(ch)
        except ValueError:
            name = f'[U+{hex(ord(ch))[2:]}]'
        if not further:
            return name + f'({ch})'
        infos = {
            'category': unicodedata.category(ch),
            'direction': unicodedata.bidirectional(ch),
            'east asian width': unicodedata.east_asian_width(ch)
        }

        decomposition = unicodedata.decomposition(ch)
        if decomposition:
            infos['decomposition'] = decomposition

        try:
            infos['digit value'] = unicodedata.digit(ch)
        except ValueError:
            pass
        try:
            infos['decimal value'] = unicodedata.decimal(ch)
        except ValueError:
            pass
        try:
            infos['numeric value'] = unicodedata.numeric(ch)
        except ValueError:
            pass
        comb = unicodedata.combining(ch)
        if comb != 0:
            infos['combining class'] = str(comb)

        mirrored = unicodedata.mirrored(ch)
        if mirrored:
            infos['mirrored'] = 'yes'
        if hasattr(unicodedata, 'is_normalized'):
            forms = []
            for form in ('NFC', 'NFD', 'NFKC', 'NFKD'):
                if unicodedata.is_normalized(form, ch):
                    forms.append(form)
            if forms:
                infos['normalized'] = f'yes: {", ".join(forms)}'
            else:
                infos['normalized'] = 'no'
        else:
            infos['normalized'] = 'unavailable'

        info = ', '.join([f'{k}: {v}' for k, v in infos.items()])
        return f'{name}: {ch!r} ({info})'
def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), char,
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))
Beispiel #7
0
def overview(tree_item):
    """ Returns an overview of the character
    """
    char = tree_item.obj
    return TEMPLATE.format(unicodedata.name(char, '<NO NAME AVAILABLE>'), 
                           char, 
                           unicodedata.decimal(char, ''),
                           unicodedata.digit(char, ''),
                           unicodedata.numeric(char, ''),
                           unicodedata.category(char),
                           unicodedata.bidirectional(char),
                           unicodedata.combining(char),
                           unicodedata.east_asian_width(char),
                           unicodedata.mirrored(char),
                           unicodedata.decomposition(char))                          
Beispiel #8
0
def char2info(ch):
    name = U.name(ch, None)
    decimal = U.decimal(ch, None)
    digit = U.digit(ch, None)
    numeric = U.numeric(ch, None)

    category = U.category(ch)
    bidirectional = U.bidirectional(ch)
    combining = U.combining(ch)
    east_asian_width = U.east_asian_width(ch)
    mirrored = U.mirrored(ch)
    decomposition = U.decomposition(ch)

    unicode = ord(ch)
    unicode_hex = hex(unicode)
    return dict(locals())
def main():
    kb_name = 'dbp_map'
    filein = './count_information/integer_per_pred_per_sub_'
    fileout = './count_information/avg_integer_per_pred_per_sub_'

    with open(filein + kb_name + '.csv') as fin:
        reader = csv.reader(fin)
        prev_pred = None
        prev_sub = None
        count_val = []
        bufferout = []
        for row in tqdm(reader):
            sub = row[0]
            pred = row[1]
            try:
                val = int(row[2])
            except ValueError:
                try:
                    unicode_char_list = ''.join([
                        str(unicodedata.decimal(d, -1))
                        for d in row[2].decode('utf8')
                    ])
                    val = int(unicode_char_list)
                except Exception as e:
                    print(sub, pred, row[2], e)
                    continue

            if sub == prev_sub and pred == prev_pred:
                count_val.append(abs(val))
                continue
            elif prev_sub is not None and prev_pred is not None:
                bufferout.append([
                    prev_sub, prev_pred,
                    int(sum(count_val) / len(count_val))
                ])
                prev_sub = sub
                prev_pred = pred
                count_val = [abs(val)]
            else:
                prev_sub = sub
                prev_pred = pred
                count_val.append(abs(val))
            if len(bufferout) == 1000:
                with open(fileout + kb_name + '.csv', 'a') as fout:
                    writer = csv.writer(fout, quoting=csv.QUOTE_MINIMAL)
                    writer.writerows(bufferout)
                bufferout = []
Beispiel #10
0
    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
Beispiel #11
0
    def test_compare_functions(self):
        def getX(fun, code):
            try:
                return getattr(unicodedb_5_2_0, fun)(code)
            except KeyError:
                return -1

        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_5_2_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_5_2_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_5_2_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_5_2_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_5_2_0.combining(code)
Beispiel #12
0
def main():
    try:
        v = bytes(int(x, 16) for x in sys.argv[1:])
        c = v.decode('utf8')
        print('gryph:            %s' % c)
        print('codepoint:        U+%x' % ord(c))
        print('name:             %s' % unicodedata.name(c, 'Unknown'))
        print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
        print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
        print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
        print('category:         %s' % unicodedata.category(c))
        print('bidirectional:    %s' % unicodedata.bidirectional(c))
        print('combining:        %s' % unicodedata.combining(c))
        print('east_asian_width: %s' % unicodedata.east_asian_width(c))
        print('mirrored:         %s' % unicodedata.mirrored(c))
        print('decomposition:    %s' % unicodedata.decomposition(c))
    except Exception as ex:
        print('ERROR: %s' % ex)
Beispiel #13
0
def main():
  try:
    v = bytes(int(x, 16) for x in sys.argv[1:])
    c = v.decode('utf8')
    print('gryph:            %s' % c)
    print('codepoint:        U+%x' % ord(c))
    print('name:             %s' % unicodedata.name(c, 'Unknown'))
    print('decimal:          %s' % unicodedata.decimal(c, 'Unknown'))
    print('digit:            %s' % unicodedata.digit(c, 'Unknown'))
    print('numeric:          %s' % unicodedata.numeric(c, 'Unknown'))
    print('category:         %s' % unicodedata.category(c))
    print('bidirectional:    %s' % unicodedata.bidirectional(c))
    print('combining:        %s' % unicodedata.combining(c))
    print('east_asian_width: %s' % unicodedata.east_asian_width(c))
    print('mirrored:         %s' % unicodedata.mirrored(c))
    print('decomposition:    %s' % unicodedata.decomposition(c))
  except Exception as ex:
    print('ERROR: %s' % ex)
Beispiel #14
0
    def __init__(self, symbol):

        self.symbol = symbol
        self.name = u.name(symbol, 'NO_NAME_FOUND')
        self.decimal = u.decimal(self.symbol, -1)
        self.digit = u.digit(self.symbol, -1)
        self.numeric = u.numeric(self.symbol, -1)
        self.category = u.category(self.symbol)
        self.bidirectional = u.bidirectional(self.symbol)
        self.combining = u.combining(self.symbol)
        self.east_asian_width = u.east_asian_width(self.symbol)
        self.mirrored = u.mirrored(self.symbol)
        self.decomposition = u.decomposition(self.symbol)
        self.normalize_nfc = u.normalize('NFC', self.symbol)
        self.normalize_nkfc = u.normalize('NFKC', self.symbol)
        self.normalize_nfd = u.normalize('NFD', self.symbol)
        self.normalize_nkfd = u.normalize('NFKD', self.symbol)

        if Config.debug['unicode']:
            self.print_debug()
Beispiel #15
0
def analyze(text):
    results = []
    mecab = MeCab.Tagger('-Ounidic -d %s -r %s' %
                         (mecab_dicdir, os.path.join(dicrc_dir, 'dicrc')))

    # 小節単位に分割
    text = text.strip()
    text = re.sub(phrase_split_chars_uni, ' ', text)
    text = text.encode('utf-8').replace('\r\n', '\n').replace('\n', ' ')
    text = text.split('===')
    text = map(lambda p: p.strip(), text)
    lyrics = map(lambda p: p.split(' '), text)

    # 読みとアクセントの解析
    for i, phrases in enumerate(lyrics):
        temp = []
        for phrase in phrases:
            for word in mecab.parse(phrase).decode('utf-8').split('\n'):
                features = word.split('\t')
                if len(features) == 4:
                    atypes = []
                    acons = []
                    try:
                        # アクセント型
                        atypes = map(lambda n: unicodedata.decimal(n),
                                     features[2].split(','))
                        acons = features[3].split(',')
                    except TypeError:
                        # アクセントが不明
                        pass

                    prono = features[0]  # 読み
                    if not prono or not re.match(ok_chars, prono):
                        continue

                    if len(atypes) > 0:
                        prono = insert_accent(prono, atypes[0])
                    temp.append(prono)
            temp.append(' ')
        results.append({'lyric': text[i], 'phoneme': '/'.join(temp).rstrip()})
    return results
Beispiel #16
0
    def test_compare_functions(self):
        import unicodedata # CPython implementation

        def getX(fun, code):
            if fun == 'numeric' and code in self.diff_numeric:
                return -1
            try:
                return getattr(unicodedb_4_1_0, fun)(code)
            except KeyError:
                return -1
        
        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_4_1_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
Beispiel #17
0
def update_quantity(request, food_id):
    food = get_object_or_404(FoodItems, id=food_id)
    try:
        selected_choice = request.POST['quantity']
    except (KeyError, food.DoesNotExist):
        # Redisplay the question voting form.
        return render(request, 'foods/detail.html', {
            'fooditems': food,
            'error_message': "You didn't select a quantity.",
        })
    else:
        food.quantity = selected_choice
        food.save()
        # Always return an HttpResponseRedirect after successfully dealing
        # with POST data. This prevents data from being posted twice if a
        # user hits the Back button.
        total_cost = '$' + str(decimal(food.quantity) * food.item_cost)
        return {
            'fooditems': food,
            'total_cost': total_cost,
        }
    def test_compare_functions(self):
        import unicodedata # CPython implementation

        def getX(fun, code):
            if fun == 'numeric' and code in self.diff_numeric:
                return -1
            try:
                return getattr(unicodedb_4_1_0, fun)(code)
            except KeyError:
                return -1
        
        for code in range(0x10000):
            char = unichr(code)
            assert unicodedata.digit(char, -1) == getX('digit', code)
            assert unicodedata.numeric(char, -1) == getX('numeric', code)
            assert unicodedata.decimal(char, -1) == getX('decimal', code)
            assert unicodedata.category(char) == unicodedb_4_1_0.category(code)
            assert unicodedata.bidirectional(char) == unicodedb_4_1_0.bidirectional(code)
            assert unicodedata.decomposition(char) == unicodedb_4_1_0.decomposition(code)
            assert unicodedata.mirrored(char) == unicodedb_4_1_0.mirrored(code)
            assert unicodedata.combining(char) == unicodedb_4_1_0.combining(code)
Beispiel #19
0
def analyze(text):
    results = []
    mecab = MeCab.Tagger('-Ounidic -d %s -r dicrc' % mecab_dicdir)

    # 小節単位に分割
    text = text.strip()
    text = re.sub(phrase_split_chars_uni, ' ', text)
    text = text.encode('utf-8').replace('\r\n', '\n').replace('\n', ' ')
    text = text.split('===')
    text = map(lambda p: p.strip(), text)
    lyrics = map(lambda p: p.split(' '), text)

    # 読みとアクセントの解析
    for i, phrases in enumerate(lyrics):
        temp = []
        for phrase in phrases:
            for word in mecab.parse(phrase).decode('utf-8').split('\n'):
                features = word.split('\t')
                if len(features) == 4:
                    atypes = []
                    acons = []
                    try:
                        # アクセント型
                        atypes = map(lambda n: unicodedata.decimal(n), features[2].split(','))
                        acons = features[3].split(',')
                    except TypeError:
                        # アクセントが不明
                        pass

                    prono = features[0] # 読み
                    if not prono or not re.match(ok_chars, prono):
                        continue

                    if len(atypes) > 0:
                        prono = insert_accent(prono, atypes[0])
                    temp.append(prono)
            temp.append(' ')
        results.append({'lyric': text[i], 'phoneme': '/'.join(temp).rstrip()})
    return results
    def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
    def test_ipy2_gh357(self):
        """https://github.com/IronLanguages/ironpython2/issues/357"""

        import unicodedata

        if is_cli:
            self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
        else:
            self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')

        self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
        self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
        self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
        self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
        self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
        self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
        self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
        self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
        self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
        self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
print test_unicodedata()

# Some additional checks of the API:
print 'API:',

verify(unicodedata.digit(u'A',None) is None)
verify(unicodedata.digit(u'9') == 9)
verify(unicodedata.digit(u'\u215b',None) is None)
verify(unicodedata.digit(u'\u2468') == 9)

verify(unicodedata.numeric(u'A',None) is None)
verify(unicodedata.numeric(u'9') == 9)
verify(unicodedata.numeric(u'\u215b') == 0.125)
verify(unicodedata.numeric(u'\u2468') == 9.0)

verify(unicodedata.decimal(u'A',None) is None)
verify(unicodedata.decimal(u'9') == 9)
verify(unicodedata.decimal(u'\u215b',None) is None)
verify(unicodedata.decimal(u'\u2468',None) is None)

verify(unicodedata.category(u'\uFFFE') == 'Cn')
verify(unicodedata.category(u'a') == 'Ll')
verify(unicodedata.category(u'A') == 'Lu')

verify(unicodedata.bidirectional(u'\uFFFE') == '')
verify(unicodedata.bidirectional(u' ') == 'WS')
verify(unicodedata.bidirectional(u'A') == 'L')

verify(unicodedata.decomposition(u'\uFFFE') == '')
verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034')
Beispiel #23
0
 def decimal(self, default=None):
     return ud.decimal(self.char, default)
Beispiel #24
0
    unicode_digit = defaultdict(list)
    unicode_decimal = defaultdict(list)

    for c in map(chr, range(sys.maxunicode + 1)):
        unicode_category[unicodedata.category(c)].append(c)

        if unicodedata.bidirectional(c):
            unicode_bidirectional[unicodedata.bidirectional(c)].append(c)

        if unicodedata.numeric(c, None) is not None:
            unicode_numeric[unicodedata.numeric(c)].append(c)

        if unicodedata.digit(c, None) is not None:
            unicode_digit[unicodedata.digit(c)].append(c)

        if unicodedata.decimal(c, None) is not None:
            unicode_decimal[unicodedata.decimal(c)].append(c)

    # get all punctuation
    punctuation = set()
    for class_name in unicode_category.keys():
        if class_name.startswith('P') or class_name.startswith('S'):
            print(class_name)
            for char in unicode_category[class_name]:
                punctuation.add(char)

    with open('punctuation_lookup.py', 'w', encoding='ascii') as f:
        f.write('PUNCTUATION = {\n')
        for p in sorted(punctuation):

            if p == '"':
Beispiel #25
0
def ascii_char_smash(char):
    """Smash a single Unicode character into an ASCII representation.

    >>> ascii_char_smash(u"\N{KATAKANA LETTER SMALL A}")
    'a'
    >>> ascii_char_smash(u"\N{KATAKANA LETTER A}")
    'A'
    >>> ascii_char_smash(u"\N{KATAKANA LETTER KA}")
    'KA'
    >>> ascii_char_smash(u"\N{HIRAGANA LETTER SMALL A}")
    'a'
    >>> ascii_char_smash(u"\N{HIRAGANA LETTER A}")
    'A'
    >>> ascii_char_smash(u"\N{BOPOMOFO LETTER ANG}")
    'ANG'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER H WITH STROKE}")
    'H'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER LONG S}")
    's'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER THORN}")
    'TH'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER THORN}")
    'th'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER I WITH OGONEK}")
    'I'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER AE}")
    'AE'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}")
    'Ae'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER A WITH DIAERESIS}")
    'ae'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}")
    'Oe'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER O WITH DIAERESIS}")
    'oe'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}")
    'Ue'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER U WITH DIAERESIS}")
    'ue'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER SHARP S}")
    'ss'

    Latin-1 and other symbols are lost

    >>> ascii_char_smash(u"\N{POUND SIGN}")
    ''

    Unless they also happen to be letters of some kind, such as greek

    >>> ascii_char_smash(u"\N{MICRO SIGN}")
    'mu'

    Fractions

    >>> ascii_char_smash(u"\N{VULGAR FRACTION ONE HALF}")
    '1/2'

    """
    mapping = {
        u"\N{LATIN CAPITAL LETTER AE}": "AE",
        u"\N{LATIN SMALL LETTER AE}": "ae",
        u"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}": "Ae",
        u"\N{LATIN SMALL LETTER A WITH DIAERESIS}": "ae",
        u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}": "Oe",
        u"\N{LATIN SMALL LETTER O WITH DIAERESIS}": "oe",
        u"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}": "Ue",
        u"\N{LATIN SMALL LETTER U WITH DIAERESIS}": "ue",
        u"\N{LATIN SMALL LETTER SHARP S}": "ss",
        u"\N{LATIN CAPITAL LETTER THORN}": "TH",
        u"\N{LATIN SMALL LETTER THORN}": "th",
        u"\N{FRACTION SLASH}": "/",
        u"\N{MULTIPLICATION SIGN}": "x",
        u"\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}": "=",
    }

    # Pass through ASCII
    if ord(char) < 127:
        return char

    # Handle manual mappings
    if mapping.has_key(char):
        return mapping[char]

    # Regress to decomposed form and recurse if necessary.
    decomposed = unicodedata.normalize("NFKD", char)
    if decomposed != char:
        out = StringIO()
        for char in decomposed:
            out.write(ascii_char_smash(char))
        return out.getvalue()

    # Handle whitespace
    if char.isspace():
        return " "

    # Handle digits
    if char.isdigit():
        return unicodedata.digit(char)

    # Handle decimal (probably pointless given isdigit above)
    if char.isdecimal():
        return unicodedata.decimal(char)

    # Handle numerics, such as 1/2
    if char.isnumeric():
        formatted = "%f" % unicodedata.numeric(char)
        # Strip leading and trailing 0
        return formatted.strip("0")

    # Ignore unprintables, such as the accents we denormalized
    if not char.isalnum():
        return ""

    # Return modified latin characters as just the latin part.
    name = unicodedata.name(char)

    match = re.search("LATIN CAPITAL LIGATURE (\w+)", name)
    if match is not None:
        return match.group(1)

    match = re.search("LATIN SMALL LIGATURE (\w+)", name)
    if match is not None:
        return match.group(1).lower()

    match = re.search("(?:LETTER SMALL|SMALL LETTER) (\w+)", name)
    if match is not None:
        return match.group(1).lower()

    match = re.search("LETTER (\w+)", name)
    if match is not None:
        return match.group(1)

    # Something we can't represent. Return empty string.
    return ""
import unicodedata

import tangled_up_in_unicode as unicode_data

if __name__ == "__main__":
    basic = [
        {
            "property": "Name",
            "standard": unicodedata.name,
            "new": unicode_data.name
        },
        {
            "property": "Decimal",
            "standard": lambda x: unicodedata.decimal(x, -1),
            "new": lambda x: unicode_data.decimal(x, -1),
        },
        {
            "property": "Digit",
            "standard": lambda x: unicodedata.digit(x, -1),
            "new": lambda x: unicode_data.digit(x, -1),
        },
        {
            "property": "Numeric",
            "standard": lambda x: unicodedata.numeric(x, -1.0),
            "new": lambda x: unicode_data.numeric(x, -1.0),
        },
        {
            "property": "Category",
            "standard": unicodedata.category,
            "new": unicode_data.category,
        },
'''
unicodedata 模块

unicodedata 模块包含了 Unicode 字符的属性, 例如字符类别, 分解数据, 以及数值.
'''
import unicodedata

for char in [u'A', u'-', u'1', u'w']:
    print(char, '-> ', end='')
    print(repr(char), '-> ', end='')
    print(unicodedata.category(char), '-> ', end='')
    print(repr(unicodedata.decomposition(char)), '-> ', end='')
    print(unicodedata.decimal(char, None), '=> ', end='')
    print(unicodedata.numeric(char, None), end='')
    print()
Beispiel #28
0
def ascii_char_smash(char):
    """Smash a single Unicode character into an ASCII representation.

    >>> ascii_char_smash(u"\N{KATAKANA LETTER SMALL A}")
    'a'
    >>> ascii_char_smash(u"\N{KATAKANA LETTER A}")
    'A'
    >>> ascii_char_smash(u"\N{KATAKANA LETTER KA}")
    'KA'
    >>> ascii_char_smash(u"\N{HIRAGANA LETTER SMALL A}")
    'a'
    >>> ascii_char_smash(u"\N{HIRAGANA LETTER A}")
    'A'
    >>> ascii_char_smash(u"\N{BOPOMOFO LETTER ANG}")
    'ANG'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER H WITH STROKE}")
    'H'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER LONG S}")
    's'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER THORN}")
    'TH'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER THORN}")
    'th'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER I WITH OGONEK}")
    'I'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER AE}")
    'AE'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}")
    'Ae'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER A WITH DIAERESIS}")
    'ae'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}")
    'Oe'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER O WITH DIAERESIS}")
    'oe'
    >>> ascii_char_smash(u"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}")
    'Ue'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER U WITH DIAERESIS}")
    'ue'
    >>> ascii_char_smash(u"\N{LATIN SMALL LETTER SHARP S}")
    'ss'

    Latin-1 and other symbols are lost

    >>> ascii_char_smash(u"\N{POUND SIGN}")
    ''

    Unless they also happen to be letters of some kind, such as greek

    >>> ascii_char_smash(u"\N{MICRO SIGN}")
    'mu'

    Fractions

    >>> ascii_char_smash(u"\N{VULGAR FRACTION ONE HALF}")
    '1/2'

    """
    mapping = {
        u"\N{LATIN CAPITAL LETTER AE}": "AE",
        u"\N{LATIN SMALL LETTER AE}": "ae",

        u"\N{LATIN CAPITAL LETTER A WITH DIAERESIS}": "Ae",
        u"\N{LATIN SMALL LETTER A WITH DIAERESIS}": "ae",

        u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}": "Oe",
        u"\N{LATIN SMALL LETTER O WITH DIAERESIS}": "oe",

        u"\N{LATIN CAPITAL LETTER U WITH DIAERESIS}": "Ue",
        u"\N{LATIN SMALL LETTER U WITH DIAERESIS}": "ue",

        u"\N{LATIN SMALL LETTER SHARP S}": "ss",

        u"\N{LATIN CAPITAL LETTER THORN}": "TH",
        u"\N{LATIN SMALL LETTER THORN}": "th",

        u"\N{FRACTION SLASH}": "/",
        u"\N{MULTIPLICATION SIGN}": "x",

        u"\N{KATAKANA-HIRAGANA DOUBLE HYPHEN}": "=",
        }

    # Pass through ASCII
    if ord(char) < 127:
        return char

    # Handle manual mappings
    if mapping.has_key(char):
        return mapping[char]

    # Regress to decomposed form and recurse if necessary.
    decomposed = unicodedata.normalize("NFKD", char)
    if decomposed != char:
        out = StringIO()
        for char in decomposed:
            out.write(ascii_char_smash(char))
        return out.getvalue()

    # Handle whitespace
    if char.isspace():
        return " "

    # Handle digits
    if char.isdigit():
        return unicodedata.digit(char)

    # Handle decimal (probably pointless given isdigit above)
    if char.isdecimal():
        return unicodedata.decimal(char)

    # Handle numerics, such as 1/2
    if char.isnumeric():
        formatted = "%f" % unicodedata.numeric(char)
        # Strip leading and trailing 0
        return formatted.strip("0")

    # Ignore unprintables, such as the accents we denormalized
    if not char.isalnum():
        return ""

    # Return modified latin characters as just the latin part.
    name = unicodedata.name(char)

    match = re.search("LATIN CAPITAL LIGATURE (\w+)", name)
    if match is not None:
        return match.group(1)

    match = re.search("LATIN SMALL LIGATURE (\w+)", name)
    if match is not None:
        return match.group(1).lower()

    match = re.search("(?:LETTER SMALL|SMALL LETTER) (\w+)", name)
    if match is not None:
        return match.group(1).lower()

    match = re.search("LETTER (\w+)", name)
    if match is not None:
        return match.group(1)

    # Something we can't represent. Return empty string.
    return ""
print test_unicodedata()

# Some additional checks of the API:
print 'API:',

verify(unicodedata.digit(u'A',None) is None)
verify(unicodedata.digit(u'9') == 9)
verify(unicodedata.digit(u'\u215b',None) is None)
verify(unicodedata.digit(u'\u2468') == 9)

verify(unicodedata.numeric(u'A',None) is None)
verify(unicodedata.numeric(u'9') == 9)
verify(unicodedata.numeric(u'\u215b') == 0.125)
verify(unicodedata.numeric(u'\u2468') == 9.0)

verify(unicodedata.decimal(u'A',None) is None)
verify(unicodedata.decimal(u'9') == 9)
verify(unicodedata.decimal(u'\u215b',None) is None)
verify(unicodedata.decimal(u'\u2468',None) is None)

verify(unicodedata.category(u'\uFFFE') == 'Cn')
verify(unicodedata.category(u'a') == 'Ll')
verify(unicodedata.category(u'A') == 'Lu')

verify(unicodedata.bidirectional(u'\uFFFE') == '')
verify(unicodedata.bidirectional(u' ') == 'WS')
verify(unicodedata.bidirectional(u'A') == 'L')

verify(unicodedata.decomposition(u'\uFFFE') == '')
verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034')
Beispiel #30
0
 def decimal(self):
     """Return unicodedata.decimal."""
     try:
         return unicodedata.decimal(self.c)
     except ValueError:
         return None
Beispiel #31
0
from natsort.compat.py23 import py23_unichr
from natsort.unicode_numeric_hex import numeric_hex

# Convert each hex into the literal Unicode character.
# Stop if a ValueError is raised in case of a narrow Unicode build.
# The extra check with unicodedata is in case this Python version
# does not support some characters.
numeric_chars = []
for a in numeric_hex:
    try:
        character = py23_unichr(a)
    except ValueError:  # pragma: no cover
        break
    if unicodedata.numeric(character, None) is None:
        continue  # pragma: no cover
    numeric_chars.append(character)

# The digit characters are a subset of the numerals.
digit_chars = [a for a in numeric_chars if unicodedata.digit(a, None) is not None]

# The decimal characters are a subset of the numberals
# (probably of the digits, but let's be safe).
decimal_chars = [a for a in numeric_chars if unicodedata.decimal(a, None) is not None]

# Create a single string with the above data.
decimals = "".join(decimal_chars)
digits = "".join(digit_chars)
numeric = "".join(numeric_chars)
digits_no_decimals = "".join([x for x in digits if x not in decimals])
numeric_no_decimals = "".join([x for x in numeric if x not in decimals])
Beispiel #32
0
    def getdetails(self, text):
        """
        Gives details of all charecters in the given string.

        :param text: The unicode string to be examined.
        :type text: str.
        :returns:  dictionary with details.

        ::

         >>> import chardetails.getInstance
         >>> a = getInstance()
         >>> a.getdetails(u"run")
         {'Characters': [u'r', u'u', u'n'],
         u'n': {'AlphaNumeric': 'True',
         'Alphabet': 'True',
         'Canonical Decomposition': '',
         'Code point': "u'n'",
         'Digit': 'False',
         'HTML Entity': '110',
         'Name': 'LATIN SMALL LETTER N'},
         u'r': {'AlphaNumeric': 'True',
         'Alphabet': 'True',
         'Canonical Decomposition': '',
         'Code point': "u'r'",
         'Digit': 'False',
         'HTML Entity': '114',
         'Name': 'LATIN SMALL LETTER R'},
         u'u': {'AlphaNumeric': 'True',
         'Alphabet': 'True',
         'Canonical Decomposition': '',
         'Code point': "u'u'",
         'Digit': 'False',
         'HTML Entity': '117',
         'Name': 'LATIN SMALL LETTER U'}}


        """
        chardetails = {}
        for character in text:
            chardetails[character] = {}
            chardetails[character]['Name'] = unicodedata.name(character)
            chardetails[character]['HTML Entity'] = str(ord(character))
            chardetails[character]['Code point'] = repr(character)
            try:
                chardetails[character]['Numeric Value'] = \
                    unicodedata.numeric(character)
            except:
                pass
            try:
                chardetails[character]['Decimal Value'] = \
                    unicodedata.decimal(character)
            except:
                pass
            try:
                chardetails[character]['Digit'] = unicodedata.digit(character)
            except:
                pass
            chardetails[character]['Alphabet'] = str(character.isalpha())
            chardetails[character]['Digit'] = str(character.isdigit())
            chardetails[character]['AlphaNumeric'] = str(character.isalnum())
            chardetails[character]['Canonical Decomposition'] = \
                unicodedata.decomposition(character)

        chardetails['Characters'] = list(text)
        return chardetails
print test_unicodedata()

# Some additional checks of the API:
print "API:",

verify(unicodedata.digit(u"A", None) is None)
verify(unicodedata.digit(u"9") == 9)
verify(unicodedata.digit(u"\u215b", None) is None)
verify(unicodedata.digit(u"\u2468") == 9)

verify(unicodedata.numeric(u"A", None) is None)
verify(unicodedata.numeric(u"9") == 9)
verify(unicodedata.numeric(u"\u215b") == 0.125)
verify(unicodedata.numeric(u"\u2468") == 9.0)

verify(unicodedata.decimal(u"A", None) is None)
verify(unicodedata.decimal(u"9") == 9)
verify(unicodedata.decimal(u"\u215b", None) is None)
verify(unicodedata.decimal(u"\u2468", None) is None)

verify(unicodedata.category(u"\uFFFE") == "Cn")
verify(unicodedata.category(u"a") == "Ll")
verify(unicodedata.category(u"A") == "Lu")

verify(unicodedata.bidirectional(u"\uFFFE") == "")
verify(unicodedata.bidirectional(u" ") == "WS")
verify(unicodedata.bidirectional(u"A") == "L")

verify(unicodedata.decomposition(u"\uFFFE") == "")
verify(unicodedata.decomposition(u"\u00bc") == "<fraction> 0031 2044 0034")
Beispiel #34
0
def test_against_unicodedata():
    '''
    Check against `unicodedata` or `unicodedata2` if available with the
    correct version of Unicode.
    '''
    if unicodedata is None:
        raise Exception(
            'Packages unicodedata and unicodedata2 are not available with the necessary version of Unicode ({0}); many consistency tests were omitted'
            .format(mdl.UNICODE_VERSION))
    ucdf = mdl.UCDFiles()

    ud = ucdf.unicodedata
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        if cp in ud:
            name = unicodedata.name(c, None)
            if name is None:
                # Handle missing names in unicodedata
                # Compare Table 4-13 in Unicode Standard
                # http://www.unicode.org/versions/Unicode9.0.0/ch04.pdf
                if 0x17000 <= cp <= 0x187EC:
                    assert ud[cp]['Name'] == 'TANGUT IDEOGRAPH-{0:04X}'.format(
                        cp)
                else:
                    assert ud[cp]['Name'] == ''
            else:
                assert name == ud[cp]['Name']
            decimal, digit, numeric = (unicodedata.decimal(c, None),
                                       unicodedata.digit(c, None),
                                       unicodedata.numeric(c, None))
            if any(x is not None for x in (decimal, digit, numeric)):
                if decimal is not None:
                    assert decimal == int(ud[cp]['Numeric_Value']) and ud[cp][
                        'Numeric_Type'] == 'Decimal' and digit is not None and decimal is not None
                elif digit is not None:
                    assert digit == int(ud[cp]['Numeric_Value']) and ud[cp][
                        'Numeric_Type'] == 'Digit' and decimal is None and numeric is not None
                elif numeric is not None:
                    try:
                        num = float(ud[cp]['Numeric_Value'])
                    except ValueError:
                        if '/' in ud[cp]['Numeric_Value']:
                            numerator, denominator = ud[cp][
                                'Numeric_Value'].split('/')
                            num = float(numerator) / float(denominator)
                        else:
                            raise
                    assert numeric == num and ud[cp][
                        'Numeric_Type'] == 'Numeric' and digit is None and decimal is None
                else:
                    raise Exception
            else:
                assert ud[cp]['Numeric_Value'] == 'NaN' and ud[cp][
                    'Numeric_Type'] == 'None'
            assert unicodedata.category(c) == ud[cp]['General_Category']
            assert unicodedata.bidirectional(c) == ud[cp]['Bidi_Class']
            assert unicodedata.combining(c) == int(
                ud[cp]['Canonical_Combining_Class'])
            assert unicodedata.mirrored(c) == ud[cp]['Bidi_Mirrored']
            if unicodedata.decomposition(c) == '':
                if ud[cp]['Name'].startswith('HANGUL SYLLABLE'):
                    # The Hangul syllables lack decomposition mapping in
                    # unicodedata, so calculate with a full decomposition
                    # followed by a partial composition (Unicode Standard,
                    # chapter 3.12)
                    decomp = unicodedata.normalize('NFD', c)
                    if len(decomp) == 3:
                        decomp = unicodedata.normalize('NFC',
                                                       decomp[:2]) + decomp[-1]
                    decomp = tuple(ord(x) for x in decomp)
                    assert decomp == ud[cp]['Decomposition_Mapping']
                else:
                    assert ud[cp]['Decomposition_Mapping'] == (cp, )
            else:
                x = unicodedata.decomposition(c)
                if '<' in x:
                    x = x.split('>', 1)[1].strip()
                x = tuple(int(y, 16) for y in x.split('\x20'))
                assert x == ud[cp]['Decomposition_Mapping']

    dbc = ucdf.derivedbidiclass
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        # Only compare assigned code points, because unicodedata and
        # unicodedata2 lack correct defaults for unassigned
        if cp in dbc and cp in ud:
            assert unicodedata.bidirectional(c) == dbc[cp]['Bidi_Class']

    eaw = ucdf.eastasianwidth
    deaw = ucdf.derivedeastasianwidth
    for cp in range(0, 0x10FFFF + 1):
        c = chr(cp)
        # Only compare assigned code points, because unicodedata and
        # unicodedata2 lack correct defaults for unassigned
        if cp in eaw and cp in ud:
            assert unicodedata.east_asian_width(
                c) == eaw[cp]['East_Asian_Width']
        if cp in deaw and cp in ud:
            assert unicodedata.east_asian_width(
                c) == deaw[cp]['East_Asian_Width']
Beispiel #35
0
import unicodedata

print(unicodedata.lookup('LEFT CURLY BRACKET'))

print(unicodedata.name('/'))

print(unicodedata.decimal('9'))

#unicodedata.decimal('a')

print(unicodedata.category('A'))  # 'L'etter, 'u'ppercase

print(unicodedata.bidirectional('\u0660'))  # 'A'rabic, 'N'umber

from codecs import StreamWriter

from datetime import timedelta

d = timedelta(hours=1)
print((d.days, d.seconds, d.microseconds))
Beispiel #36
0
# Stop if a ValueError is raised in case of a narrow Unicode build.
# The extra check with unicodedata is in case this Python version
# does not support some characters.
numeric_chars = []
for a in numeric_hex:
    try:
        character = chr(a)
    except ValueError:  # pragma: no cover
        break
    if unicodedata.numeric(character, None) is None:
        continue  # pragma: no cover
    numeric_chars.append(character)

# The digit characters are a subset of the numerals.
digit_chars = [
    a for a in numeric_chars if unicodedata.digit(a, None) is not None
]

# The decimal characters are a subset of the numberals
# (probably of the digits, but let's be safe).
decimal_chars = [
    a for a in numeric_chars if unicodedata.decimal(a, None) is not None
]

# Create a single string with the above data.
decimals = "".join(decimal_chars)
digits = "".join(digit_chars)
numeric = "".join(numeric_chars)
digits_no_decimals = "".join([x for x in digits if x not in decimals])
numeric_no_decimals = "".join([x for x in numeric if x not in decimals])
Beispiel #37
0
def test_decimal_chars_contains_only_valid_unicode_decimal_characters():
    for a in decimal_chars:
        assert unicodedata.decimal(a, None) is not None
Beispiel #38
0
""" Test script for the unicodedata module.
    Written by Marc-Andre Lemburg ([email protected]).
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#"
from test_support import verify, verbose
import sha
encoding = 'utf-8'
def test_methods():
    h = sha.sha()
    for i in range(65536):
        char = unichr(i)
        data = [
            # Predicates (single char)
            char.isalnum() and u'1' or u'0',
            char.isalpha() and u'1' or u'0',
            char.isdecimal() and u'1' or u'0',
            char.isdigit() and u'1' or u'0',
            char.islower() and u'1' or u'0',
            char.isnumeric() and u'1' or u'0',
            char.isspace() and u'1' or u'0',
            char.istitle() and u'1' or u'0',
            char.isupper() and u'1' or u'0',
            # Predicates (multiple chars)
            (char + u'abc').isalnum() and u'1' or u'0',
            (char + u'abc').isalpha() and u'1' or u'0',
            (char + u'123').isdecimal() and u'1' or u'0',
            (char + u'123').isdigit() and u'1' or u'0',
            (char + u'abc').islower() and u'1' or u'0',
            (char + u'123').isnumeric() and u'1' or u'0',
            (char + u' \t').isspace() and u'1' or u'0',
Beispiel #39
0
    try:
        l = py23_unichr(a)
    except ValueError:  # pragma: no cover
        break
    if unicodedata.numeric(l, None) is None:
        continue  # pragma: no cover
    numeric_chars.append(l)

# The digit characters are a subset of the numerals.
digit_chars = [a for a in numeric_chars
               if unicodedata.digit(a, None) is not None]

# The decimal characters are a subset of the numberals
# (probably of the digits, but let's be safe).
decimal_chars = [a for a in numeric_chars
                 if unicodedata.decimal(a, None) is not None]

# Create a single string with the above data.
decimals = ''.join(decimal_chars)
digits = ''.join(digit_chars)
numeric = ''.join(numeric_chars)
digits_no_decimals = ''.join([x for x in digits if x not in decimals])
numeric_no_decimals = ''.join([x for x in numeric if x not in decimals])

# Some code that can be used to create the above list of hex numbers.
if __name__ == '__main__':
    import textwrap
    from natsort.compat.py23 import py23_range

    hex_chars = []
    for i in py23_range(0X110000):
Beispiel #40
0
import unicodedata


if __name__ == "__main__":
    s = "hello world, Lcoderfit"
    print(unicodedata.lookup('left curly bracket'))

    print(unicodedata.name('\\'))
    print(unicodedata.decimal("1"))
    print(unicodedata.digit("4"))
    print(unicodedata.numeric("9"))
    print(unicodedata.category("/"))

    print(unicodedata.bidirectional("b"))

    print(unicodedata.east_asian_width("b"))
    print(unicodedata.mirrored("{}"))
Beispiel #41
0
print("bidirectional ok")

for category, cp in tests["categories"].items():
    assert category == unicodedata.category(chr(int(cp, 16)))

print("categories ok")

for comb, cp in tests["combinings"].items():
    assert int(comb) == unicodedata.combining(chr(int(cp, 16)))

print("combining ok")

for decimal, cp in tests["decimals"].items():
    if decimal:
        assert eval(decimal) == unicodedata.decimal(chr(int(cp, 16)))

print("decimals ok")

for decomp, cp in tests["decompositions"].items():
    assert decomp == unicodedata.decomposition(chr(int(cp, 16)))

print("decomposition ok")

for digit, cp in tests["digits"].items():
    if digit:
        assert eval(digit) == unicodedata.digit(chr(int(cp, 16)))

print("digits ok")

for name, cp in tests["names"].items():
Beispiel #42
0
    ## Function
        1-unicodedata.lookup(name)
        2-unicodedata.name(chr[, default])
        3-unicodedata.decimal(chr[, default])
        4-unicodedata.digit(chr[, default])
        5-unicodedata.numeric(chr[, default])
        6-unicodedata.category(chr)
        7-unicodedata.bidirectional(chr)
        8-unicodedata.normalize(form, unistr)
'''
import unicodedata

print(unicodedata.lookup('LEFT CURLY BRACKET'))
print(unicodedata.lookup('RIGHT CURLY BRACKET'))
print(unicodedata.lookup('ASTERISK'))
#############################
print(unicodedata.name(u'/'))
print(unicodedata.name(u'|'))
print(unicodedata.name(u':'))
################################
print(unicodedata.decimal(u'9'))
print(unicodedata.decimal(u'5'))
####################################
print(unicodedata.decimal(u'0'))
print(unicodedata.decimal(u'1'))
###################################
print(unicodedata.category(u'A'))
print(unicodedata.category(u'b'))
###################################
print(unicodedata.bidirectional(u'\u0660'))
# Test Unicode database APIs
import unicodedata

print 'Testing unicodedata module...',

assert unicodedata.digit(u'A',None) is None
assert unicodedata.digit(u'9') == 9
assert unicodedata.digit(u'\u215b',None) is None
assert unicodedata.digit(u'\u2468') == 9

assert unicodedata.numeric(u'A',None) is None
assert unicodedata.numeric(u'9') == 9
assert unicodedata.numeric(u'\u215b') == 0.125
assert unicodedata.numeric(u'\u2468') == 9.0

assert unicodedata.decimal(u'A',None) is None
assert unicodedata.decimal(u'9') == 9
assert unicodedata.decimal(u'\u215b',None) is None
assert unicodedata.decimal(u'\u2468',None) is None

assert unicodedata.category(u'\uFFFE') == 'Cn'
assert unicodedata.category(u'a') == 'Ll'
assert unicodedata.category(u'A') == 'Lu'

assert unicodedata.bidirectional(u'\uFFFE') == ''
assert unicodedata.bidirectional(u' ') == 'WS'
assert unicodedata.bidirectional(u'A') == 'L'

assert unicodedata.decomposition(u'\uFFFE') == ''
assert unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034'
Beispiel #44
0
import unicodedata

for char in [u"A", u"-", u"1", u"\N{LATIN CAPITAL LETTER O WITH DIAERESIS}"]:
    print repr(char),
    print unicodedata.category(char),
    print repr(unicodedata.decomposition(char)),
    print unicodedata.decimal(char, None),
    print unicodedata.numeric(char, None)

## u'A' Lu '' None None
## u'-' Pd '' None None
## u'1' Nd '' 1 1.0
## u'Ö' Lu '004F 0308' None None

Beispiel #45
0
def setUpModule():
    log = logging.getLogger('unicodedata')

    log.info('generating unicodedata CSV')
    with tempfile.NamedTemporaryFile(prefix='unicode-',
                                     suffix='.csv') as csvfile:
        c = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
        for i in xrange(sys.maxunicode + 1):
            if i >= 5024 and i <= 5119:
                continue  # the Unicode Cherokee-Block is broken in Python 2.7 and Python 3.4 (maybe also 3.5)
            u = unichr(i)
            if unicodedata.category(u).startswith('C'):
                # [Cc]Other, Control
                # [Cf]Other, Format
                # [Cn]Other, Not Assigned
                # [Co]Other, Private Use
                # [Cs]Other, Surrogate
                continue
            row = (
                i,  # INT 0-1114111
                unicodedata.name(u,
                                 'UNICODE U+%08X' % i),  # VARCHAR(100) ASCII
                u,  # VARCHAR(1) UNICODE
                u.upper(),  # VARCHAR(1) UNICODE
                u.lower(),  # VARCHAR(1) UNICODE
                unicodedata.decimal(u, None),  # INT
                unicodedata.numeric(u, None),  # DOUBLE
                unicodedata.category(u),  # VARCHAR(3) ASCII
                unicodedata.bidirectional(u),  # VARCHAR(3) ASCII
                unicodedata.combining(u),  # VARCHAR(3) ASCII
                unicodedata.east_asian_width(u),  # VARCHAR(1) ASCII
                bool(unicodedata.mirrored),  # BOOLEAN
                unicodedata.decomposition(u),  # VARCHAR(10) ASCII
                unicodedata.normalize('NFC', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFD', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFKC', u),  # VARCHAR(3) UNICODE
                unicodedata.normalize('NFKD', u),  # VARCHAR(3) UNICODE
            )
            c.writerow(utf8encoder(row))
        csvfile.flush()

        log.info('loading CSV')
        sql = '''
            DROP SCHEMA utest CASCADE;
            CREATE SCHEMA utest;
            CREATE TABLE unicodedata (
                codepoint INT NOT NULL,
                name VARCHAR(100) ASCII,
                uchar VARCHAR(1) UTF8,
                to_upper VARCHAR(1) UTF8,
                to_lower VARCHAR(1) UTF8,
                decimal_value INT,
                numeric_value INT,
                category VARCHAR(3) ASCII,
                bidirectional VARCHAR(3) ASCII,
                combining VARCHAR(10) ASCII,
                east_asian_width VARCHAR(2) ASCII,                mirrored BOOLEAN,
                decomposition VARCHAR(100) ASCII,
                NFC VARCHAR(10) UTF8,
                NFD VARCHAR(10) UTF8,
                NFKC VARCHAR(20) UTF8,
                NFKD VARCHAR(20) UTF8
                );
            IMPORT INTO unicodedata
            FROM LOCAL CSV FILE '%s'
            ROW SEPARATOR = 'CRLF';
            ''' % os.path.join(os.getcwd(), csvfile.name)
        cmd = '''%(exaplus)s -c %(conn)s -u sys -P exasol
		        -no-config -autocommit ON -L -pipe''' % {
            'exaplus':
            os.environ.get(
                'EXAPLUS',
                '/usr/opt/EXASuite-4/EXASolution-4.2.9/bin/Console/exaplus'),
            'conn':
            udf.opts.server
        }
        env = os.environ.copy()
        env['PATH'] = '/usr/opt/jdk1.8.0_latest/bin:' + env['PATH']
        exaplus = subprocess.Popen(cmd.split(),
                                   env=env,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        out, _err = exaplus.communicate(sql)
    if exaplus.returncode != 0:
        log.critical('EXAplus error: %d', exaplus.returncode)
        log.error(out)
    else:
        log.debug(out)
Beispiel #46
0
def check_food_qty_len(request):
    total_food_price = 0
    data = {}
    food_data = []
    pizzas = request.POST.getlist('pizzas', None)
    if len(pizzas) == 0:
        return False, 'Please select at least one pizza and quantity first.', data
    else:
        pizza_ins = Pizza.objects.filter(id__in=pizzas)
        for pizza in pizza_ins:
            temp_data = {}
            pizza_qty = 'pizzas_qty_' + str(pizza.id)
            input_pizza_qty = request.POST.get(pizza_qty, None)
            if input_pizza_qty is None or input_pizza_qty == '':
                return False, 'Pizza Quantity not given properly!', data
            temp_data['name'] = pizza.name
            temp_data['total_price'] = str(pizza.price *
                                           decimal(input_pizza_qty))
            temp_data['qty'] = input_pizza_qty
            food_data.append(temp_data)
            total_food_price += pizza.price * decimal(input_pizza_qty)

    gluten_cauliflower = request.POST.getlist('gluten_cauliflower', None)
    if len(gluten_cauliflower) != 0:
        gluten_cauliflower_ins = GlutenCauliflower.objects.filter(
            id__in=gluten_cauliflower)
        for gluten_cauliflower in gluten_cauliflower_ins:
            temp_data = {}
            gluten_cauliflower_qty = 'gluten_cauliflower_qty_' + str(
                gluten_cauliflower.id)
            input_gluten_cauliflower_qty = request.POST.get(
                gluten_cauliflower_qty, None)
            if input_gluten_cauliflower_qty is None or input_gluten_cauliflower_qty == '':
                return False, 'Gluten Free and Cauliflower Crust Quantity not given properly!', data
            temp_data['name'] = gluten_cauliflower.name
            temp_data['total_price'] = str(
                gluten_cauliflower.price *
                decimal(input_gluten_cauliflower_qty))
            temp_data['qty'] = input_gluten_cauliflower_qty
            food_data.append(temp_data)
            total_food_price += gluten_cauliflower.price * decimal(
                input_gluten_cauliflower_qty)

    # gluten_cauliflower_total_ins = GlutenCauliflower.objects.filter(id__in=pizzas).aggregate(Sum('price'))[
    #         'price__sum']
    #     total_price += gluten_cauliflower_total_ins

    wings_sauce = request.POST.getlist('wings_sauce', None)
    if len(wings_sauce) != 0:
        wings_sauce_ins = WingSauce.objects.filter(id__in=wings_sauce)
        for wings_sauce_data in wings_sauce_ins:
            temp_data = {}
            wings_sauce_qty = 'wings_sauce_qty_' + str(wings_sauce_data.id)
            input_wings_sauce_qty = request.POST.get(wings_sauce_qty, None)
            special_request = request.POST.get('wings_sauce_special_request',
                                               None)
            if input_wings_sauce_qty is None or input_wings_sauce_qty == '':
                return False, 'Wings Sauces Quantity not given properly!', data
            temp_data['name'] = wings_sauce_data.name
            temp_data['total_price'] = str(wings_sauce_data.price *
                                           decimal(input_wings_sauce_qty))
            temp_data['qty'] = input_wings_sauce_qty
            temp_data['special_request'] = special_request
            food_data.append(temp_data)
            total_food_price += wings_sauce_data.price * decimal(
                input_wings_sauce_qty)

    salad = request.POST.getlist('salad', None)
    if len(salad) != 0:
        salad_ins = Salad.objects.filter(id__in=salad)
        for salad_data in salad_ins:
            temp_data = {}
            salad_qty = 'salad_qty_' + str(salad_data.id)
            input_salad_qty = request.POST.get(salad_qty, None)
            special_request = request.POST.get('salad_special_request', None)
            if input_salad_qty is None or input_salad_qty == '':
                return False, 'Salads Quantity not given properly!', data
            temp_data['name'] = salad_data.name
            temp_data['total_price'] = str(salad_data.price *
                                           decimal(input_salad_qty))
            temp_data['qty'] = input_salad_qty
            temp_data['special_request'] = special_request
            food_data.append(temp_data)
            total_food_price += salad_data.price * decimal(input_salad_qty)

    salad_dressing = request.POST.getlist('salad_dressing', None)
    salad_dressing_qty = request.POST.getlist('salad_dressing_qty', None)
    if len(salad_dressing) != 0:
        salad_dressing_ins = SaladDressing.objects.filter(
            id__in=salad_dressing)
        for salad_dressing_data in salad_dressing_ins:
            temp_data = {}
            salad_dressing_qty = 'salad_dressing_qty_' + str(
                salad_dressing_data.id)
            input_salad_dressing_qty = request.POST.get(
                salad_dressing_qty, None)
            if input_salad_dressing_qty is None or input_salad_dressing_qty == '':
                return False, 'Salad Dressings Quantity not given properly!', data
            temp_data['name'] = salad_dressing_data.name
            temp_data['total_price'] = str(salad_dressing_data.price *
                                           decimal(input_salad_dressing_qty))
            temp_data['qty'] = input_salad_dressing_qty
            food_data.append(temp_data)
            total_food_price += salad_dressing_data.price * decimal(
                input_salad_dressing_qty)

    dessert = request.POST.getlist('dessert', None)
    dessert_qty = request.POST.getlist('dessert_qty', None)
    if len(dessert) != 0:
        dessert_ins = Dessert.objects.filter(id__in=dessert)
        for dessert_data in dessert_ins:
            temp_data = {}
            dessert_qty = 'dessert_qty_' + str(dessert_data.id)
            input_dessert_qty = request.POST.get(dessert_qty, None)
            special_request = request.POST.get('dessert_special_request', None)
            if input_dessert_qty is None or input_dessert_qty == '':
                return False, 'Desserts Quantity not given properly!', data
            temp_data['name'] = dessert_data.name
            temp_data['total_price'] = str(dessert_data.price *
                                           decimal(input_dessert_qty))
            temp_data['qty'] = input_dessert_qty
            temp_data['special_request'] = special_request
            food_data.append(temp_data)
            total_food_price += dessert_data.price * decimal(input_dessert_qty)

    bread = request.POST.getlist('bread', None)
    if len(bread) != 0:
        bread_ins = Bread.objects.filter(id__in=bread)
        for bread_data in bread_ins:
            temp_data = {}
            bread_qty = 'bread_qty_' + str(bread_data.id)
            input_bread_qty = request.POST.get(bread_qty, None)
            if input_bread_qty is None or input_bread_qty == '':
                return False, 'Breads Quantity not given properly!', data
            temp_data['name'] = bread_data.name
            temp_data['total_price'] = str(bread_data.price *
                                           decimal(input_bread_qty))
            temp_data['qty'] = input_bread_qty
            food_data.append(temp_data)
            total_food_price += bread_data.price * decimal(input_bread_qty)

    wing = request.POST.getlist('wing', None)
    wing_qty = request.POST.getlist('wing_qty', None)
    if len(wing) != 0:
        wing_ins = Wing.objects.filter(id__in=wing)
        for wing_data in wing_ins:
            temp_data = {}
            wing_qty = 'wing_qty_' + str(wing_data.id)
            input_wing_qty = request.POST.get(wing_qty, None)
            if input_wing_qty is None or input_wing_qty == '':
                return False, 'Wings Quantity not given properly!', data
            temp_data['name'] = wing_data.name
            temp_data['total_price'] = str(wing_data.price *
                                           decimal(input_wing_qty))
            temp_data['qty'] = input_wing_qty
            food_data.append(temp_data)
            total_food_price += wing_data.price * decimal(input_wing_qty)

    # print('total price: ', total_price)
    # print('food data: ', food_data)
    data = {'food_data': food_data, 'total_food_price': total_food_price}

    return True, '', data
'''
unicodedata 模块

unicodedata 模块包含了 Unicode 字符的属性, 例如字符类别, 分解数据, 以及数值.
'''
import unicodedata

for char in [u'A', u'-', u'1', u'w']:
	print(char,'-> ' ,end = '')
	print(repr(char), '-> ' ,end = '')
	print(unicodedata.category(char), '-> ' ,end = '')
	print(repr(unicodedata.decomposition(char)), '-> ' ,end = '')
	print(unicodedata.decimal(char, None),'=> ', end = '')
	print(unicodedata.numeric(char,None), end = '')
	print()