Example #1
0
 def _tag_line(line):
     line = to_unicode(line)
     if line and xml_lang:
         tag = u''.join([
             unichr(
                 uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') +
                 uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang))
         ])
         line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
     return line
Example #2
0
 def _js_escape_char(match):
     ch = match.group(0)
     if match.groupdict()['pyquote']:
         return ch.encode('unicode-escape')
     if match.groupdict()['backslash']:
         return r'\%s' % ch
     if match.groupdict()['empty']:
         return r'\x2f>'
     if match.groupdict()['close']:
         return r'<\x2f'
     if match.groupdict()['ccdata']:
         return r'\x5d]>'
     if match.groupdict()['entityref']:
         return r'\x26'
     och = uniord(ch)
     if och > 0x10ffff:
         assert "Codepoints outside the UTF-16 range are not supported." [:0]
     if och > 0xffff:
         # do UTF-16 encoding for chars outside the BMP
         return r'\u%04.4x\u%04.4x' % (
             ((och - 0x10000) >> 10) | 0xd800,
             ((och - 0x10000) & 0x3ff) | 0xdc00)
     if och > 0xff:
         return r'\u%04.04x' % och
     return r'\x%02.2x' % och
Example #3
0
def xml_escape(s):
    '''
    Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document.

    Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character.

    Whitespace characters are encoded as numeric character references.
    '''
    o = []
    for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))):
        if match.group(1):
            o.append(cgi.escape(match.group(0).encode('utf-8', 'replace'), quote = True))
        elif match.group(2):
            o.append('&#%d;' % uniord(match.group(0)))
        else:
            o.append(match.group(0).encode('utf-8', 'replace'))
    return ''.join(o)
Example #4
0
def xml_escape(s):
    '''
    Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document.

    Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character.

    Whitespace characters are encoded as numeric character references.
    '''
    o = []
    for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))):
        if match.group(1):
            o.append(
                cgi.escape(match.group(0).encode('utf-8', 'replace'),
                           quote=True))
        elif match.group(2):
            o.append('&#%d;' % uniord(match.group(0)))
        else:
            o.append(match.group(0).encode('utf-8', 'replace'))
    return ''.join(o)
Example #5
0
 def _js_escape_char(match):
     ch = match.group(0)
     if match.groupdict()['pyquote']:
         return ch.encode('unicode-escape')
     if match.groupdict()['backslash']:
         return r'\%s' % ch
     if match.groupdict()['empty']:
         return r'\x2f>'
     if match.groupdict()['close']:
         return r'<\x2f'
     if match.groupdict()['ccdata']:
         return r'\x5d]>'
     if match.groupdict()['entityref']:
         return r'\x26'
     och = uniord(ch)
     if och > 0x10ffff:
         assert "Codepoints outside the UTF-16 range are not supported."[:0]
     if och > 0xffff:
         # do UTF-16 encoding for chars outside the BMP
         return r'\u%04.4x\u%04.4x' % (((och - 0x10000) >> 10) | 0xd800,
                                       ((och - 0x10000) & 0x3ff) | 0xdc00)
     if och > 0xff:
         return r'\u%04.04x' % och
     return r'\x%02.2x' % och
Example #6
0
def _slow_init(confusables_file = None):
    global confusables, confusemap, _init_in_progress
    if _init_in_progress:
        return
    _init_in_progress = True
    try:
        if confusables_file is None and __name__ == '__main__':
            try:
                confusables_file = file(os.path.join(os.path.split(sys.argv[0])[0], '../share/confusables.txt'), 'rb')
            except:
                pass
        if confusables_file is None:
            confusables_file = urllib.urlopen('http://www.unicode.org/reports/tr36/data/confusables.txt', 'rb')
        for line in confusables_file:
            line = line.decode('utf-8').rstrip()
            try:
                cmd, comment = [ field.strip() for field in line.split('#', 1) ]
                a, b, cclass = [ field.strip() for field in cmd.split(';') ]
                a, b = [ u''.join([ unichr(int(ch, 16)) for ch in field.split() ]) for field in (a, b) ]
                confusables[cclass][a] = confusables[cclass].get(a, set()) | set((b,))
            except:
                pass
        confusables_file.close()

        for chno in xrange(0, 0x110000):
            source = unichr(chno)
            sources = set([ source, unicodedata.normalize('NFKD', source),
                            source.upper(), unicodedata.normalize('NFKD', source.upper()),
                            source.title(), unicodedata.normalize('NFKD', source.title()),
                            source.lower(), unicodedata.normalize('NFKD', source.lower())])
            try:
                sources = sources | set([ source.encode('idna').decode('idna') ])
            except:
                pass
            while True:
                sources2 = sources
                for source in sources:
                    sources2 = sources2 | confusables['all'].get(source, set([ source ]))
                if sources2 == sources:
                    break
                sources = sources2
            if len(sources) > 1:
                for source in sources:
                    if unilen(source) == 1:
                        confusables['all'][source] = sources

        for cclass in confusables:
            if cclass == 'all':
                continue
            for source in confusables[cclass]:
                dest = confusables[cclass][source]
                dest = dest | confusables['all'].get(source, set()) | set([ source ])
                dest2 = dest
                for d in dest2:
                    dest = dest | confusables['all'].get(d, set())
                for d in dest:
                    if unilen(d) == 1:
                        confusables['all'][d] = confusables['all'].get(d, set()) | dest

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([ source, unicodedata.normalize('NFKD', source),
                            source.upper(), unicodedata.normalize('NFKD', source.upper()),
                            source.title(), unicodedata.normalize('NFKD', source.title()),
                            source.lower(), unicodedata.normalize('NFKD', source.lower())])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([ source, unicodedata.normalize('NFKD', source),
                            source.upper(), unicodedata.normalize('NFKD', source.upper()),
                            source.title(), unicodedata.normalize('NFKD', source.title()),
                            source.lower(), unicodedata.normalize('NFKD', source.lower())])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([ source, unicodedata.normalize('NFKD', source),
                            source.upper(), unicodedata.normalize('NFKD', source.upper()),
                            source.title(), unicodedata.normalize('NFKD', source.title()),
                            source.lower(), unicodedata.normalize('NFKD', source.lower())])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        _confusemap = {}
        for chno in xrange(0, 0x110000):
            source = unichr(chno)
            dest = _confuse_internal(source, cclasses = [ 'all' ], asciify = True)
            dest2 = unicodedata.normalize('NFKC', source.lower())
            if dest != dest2 or (uniord(source) >= 0x10000 and source != dest):
                if dest != unicodedata.normalize('NFC', dest.lower()):
                    raise ValueError(repr(source) + ': ' + repr(dest))
                _confusemap[source] = dest
        confusemap = _confusemap
    finally:
        _init_in_progress = False
Example #7
0
def test():
    '''
    Small smoke test to make sure this module is not broken.
    '''
    assert unilen(u'\U0010fffd') == 1
    assert len(u'Hello, world!') == unilen(u'Hello, world!')
    assert uniord(u'\U0010fffd') == 0x10fffd
    assert uniord(u'H') == ord(u'H')
    assert unichr(0x10fffd) == u'\U0010fffd'
    assert unichr(ord(u'H')) == u'H'
    assert [ ch for ch in unichars(u'H\U0010fffd') ] == [ 'H', u'\U0010fffd' ]
    assert confuse(u'Hello, world!', [ 'all' ], True) == confuse(u'he110, w0r1d!', [ 'all' ], True)
    assert confuse(u'Sergey Brin') == confuse(u'5e\N{GREEK SMALL LETTER GAMMA}g\N{GREEK SMALL LETTER EPSILON}\N{GREEK SMALL LETTER UPSILON} \N{GREEK CAPITAL LETTER BETA}r1\N{GREEK CAPITAL LETTER NU}')
    assert confuse(u'He\N{LATIN SMALL LETTER SHARP S}e') == confuse(u'Hesse')
    assert confuse(u'\N{GREEK CAPITAL LETTER KAPPA}\N{GREEK SMALL LETTER NU}\N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER SIGMA}\N{GREEK SMALL LETTER OMICRON WITH TONOS}\N{GREEK SMALL LETTER FINAL SIGMA}') == confuse(u'Kv\N{OHM SIGN}co\N{COMBINING ACUTE ACCENT}c')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(u'\N{LATIN CAPITAL LETTER I}')
    assert confuse(u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}') == confuse(u'\N{LATIN CAPITAL LETTER I}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN SMALL LETTER I}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN CAPITAL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I}\N{COMBINING DOT ABOVE}') == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}') == confuse(u'\N{DIGIT ONE}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER O WITH HORN}') == confuse(u'\N{DIGIT ZERO}\N{COMBINING HORN}')
    assert confuse(u'\N{LATIN SMALL LETTER O WITH HORN}') == confuse(u'\N{DIGIT ZERO}\N{COMBINING HORN}')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}') == confuse(u'\N{DIGIT ONE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER O WITH HORN}') == confuse(u'\N{LATIN CAPITAL LETTER O}\N{SPACE}\N{COMBINING COMMA ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER O WITH HORN}') == confuse(u'\N{LATIN CAPITAL LETTER O}\N{SPACE}\N{COMBINING COMMA ABOVE}')

    # NOTE: this is a weak bidi homograph for "i love you"
    assert confuse(u'ڸ٥ﻻ ﻉ√٥ﺎ ٱ') == confuse(u'\u06b8\u0665\ufefb \ufec9\u221a\u0665\ufe8e \u0671')
    assert confuse(u'\u06b8\u0665\u0644\u0627 \u0639\u221a\u0665\u0627 \u0671') == confuse(u'ڸ٥ﻻ ﻉ√٥ﺎ ٱ')
    assert (confuse(u''.join(u'''
    0\u09e6\u2080\u0d02\u0585\u3007\U0001d70a\U0001d5ae\u1d0f\U0001d40e\u1810\U0001d512\U0001d7f6\U0001d594\U0001d616\U0001d698\u0966\u101d\U0001d6f0\u039f\u041e\u0d20\U0001d7ec\uff4f\u0c66\U0001d630\U0001d428\U0001d72a\U0001d52c\uff2f\u0be6\u0030\u24c4\u0a66\u2134\U0001d6b6\U0001d7b8\u0ed0\U0001d79e\uff10\u03bf\u043e\u1040\U0001d442\U0001d744\U0001d546\U0001d5c8\U0001d64a\U0001d664\U0001d67e\u004f\U0001d7ce\u0e50\U0001d5e2\u0555\u2d54\u0c02\U0001d7d8\u0c82\U0001d45c\u24de\U0001d560\U0001d7e2\U0001d764\u0ae6\U0001d490\u0ce6\u0b66\U0001d4aa\u006f\u24ea\u2070\U0001d4de\U0001d476\u00ba\U0001d4f8\U0001d6d0\U0001d57a\U0001d5fc\U0001d77e
    1\U0001d408\U0001d456\U0001d591\U0001d610\u2112\u0399\u24db\u0322\u13a5\U0001d526\u03b9\U0001d43c\U0001d73e\U0001d5c5\U0001d644\u004c\U0001d7d9\U0001d4db\U0001d55a\U0001d75e\u2460\U0001d7ed\u006c\U0001d470\U0001d5f9\U0001d678\u24be\U0001d40b\U0001d58e\uff11\U0001d613\u0196\U0001d6a4\U0001d529\u0131\U0001d6b0\U0001d7b2\u2139\U0001d43f\u1fbe\u04c0\U0001d5c2\u0345\U0001d647\uff49\u24d8\U0001d55d\u13de\U0001d473\U0001d5f6\U0001d67b\u2081\U0001d704\u0406\U0001d48d\u2110\U0001d695\U0001d4d8\U0001d422\U0001d724\u0328\U0001d5ab\U0001d62a\uff2c\u00b9\u24c1\U0001d540\u2148\uff4c\U0001d7cf\u0456\U0001d5df\U0001d65e\u02e1\u2160\U0001d7e3\u0069\u216c\u2170\U0001d4f5\U0001d574\U0001d7f7\U0001d778\u217c\u2780\U0001d48a\U0001d50f\u2113\U0001d692\U0001d798\U0001d425\U0001d5a8\u2111\U0001d62d\u0031\U0001d4be\U0001d543\u0049\U0001d6ca\U0001d459\U0001d5dc\U0001d661\u0269\U0001d6ea\u2071\uff29\u2373\U0001d4f2\U0001d577\u0130
    2\u2781\u2461\u2082\U0001d7e4\U0001d7ee\U0001d7d0\uff12\U0001d7f8\u0032\U0001d7da\u00b2\u14bf
    3\u2083\u2782\U0001d409\u0408\U0001d48b\u148d\U0001d58f\U0001d611\u24d9\U0001d693\u0417\u0545\U0001d55b\U0001d423\U0001d4a5\U0001d527\U0001d5a9\u13ab\uff2a\u03f3\u00b3\u02b2\u0437\u0575\U0001d43d\u24bf\U0001d541\U0001d5c3\u004a\U0001d645\u0033\u2149\U0001d50d\uff4a\U0001d7d1\U0001d679\U0001d457\U0001d4d9\u0458\U0001d7db\U0001d5dd\u025c\U0001d65f\u2462\U0001d7e5\u13e7\u006a\uff13\U0001d7ef\U0001d471\U0001d4f3\U0001d575\U0001d4bf\U0001d5f7\U0001d7f9\U0001d62b
    4\u2783\u2084\U0001d7e6\u0034\u2463\u13ce\U0001d7f0\U0001d7d2\uff14\U0001d7fa\U0001d7dc\u2074
    5\U0001d598\U0001d600\U0001d682\u2085\u2784\u0053\u02e2\U0001d412\uff15\U0001d494\U0001d516\u0405\U0001d61a\U0001d69c\U0001d42c\U0001d4ae\U0001d530\uff33\U0001d5b2\u0035\U0001d634\u24c8\u01bd\u01bc\U0001d446\U0001d4c8\U0001d54a\U0001d5cc\U0001d64e\U0001d7d3\U0001d4e2\u13d5\u13da\U0001d7dd\u2464\U0001d460\u24e2\U0001d564\U0001d7e7\U0001d5e6\U0001d668\uff53\U0001d7f1\u0073\u2075\u0455\U0001d7fb\U0001d47a\U0001d4fc\u017f\U0001d57e
    6\u0431\u2785\u2086\U0001d7e8\u0036\u2076\u0411\U0001d7f2\U0001d7d4\uff16\u2465\U0001d7fc\U0001d7de
    7\u0037\u2087\u2786\U0001d7e9\u2466\u2077\U0001d7f3\U0001d7d5\uff17\U0001d7fd\U0001d7df
    8\U0001d7e0\u0b03\u0222\u2467\u2787\u09ea\u2088\U0001d7ea\uff18\u0223\U0001d7f4\U0001d7d6\u0038\u0a6a\u2078\U0001d7fe
    9\U0001d7e1\u2468\u2079\u0a67\u2089\u2788\U0001d7eb\u09ed\u0039\U0001d7f5\U0001d7d7\u0b68\uff19\U0001d7ff
    '''.split())) == confuse(
        u'0' * len('................................................................................')
        +
        u'1' * len('...............................................................................................................')
        +
        u'2' * len('.............')
        +
        u'3' * len('.........................................................')
        +
        u'4' * len('.............')
        +
        u'5' * len('....................................................')
        +
        u'6' * len('..............')
        +
        u'7' * len('............')
        +
        u'8' * len('.................')
        +
        u'9' * len('...............')
        ))
    pass
Example #8
0
 def _tag_line(line):
     line = to_unicode(line)
     if line and xml_lang:
         tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ])
         line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
     return line
Example #9
0
def _slow_init(confusables_file=None):
    global confusables, confusemap, _init_in_progress
    if _init_in_progress:
        return
    _init_in_progress = True
    try:
        if confusables_file is None and __name__ == '__main__':
            try:
                confusables_file = file(
                    os.path.join(
                        os.path.split(sys.argv[0])[0],
                        '../share/confusables.txt'), 'rb')
            except:
                pass
        if confusables_file is None:
            confusables_file = urllib.urlopen(
                'http://www.unicode.org/reports/tr36/data/confusables.txt',
                'rb')
        for line in confusables_file:
            line = line.decode('utf-8').rstrip()
            try:
                cmd, comment = [field.strip() for field in line.split('#', 1)]
                a, b, cclass = [field.strip() for field in cmd.split(';')]
                a, b = [
                    u''.join([unichr(int(ch, 16)) for ch in field.split()])
                    for field in (a, b)
                ]
                confusables[cclass][a] = confusables[cclass].get(
                    a, set()) | set((b, ))
            except:
                pass
        confusables_file.close()

        for chno in xrange(0, 0x110000):
            source = unichr(chno)
            sources = set([
                source,
                unicodedata.normalize('NFKD', source),
                source.upper(),
                unicodedata.normalize('NFKD', source.upper()),
                source.title(),
                unicodedata.normalize('NFKD', source.title()),
                source.lower(),
                unicodedata.normalize('NFKD', source.lower())
            ])
            try:
                sources = sources | set([source.encode('idna').decode('idna')])
            except:
                pass
            while True:
                sources2 = sources
                for source in sources:
                    sources2 = sources2 | confusables['all'].get(
                        source, set([source]))
                if sources2 == sources:
                    break
                sources = sources2
            if len(sources) > 1:
                for source in sources:
                    if unilen(source) == 1:
                        confusables['all'][source] = sources

        for cclass in confusables:
            if cclass == 'all':
                continue
            for source in confusables[cclass]:
                dest = confusables[cclass][source]
                dest = dest | confusables['all'].get(source, set()) | set(
                    [source])
                dest2 = dest
                for d in dest2:
                    dest = dest | confusables['all'].get(d, set())
                for d in dest:
                    if unilen(d) == 1:
                        confusables['all'][d] = confusables['all'].get(
                            d, set()) | dest

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([
                source,
                unicodedata.normalize('NFKD', source),
                source.upper(),
                unicodedata.normalize('NFKD', source.upper()),
                source.title(),
                unicodedata.normalize('NFKD', source.title()),
                source.lower(),
                unicodedata.normalize('NFKD', source.lower())
            ])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([
                source,
                unicodedata.normalize('NFKD', source),
                source.upper(),
                unicodedata.normalize('NFKD', source.upper()),
                source.title(),
                unicodedata.normalize('NFKD', source.title()),
                source.lower(),
                unicodedata.normalize('NFKD', source.lower())
            ])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        confusables['all2'] = {}

        for source in confusables['all']:
            sources = set([
                source,
                unicodedata.normalize('NFKD', source),
                source.upper(),
                unicodedata.normalize('NFKD', source.upper()),
                source.title(),
                unicodedata.normalize('NFKD', source.title()),
                source.lower(),
                unicodedata.normalize('NFKD', source.lower())
            ])
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            dest = set()
            for source2 in sources:
                dest = dest | confusables['all'].get(source2, set())
            sources = sources | dest
            for source2 in sources:
                if unilen(source2) == 1:
                    confusables['all2'][source2] = sources

        confusables['all'] = confusables['all2']

        del confusables['all2']

        _confusemap = {}
        for chno in xrange(0, 0x110000):
            source = unichr(chno)
            dest = _confuse_internal(source, cclasses=['all'], asciify=True)
            dest2 = unicodedata.normalize('NFKC', source.lower())
            if dest != dest2 or (uniord(source) >= 0x10000 and source != dest):
                if dest != unicodedata.normalize('NFC', dest.lower()):
                    raise ValueError(repr(source) + ': ' + repr(dest))
                _confusemap[source] = dest
        confusemap = _confusemap
    finally:
        _init_in_progress = False
Example #10
0
def test():
    '''
    Small smoke test to make sure this module is not broken.
    '''
    assert unilen(u'\U0010fffd') == 1
    assert len(u'Hello, world!') == unilen(u'Hello, world!')
    assert uniord(u'\U0010fffd') == 0x10fffd
    assert uniord(u'H') == ord(u'H')
    assert unichr(0x10fffd) == u'\U0010fffd'
    assert unichr(ord(u'H')) == u'H'
    assert [ch for ch in unichars(u'H\U0010fffd')] == ['H', u'\U0010fffd']
    assert confuse(u'Hello, world!', ['all'],
                   True) == confuse(u'he110, w0r1d!', ['all'], True)
    assert confuse(u'Sergey Brin') == confuse(
        u'5e\N{GREEK SMALL LETTER GAMMA}g\N{GREEK SMALL LETTER EPSILON}\N{GREEK SMALL LETTER UPSILON} \N{GREEK CAPITAL LETTER BETA}r1\N{GREEK CAPITAL LETTER NU}'
    )
    assert confuse(u'He\N{LATIN SMALL LETTER SHARP S}e') == confuse(u'Hesse')
    assert confuse(
        u'\N{GREEK CAPITAL LETTER KAPPA}\N{GREEK SMALL LETTER NU}\N{GREEK SMALL LETTER OMEGA}\N{GREEK SMALL LETTER SIGMA}\N{GREEK SMALL LETTER OMICRON WITH TONOS}\N{GREEK SMALL LETTER FINAL SIGMA}'
    ) == confuse(u'Kv\N{OHM SIGN}co\N{COMBINING ACUTE ACCENT}c')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(
        u'\N{LATIN CAPITAL LETTER I}')
    assert confuse(u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}'
                   ) == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}') == confuse(
        u'\N{LATIN CAPITAL LETTER I}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}'
                   ) == confuse(u'\N{LATIN SMALL LETTER I}')
    assert confuse(
        u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(
            u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(
        u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}') == confuse(
            u'\N{LATIN CAPITAL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER DOTLESS I}\N{COMBINING DOT ABOVE}'
                   ) == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I}\N{COMBINING DOT ABOVE}'
                   ) == confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}') == confuse(
        u'\N{DIGIT ONE}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER O WITH HORN}') == confuse(
        u'\N{DIGIT ZERO}\N{COMBINING HORN}')
    assert confuse(u'\N{LATIN SMALL LETTER O WITH HORN}') == confuse(
        u'\N{DIGIT ZERO}\N{COMBINING HORN}')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(
        u'\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER I}') == confuse(
        u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}') == confuse(
        u'\N{DIGIT ONE}')
    assert confuse(u'\N{LATIN CAPITAL LETTER O WITH HORN}') == confuse(
        u'\N{LATIN CAPITAL LETTER O}\N{SPACE}\N{COMBINING COMMA ABOVE}')
    assert confuse(u'\N{LATIN SMALL LETTER O WITH HORN}') == confuse(
        u'\N{LATIN CAPITAL LETTER O}\N{SPACE}\N{COMBINING COMMA ABOVE}')

    # NOTE: this is a weak bidi homograph for "i love you"
    assert confuse(u'ڸ٥ﻻ ﻉ√٥ﺎ ٱ') == confuse(
        u'\u06b8\u0665\ufefb \ufec9\u221a\u0665\ufe8e \u0671')
    assert confuse(u'\u06b8\u0665\u0644\u0627 \u0639\u221a\u0665\u0627 \u0671'
                   ) == confuse(u'ڸ٥ﻻ ﻉ√٥ﺎ ٱ')
    assert (confuse(u''.join(u'''
    0\u09e6\u2080\u0d02\u0585\u3007\U0001d70a\U0001d5ae\u1d0f\U0001d40e\u1810\U0001d512\U0001d7f6\U0001d594\U0001d616\U0001d698\u0966\u101d\U0001d6f0\u039f\u041e\u0d20\U0001d7ec\uff4f\u0c66\U0001d630\U0001d428\U0001d72a\U0001d52c\uff2f\u0be6\u0030\u24c4\u0a66\u2134\U0001d6b6\U0001d7b8\u0ed0\U0001d79e\uff10\u03bf\u043e\u1040\U0001d442\U0001d744\U0001d546\U0001d5c8\U0001d64a\U0001d664\U0001d67e\u004f\U0001d7ce\u0e50\U0001d5e2\u0555\u2d54\u0c02\U0001d7d8\u0c82\U0001d45c\u24de\U0001d560\U0001d7e2\U0001d764\u0ae6\U0001d490\u0ce6\u0b66\U0001d4aa\u006f\u24ea\u2070\U0001d4de\U0001d476\u00ba\U0001d4f8\U0001d6d0\U0001d57a\U0001d5fc\U0001d77e
    1\U0001d408\U0001d456\U0001d591\U0001d610\u2112\u0399\u24db\u0322\u13a5\U0001d526\u03b9\U0001d43c\U0001d73e\U0001d5c5\U0001d644\u004c\U0001d7d9\U0001d4db\U0001d55a\U0001d75e\u2460\U0001d7ed\u006c\U0001d470\U0001d5f9\U0001d678\u24be\U0001d40b\U0001d58e\uff11\U0001d613\u0196\U0001d6a4\U0001d529\u0131\U0001d6b0\U0001d7b2\u2139\U0001d43f\u1fbe\u04c0\U0001d5c2\u0345\U0001d647\uff49\u24d8\U0001d55d\u13de\U0001d473\U0001d5f6\U0001d67b\u2081\U0001d704\u0406\U0001d48d\u2110\U0001d695\U0001d4d8\U0001d422\U0001d724\u0328\U0001d5ab\U0001d62a\uff2c\u00b9\u24c1\U0001d540\u2148\uff4c\U0001d7cf\u0456\U0001d5df\U0001d65e\u02e1\u2160\U0001d7e3\u0069\u216c\u2170\U0001d4f5\U0001d574\U0001d7f7\U0001d778\u217c\u2780\U0001d48a\U0001d50f\u2113\U0001d692\U0001d798\U0001d425\U0001d5a8\u2111\U0001d62d\u0031\U0001d4be\U0001d543\u0049\U0001d6ca\U0001d459\U0001d5dc\U0001d661\u0269\U0001d6ea\u2071\uff29\u2373\U0001d4f2\U0001d577\u0130
    2\u2781\u2461\u2082\U0001d7e4\U0001d7ee\U0001d7d0\uff12\U0001d7f8\u0032\U0001d7da\u00b2\u14bf
    3\u2083\u2782\U0001d409\u0408\U0001d48b\u148d\U0001d58f\U0001d611\u24d9\U0001d693\u0417\u0545\U0001d55b\U0001d423\U0001d4a5\U0001d527\U0001d5a9\u13ab\uff2a\u03f3\u00b3\u02b2\u0437\u0575\U0001d43d\u24bf\U0001d541\U0001d5c3\u004a\U0001d645\u0033\u2149\U0001d50d\uff4a\U0001d7d1\U0001d679\U0001d457\U0001d4d9\u0458\U0001d7db\U0001d5dd\u025c\U0001d65f\u2462\U0001d7e5\u13e7\u006a\uff13\U0001d7ef\U0001d471\U0001d4f3\U0001d575\U0001d4bf\U0001d5f7\U0001d7f9\U0001d62b
    4\u2783\u2084\U0001d7e6\u0034\u2463\u13ce\U0001d7f0\U0001d7d2\uff14\U0001d7fa\U0001d7dc\u2074
    5\U0001d598\U0001d600\U0001d682\u2085\u2784\u0053\u02e2\U0001d412\uff15\U0001d494\U0001d516\u0405\U0001d61a\U0001d69c\U0001d42c\U0001d4ae\U0001d530\uff33\U0001d5b2\u0035\U0001d634\u24c8\u01bd\u01bc\U0001d446\U0001d4c8\U0001d54a\U0001d5cc\U0001d64e\U0001d7d3\U0001d4e2\u13d5\u13da\U0001d7dd\u2464\U0001d460\u24e2\U0001d564\U0001d7e7\U0001d5e6\U0001d668\uff53\U0001d7f1\u0073\u2075\u0455\U0001d7fb\U0001d47a\U0001d4fc\u017f\U0001d57e
    6\u0431\u2785\u2086\U0001d7e8\u0036\u2076\u0411\U0001d7f2\U0001d7d4\uff16\u2465\U0001d7fc\U0001d7de
    7\u0037\u2087\u2786\U0001d7e9\u2466\u2077\U0001d7f3\U0001d7d5\uff17\U0001d7fd\U0001d7df
    8\U0001d7e0\u0b03\u0222\u2467\u2787\u09ea\u2088\U0001d7ea\uff18\u0223\U0001d7f4\U0001d7d6\u0038\u0a6a\u2078\U0001d7fe
    9\U0001d7e1\u2468\u2079\u0a67\u2089\u2788\U0001d7eb\u09ed\u0039\U0001d7f5\U0001d7d7\u0b68\uff19\U0001d7ff
    '''.split())) == confuse(u'0' * len(
        '................................................................................'
    ) + u'1' * len(
        '...............................................................................................................'
    ) + u'2' * len('.............') + u'3' * len(
        '.........................................................'
    ) + u'4' * len('.............') + u'5' * len(
        '....................................................') +
                             u'6' * len('..............') +
                             u'7' * len('............') +
                             u'8' * len('.................') +
                             u'9' * len('...............')))
    pass