Example #1
0
def get_widechar_converter(excepts=None):
    '''
    >>> f = get_widechar_converter(r'/\@')
    >>> s = 'wc是@厕所的意思.../'

    wc是@厕所的意思.../
    '''
    if excepts:
        fm = list(_fullwide_map)
        for char in excepts:
            fm[ord(char)] = char
        fm = UEMPTY.join(fm)
    else:
        fm = _fullwide_map
    return lambda s: s.translate(fm)
Example #2
0
def get_widechar_converter(excepts=None):
    '''
    >>> f = get_widechar_converter(r'/\@')
    >>> s = 'wc是@厕所的意思.../'
    >>> print(f(s) if py3k else f(s.decode('utf-8')).encode('utf-8'))
    wc是@厕所的意思.../
    '''
    if excepts:
        fm = list(_fullwide_map)
        for char in excepts:
            fm[ord(char)] = char
        fm = UEMPTY.join(fm)
    else:
        fm = _fullwide_map
    return lambda s: s.translate(fm)
Example #3
0
def hzconvert(text, from_, to_, method='auto', chardict=None):
    assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto'

    from zhtools import chconv, xpinyin
    cdict = chconv.Chinese2Kanji_Table
    for k, v in chardict.items():
        try:
            encode(v, 'SHIFT-JIS')
            cdict[ord(k)] = ord(v)
        except UnicodeEncodeError:
            pass

    xpy = xpinyin.Pinyin()
    guess_chars = set()
    except_chars = set()

    def gbk_to_sjis(exc):
        if not isinstance(exc, UnicodeEncodeError):
            raise exc
        newpos = exc.end
        char = exc.object[exc.start:exc.end]
        c = ord(char)
        if c in cdict:
            # print('%s: %s matched!' %(char, cdict[c]))
            return chr(cdict[c]), newpos
        pinyin = xpy.get_pinyin(char)
        ok = []
        if pinyin:
            for newchar in xpy.py2hz(pinyin):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
            for newchar in xpy.py2hz(pinyin[:-1]):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
        if ok:
            newchar = random.choice(ok)
            cdict[c] = ord(newchar)
            guess_chars.add(c)
            # print('%s: %s' %(char, ','.join(ok)))
            return newchar, newpos
        except_chars.add(c)
        # print('Can not encode %s, ignore' % char)
        return ' ' * (newpos - exc.start), newpos

    codecs.register_error('gbk_to_sjis', gbk_to_sjis)
    # from zhtools import langconv
    # text = langconv.Converter('zh-hant').convert(text)
    try:
        text = text.encode('SHIFT-JIS', errors='gbk_to_sjis')
    except UnicodeError as exc:
        char = exc.object[exc.start:exc.end]
        print(char)
        raise
    print('These chars cannot encode to shift-jis:')
    if py3k:
        print(''.join(chr(c) for c in except_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in except_chars)))
    print('These chars can be guessed by pinyin:')
    if py3k:
        print(''.join(chr(c) for c in guess_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in guess_chars)))
    return text
Example #4
0
# -*- coding: utf-8 -*-

from portable import chr, to_unicode, UEMPTY, py3k

_fullwide_map = [chr(65248 + i) for i in range(128)]
_fullwide_map[32] = to_unicode(' ')
_fullwide_map = UEMPTY.join(_fullwide_map)


def get_widechar_converter(excepts=None):
    '''
    >>> f = get_widechar_converter(r'/\@')
    >>> s = 'wc是@厕所的意思.../'

    wc是@厕所的意思.../
    '''
    if excepts:
        fm = list(_fullwide_map)
        for char in excepts:
            fm[ord(char)] = char
        fm = UEMPTY.join(fm)
    else:
        fm = _fullwide_map
    return lambda s: s.translate(fm)


if __name__ == '__main__':
    import doctest
    doctest.testmod()
Example #5
0
def hzconvert(text, from_, to_, method='auto', chardict=None):
    assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto'

    from zhtools import chconv, xpinyin
    cdict = chconv.Chinese2Kanji_Table
    for k, v in chardict.items():
        try:
            encode(v, 'SHIFT-JIS')
            cdict[ord(k)] = ord(v)
        except UnicodeEncodeError:
            pass

    xpy = xpinyin.Pinyin()
    guess_chars = set()
    except_chars = set()

    def gbk_to_sjis(exc):
        if not isinstance(exc, UnicodeEncodeError):
            raise exc
        newpos = exc.end
        char = exc.object[exc.start:exc.end]
        c = ord(char)
        if c in cdict:
            # print('%s: %s matched!' %(char, cdict[c]))
            return chr(cdict[c]), newpos
        pinyin = xpy.get_pinyin(char)
        ok = []
        if pinyin:
            for newchar in xpy.py2hz(pinyin):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
            for newchar in xpy.py2hz(pinyin[:-1]):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
        if ok:
            newchar = random.choice(ok)
            cdict[c] = ord(newchar)
            guess_chars.add(c)
            # print('%s: %s' %(char, ','.join(ok)))
            return newchar, newpos
        except_chars.add(c)
        # print('Can not encode %s, ignore' % char)
        return ' ' * (newpos - exc.start), newpos

    codecs.register_error('gbk_to_sjis', gbk_to_sjis)
    # from zhtools import langconv
    # text = langconv.Converter('zh-hant').convert(text)
    try:
        text = text.encode('SHIFT-JIS', errors='gbk_to_sjis')
    except UnicodeError as exc:
        char = exc.object[exc.start:exc.end]
        print(char)
        raise
    print('These chars cannot encode to shift-jis:')
    if py3k:
        print(''.join(chr(c) for c in except_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in except_chars)))
    print('These chars can be guessed by pinyin:')
    if py3k:
        print(''.join(chr(c) for c in guess_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in guess_chars)))
    return text
Example #6
0
# -*- coding: utf-8 -*-


from portable import chr, to_unicode, UEMPTY, py3k

_fullwide_map = [chr(65248 + i) for i in range(128)]
_fullwide_map[32] = to_unicode(' ')
_fullwide_map = UEMPTY.join(_fullwide_map)


def get_widechar_converter(excepts=None):
    '''
    >>> f = get_widechar_converter(r'/\@')
    >>> s = 'wc是@厕所的意思.../'
    >>> print(f(s) if py3k else f(s.decode('utf-8')).encode('utf-8'))
    wc是@厕所的意思.../
    '''
    if excepts:
        fm = list(_fullwide_map)
        for char in excepts:
            fm[ord(char)] = char
        fm = UEMPTY.join(fm)
    else:
        fm = _fullwide_map
    return lambda s: s.translate(fm)


if __name__ == '__main__':
    import doctest
    doctest.testmod()