Beispiel #1
0
#:
ITV_WHITESPACE_CHARS = utils.sorted_chain([
    (0x9, 0xD),
    (0x20, 0x20),
    (0x85, 0x85),
    (0xA0, 0xA0),

    # Move the "middle dot" to delimiter category,
    # since this one is commonly used in Chinese news material.
    # (0xB7, 0xB7),
    (0x1680, 0x1680),
    (0x180E, 0x180E),

    # (0x2000, 0x200D),
    # Fix with:
    (0x2000, 0x200F),

    # (0x2028, 0x2029),
    # Fix with:
    (0x2028, 0x202C),
    (0x202F, 0x202F),

    # (0x205F, 0x2060),
    # Fix with:
    (0x205F, 0x206F),
    (0x237D, 0x237D),
    (0x2420, 0x2420),
    (0x2422, 0x2423),
    (0x3000, 0x3000),
    (0xFEFF, 0xFEFF),
])
Beispiel #2
0
"""
Consts for detecting chinese chars.
"""
from cnt.rulebase.const import utils

#: English Chars.
ITV_ENGLISH_CHARS = utils.sorted_chain(
    # ASCII_ALPHA_RANGES
    [
        (0x0041, 0x005A),
        (0x0061, 0x007A),
    ],
    # ALPHA_EXTENSION_RANGES
    [
        (0xFF21, 0xFF3A),
        (0xFF41, 0xFF5A),
    ],
)
Beispiel #3
0
#: Chinese Chars.
#: Pulled from https://www.qqxiuzi.cn/zh/hanzi-unicode-bianma.php
#: Notice ``3007`` is a delimiter, hence should not be included.
#:
#: Range generation::
#:
#:  lines = '''copy paste the table here'''
#:  [l.split('\t') for l in lines.strip().split('\n')]
ITV_CHINESE_CHARS = utils.sorted_chain([
    (0x4E00, 0x9FA5),
    (0x9FA6, 0x9FEF),
    (0x3400, 0x4DB5),
    (0x20000, 0x2A6D6),
    (0x2A700, 0x2B734),
    (0x2B740, 0x2B81D),
    (0x2B820, 0x2CEA1),
    (0x2CEB0, 0x2EBE0),
    (0x2F00, 0x2FD5),
    (0x2E80, 0x2EF3),
    (0xF900, 0xFAD9),
    (0x2F800, 0x2FA1D),
    (0xE815, 0xE86F),
    (0xE400, 0xE5E8),
    (0xE600, 0xE6CF),
    (0x31C0, 0x31E3),
    (0x2FF0, 0x2FFB),
    (0x3105, 0x312F),
    (0x31A0, 0x31BA),
])
Beispiel #4
0
"""
Consts for detecting digit chars.
"""
from cnt.rulebase.const import utils

#: Digits.
ITV_DIGITS = utils.sorted_chain(
    # ASCII_DIGIT_RANGES
    [
        (0x0030, 0x0039),
    ],
    # DIGIT_EXTENSION_RANGES
    [
        (0xFF10, 0xFF19),
    ],
)
Beispiel #5
0
ITV_DELIMITERS = utils.sorted_chain(
    # ASCII_DELIMITERS_RANGES
    [
        (0x0021, 0x002F),
        (0x003A, 0x0040),
        (0x005B, 0x0060),
        (0x007B, 0x007E),
    ],
    [
        # Pick from the whitespace category.
        (0xB7, 0xB7)
    ],

    # GENERAL_DELIMITERS_RAGES
    # http://www.unicode.org/charts/PDF/U2000.pdf
    [
        # (0x2000, 0x206F),
        # Fix with:
        (0x2010, 0x2027),
        (0x202D, 0x202E),
        (0x2030, 0x205E),
    ],
    # CJK_DELIMITERS_RANGES
    # http://www.unicode.org/charts/PDF/U3000.pdf
    # http://www.unicode.org/charts/PDF/UFE30.pdf
    [
        # (0x3000, 0x303F),
        # Fix with:
        (0x3001, 0x303F),
        (0xFE30, 0xFE4F),
    ],
    # DELIMITERS_EXTENSION_RANGES
    # http://www.unicode.org/charts/PDF/UFF00.pdf
    [
        (0xFF01, 0xFF0F),
        (0xFF1A, 0xFF20),
        (0xFF3B, 0xFF40),
        (0xFF5B, 0xFF64),
        (0xFFE0, 0xFFEE),
    ],
)