Beispiel #1
0
from hypothesis.internal.charmap import as_general_categories, categories
from hypothesis.internal.compat import int_to_byte

HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6)

UNICODE_CATEGORIES = set(categories())

SPACE_CHARS = set(" \t\n\r\f\v")
UNICODE_SPACE_CHARS = SPACE_CHARS | set("\x1c\x1d\x1e\x1f\x85")
UNICODE_DIGIT_CATEGORIES = {"Nd"}
UNICODE_SPACE_CATEGORIES = set(as_general_categories("Z"))
UNICODE_LETTER_CATEGORIES = set(as_general_categories("L"))
UNICODE_WORD_CATEGORIES = set(as_general_categories(["L", "N"]))

# This is verbose, but correct on all versions of Python
BYTES_ALL = {int_to_byte(i) for i in range(256)}
BYTES_DIGIT = {b for b in BYTES_ALL if re.match(b"\\d", b)}
BYTES_SPACE = {b for b in BYTES_ALL if re.match(b"\\s", b)}
BYTES_WORD = {b for b in BYTES_ALL if re.match(b"\\w", b)}
BYTES_LOOKUP = {
    sre.CATEGORY_DIGIT: BYTES_DIGIT,
    sre.CATEGORY_SPACE: BYTES_SPACE,
    sre.CATEGORY_WORD: BYTES_WORD,
    sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
    sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
    sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
}

GROUP_CACHE_STRATEGY = st.shared(st.builds(dict),
                                 key="hypothesis.regex.group_cache")
HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6)


UNICODE_CATEGORIES = set(categories())


SPACE_CHARS = set(u" \t\n\r\f\v")
UNICODE_SPACE_CHARS = SPACE_CHARS | set(u"\x1c\x1d\x1e\x1f\x85")
UNICODE_DIGIT_CATEGORIES = {"Nd"}
UNICODE_SPACE_CATEGORIES = set(as_general_categories("Z"))
UNICODE_LETTER_CATEGORIES = set(as_general_categories("L"))
UNICODE_WORD_CATEGORIES = set(as_general_categories(["L", "N"]))

# This is verbose, but correct on all versions of Python
BYTES_ALL = {int_to_byte(i) for i in range(256)}
BYTES_DIGIT = {b for b in BYTES_ALL if re.match(b"\\d", b)}
BYTES_SPACE = {b for b in BYTES_ALL if re.match(b"\\s", b)}
BYTES_WORD = {b for b in BYTES_ALL if re.match(b"\\w", b)}
BYTES_LOOKUP = {
    sre.CATEGORY_DIGIT: BYTES_DIGIT,
    sre.CATEGORY_SPACE: BYTES_SPACE,
    sre.CATEGORY_WORD: BYTES_WORD,
    sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
    sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
    sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
}

# On Python 2, these unicode chars are matched by \W, meaning 'not word',
# but unicodedata.category(c) returns one of the word categories above.
UNICODE_WEIRD_NONWORD_CHARS = set(u"\U00012432\U00012433\U00012456\U00012457")
Beispiel #3
0
HAS_SUBPATTERN_FLAGS = sys.version_info[:2] >= (3, 6)


UNICODE_CATEGORIES = set(categories())


SPACE_CHARS = set(u' \t\n\r\f\v')
UNICODE_SPACE_CHARS = SPACE_CHARS | set(u'\x1c\x1d\x1e\x1f\x85')
UNICODE_DIGIT_CATEGORIES = set(['Nd'])
UNICODE_SPACE_CATEGORIES = set(as_general_categories('Z'))
UNICODE_LETTER_CATEGORIES = set(as_general_categories('L'))
UNICODE_WORD_CATEGORIES = set(as_general_categories(['L', 'N']))

# This is verbose, but correct on all versions of Python
BYTES_ALL = set(int_to_byte(i) for i in range(256))
BYTES_DIGIT = set(b for b in BYTES_ALL if re.match(b'\\d', b))
BYTES_SPACE = set(b for b in BYTES_ALL if re.match(b'\\s', b))
BYTES_WORD = set(b for b in BYTES_ALL if re.match(b'\\w', b))
BYTES_LOOKUP = {
    sre.CATEGORY_DIGIT: BYTES_DIGIT,
    sre.CATEGORY_SPACE: BYTES_SPACE,
    sre.CATEGORY_WORD: BYTES_WORD,
    sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
    sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
    sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
}

# On Python < 3.4 (including 2.7), the following unicode chars are weird.
# They are matched by the \W, meaning 'not word', but unicodedata.category(c)
# returns one of the word categories above.  There's special handling below.
Beispiel #4
0
    'Cf', 'Cn', 'Co', 'LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
    'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe',
    'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl',
    'Zp', 'Zs',
])


SPACE_CHARS = set(u' \t\n\r\f\v')
UNICODE_SPACE_CHARS = SPACE_CHARS | set(u'\x1c\x1d\x1e\x1f\x85')
UNICODE_DIGIT_CATEGORIES = set(['Nd'])
UNICODE_SPACE_CATEGORIES = set(['Zs', 'Zl', 'Zp'])
UNICODE_LETTER_CATEGORIES = set(['LC', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu'])
UNICODE_WORD_CATEGORIES = UNICODE_LETTER_CATEGORIES | set(['Nd', 'Nl', 'No'])

# This is verbose, but correct on all versions of Python
BYTES_ALL = set(int_to_byte(i) for i in range(256))
BYTES_DIGIT = set(b for b in BYTES_ALL if re.match(b'\\d', b))
BYTES_SPACE = set(b for b in BYTES_ALL if re.match(b'\\s', b))
BYTES_WORD = set(b for b in BYTES_ALL if re.match(b'\\w', b))
BYTES_LOOKUP = {
    sre.CATEGORY_DIGIT: BYTES_DIGIT,
    sre.CATEGORY_SPACE: BYTES_SPACE,
    sre.CATEGORY_WORD: BYTES_WORD,
    sre.CATEGORY_NOT_DIGIT: BYTES_ALL - BYTES_DIGIT,
    sre.CATEGORY_NOT_SPACE: BYTES_ALL - BYTES_SPACE,
    sre.CATEGORY_NOT_WORD: BYTES_ALL - BYTES_WORD,
}

# On Python < 3.4 (including 2.7), the following unicode chars are weird.
# They are matched by the \W, meaning 'not word', but unicodedata.category(c)
# returns one of the word categories above.  There's special handling below.