Example #1
0
def _compile(pattern, flags=0, kwargs={}):
    "Compiles a regular expression to a PatternObject."

    # We won't bother to cache the pattern if we're debugging.
    debugging = (flags & DEBUG) != 0

    # What locale is this pattern using?
    locale_key = (type(pattern), pattern)
    if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
        # This pattern is, or might be, locale-sensitive.
        pattern_locale = _getlocale()[1]
    else:
        # This pattern is definitely not locale-sensitive.
        pattern_locale = None

    if not debugging:
        try:
            # Do we know what keyword arguments are needed?
            args_key = pattern, type(pattern), flags
            args_needed = _named_args[args_key]

            # Are we being provided with its required keyword arguments?
            args_supplied = set()
            if args_needed:
                for k, v in args_needed:
                    try:
                        args_supplied.add((k, frozenset(kwargs[k])))
                    except KeyError:
                        raise error("missing named list: {!r}".format(k))

            args_supplied = frozenset(args_supplied)

            # Have we already seen this regular expression and named list?
            pattern_key = (pattern, type(pattern), flags, args_supplied,
              DEFAULT_VERSION, pattern_locale)
            return _cache[pattern_key]
        except KeyError:
            # It's a new pattern, or new named list for a known pattern.
            pass

    # Guess the encoding from the class of the pattern string.
    if isinstance(pattern, unicode):
        guess_encoding = UNICODE
    elif isinstance(pattern, str):
        guess_encoding = ASCII
    elif isinstance(pattern, _pattern_type):
        if flags:
            raise ValueError("cannot process flags argument with a compiled pattern")

        return pattern
    else:
        raise TypeError("first argument must be a string or compiled pattern")

    # Set the default version in the core code in case it has been changed.
    _regex_core.DEFAULT_VERSION = DEFAULT_VERSION

    caught_exception = None
    global_flags = flags

    while True:
        try:
            source = _Source(pattern)
            info = _Info(global_flags, source.char_type, kwargs)
            info.guess_encoding = guess_encoding
            source.ignore_space = bool(info.flags & VERBOSE)
            parsed = _parse_pattern(source, info)
            break
        except _UnscopedFlagSet:
            # Remember the global flags for the next attempt.
            global_flags = info.global_flags
        except error, e:
            caught_exception = e

        if caught_exception:
            raise error(caught_exception.msg, caught_exception.pattern,
              caught_exception.pos)
Example #2
0
def _compile(pattern, flags=0, kwargs={}):
    "Compiles a regular expression to a PatternObject."

    # We won't bother to cache the pattern if we're debugging.
    debugging = (flags & DEBUG) != 0

    # What locale is this pattern using?
    locale_key = (type(pattern), pattern)
    if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
        # This pattern is, or might be, locale-sensitive.
        pattern_locale = _getlocale()[1]
    else:
        # This pattern is definitely not locale-sensitive.
        pattern_locale = None

    if not debugging:
        try:
            # Do we know what keyword arguments are needed?
            args_key = pattern, type(pattern), flags
            args_needed = _named_args[args_key]

            # Are we being provided with its required keyword arguments?
            args_supplied = set()
            if args_needed:
                for k, v in args_needed:
                    try:
                        args_supplied.add((k, frozenset(kwargs[k])))
                    except KeyError:
                        raise error("missing named list: {!r}".format(k))

            args_supplied = frozenset(args_supplied)

            # Have we already seen this regular expression and named list?
            pattern_key = (pattern, type(pattern), flags, args_supplied,
              DEFAULT_VERSION, pattern_locale)
            return _cache[pattern_key]
        except KeyError:
            # It's a new pattern, or new named list for a known pattern.
            pass

    # Guess the encoding from the class of the pattern string.
    if isinstance(pattern, str):
        guess_encoding = UNICODE
    elif isinstance(pattern, bytes):
        guess_encoding = ASCII
    elif isinstance(pattern, _pattern_type):
        if flags:
            raise ValueError("cannot process flags argument with a compiled pattern")

        return pattern
    else:
        raise TypeError("first argument must be a string or compiled pattern")

    # Set the default version in the core code in case it has been changed.
    _regex_core.DEFAULT_VERSION = DEFAULT_VERSION

    caught_exception = None
    global_flags = flags

    while True:
        try:
            source = _Source(pattern)
            info = _Info(global_flags, source.char_type, kwargs)
            info.guess_encoding = guess_encoding
            source.ignore_space = bool(info.flags & VERBOSE)
            parsed = _parse_pattern(source, info)
            break
        except _UnscopedFlagSet:
            # Remember the global flags for the next attempt.
            global_flags = info.global_flags
        except error as e:
            caught_exception = e

        if caught_exception:
            raise error(caught_exception.msg, caught_exception.pattern,
              caught_exception.pos)

    if not source.at_end():
        raise error("unbalanced parenthesis", pattern, source.pos)

    # Check the global flags for conflicts.
    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
    if version not in (0, VERSION0, VERSION1):
        raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

    if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE):
        raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible")

    if isinstance(pattern, bytes) and (info.flags & UNICODE):
        raise ValueError("cannot use UNICODE flag with a bytes pattern")

    if not (info.flags & _ALL_ENCODINGS):
        if isinstance(pattern, str):
            info.flags |= UNICODE
        else:
            info.flags |= ASCII

    reverse = bool(info.flags & REVERSE)
    fuzzy = isinstance(parsed, _Fuzzy)

    # Remember whether this pattern as an inline locale flag.
    _locale_sensitive[locale_key] = info.inline_locale

    # Fix the group references.
    try:
        parsed.fix_groups(pattern, reverse, False)
    except error as e:
        caught_exception = e

    if caught_exception:
        raise error(caught_exception.msg, caught_exception.pattern,
          caught_exception.pos)

    # Should we print the parsed pattern?
    if flags & DEBUG:
        parsed.dump(indent=0, reverse=reverse)

    # Optimise the parsed pattern.
    parsed = parsed.optimise(info)
    parsed = parsed.pack_characters(info)

    # Get the required string.
    req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags)

    # Build the named lists.
    named_lists = {}
    named_list_indexes = [None] * len(info.named_lists_used)
    args_needed = set()
    for key, index in info.named_lists_used.items():
        name, case_flags = key
        values = frozenset(kwargs[name])
        if case_flags:
            items = frozenset(_fold_case(info, v) for v in values)
        else:
            items = values
        named_lists[name] = values
        named_list_indexes[index] = items
        args_needed.add((name, values))

    # Check the features of the groups.
    _check_group_features(info, parsed)

    # Compile the parsed pattern. The result is a list of tuples.
    code = parsed.compile(reverse)

    # Is there a group call to the pattern as a whole?
    key = (0, reverse, fuzzy)
    ref = info.call_refs.get(key)
    if ref is not None:
        code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )]

    # Add the final 'success' opcode.
    code += [(_OP.SUCCESS, )]

    # Compile the additional copies of the groups that we need.
    for group, rev, fuz in info.additional_groups:
        code += group.compile(rev, fuz)

    # Flatten the code into a list of ints.
    code = _flatten_code(code)

    if not parsed.has_simple_start():
        # Get the first set, if possible.
        try:
            fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
            fs_code = _flatten_code(fs_code)
            code = fs_code + code
        except _FirstSetError:
            pass

    # The named capture groups.
    index_group = dict((v, n) for n, v in info.group_index.items())

    # Create the PatternObject.
    #
    # Local flags like IGNORECASE affect the code generation, but aren't needed
    # by the PatternObject itself. Conversely, global flags like LOCALE _don't_
    # affect the code generation but _are_ needed by the PatternObject.
    compiled_pattern = _regex.compile(pattern, info.flags | version, code,
      info.group_index, index_group, named_lists, named_list_indexes,
      req_offset, req_chars, req_flags, info.group_count)

    # Do we need to reduce the size of the cache?
    if len(_cache) >= _MAXCACHE:
        with _cache_lock:
            _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE)

    if not debugging:
        if (info.flags & LOCALE) == 0:
            pattern_locale = None

        args_needed = frozenset(args_needed)

        # Store this regular expression and named list.
        pattern_key = (pattern, type(pattern), flags, args_needed,
          DEFAULT_VERSION, pattern_locale)
        _cache[pattern_key] = compiled_pattern

        # Store what keyword arguments are needed.
        _named_args[args_key] = args_needed

    return compiled_pattern