コード例 #1
0
def _compile_replacement_helper(pattern, template):
    "Compiles a replacement template."
    # This function is called by the _regex module.

    # Have we seen this before?
    key = pattern.pattern, pattern.flags, template
    compiled = _replacement_cache.get(key)
    if compiled is not None:
        return compiled

    if len(_replacement_cache) >= _MAXREPCACHE:
        _replacement_cache.clear()

    is_unicode = isinstance(template, str)
    source = _Source(template)
    if is_unicode:

        def make_string(char_codes):
            return "".join(chr(c) for c in char_codes)
    else:

        def make_string(char_codes):
            return bytes(char_codes)

    compiled = []
    literal = []
    while True:
        ch = source.get()
        if not ch:
            break
        if ch == "\\":
            # '_compile_replacement' will return either an int group reference
            # or a string literal. It returns items (plural) in order to handle
            # a 2-character literal (an invalid escape sequence).
            is_group, items = _compile_replacement(source, pattern, is_unicode)
            if is_group:
                # It's a group, so first flush the literal.
                if literal:
                    compiled.append(make_string(literal))
                    literal = []
                compiled.extend(items)
            else:
                literal.extend(items)
        else:
            literal.append(ord(ch))

    # Flush the literal.
    if literal:
        compiled.append(make_string(literal))

    _replacement_cache[key] = compiled

    return compiled
コード例 #2
0
def _compile(pattern, flags, ignore_unused, kwargs):
    "Compiles a regular expression to a PatternObject."

    global DEFAULT_VERSION
    try:
        from regex import DEFAULT_VERSION
    except ImportError:
        pass

    # We won't bother to cache the pattern if we're debugging.
    debugging = (flags & DEBUG) != 0

    # What locale is this pattern using?
    locale_key = (type(pattern), pattern)
    if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
        # This pattern is, or might be, locale-sensitive.
        pattern_locale = _getpreferredencoding()
    else:
        # This pattern is definitely not locale-sensitive.
        pattern_locale = None

    if not debugging:
        try:
            # Do we know what keyword arguments are needed?
            args_key = pattern, type(pattern), flags
            args_needed = _named_args[args_key]

            # Are we being provided with its required keyword arguments?
            args_supplied = set()
            if args_needed:
                for k, v in args_needed:
                    try:
                        args_supplied.add((k, frozenset(kwargs[k])))
                    except KeyError:
                        raise error("missing named list: {!r}".format(k))

            args_supplied = frozenset(args_supplied)

            # Have we already seen this regular expression and named list?
            pattern_key = (pattern, type(pattern), flags, args_supplied,
                           DEFAULT_VERSION, pattern_locale)
            return _cache[pattern_key]
        except KeyError:
            # It's a new pattern, or new named list for a known pattern.
            pass

    # Guess the encoding from the class of the pattern string.
    if isinstance(pattern, str):
        guess_encoding = UNICODE
    elif isinstance(pattern, bytes):
        guess_encoding = ASCII
    elif isinstance(pattern, Pattern):
        if flags:
            raise ValueError(
                "cannot process flags argument with a compiled pattern")

        return pattern
    else:
        raise TypeError("first argument must be a string or compiled pattern")

    # Set the default version in the core code in case it has been changed.
    _regex_core.DEFAULT_VERSION = DEFAULT_VERSION

    global_flags = flags

    while True:
        caught_exception = None
        try:
            source = _Source(pattern)
            info = _Info(global_flags, source.char_type, kwargs)
            info.guess_encoding = guess_encoding
            source.ignore_space = bool(info.flags & VERBOSE)
            parsed = _parse_pattern(source, info)
            break
        except _UnscopedFlagSet:
            # Remember the global flags for the next attempt.
            global_flags = info.global_flags
        except error as e:
            caught_exception = e

        if caught_exception:
            raise error(caught_exception.msg, caught_exception.pattern,
                        caught_exception.pos)

    if not source.at_end():
        raise error("unbalanced parenthesis", pattern, source.pos)

    # Check the global flags for conflicts.
    version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
    if version not in (0, VERSION0, VERSION1):
        raise ValueError(
            "VERSION0 and VERSION1 flags are mutually incompatible")

    if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE):
        raise ValueError(
            "ASCII, LOCALE and UNICODE flags are mutually incompatible")

    if isinstance(pattern, bytes) and (info.flags & UNICODE):
        raise ValueError("cannot use UNICODE flag with a bytes pattern")

    if not (info.flags & _ALL_ENCODINGS):
        if isinstance(pattern, str):
            info.flags |= UNICODE
        else:
            info.flags |= ASCII

    reverse = bool(info.flags & REVERSE)
    fuzzy = isinstance(parsed, _Fuzzy)

    # Remember whether this pattern as an inline locale flag.
    _locale_sensitive[locale_key] = info.inline_locale

    # Fix the group references.
    caught_exception = None
    try:
        parsed.fix_groups(pattern, reverse, False)
    except error as e:
        caught_exception = e

    if caught_exception:
        raise error(caught_exception.msg, caught_exception.pattern,
                    caught_exception.pos)

    # Should we print the parsed pattern?
    if flags & DEBUG:
        parsed.dump(indent=0, reverse=reverse)

    # Optimise the parsed pattern.
    parsed = parsed.optimise(info, reverse)
    parsed = parsed.pack_characters(info)

    # Get the required string.
    req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags)

    # Build the named lists.
    named_lists = {}
    named_list_indexes = [None] * len(info.named_lists_used)
    args_needed = set()
    for key, index in info.named_lists_used.items():
        name, case_flags = key
        values = frozenset(kwargs[name])
        if case_flags:
            items = frozenset(_fold_case(info, v) for v in values)
        else:
            items = values
        named_lists[name] = values
        named_list_indexes[index] = items
        args_needed.add((name, values))

    # Any unused keyword arguments, possibly resulting from a typo?
    unused_kwargs = set(kwargs) - set(named_lists)
    if unused_kwargs and not ignore_unused:
        any_one = next(iter(unused_kwargs))
        raise ValueError('unused keyword argument {!a}'.format(any_one))

    # Check the features of the groups.
    _check_group_features(info, parsed)

    # Compile the parsed pattern. The result is a list of tuples.
    code = parsed.compile(reverse)

    # Is there a group call to the pattern as a whole?
    key = (0, reverse, fuzzy)
    ref = info.call_refs.get(key)
    if ref is not None:
        code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )]

    # Add the final 'success' opcode.
    code += [(_OP.SUCCESS, )]

    # Compile the additional copies of the groups that we need.
    for group, rev, fuz in info.additional_groups:
        code += group.compile(rev, fuz)

    # Flatten the code into a list of ints.
    code = _flatten_code(code)

    if not parsed.has_simple_start():
        # Get the first set, if possible.
        try:
            fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
            fs_code = _flatten_code(fs_code)
            code = fs_code + code
        except _FirstSetError:
            pass

    # The named capture groups.
    index_group = dict((v, n) for n, v in info.group_index.items())

    # Create the PatternObject.
    #
    # Local flags like IGNORECASE affect the code generation, but aren't needed
    # by the PatternObject itself. Conversely, global flags like LOCALE _don't_
    # affect the code generation but _are_ needed by the PatternObject.
    compiled_pattern = _regex.compile(pattern, info.flags | version, code,
                                      info.group_index, index_group,
                                      named_lists, named_list_indexes,
                                      req_offset, req_chars, req_flags,
                                      info.group_count)

    # Do we need to reduce the size of the cache?
    if len(_cache) >= _MAXCACHE:
        with _cache_lock:
            _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE)

    if not debugging:
        if (info.flags & LOCALE) == 0:
            pattern_locale = None

        args_needed = frozenset(args_needed)

        # Store this regular expression and named list.
        pattern_key = (pattern, type(pattern), flags, args_needed,
                       DEFAULT_VERSION, pattern_locale)
        _cache[pattern_key] = compiled_pattern

        # Store what keyword arguments are needed.
        _named_args[args_key] = args_needed

    return compiled_pattern