def _compile_replacement_helper(pattern, template): "Compiles a replacement template." # This function is called by the _regex module. # Have we seen this before? key = pattern.pattern, pattern.flags, template compiled = _replacement_cache.get(key) if compiled is not None: return compiled if len(_replacement_cache) >= _MAXREPCACHE: _replacement_cache.clear() is_unicode = isinstance(template, str) source = _Source(template) if is_unicode: def make_string(char_codes): return "".join(chr(c) for c in char_codes) else: def make_string(char_codes): return bytes(char_codes) compiled = [] literal = [] while True: ch = source.get() if not ch: break if ch == "\\": # '_compile_replacement' will return either an int group reference # or a string literal. It returns items (plural) in order to handle # a 2-character literal (an invalid escape sequence). is_group, items = _compile_replacement(source, pattern, is_unicode) if is_group: # It's a group, so first flush the literal. if literal: compiled.append(make_string(literal)) literal = [] compiled.extend(items) else: literal.extend(items) else: literal.append(ord(ch)) # Flush the literal. if literal: compiled.append(make_string(literal)) _replacement_cache[key] = compiled return compiled
def _compile(pattern, flags, ignore_unused, kwargs): "Compiles a regular expression to a PatternObject." global DEFAULT_VERSION try: from regex import DEFAULT_VERSION except ImportError: pass # We won't bother to cache the pattern if we're debugging. debugging = (flags & DEBUG) != 0 # What locale is this pattern using? locale_key = (type(pattern), pattern) if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: # This pattern is, or might be, locale-sensitive. pattern_locale = _getpreferredencoding() else: # This pattern is definitely not locale-sensitive. pattern_locale = None if not debugging: try: # Do we know what keyword arguments are needed? args_key = pattern, type(pattern), flags args_needed = _named_args[args_key] # Are we being provided with its required keyword arguments? args_supplied = set() if args_needed: for k, v in args_needed: try: args_supplied.add((k, frozenset(kwargs[k]))) except KeyError: raise error("missing named list: {!r}".format(k)) args_supplied = frozenset(args_supplied) # Have we already seen this regular expression and named list? pattern_key = (pattern, type(pattern), flags, args_supplied, DEFAULT_VERSION, pattern_locale) return _cache[pattern_key] except KeyError: # It's a new pattern, or new named list for a known pattern. pass # Guess the encoding from the class of the pattern string. if isinstance(pattern, str): guess_encoding = UNICODE elif isinstance(pattern, bytes): guess_encoding = ASCII elif isinstance(pattern, Pattern): if flags: raise ValueError( "cannot process flags argument with a compiled pattern") return pattern else: raise TypeError("first argument must be a string or compiled pattern") # Set the default version in the core code in case it has been changed. _regex_core.DEFAULT_VERSION = DEFAULT_VERSION global_flags = flags while True: caught_exception = None try: source = _Source(pattern) info = _Info(global_flags, source.char_type, kwargs) info.guess_encoding = guess_encoding source.ignore_space = bool(info.flags & VERBOSE) parsed = _parse_pattern(source, info) break except _UnscopedFlagSet: # Remember the global flags for the next attempt. global_flags = info.global_flags except error as e: caught_exception = e if caught_exception: raise error(caught_exception.msg, caught_exception.pattern, caught_exception.pos) if not source.at_end(): raise error("unbalanced parenthesis", pattern, source.pos) # Check the global flags for conflicts. version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION if version not in (0, VERSION0, VERSION1): raise ValueError( "VERSION0 and VERSION1 flags are mutually incompatible") if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE): raise ValueError( "ASCII, LOCALE and UNICODE flags are mutually incompatible") if isinstance(pattern, bytes) and (info.flags & UNICODE): raise ValueError("cannot use UNICODE flag with a bytes pattern") if not (info.flags & _ALL_ENCODINGS): if isinstance(pattern, str): info.flags |= UNICODE else: info.flags |= ASCII reverse = bool(info.flags & REVERSE) fuzzy = isinstance(parsed, _Fuzzy) # Remember whether this pattern as an inline locale flag. _locale_sensitive[locale_key] = info.inline_locale # Fix the group references. caught_exception = None try: parsed.fix_groups(pattern, reverse, False) except error as e: caught_exception = e if caught_exception: raise error(caught_exception.msg, caught_exception.pattern, caught_exception.pos) # Should we print the parsed pattern? if flags & DEBUG: parsed.dump(indent=0, reverse=reverse) # Optimise the parsed pattern. parsed = parsed.optimise(info, reverse) parsed = parsed.pack_characters(info) # Get the required string. req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags) # Build the named lists. named_lists = {} named_list_indexes = [None] * len(info.named_lists_used) args_needed = set() for key, index in info.named_lists_used.items(): name, case_flags = key values = frozenset(kwargs[name]) if case_flags: items = frozenset(_fold_case(info, v) for v in values) else: items = values named_lists[name] = values named_list_indexes[index] = items args_needed.add((name, values)) # Any unused keyword arguments, possibly resulting from a typo? unused_kwargs = set(kwargs) - set(named_lists) if unused_kwargs and not ignore_unused: any_one = next(iter(unused_kwargs)) raise ValueError('unused keyword argument {!a}'.format(any_one)) # Check the features of the groups. _check_group_features(info, parsed) # Compile the parsed pattern. The result is a list of tuples. code = parsed.compile(reverse) # Is there a group call to the pattern as a whole? key = (0, reverse, fuzzy) ref = info.call_refs.get(key) if ref is not None: code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )] # Add the final 'success' opcode. code += [(_OP.SUCCESS, )] # Compile the additional copies of the groups that we need. for group, rev, fuz in info.additional_groups: code += group.compile(rev, fuz) # Flatten the code into a list of ints. code = _flatten_code(code) if not parsed.has_simple_start(): # Get the first set, if possible. try: fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) fs_code = _flatten_code(fs_code) code = fs_code + code except _FirstSetError: pass # The named capture groups. index_group = dict((v, n) for n, v in info.group_index.items()) # Create the PatternObject. # # Local flags like IGNORECASE affect the code generation, but aren't needed # by the PatternObject itself. Conversely, global flags like LOCALE _don't_ # affect the code generation but _are_ needed by the PatternObject. compiled_pattern = _regex.compile(pattern, info.flags | version, code, info.group_index, index_group, named_lists, named_list_indexes, req_offset, req_chars, req_flags, info.group_count) # Do we need to reduce the size of the cache? if len(_cache) >= _MAXCACHE: with _cache_lock: _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE) if not debugging: if (info.flags & LOCALE) == 0: pattern_locale = None args_needed = frozenset(args_needed) # Store this regular expression and named list. pattern_key = (pattern, type(pattern), flags, args_needed, DEFAULT_VERSION, pattern_locale) _cache[pattern_key] = compiled_pattern # Store what keyword arguments are needed. _named_args[args_key] = args_needed return compiled_pattern