def compile_regexp_to_noncapturing(pattern, flags=0): """ Convert all grouping parentheses in the given regexp pattern to non-capturing groups, and return the result. E.g.: >>> from nltk.internals import compile_regexp_to_noncapturing >>> compile_regexp_to_noncapturing('ab(c(x+)(z*))?d') 'ab(?:c(?:x+)(?:z*))?d' :type pattern: str :rtype: str """ def convert_regexp_to_noncapturing_parsed(parsed_pattern): res_data = [] for key, value in parsed_pattern.data: if key == sre_constants.SUBPATTERN: index, subpattern = value value = (None, convert_regexp_to_noncapturing(subpattern)) elif key == sre_constants.GROUPREF: raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern)) res_data.append((key, value)) parsed_pattern.data = res_data parsed_pattern.pattern.groups = 1 parsed_pattern.pattern.groupdict = {} return parsed_pattern return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)))
def __init__(self, lexicons, init_state=None, flags=0): # All the regexp magic below is copied from re.Scanner from # the standard library. import sre_compile import sre_parse from sre_constants import BRANCH, SUBPATTERN if init_state is None: init_state = State() if not hasattr(init_state, 'start'): init_state.start = None self.init_state = init_state self.lexicons = lexicons self.scanners = {} for start, lexicon in lexicons.iteritems(): # combine phrases into a compound pattern p, a = [], [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), ])) a.append(action) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) s.groups = len(p) self.scanners[start] = sre_compile.compile(p).match, a
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] # NOTE(kgibbs): These lines must be added to make this file work under # Python 2.2, which is commonly used at Google. def enumerate(obj): i = -1 for item in obj: i += 1 yield i, item # NOTE(kgibbs): End changes. for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) s.groups = len(p)+1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(*key): pattern, flags = key bypass_cache = flags & DEBUG if not bypass_cache: cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p if isinstance(pattern, _pattern_type): if flags: raise ValueError("Cannot process flags argument with a compiled pattern") return pattern else: if not sre_compile.isstring(pattern): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error as v: raise error, v if not bypass_cache: if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p return p
def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): flags = flags.value try: return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, Pattern): if flags: raise ValueError( "cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: # Drop the oldest item try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): pass _cache[type(pattern), pattern, flags] = p return p
def _compile(*key): # internal: compile pattern taint = _get_taint(key[0]) if taint is not None: # can't hash the set taint = tuple(taint) cachekey = (type(key[0]), key, taint) p = re._cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, re._pattern_type): if flags: raise ValueError("Cannot process flags argument with" " a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled" " pattern") p = sre_compile.compile(pattern, flags) if len(re._cache) >= re._MAXCACHE: re._cache.clear() re._cache[cachekey] = p return p
def _compile(pattern, flags): # internal: compile pattern try: p, loc = _cache[type(pattern), pattern, flags] if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): return p except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError( "cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: _cache.clear() if p.flags & LOCALE: if not _locale: return p loc = _locale.setlocale(_locale.LC_CTYPE) else: loc = None _cache[type(pattern), pattern, flags] = p, loc return p
def __init__(self, runtimePath) : self.availRuleNames = [] basePath = os.path.join(runtimePath, "rules") ruleFiles = os.listdir(basePath) rulePattern = sre_compile.compile("^(.*)\.py$") for eachRuleFile in ruleFiles : if os.path.isfile(os.path.join(basePath, eachRuleFile)) : ruleMatch = rulePattern.match(eachRuleFile) if ruleMatch != None and eachRuleFile.find("__init__") == -1 : ruleName = ruleMatch.group(1) self.availRuleNames.append(ruleName) self.availRuleCount = len(self.availRuleNames) self.availRuleModules = {} self.loadedRule = [] self.rules = [] self.preprocessRules = [] self.functionNameRules = [] self.functionScopeRules = [] self.typeNameRules = [] self.typeScopeRules = [] self.lineRules = [] self.fileEndRules = [] self.fileStartRules = [] self.projectRules = [] self.rollBackImporter = None
def Sub(self, pattern, repl, s): """Replace the string for the pattern by the paramter repl, caching the compiled regexp.""" # for example: s='a1234a' ,repl='OOOO' pattern = r'd+' # result is 'aOOOOa' # if not pattern in self._regexp_compile_cache: self._regexp_compile_cache[pattern] = sre_compile.compile(pattern) return self._regexp_compile_cache[pattern].sub(repl,s)
def Match(self, pattern, s): """Matches the string with the pattern, caching the compiled regexp.""" # The regexp compilation caching is inlined in both Match and Search for # performance reasons; factoring it out into a separate function turns out # to be noticeably expensive. if not pattern in self._regexp_compile_cache: self._regexp_compile_cache[pattern] = sre_compile.compile(pattern) return self._regexp_compile_cache[pattern].match(s)
def _compile_typed(text_bytes_type, pattern, flags): # internal: compile pattern if isinstance(pattern, _pattern_type): if flags: raise ValueError( "Cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") return sre_compile.compile(pattern, flags)
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon p = [] s = sre_parse.Pattern() s.flags = flags for (phrase, action) in lexicon: p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags)))])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, lexicon): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p), sre_parse.parse(phrase)))])) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) s.groups = len(p) self.scanner = sre_compile.compile(p)
def _compile(*key): # internal: compile pattern p = _cache.get(key) if p is not None: return p pattern, flags = key if type(pattern) not in sre_compile.STRING_TYPES: return pattern try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression
def _get_group_pattern(self,flags): # combine phrases into a compound pattern patterns = [] sub_pattern = sre_parse.Pattern() sub_pattern.flags = flags for phrase, action in self.lexicon: patterns.append(sre_parse.SubPattern(sub_pattern, [ (SUBPATTERN, (len(patterns) + 1, sre_parse.parse(phrase, flags))), ])) sub_pattern.groups = len(patterns) + 1 group_pattern = sre_parse.SubPattern(sub_pattern, [(BRANCH, (None, patterns))]) return sre_compile.compile(group_pattern)
def __init__(self, lexicon, flags=0): self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), ])) s.groups = len(p)+1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(*key): p = _cache.get(key) if (p is not None): return p (pattern, flags,) = key if (type(pattern) is _pattern_type): return pattern if (type(pattern) not in sre_compile.STRING_TYPES): raise TypeError, 'first argument must be string or compiled pattern' try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v
def __init__(self, regexp, negative=False, **property_names): """ Create a new C{RegexpTokenizer} from a given regular expression. @type regexp: C{string} or C{SRE_Pattern} @param regexp: The regular expression used to tokenized texts. Unless C{negative} is true, this regular expression specifies the form of a single word type; so the list of tokens generated by tokenization includes all non-overlapping substrings that match C{regexp} @type negative: C{boolean} @param negative: An optional parameter that inverts the meaning of C{regexp}. In particular, if C{negative} is true, then C{regexp} is taken to specify the form of word separators (and not word types); so the list of tokens generated by tokenization includes all substrings that occur I{between} matches of the regular expression. @type property_names: C{dict} @param property_names: A dictionary that can be used to override the default property names. Each entry maps from a default property name to a new property name. """ assert chktype(1, regexp, str) AbstractTokenizer.__init__(self, **property_names) if hasattr(regexp, "pattern"): regexp = regexp.pattern self._negative = bool(negative) # Replace any grouping parentheses with non-grouping ones. We # need to do this, because the list returned by re.sub will # contain an element corresponding to every set of grouping # parentheses. We must not touch escaped parentheses, and # need to handle the case of escaped escapes (e.g. "\\("). # We also need to handle nested parentheses, which means our # regexp contexts must be zero-width. There are also issues with # parenthesis appearing in bracketed contexts, hence we've # operated on the intermediate parse structure from sre_parse. parsed = sre_parse.parse(regexp) parsed = _remove_group_identifiers(parsed) # Add grouping parentheses around the regexp; this will allow # us to access the material that was split on. # Need to set the Pattern to expect a single group pattern = sre_parse.Pattern() pattern.groups += 1 grouped = sre_parse.SubPattern(pattern) grouped.append((sre_constants.SUBPATTERN, (1, parsed))) self._regexp = sre_compile.compile(grouped, re.UNICODE)
def _compile(*key): # internal: compile pattern p = _cache.get(key) if p is not None: return p pattern, flags = key if type(pattern) is _pattern_type: return pattern if type(pattern) not in sre_compile.STRING_TYPES: raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression
def _compile(*key): # internal: compile pattern p = _cache.get(key) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): return pattern if not sre_compile.isstring(pattern): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(regexp): parsed = sre_parse.parse(regexp) parsed = _remove_group_identifiers(parsed) # Add grouping parentheses around the regexp; this will allow # us to access the material that was split on. # Need to set the Pattern to expect a single group pattern = sre_parse.Pattern() pattern.groups += 1 grouped = sre_parse.SubPattern(pattern) grouped.append((sre_constants.SUBPATTERN, (1, parsed))) return sre_compile.compile(grouped, re.UNICODE | re.MULTILINE | re.DOTALL)
def ReplaceAll(pattern, rep, s): """Replaces instances of pattern in a string with a replacement. The compiled regex is kept in a cache shared by Match and Search. Args: pattern: regex pattern rep: replacement text s: search string Returns: string with replacements made (or original string if no replacements) """ if pattern not in _regexp_compile_cache: _regexp_compile_cache[pattern] = sre_compile.compile(pattern) return _regexp_compile_cache[pattern].sub(rep, s)
def build_scanner(lexicon, flags=0): import sre_parse import sre_compile from sre_constants import BRANCH, SUBPATTERN # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))), ])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) scanner = sre_compile.compile(p) return scanner
def _compile(*key): # internal: compile pattern cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(pattern, flags): # internal: compile pattern try: return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError( "Cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if len(_cache) >= _MAXCACHE: _cache.clear() _cache[type(pattern), pattern, flags] = p return p
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN if isinstance(flags, RegexFlag): flags = flags.value self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(*key): # internal: compile pattern cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: raise ValueError("Cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p return p
def _compile(*key): # internal: compile pattern cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: raise ValueError( "Cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p return p
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _compile(*key): cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): return pattern if not sre_compile.isstring(pattern): raise TypeError, 'first argument must be string or compiled pattern' try: p = sre_compile.compile(pattern, flags) except error as v: raise error, v if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p return p
def _compile(pattern, flags): bypass_cache = flags & DEBUG if not bypass_cache: try: return _cache[(type(pattern), pattern, flags)] except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError('first argument must be string or compiled pattern') p = sre_compile.compile(pattern, flags) if not bypass_cache: if len(_cache) >= _MAXCACHE: _cache.clear() _cache[(type(pattern), pattern, flags)] = p return p
def _compile(pattern, flags): # internal: compile pattern try: return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError( "cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: _cache.clear() _cache[type(pattern), pattern, flags] = p return p
def _compile(*key): cachekey = (type(key[0]), ) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): return pattern if not sre_compile.isstring(pattern): raise TypeError, 'first argument must be string or compiled pattern' try: p = sre_compile.compile(pattern, flags) except error as v: raise error, v if len(_cache) >= _MAXCACHE: _cache.clear() _cache[cachekey] = p return p
def _compile(*key): # internal: compile pattern pattern, flags = key bypass_cache = flags & DEBUG if not bypass_cache: cachekey = (type(key[0]), ) + key p = _cache.get(cachekey) if p is not None: return p if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags \ argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") try: p = sre_compile.compile(pattern, flags) except error, v: raise error(v) # invalid expression
def _compile(pattern, flags): bypass_cache = flags & DEBUG if not bypass_cache: try: return _cache[(type(pattern), pattern, flags)] except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError( 'Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError('first argument must be string or compiled pattern') p = sre_compile.compile(pattern, flags) if not bypass_cache: if len(_cache) >= _MAXCACHE: _cache.clear() _cache[(type(pattern), pattern, flags)] = p return p
def make_scanner(lexicon, flags=FLAGS): actions = [None] # Combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags charpatterns = {} p = [] idx = 0 for token in lexicon: if token.pattern in (r'\[', r'{', r'"'): charpatterns[token.pattern[-1]] = token idx += 1 phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) actions.append(token) s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) scanner = sre_compile.compile(p).scanner def _scan_once(string, idx=0, context=None): try: action = charpatterns[string[idx]] except KeyError: pass except IndexError: raise StopIteration else: return action((string, idx + 1), context) m = scanner(string, idx).match() if m is None or m.end() == idx: raise StopIteration return actions[m.lastindex](m, context) return _scan_once
def _compile(*key): # internal: compile pattern pattern, flags = key bypass_cache = flags & DEBUG if not bypass_cache: cachekey = (type(key[0]),) + key try: p, loc = _cache[cachekey] if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): return p except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError, "first argument must be string or compiled pattern" try: p = sre_compile.compile(pattern, flags) except error, v: raise error, v # invalid expression
def expand_sub(string, template, debug=0, mode='all'): """ Given a regular expression and a replacement string, generate expansions of the regular expression and for each one return it and its transformation as applied by the replacement string. string: regular expression to expand template: transformation to apply to each regular expression mode: can take 3 values all: return all possible shortest strings that the regular expression would match first: return the first string that all would return random: return one random string that the regular expression would match """ pattern = sre_parse.parse(string, flags=sre_parse.SRE_FLAG_VERBOSE) pattern.mode = mode template = sre_parse.parse_template(template, sre_compile.compile(pattern)) if debug: print(pattern) print(template) for s in _iterate(pattern, pattern.data, MatchObj(pattern, "")): s.patient = 0 yield (s.string, sre_parse.expand_template(template, s))
def compile_regexp_to_noncapturing(pattern, flags=0): """ Compile the regexp pattern after switching all grouping parentheses in the given regexp pattern to non-capturing groups. :type pattern: str :rtype: str """ def convert_regexp_to_noncapturing_parsed(parsed_pattern): res_data = [] for key, value in parsed_pattern.data: if key == sre_constants.SUBPATTERN: index, subpattern = value value = (None, convert_regexp_to_noncapturing_parsed(subpattern)) elif key == sre_constants.GROUPREF: raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern)) res_data.append((key, value)) parsed_pattern.data = res_data parsed_pattern.pattern.groups = 1 parsed_pattern.pattern.groupdict = {} return parsed_pattern return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)), flags=flags)
def _compile(pattern, flags): # 内部:编译模式 if isinstance(flags, RegexFlag): flags = flags.value try: return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, Pattern): if flags: raise ValueError("无法使用编译模式处理标志参数") return pattern if not sre_compile.isstring(pattern): raise TypeError("第一个参数必须是字符串或编译模式") p = sre_compile.compile(pattern, flags) if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: # 删除最旧的项目 try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): pass _cache[type(pattern), pattern, flags] = p return p
def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): flags = flags.value try: return _cache[type(pattern), pattern, flags] except KeyError: pass if isinstance(pattern, Pattern): if flags: raise ValueError( "cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) if not (flags & DEBUG): if len(_cache) >= _MAXCACHE: try: _cache.popitem(last=False) except KeyError: pass _cache[type(pattern), pattern, flags] = p return p
def _compile(pattern, flags): # internal: compile pattern try: #fixme brython #return _cache[type(pattern), pattern, flags] return _cache["%s:%s:%s" % (type(pattern), pattern, flags)] except KeyError: pass #print(pattern) if isinstance(pattern, _pattern_type): if flags: raise ValueError( "Cannot process flags argument with a compiled pattern") return pattern if not sre_compile.isstring(pattern): raise TypeError("first argument must be string or compiled pattern") p = sre_compile.compile(pattern, flags) #print('_compile', p) if len(_cache) >= _MAXCACHE: _cache.clear() #fix me brython #_cache[type(pattern), pattern, flags] = p _cache["%s:%s:%s" % (type(pattern), pattern, flags)]= p return p
def _compile(*key): pattern, flags = key bypass_cache = flags & DEBUG if not bypass_cache: cachekey = ( type(key[0]),) + key try: p, loc = _cache[cachekey] if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE): return p except KeyError: pass if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern): raise TypeError, 'first argument must be string or compiled pattern' try: p = sre_compile.compile(pattern, flags) except error as v: raise error, v if not bypass_cache: if len(_cache) >= _MAXCACHE: _cache.clear() if p.flags & LOCALE: if not _locale: return p loc = _locale.setlocale(_locale.LC_CTYPE) else: loc = None _cache[cachekey] = ( p, loc) return p
def _compile(*key): p = _cache.get(key) if p is not None: return p (pattern, flags) = key if type(pattern) is _pattern_type: return pattern if type(pattern) not in sre_compile.STRING_TYPES: raise TypeError, 'first argument must be string or compiled pattern' try: p = sre_compile.compile(pattern, flags) except error: v = None raise error, v if len(_cache) >= _MAXCACHE: _cache.clear() _cache[key] = p return p
def FindAll(pattern, s): """Searches the string for the pattern, caching the compiled regexp.""" if not pattern in _regexp_compile_cache: _regexp_compile_cache[pattern] = sre_compile.compile(pattern) return _regexp_compile_cache[pattern].findall(s)
class Wiki2XHTML(Translator): '''Simple conversion from DotClear wiki2xhtml markup to HTML .. Warning:: Behaviour is quite different than the original DotClear parser and a few elements has been left unimplemented. ''' #? first space of each line _first_space = sre_compile.compile(r'(?:^|(?<=\n))(?:[ ]|(?=\n))') #? first '> ' sequence of each line, the space being optional _first_gt_space = sre_compile.compile(r'(?:^|(?<=\n))>(?:[ ]|(?=\n))') #? double or more LF _block_separator = sre_compile.compile(r'\n\n+(?=\S)') #? separate list prefix (# | *) and list value _fragment_list = sre_compile.compile( r'(?P<type>[*#]+) \s* (?P<value> (?: (?:(?:.|\n)+?) (?:(?=\n[#*])|$) ) | (?:.+\n)$ )', sre_compile.SRE_FLAG_MULTILINE | sre_compile.SRE_FLAG_VERBOSE) #? use to beautify ``<li>\n\s*value`` to ``<li>value`` _li_trim = sre_compile.compile(r'(?<=<li>)\s+|(?<![>])(?=\n)\s+?(?=</li>)') #? non-word _non_word = sre_compile.compile(r'\W+') @staticmethod def escape(string, entities=False): '''Escape special HTML characters Replace characters with a special signifiance in HTML by their HTML entity equivalent. If the optional argument `entities` is set, will use an entities table build from :var:`htmlentitydefs.entitydefs`. ''' tr = html_entities if entities else html_special_chars return u''.join(c in tr and tr[c] or c for c in string) ##### blocks @staticmethod def b_hr(match): return u'<hr />\n' def b_p(self, match): return u'<p>%s</p>\n' % p_inline.sub(self.inlines, match.group('p')).strip() def b_xmp(self, match): return u'<pre class="xmp">%s</pre>\n' % self.escape( match.group('xmp')).strip() def b_pre(self, match): return u'<pre>%s</pre>\n' % p_inline.sub( self.inlines, self._first_space.sub('', match.group(match.lastgroup))).rstrip() def b_special(self, match): assert match.group('macro') == 'html' return u'<div class="macro %s">%s</div>\n' % (self.escape( match.group('macro')), match.group('special').strip()) def b_head(self, match): return u'<h%(n)u>%(value)s</h%(n)u>\n' % { 'n': 6 - len(match.group('head_level')), 'value': p_inline.sub(self.inlines, match.group('head_value')) } def b_blockquote(self, match): return u'<blockquote><p>%s</p></blockquote>\n' % u'</p>\n<p>'.join( self._block_separator.split( p_inline.sub( self.inlines, self._first_gt_space.sub('', match.group( match.lastgroup))))).rstrip() def b_list(self, match): ltprev = '' #? TODO: i'd like to do it without the nodes tree root = node = MazNode('div') for m in self._fragment_list.finditer(match.group()): ltcurr, value = m.groups() for prev, curr in itertools.dropwhile( lambda x: not cmp(x[0], x[1]), itertools.izip_longest(ltprev, ltcurr)): if prev: node = node.parent if node.name == 'li': node = node.parent if curr: if node.child and node.child.name == 'li': node = node.child.prev node += MazNode('%sl' % (curr == '#' and 'o' or 'u', )) node = node.child.prev node += (MazNode('li') + MazNode(value=p_inline.sub(self.inlines, value))) ltprev = ltcurr # FIXME: there is a bug in MazNode when descending from a higher level than the root root.child.parent = root.child return self._li_trim.sub('', node2html(root.child)) @staticmethod def b_nl(match): return '' ##### inlines def i_code(self, match): return u'<tt class="code">%s</tt>' % p_inline.sub( self.inlines, match.group(match.lastgroup)) def i_em(self, match): return u'<em>%s</em>' % p_inline.sub(self.inlines, match.group(match.lastgroup)) def i_strong(self, match): return u'<strong>%s</strong>' % p_inline.sub( self.inlines, match.group(match.lastgroup)) def i_del(self, match): return u'<del>%s</del>' % p_inline.sub(self.inlines, match.group(match.lastgroup)) def i_ins(self, match): return u'<ins>%s</ins>' % p_inline.sub(self.inlines, match.group(match.lastgroup)) @staticmethod def i_br(match): return u'<br />' def i_anchor(self, match): return u'<a name="%s"></a>' % self._non_word.sub( '-', match.group('anchor')) def i_acronym(self, match): return u'<acronym%s>%s</acronym>' % ( u' title="%s"' % self.escape(match.group('acronym_title').strip()) if match.group('acronym_title') else '', p_inline.sub(self.inlines, match.group('acronym_value')).strip()) def i_a(self, match): href = urlparse.urlsplit(match.group('a_href')) link = [u'<a href="%s"' % match.group('a_href')] if match.group('a_title'): link.append(u' title="%s"' % self.escape(match.group('a_title'))) if match.group('a_lang'): link.append(u' hreflang="%s"' % self.escape(match.group('a_lang'))) if href.scheme: # TODO: make a handle for the external using the hostname link.append(u' class="external"') link.append(u'>%s</a>' % ( p_inline.sub(self.inlines, match.group('a_value')) \ if match.group('a_value') \ else self.escape(match.group('a_href')) )) return ''.join(link) def i_uri(self, match): return u'<a href="%s" class="external">%s</a>' % (match.group( match.lastgroup), self.escape(match.group(match.lastgroup))) def i_img(self, match): link = [u'<img src="%s"' % match.group('img_src')] if match.group('img_alt'): link.append(u'alt="%s"' % self.escape(match.group('img_alt'))) if match.group('img_desc'): link.append(u'longdesc="%s"' % self.escape(match.group('img_desc'))) if match.group('img_align'): align = match.group('img_align').strip().lower()[0] if align in 'lg': #? align left link.append('style="float:left; margin: 0 1em 1em 0;"') elif align in 'cm': #? align center link.append('style="display:block; margin:0 auto;"') elif align in 'rd': #? align right link.append('style="float:right; margin: 0 0 1em 1em;"') else: self.warn(match, 'unknown alignment %r' % match.group('img_align')) link.append('/>') return ' '.join(link) def i_cite(self, match): r = ['<q'] if match.group('cite_lang'): r.append(u' lang="%s"' % self.escape(match.group('cite_lang'))) if match.group('cite_cite'): # FIXME? use urlencode, not escape r.append(u' cite="%s"' % self.escape(match.group('cite_cite'))) r.append(u'>%s</q>' % p_inline.sub(self.inlines, match.group('cite_value')).strip()) return ''.join(r)
def search(pattern, s): """Searches the string for the pattern, caching the compiled regexp.""" if pattern not in _regexp_compile_cache: _regexp_compile_cache[pattern] = sre_compile.compile(pattern) return _regexp_compile_cache[pattern].search(s)
strings. """ __date__ = '28 June 2018' __author__ = ('Rohit Sehgal <*****@*****.**>') from typing import Any from importlib import import_module import re import os import builtins import traceback import sre_compile Match = type(sre_compile.compile('', 0).match('')) # cf. Lib/re.py#L263 class REResolver: """ This class defines how the values from the jackson file will be translated to in memory json (having secrets) file. For now it supports translations from either environment variables or from other python functions. Which are specified in MATCH_REGEX The key value have to be declared like: - env.<ENVIRONMENT_VAR_NAME> - !foo.bar.baz: the python notation of calling method from other python file. """
def __init__(self, rules, flags=0): self.scanner = sre_compile.compile( '(%s)' % '|'.join('(%s)' % pattern for pattern in rules), flags)
def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None, relaxed=False): # If the RE module cannot compile it, we give up quickly if not isinstance(pattern, sre_parse.SubPattern): pattern = sre_parse.parse(pattern, flags) self.matcher = sre_compile.compile(pattern, flags) if not flags & re.DOTALL: charset = "".join(c for c in charset if c != "\n") self.charset = charset self.relaxed = relaxed self.named_group_lookup = self.matcher.groupindex flags |= DEFAULT_RE_FLAGS # https://github.com/google/sre_yield/issues/3 if flags & re.IGNORECASE: raise ParseError( 'Flag "i" not supported. https://github.com/google/sre_yield/issues/4' ) elif flags & re.UNICODE: raise ParseError( 'Flag "u" not supported. https://github.com/google/sre_yield/issues/3' ) elif flags & re.LOCALE: raise ParseError( 'Flag "l" not supported. https://github.com/google/sre_yield/issues/5' ) if max_count is None: self.max_count = MAX_REPEAT_COUNT else: self.max_count = max_count self.has_groupref = False # Configure the parser backends self.backends = { sre_constants.LITERAL: lambda y: [chr(y)], sre_constants.RANGE: lambda l, h: [chr(c) for c in range(l, h + 1)], sre_constants.SUBPATTERN: self.maybe_save, sre_constants.BRANCH: self.branch_values, sre_constants.MIN_REPEAT: self.max_repeat_values, sre_constants.MAX_REPEAT: self.max_repeat_values, sre_constants.AT: self.nothing_added, sre_constants.ASSERT: self.lookaround_parse_error, sre_constants.ASSERT_NOT: self.lookaround_parse_error, sre_constants.ANY: lambda _: self.in_values(((sre_constants.NEGATE, ), )), sre_constants.IN: self.in_values, sre_constants.NOT_LITERAL: self.not_literal, sre_constants.CATEGORY: self.category, sre_constants.GROUPREF: self.groupref, } if self.relaxed: self.backends.update({ sre_constants.ASSERT: self.nothing_added, sre_constants.ASSERT_NOT: self.nothing_added, }) self.state = STATE_START # Now build a generator that knows all possible patterns self.raw = self.sub_values(pattern) # Configure this class instance to know about that result self.length = self.raw.__len__()
def test_no_pattern(self): import sre_compile, sre_parse sre_pattern = sre_compile.compile( sre_parse.SubPattern(sre_parse.Pattern())) assert sre_pattern.scanner('s') is not None
def test_create_basic(self): orig = sre_parse.parse(r"a{2}") new = create_subpattern(orig.data) sre_compile.compile(new) assert _val_eq(orig, new)
r'(?P<pre> (?:(?<=\n\n)|^) (?:^[ ].+(\n|$))+ )', #? list, ordered and unordered Matches them whole, separate items are parsed later. The list *must* start with a single bullet. r'(?P<list>^[ \t]*([*][^*\#]|[\#][^\#*]).*$(\n[ \t]*[*\#]+.*$)*)', #? head r'^\s*(?P<head>(?P<head_level>!{1,4})(?P<head_value>.*?))\s*$', #? hr separator r'(?P<hr>^\s*----\s*$)', #? citation r'^(?P<blockquote>>(.*) ([\#]!(\s+.*)?$)?(.|\n)+?)(?:^[^>]|^$)', #? paragraph r'(?:(?<=(\n\n)(?![/*#!]|----))|^) (?P<p> ^(?:(?:.+\n(?!\n))*(?:.+$)) \n?)', #? empty line r'(?P<nl>^\s*$ )', ) p_block = sre_compile.compile('|'.join(RULES_BLOCK), RULES_FLAGS) RULES_INLINE = ( #? URLs (starting with an url scheme like HTTP) #TODO: r'(?P<url>(^|(?<=\s|[.,:;!?()/=]))(?P<escaped_url>~)?(?P<url_target> (?P<url_proto>https?|ftps?|ircs?|nntp|news|mailto|telnet|file):\S+?)($|(?=\s|[,.:;!?()](\s|$))))', r'(?P<uri>[a-zA-Z]+:/{,3}[%s]+/?[%s]*|%s)' %( r'A-Za-z0-9\-\.', r'\%\;\/\?\:\@\&\=\+\$\,\[\]A-Za-z0-9\-_\.\!\~\*\'\(\)\w#', r'\%\;\/\?\:\@\&\=\+\$\,\[\]A-Za-z0-9\-_\.\!\~\*\'\(\)' ), #? image r'(?P<img>\050\050 (?P<img_src>%(word)s) (?: \| (?P<img_alt>%(word)s) (?: \| (?P<img_align>%(word)s) (?: \| (?P<img_desc>%(word)s))?)?)? \051\051)' % {'word': r'(?: (?<![^\\](?=\|)) .)+'}, #? escaped character r'(?P<escape>[\\] (?P<escaped_char>\S) )', #? emphasis r"(?:'' (?P<em>.+?) (?<![\\])'')",
def compile(regexp, flags=0): return sre_compile.compile(regexp, get_flags(flags))
alphanum = _alphanum for i, c in enumerate(pattern): if c not in alphanum: if c == "\000": s[i] = "\\000" else: s[i] = "\\" + c return pattern[:0].join(s) # -------------------------------------------------------------------- # internals _cache = {} _cache_repl = {} _pattern_type = type(sre_compile.compile("", 0)) _MAXCACHE = 100 def _compile(*key): # internal: compile pattern cachekey = (type(key[0]),) + key p = _cache.get(cachekey) if p is not None: return p pattern, flags = key if isinstance(pattern, _pattern_type): if flags: raise ValueError('Cannot process flags argument with a compiled pattern') return pattern if not sre_compile.isstring(pattern):
for i in b'()[]{}?*+-|^$\\.&~# \t\n\r\v\f' } def escape(pattern): """ Escape special characters in a string. """ if isinstance(pattern, str): return pattern.translate(_special_chars_map) else: pattern = str(pattern, 'latin1') return pattern.translate(_special_chars_map).encode('latin1') Pattern = type(sre_compile.compile('', 0)) Match = type(sre_compile.compile('', 0).match('')) # -------------------------------------------------------------------- # internals _cache = {} # ordered! _MAXCACHE = 512 def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): flags = flags.value try:
import sre_compile import sre_constants r = sre_compile.compile("a(b+)c", 0) print r.match("") print r.match("ac") print r.match("abc").groups() for i in xrange(100000): r.match("abbc").groups() if i % 10000 == 0: print i def identity(o): return o charset = [(sre_constants.RANGE, (128, 65535))] print sre_compile._optimize_charset(charset, identity)
u'(?:<p>(?: |\\s|<br \\/>)*?</p>\\s*)+\\Z', u'(?<!\\\\)([aAbBcdDeEfFgGhHiIjlLmMnNoOPrsStTUuwWyYzZ])', u'\\\\(.)', '((^|[^%])(%%)*%[sy])', '(?P<year>\\d{4})-(?P<month>\\d{1,2})-(?P<day>\\d{1,2})$', '(?P<hour>\\d{1,2}):(?P<minute>\\d{1,2})(?::(?P<second>\\d{1,2})(?:\\.(?P<microsecond>\\d{1,6})\\d{0,6})?)?', '(?P<year>\\d{4})-(?P<month>\\d{1,2})-(?P<day>\\d{1,2})[T ](?P<hour>\\d{1,2}):(?P<minute>\\d{1,2})(?::(?P<second>\\d{1,2})(?:\\.(?P<microsecond>\\d{1,6})\\d{0,6})?)?(?P<tzinfo>Z|[+-]\\d{2}(?::?\\d{2})?)?$', '\\?|[-+]?[.\\w]+$', u'(?:W/)?"((?:\\\\.|[^"])*)"', u'^\\w{3}, (?P<day>\\d{2}) (?P<mon>\\w{3}) (?P<year>\\d{4}) (?P<hour>\\d{2}):(?P<min>\\d{2}):(?P<sec>\\d{2}) GMT$', u'^\\w{6,9}, (?P<day>\\d{2})-(?P<mon>\\w{3})-(?P<year>\\d{2}) (?P<hour>\\d{2}):(?P<min>\\d{2}):(?P<sec>\\d{2}) GMT$', u'^\\w{3} (?P<mon>\\w{3}) (?P<day>[ \\d]\\d) (?P<hour>\\d{2}):(?P<min>\\d{2}):(?P<sec>\\d{2}) (?P<year>\\d{4})$', u'\\s*,\\s*', '^From ', '[ \\(\\)<>@,;:\\\\"/\\[\\]\\?=]', u'(\\{\\%.*?\\%\\}|\\{\\{.*?\\}\\}|\\{\\#.*?\\#\\})', u'\n^(?P<constant>(?:\\_\\("[^"\\\\]*(?:\\\\.[^"\\\\]*)*"\\)|\\_\\(\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'\\)|"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"|\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'))|\n^(?P<var>[\\w\\.]+|[-+\\.]?\\d[\\d\\.e]*)|\n (?:\\s*\\|\\s*\n (?P<filter_name>\\w+)\n (?:\\:\n (?:\n (?P<constant_arg>(?:\\_\\("[^"\\\\]*(?:\\\\.[^"\\\\]*)*"\\)|\\_\\(\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'\\)|"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"|\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'))|\n (?P<var_arg>[\\w\\.]+|[-+\\.]?\\d[\\d\\.e]*)\n )\n )?\n )', u'(?:(\\w+)=)?(.+)', u'API|TOKEN|KEY|SECRET|PASS|PROFANITIES_LIST|SIGNATURE', '\\s*#?\\s*$', '[_a-z]\\w*\\.py$', u'.*; charset=([\\w\\d-]+);?', '[ \\(\\)<>@,;:\\\\"/\\[\\]\\?=]', u'\\s+', u'^[\\w.@+-]+$', ] for pattern in patterns: sre_compile.compile(pattern, 0)