def test(): import ClearMap.Utils.InverseRegularExpression as ire; import sre_parse as sre; reload(ire) source = '/test/test_(?P<row>\d{4})_(?P<col>\d{3}).tif'; p = sre.parse(source); ire.patternToExpression(p) reload(ire) source = r'/test/test_(?:\d)_(?P<col>\d{3})_[7-9][.](?=col)tif$'; p = sre.parse(source); ire.patternToExpression(p)
def doReversing(p): # p = 'ab*de.gh+i{10}' # p = re.compile() dbg("Pattern:" + p) pattern = sre_parse.parse(p, 0) out = reverse(pattern) return out
def charclass_runner(pat): r = Regex().get_parse_tree(pat) regexlint_version = r.children[0].matching_character_codes sre_parsed = sre_parse.parse(pat) print(sre_parsed) if isinstance(sre_parsed[0][1], int): sre_chars = sre_parsed else: sre_chars = sre_parsed[0][1] print('inner', sre_chars) golden = list(expand_sre_in(sre_chars)) order_matters = True try: if (sre_parsed[0][0] == sre_constants.NOT_LITERAL or sre_parsed[0][1][0][0] == sre_constants.NEGATE): golden = [i for i in range(256) if i not in golden] order_matters = False except TypeError: pass print('sre_parse', golden) print('regexlint', regexlint_version) if order_matters: assert golden == regexlint_version else: print('extra:', sorted(set(regexlint_version) - set(golden))) print('missing:', sorted(set(golden) - set(regexlint_version))) assert sorted(golden) == sorted(regexlint_version)
def reverse_group_map(re_str): r = re.compile(re_str) ast = sre_parse.parse(re_str) group_indices = r.groupindex group_index_map = dict((index, group) for (group, index) in r.groupindex.items()) return group_index_map
def __init__(self, pattern, flags=0): self.pattern = pattern try: self.parsed_pattern = sre_parse.parse(pattern, flags) except error, e: raise ErrorUnparseable( 'Invalid regex %s failed: %s' % (pattern,e.message))
def compile_regexp_to_noncapturing(pattern, flags=0): """ Convert all grouping parentheses in the given regexp pattern to non-capturing groups, and return the result. E.g.: >>> from nltk.internals import compile_regexp_to_noncapturing >>> compile_regexp_to_noncapturing('ab(c(x+)(z*))?d') 'ab(?:c(?:x+)(?:z*))?d' :type pattern: str :rtype: str """ def convert_regexp_to_noncapturing_parsed(parsed_pattern): res_data = [] for key, value in parsed_pattern.data: if key == sre_constants.SUBPATTERN: index, subpattern = value value = (None, convert_regexp_to_noncapturing(subpattern)) elif key == sre_constants.GROUPREF: raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern)) res_data.append((key, value)) parsed_pattern.data = res_data parsed_pattern.pattern.groups = 1 parsed_pattern.pattern.groupdict = {} return parsed_pattern return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)))
def compile(p, flags=0): # internal: convert pattern list to internal format if type(p) in STRING_TYPES: import sre_parse pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) # print code # FIXME: <fl> get rid of this limitation! assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" # map in either direction groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups for k, i in groupindex.items(): indexgroup[i] = k return _sre.compile( pattern, flags, code, p.pattern.groups-1, groupindex, indexgroup )
def __init__(self, lexicons, init_state=None, flags=0): # All the regexp magic below is copied from re.Scanner from # the standard library. import sre_compile import sre_parse from sre_constants import BRANCH, SUBPATTERN if init_state is None: init_state = State() if not hasattr(init_state, 'start'): init_state.start = None self.init_state = init_state self.lexicons = lexicons self.scanners = {} for start, lexicon in lexicons.iteritems(): # combine phrases into a compound pattern p, a = [], [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), ])) a.append(action) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) s.groups = len(p) self.scanners[start] = sre_compile.compile(p).match, a
def regex_slice(expr, start, end): """ Get a slice of a regex by calling regex_index on each index. Note that this can return expressions that are overly general: for example, it can mix characters from both branches of a regex. Being more specific than that would take more work. >>> regex_slice('test', 0, 1) 't' >>> regex_slice('t?est', 0, 2) '[te][es]' >>> regex_slice('mo+', 3, 8) 'ooooo' """ if start < 0 or end < 0: raise NotImplementedError("Can't take negative slices of a regex yet") result = '' for index in range(start, end): choices = _regex_index_pattern(parse(expr), index) if len(choices) == 0: return None elif len(choices) == 1: regex = unparse(choices[0]) result += regex else: regex = round_trip(unparse(('branch', (None, choices)))) if '|' in regex: result += '(%s)' % (regex,) else: result += regex return result
def clean_pattern(pattern): """ Cleans URL patterns * pattern => token * '2' => ('literal', 50) * '2|3' => ('in', [('literal', 50), ('literal', 51)]) """ star = '*' parsed = sre_parse.parse(pattern) literals = [] for token in parsed: if token[0] == LITERAL: character = quote(unichr(token[1]).encode('utf8')) literals.append(character) elif token[0] == AT: pass elif literals[-1:] != [star]: literals.append(star) rule = '/' + ''.join(literals) if parsed and not rule.endswith(star): if parsed[-1] == (AT, AT_END): rule += '$' else: rule += star return rule
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] # NOTE(kgibbs): These lines must be added to make this file work under # Python 2.2, which is commonly used at Google. def enumerate(obj): i = -1 for item in obj: i += 1 yield i, item # NOTE(kgibbs): End changes. for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) s.groups = len(p)+1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def isFileExpression(source): """Checks if filename is a regular expression denoting a file list Arguments: source (str): source file name Returns: bool: True if source is true regular expression with at least one non-literal Note: The any character '.' is not treated as a non-literal because of possible filename extensions """ if not isinstance(source, basestring): return False; if isFile(source): return False; else: #searchRegex = re.compile('.*\\\\d\{(?P<digit>\d)\}.*').search #m = searchRegex(source); #if m is None: # return False; #else: # return True; #parse regular expression p = sre_parse.parse(source); for l in p: #note: allow for a filname.ext patterns although this is strictly a regular expression which should be denoted as filename\.ext if l[0] != 'literal' and l[0] != 'any': return True; return False;
def re_replace_literals(text, mapping): """Raises NotImplementedError or re.error""" assert isinstance(text, unicode) pattern = sre_parse.parse(text) return _construct_regexp(pattern, mapping)
def charclass_runner(pat): r = Regex().get_parse_tree(pat) regexlint_version = r.children[0].matching_character_codes sre_parsed = sre_parse.parse(pat) print sre_parsed if isinstance(sre_parsed[0][1], int): sre_chars = sre_parsed else: sre_chars = sre_parsed[0][1] golden = list(expand_sre_in(sre_chars)) order_matters = True try: if (sre_parsed[0][0] == 'not_literal' or sre_parsed[0][1][0][0] == 'negate'): golden = [i for i in range(256) if i not in golden] order_matters = False except TypeError: pass print golden print regexlint_version if order_matters: assert golden == regexlint_version else: assert sorted(golden) == sorted(regexlint_version)
def compile(p, flags=0): # internal: convert pattern list to internal format if isstring(p): pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) # print code # XXX: <fl> get rid of this limitation! if p.pattern.groups > 100: raise AssertionError( "sorry, but this version only supports 100 named groups" ) # map in either direction groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups for k, i in groupindex.items(): indexgroup[i] = k return _sre.compile( pattern, flags | p.pattern.flags, code, p.pattern.groups - 1, groupindex, indexgroup )
def compile(p, flags=0): # internal: convert pattern list to internal format if isstring(p): pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) if flags & SRE_FLAG_DEBUG: print() dis(code) # map in either direction groupindex = p.state.groupdict indexgroup = [None] * p.state.groups for k, i in groupindex.items(): indexgroup[i] = k return _sre.compile( pattern, flags | p.state.flags, code, p.state.groups-1, groupindex, tuple(indexgroup) )
def regex_score(regex, search_string): """ Returns a closeness score of how well the regex matches the string. Will return -1 if it doesn't match. """ match = re.search(regex, search_string) if match: # Base score is the longest distance between regex, match, # and search_string regex_match_dist = levenshtein_distance( regex.pattern.lower(), match.group(0).lower()) match_string_dist = levenshtein_distance( match.group(0).lower(), search_string.lower()) score = max(regex_match_dist, match_string_dist) # Adjust score: Special anchors slightly reduce distance for opcode, argument in sre_parse.parse(regex.pattern): if str(opcode) == 'AT': if str(argument) == 'AT_BEGINNING' or 'AT_END': # ^ or $, adjust 1 edit score -= 1 if str(argument) == 'AT_BOUNDARY': # all other anchors reduce 2 edits score -= 2 return score if score >= 0 else 0 else: return -1
def ipermute(p): r"""Generate permutations (returns an iterable rather than an array or list) >>> list(ipermute(r'[A-Z]\d')) ['A0', 'B0', ..., 'Z9'] """ toks = [tok_n_val for tok_n_val in sre_parse.parse(p)] return permute_toks(toks)
def base_regex_strategy(regex, parsed=None): if parsed is None: parsed = sre.parse(regex.pattern) return clear_cache_after_draw(_strategy( parsed, Context(flags=regex.flags), regex.pattern ))
def test_re_inverse(self): import sre_parse RE = r'(firstleft|)somestring(\s.*|) \S(a|b) [fgh]+ {2,3}R(\S)' print(sre_parse.parse(RE)) for i in range(20): ms = re_inverse.make_match_string(RE) for i in range(20): ms = re_inverse.make_nonmatch_string(RE)
def base_regex_strategy(regex, parsed=None): if parsed is None: parsed = sre_parse.parse(regex.pattern, flags=regex.flags) return clear_cache_after_draw( _strategy( parsed, Context(flags=regex.flags), isinstance(regex.pattern, text_type) ) )
def __init__(self, pattern, flag = 0, escape = None): self.name = None self.group = None self.flag = flag self.lexer = None self.escape = escape scf = 0 if flag & self.IGNORE_CASE: scf |= sre_compile.SRE_FLAG_IGNORECASE cpattern = re.compile(pattern, scf) groupidx = cpattern.groupindex self.parsed = sre_parse.parse(pattern, scf) self.begin = None self.middle = None self.end = None self.exact = False for (op, (gidx, val)) in self.parsed: for (gname, idx) in groupidx.iteritems(): if op == sre_constants.SUBPATTERN and gidx == idx: if gname == "begin": self.begin = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE) elif gname == "middle": self.middle = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE) elif gname == "end": self.end = MatchPattern(self, val, (flag & self.IGNORE_CASE) == self.IGNORE_CASE) if self.begin is None: raise Exception("Need to define 'begin' group!") if self.end is not None and self.middle is None: raise Exception("Need to define 'middle' group when 'end' defined!") if self.middle is None and self.end is None: _ex = True for (_op, _val) in self.begin._pattern: if _op != sre_constants.LITERAL: _ex = False break self.exact = _ex if self.begin.maxWidth() >= self.INF_WIDTH: raise Exception("Begin group don't support ifinity match!") if self.end and self.end.maxWidth() >= self.INF_WIDTH: raise Exception("End group don't support infinity match!"); if self.middle and self.middle.maxWidth() >= self.INF_WIDTH: self.infinity = True self.middle.infinity = True else: self.infinity = False
def expressionToPattern(expression): """Convert regular expression to a parsed pattern for manipulation Arguments: expression (str): regular expression Returns: object: parsed pattern """ return sre.parse(expression);
def colorize_re(regexp): r""" @return: The HTML code for a colorized version of the pattern for the given SRE regular expression. If C{colorize_re} can't figure out how to colorize the regexp, then it will simply return the (uncolorized) pattern, with C{'&'}, C{'<'}, and C{'>'} escaped as HTML entities. The colorized expression includes spans with the following css classes: - X{re}: The entire regular expression. - X{re-char}: Special characters (such as C{'.'}, C{'\('}), character categories (such as C{'\w'}), and locations (such as C{'\b'}). - X{re-op}: Operators (such as C{'*'} and C{'|'}). - X{re-group}: Grouping constructs (such as C{'(...)'}). - X{re-ref} References (such as C{'\1'}) @rtype: C{string} @param regexp: The regular expression to colorize. @type regexp: C{SRE_Pattern} or C{string} @raise sre_constants.error: If regexp is not a valid regular expression. """ if isinstance(regexp, str): pat = decode_with_backslashreplace(regexp) tree = sre_parse.parse(pat) elif isinstance(regexp, unicode): tree = sre_parse.parse(regexp) elif hasattr(regexp, 'pattern') and hasattr(regexp, 'flags'): if isinstance(regexp.pattern, str): pat = decode_with_backslashreplace(regexp.pattern) tree = sre_parse.parse(pat, regexp.flags) elif isinstance(regexp.pattern, unicode): tree = sre_parse.parse(regexp.pattern, regexp.flags) else: raise TypeError("Bad regexp object -- pattern is not a string") else: raise TypeError("Expected a regexp or a string") return ('<span class="%s">%s</span>' % (RE_TAG, _colorize_re(tree, 1)))
def make_pattern(rules, flags=0): """Compile a rules to single branch with groups.""" pattern = Pattern() pattern.flags = flags pattern.subpatterns = [None] * (len(rules) + 1) return sre_compile(SubPattern(pattern, [ (BRANCH, (None, [SubPattern(pattern, [ (SUBPATTERN, (group, parse(regex, flags, pattern))), ]) for group, (regex, _) in enumerate(rules, 1)])) ]))
def regex_pieces(regex): """ Separates a regex into independent pieces. >>> regex_pieces('[abc]de+') ['[abc]', 'd', 'e+'] """ result = [] for piece in parse(regex): result.append(unparse([piece])) return result
def make_nonmatch_string(regexp, flags=0): """Given a string that is a regular expression, return a string (perhaps with some randomness) that is certain to NOT produce a match. """ s = _make_match_string_from_pattern(sre_parse.parse(regexp, get_flags(flags)), True) if __debug__: cre = compile(regexp, flags) if cre.match(s): raise GeneratorError("'%s' matches '%s'" % (s, regexp)) return s
def __init__(self, rules, flags=0): pattern = Pattern() pattern.flags = flags pattern.groups = len(rules) + 1 self.rules = [name for name, _ in rules] self._scanner = sre_compile(SubPattern(pattern, [ (BRANCH, (None, [SubPattern(pattern, [ (SUBPATTERN, (group, parse(regex, flags, pattern))), ]) for group, (_, regex) in enumerate(rules, 1)])) ])).scanner
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon p = [] s = sre_parse.Pattern() s.flags = flags for (phrase, action) in lexicon: p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags)))])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _make_url_form(regexp): cre = re.compile(regexp, re.I) # Build reverse format from re parse tree. indexmap = dict([(v,k) for k,v in cre.groupindex.items()]) collect = [] for op, val in sre_parse.parse(regexp, re.I): if op is sre_parse.LITERAL: collect.append(chr(val)) elif op is sre_parse.SUBPATTERN: name = indexmap[val[0]] collect.append(r'%%(%s)s' % name) return cre, "".join(collect)
def compile(self): self.subpattern = sre_parse.parse(self.pattern, self.flags) self.code = sre_compile._code(self.subpattern, self.flags) # groups=0, groupindex={}, indexgroup=[None] self.groupindex = self.subpattern.pattern.groupdict self.indexgroup = [None] * self.subpattern.pattern.groups for k, i in self.groupindex.items(): self.indexgroup[i] = k module = _sre_ if self.debug else _sre self.regex = getattr(module, 'compile')( self.pattern, self.flags | self.subpattern.pattern.flags, self.code, self.subpattern.pattern.groups - 1, self.groupindex, tuple(self.indexgroup)) self.dump print('-' * 76) self.dis
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN if isinstance(flags, RegexFlag): flags = flags.value self.lexicon = lexicon # 将短语组合成复合模式 p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, rules, flags=0): pattern = Pattern() pattern.flags = flags if sys.version_info < (3, 0): pattern.groups = len(rules) + 1 _og = pattern.opengroup pattern.opengroup = lambda n: _og(n and '%s\x00%s' % (name, n) or n) self.rules = [] subpatterns = [] for group, (name, regex) in enumerate(rules, 1): last_group = pattern.groups - 1 subpatterns.append( SubPattern(pattern, [(SUBPATTERN, (group, parse(regex, flags, pattern))), ])) self.rules.append((name, last_group, pattern.groups - 1)) self._scanner = sre_compile( SubPattern(pattern, [(BRANCH, (None, subpatterns))])).scanner
def compile(p, flags=0): if isstring(p): import sre_parse pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) if p.pattern.groups > 100: raise AssertionError( 'sorry, but this version only supports 100 named groups') groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups for k, i in groupindex.items(): indexgroup[i] = k return _sre.compile(pattern, flags, code, p.pattern.groups - 1, groupindex, indexgroup)
def __init__(self, tokens, ignore=()): assert all(isinstance(t, TokenDef) for t in tokens), tokens self.ignore = ignore self.newline_char = '\n' tokens = list(tokens) # Sanitization for t in tokens: try: re.compile(t.pattern.to_regexp()) except: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) width = sre_parse.parse(t.pattern.to_regexp()).getwidth() if width[0] == 0: raise LexError( "Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) token_names = {t.name for t in tokens} for t in ignore: if t not in token_names: raise LexError( "Token '%s' was marked to ignore but it is not defined!" % t) # Init self.newline_types = [ t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp()) ] self.ignore_types = [t for t in ignore] tokens.sort(key=lambda x: (x.pattern.priority, len(x.pattern.value)), reverse=True) tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) self.tokens = tokens self.mres = build_mres(tokens)
def _match_pattern(compiled_regex, pattern, orig_smtstr, pos, endpos=None): space = orig_smtstr.statespace parsed_pattern = parse(pattern, compiled_regex.flags) smtstr = _slice_match_area(orig_smtstr, pos, endpos) match = _internal_match_patterns(space, parsed_pattern, compiled_regex.flags, smtstr, pos) if match is not None: match.pos = pos match.endpos = endpos if endpos is not None else len(orig_smtstr) match.re = compiled_regex match.string = orig_smtstr # fill None in unmatched groups: while len(match._groups) < compiled_regex.groups + 1: match._groups.append(None) # Link up any named groups: for name, num in compiled_regex.groupindex.items(): (_, start, end) = match._groups[num] match._groups[num] = (name, start, end) return match
def parse(cls, s): keywords = [] current = [] quoted = False for t, x in sre_parse.parse(s): if t == "literal": if x == 92: # \ if quoted: current += ["\\"] quoted = False else: quoted = True else: current += [chr(x)] elif current: keywords += ["".join(current)] current = [] if current: keywords += ["".join(current)] return keywords
def compile(p, flags=0): if (type(p) in STRING_TYPES): import sre_parse pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) assert (p.pattern.groups <= 100), 'sorry, but this version only supports 100 named groups' groupindex = p.pattern.groupdict indexgroup = ([None] * p.pattern.groups) for ( k, i, ) in groupindex.items(): indexgroup[i] = k return _sre.compile(pattern, flags, code, (p.pattern.groups - 1), groupindex, indexgroup)
def make_scanner(lexicon, flags=FLAGS): actions = [None] # Combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags charpatterns = {} p = [] idx = 0 for token in lexicon: if token.pattern in (r'\[', r'{', r'"'): charpatterns[token.pattern[-1]] = token idx += 1 phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) actions.append(token) s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) scanner = sre_compile.compile(p).scanner def _scan_once(string, idx=0, context=None): try: action = charpatterns[string[idx]] except KeyError: pass except IndexError: raise StopIteration else: return action((string, idx + 1), context) m = scanner(string, idx).match() if m is None or m.end() == idx: raise StopIteration return actions[m.lastindex](m, context) return _scan_once
def get_brackets_values(pattern): replace_perl_regex_dict = { '[:alnum:]': 'a-zA-Z0-9', '[:alpha:]': 'a-zA-Z', '[:ascii:]': '\\x00-\\x7f', '[:blank:]': ' \\t', '[:cntrl:]': '\\x00-\\x1f\\x7f', '[:digit:]': '0-9', '[:graph:]': '\\x21-\\x7e', '[:lower:]': 'a-z', '[:print:]': '\\x20-\\x7e', '[:punct:]': '!"\\#$%&\'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~', '[:space:]': ' \\t\\r\\n\\v\\f', '[:upper:]': 'A-Z', '[:word:]': 'A-Za-z0-9_', '[:xdigit:]': 'A-Fa-f0-9' } for perl_pattern in replace_perl_regex_dict: pattern = pattern.replace(perl_pattern, replace_perl_regex_dict[perl_pattern]) matches = [] sre_parsed_data = sre_parse.parse(pattern).data if type(sre_parsed_data[0][1]) is int: return [ chr(sre_parsed_data[0][1]).encode('unicode_escape').decode( "utf-8") ] for m in sre_parsed_data[0][1]: if type(m) is int: return [chr(m).encode('unicode_escape').decode("utf-8")] if sre_parse.LITERAL == m[0]: matches.append( chr(m[1]).encode('unicode_escape').decode("utf-8")) elif sre_parse.RANGE == m[0]: range_chars = [ chr(x).encode('unicode_escape').decode("utf-8") for x in range(m[1][0], 1 + m[1][1]) ] matches += range_chars return matches
def compile(p, flags=0): # internal: convert pattern list to internal format if isstring(p): pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) # print(code) # map in either direction groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups for k, i in list(groupindex.items()): indexgroup[i] = k return _sre.compile(pattern, flags | p.pattern.flags, code, p.pattern.groups - 1, groupindex, indexgroup)
def _compile_hook(pattern: str, flags: int) -> "AtherisPatternProxy": """Overrides re._compile.""" generated = "" if pattern not in pattern_gen_map: pat = sre_parse.parse(pattern) generated = gen_match(pat) # Check that the pattern actually matches check_pattern = pattern try: # Convert our pattern to a string if necessary check_pattern = pattern.decode("utf-8") # type: ignore except AttributeError: # Already a string pass except Exception as e: # pylint: disable=broad-except # Not sure what went wrong. sys.stderr.write( f"Could not convert the pattern {pattern} to a " + f"utf-8 string: {e}\n") try: if original_compile_func(check_pattern, flags).search(generated) is None: sys.stderr.write( f"ERROR: generated match '{generated}' did not " + "match the RegEx pattern '{_pattern}'!\n") except Exception as e: # pylint: disable=broad-except sys.stderr.write( "Could not check the generated match against the " + f"RegEx pattern: {e}\n") pattern_gen_map[pattern] = generated else: generated = pattern_gen_map[pattern] # Create the `re.Pattern` object. We will wrap this in a proxy later on. re_object = original_compile_func(pattern, flags) # Return the wrapped `re.Pattern` object. return AtherisPatternProxy(re_object, generated)
def _match_pattern( compiled_regex: re.Pattern, pattern: str, orig_smtstr: SymbolicStr, pos: int, endpos: Optional[int] = None, ) -> Optional[_Match]: if pos == 0: # Remove some meaningless empty matchers for match/fullmatch: pattern = pattern.lstrip("^") while pattern.startswith(r"\A"): pattern = pattern[2:] space = orig_smtstr.statespace parsed_pattern = parse(pattern, compiled_regex.flags) smtstr = _slice_tail(orig_smtstr, endpos) matchpart = _internal_match_patterns( space, parsed_pattern, compiled_regex.flags, smtstr, pos ) if matchpart is None: return None return _Match(matchpart._groups, pos, endpos, compiled_regex, orig_smtstr)
def expand_sub(string, template, debug=0, mode='all'): """ Given a regular expression and a replacement string, generate expansions of the regular expression and for each one return it and its transformation as applied by the replacement string. string : regular expression to expand template : transformation to apply to each regular expression mode : can take 3 values all : return all possible shortest strings that the regular expression would match first : return the first string that all would return random : return one random string that the regular expression would match """ pattern = sre_parse.parse(string, flags=sre_parse.SRE_FLAG_VERBOSE) pattern.mode = mode template = sre_parse.parse_template(template, pattern) if debug: print(pattern) print(template) for s in _iterate(pattern, pattern.data, MatchObj(pattern, "")): s.patient = 0 yield (s.string, sre_parse.expand_template(template, s))
def regex_len(regex): """ Returns a tuple of the minimum and maximum possible length string that a regex will match. Returns MAXREPEAT if a match can be very or infinitely long. >>> regex_len('test') (4, 4) >>> regex_len('t.st') (4, 4) >>> regex_len('.*') (0, MAXREPEAT) >>> regex_len('fo?o') (2, 3) >>> regex_len('mo{2,7}') (3, 8) >>> regex_len('(foo)+') (3, MAXREPEAT) >>> regex_len('s?e?q?u?e?n?c?e?') (0, 8) """ return _regex_len_pattern(parse(regex))
def regex_index(regex, index): """ Index into a regex, returning a smaller regex of the things that match in that position. >>> regex_index('test', 0) 't' >>> regex_index('t?est', 0) '[te]' >>> regex_index('fa(la){2,}', 2) 'l' >>> regex_index('fa(la){2,}', 6) 'l' >>> regex_index('.*', 99) '.' """ choices = _regex_index_pattern(parse(regex), index) if len(choices) == 0: raise IndexError elif len(choices) == 1: return unparse(choices[0]) else: return round_trip(unparse((BRANCH, (None, choices))))
def compile_regexp_to_noncapturing(pattern, flags=0): """ Compile the regexp pattern after switching all grouping parentheses in the given regexp pattern to non-capturing groups. :type pattern: str :rtype: str """ def convert_regexp_to_noncapturing_parsed(parsed_pattern): res_data = [] for key, value in parsed_pattern.data: if key == sre_constants.SUBPATTERN: index, subpattern = value value = (None, convert_regexp_to_noncapturing_parsed(subpattern)) elif key == sre_constants.GROUPREF: raise ValueError('Regular expressions with back-references are not supported: {0}'.format(pattern)) res_data.append((key, value)) parsed_pattern.data = res_data parsed_pattern.pattern.groups = 1 parsed_pattern.pattern.groupdict = {} return parsed_pattern return sre_compile.compile(convert_regexp_to_noncapturing_parsed(sre_parse.parse(pattern)), flags=flags)
def ranking(item): """ Sorting function """ regex = item['pattern'] + '$' regex_max_width = int(sre_parse.parse(regex).getwidth()[1]) # Capture group should not impact length l = re.sub(r'[()]', '', item['pattern']) # two character specifier should not impact length l = re.sub(r'\\(\w)', '\1', l) length = len(l) # "\d" and "\w" placed before "." a = re.sub(r'[\\]', u"\U0010FFFD", item['pattern']) # "atf" before "(atf)" with "at." last a = re.sub(r'[(]', '', a) # Make sure regex symbols after letters or numbers alphabetical = re.sub(r'[.?*)]', u"\U0010FFFF", a) # patterns with infinite wildcards like \d+ or .* if regex_max_width >= int(sre_constants.MAXREPEAT): # in this case longer string more specific length = -length return (regex_max_width, length, alphabetical)
def compile(p, flags=0): # internal: convert pattern list to internal format if isstring(p): pattern = p p = sre_parse.parse(p, flags) else: pattern = None code = _code(p, flags) if flags & SRE_FLAG_DEBUG: print() dis(code) # map in either direction groupindex = p.state.groupdict indexgroup = [None] * p.state.groups for k, i in groupindex.items(): indexgroup[i] = k return _sre.compile(pattern, flags | p.state.flags, code, p.state.groups - 1, groupindex, tuple(indexgroup))
def transform_format_string_into_regex(self): # 3. Convert the mangled format string into a regex object # Transforming our format string into a regular expression, # substituting {{ ... }} with regex named groups, so that param_stream # matched against this expression yields a dict of params with values. param_match = r'\1["\']?(?P<\2>(?:(?<=\').+?(?=\')|(?<=").+?(?=")|{.+?}|.+?))["\']?' reg = re.sub(r'(\s*)' + self._snippets['optional'], r'(?:' + param_match + r')?', self._format) reg = re.sub(r'(\s*)' + self._snippets['required'], param_match, reg) reg_tokens = parse(reg, flags=re.DOTALL) # Add a beginning anchor if none exists if not search_regex_tokens( ((AT, AT_BEGINNING), (AT, AT_BEGINNING_STRING)), reg_tokens): reg = r'^\s*' + reg # Add an ending anchor if none exists if not search_regex_tokens( ((AT, AT_END), (AT, AT_END_STRING)), reg_tokens, backwards=True): reg = reg + r'\s*$' return re.compile(reg, re.DOTALL)
def group_names(expression): """Returns the names of groups in the regular expression Arguments --------- expression : str The regular expression. Returns ------- names : list of str The group names in the regular expression sorted according to appearance. """ #parse regular expression for name expressions p = sre.parse(expression) gd = p.pattern.groupdict names = list(gd.keys()) #sort according to appearance order = np.argsort(list(gd.values())) names = [names[o] for o in order] return names
def regex(regex): """Return strategy that generates strings that match given regex. Regex can be either a string or compiled regex (through `re.compile()`). You can use regex flags (such as `re.IGNORECASE`, `re.DOTALL` or `re.UNICODE`) to control generation. Flags can be passed either in compiled regex (specify flags in call to `re.compile()`) or inside pattern with (?iLmsux) group. Some tricky regular expressions are partly supported or not supported at all. "^" and "$" do not affect generation. Positive lookahead/lookbehind groups are considered normal groups. Negative lookahead/lookbehind groups do not do anything. Ternary regex groups ('(?(name)yes-pattern|no-pattern)') are not supported at all. """ if not hasattr(regex, 'pattern'): regex = re.compile(regex) pattern = regex.pattern flags = regex.flags codes = sre.parse(pattern) return _strategy(codes, Context(flags=flags)).filter(regex.match)
def __init__(self, pattern, flags=0, charset=CHARSET, max_count=None): # If the RE module cannot compile it, we give up quickly self.matcher = re.compile(r'(?:%s)\Z' % pattern, flags) if not flags & re.DOTALL: charset = ''.join(c for c in charset if c != '\n') self.charset = charset self.named_group_lookup = self.matcher.groupindex if flags & re.IGNORECASE: raise ParseError( 'Flag "i" not supported. https://github.com/google/sre_yield/issues/4' ) elif flags & re.UNICODE: raise ParseError( 'Flag "u" not supported. https://github.com/google/sre_yield/issues/3' ) elif flags & re.LOCALE: raise ParseError( 'Flag "l" not supported. https://github.com/google/sre_yield/issues/5' ) if max_count is None: self.max_count = MAX_REPEAT_COUNT else: self.max_count = max_count self.has_groupref = False # Configure the parser backends self.backends = { sre_constants.LITERAL: lambda y: [chr(y)], sre_constants.RANGE: lambda l, h: [chr(c) for c in xrange(l, h + 1)], sre_constants.SUBPATTERN: self.maybe_save, sre_constants.BRANCH: self.branch_values, sre_constants.MIN_REPEAT: self.max_repeat_values, sre_constants.MAX_REPEAT: self.max_repeat_values, sre_constants.AT: self.empty_list, sre_constants.ASSERT: self.empty_list, sre_constants.ASSERT_NOT: self.empty_list, sre_constants.ANY: lambda _: self.in_values(((sre_constants.NEGATE, ), )), sre_constants.IN: self.in_values, sre_constants.NOT_LITERAL: self.not_literal, sre_constants.CATEGORY: self.category, sre_constants.GROUPREF: self.groupref, } # Now build a generator that knows all possible patterns self.raw = self.sub_values(sre_parse.parse(pattern, flags)) # Configure this class instance to know about that result self.length = self.raw.__len__()
def max_width(self): return sre_parse.parse(self.to_regexp()).getwidth()[1]
def get_extracted_param_value(self): """ Match command against the format string and extract paramters from the command string. :rtype: ``dict`` """ result = {} param_stream = self._param_stream # As there's a lot of questions about using regular expressions, # I'll try to be thorough when documenting this code. # I'll split the whole convoluted regex into snippets to make it # a bit more readable (hopefully). snippets = dict() # Formats for keys and values: key is a non-spaced string, # value is anything in quotes or curly braces, or a single word. snippets['key'] = r'\s*(\S+?)\s*' snippets['value'] = r'""|\'\'|"(.+?)"|\'(.+?)\'|({.+?})|(\S+)' # Extended value: also matches unquoted text (caution). snippets['ext_value'] = r'""|\'\'|"(.+?)"|\'(.+?)\'|({.+?})|(.+?)' # Key-value pair: snippets['pairs'] = r'(?:^|\s+){key}=({value})'.format(**snippets) # End of string: multiple space-separated key-value pairs: snippets['ending'] = r'.*?(({pairs}\s*)*)$'.format(**snippets) # Default value in optional parameters: snippets['default'] = r'\s*=\s*(?:{ext_value})\s*'.format(**snippets) # Optional parameter (has a default value): snippets[ 'optional'] = '{{' + snippets['key'] + snippets['default'] + '}}' # Required parameter (no default value): snippets['required'] = '{{' + snippets['key'] + '}}' # 1. Matching the arbitrary key-value pairs at the end of the command # to support extra parameters (not specified in the format string), # and cutting them from the command string afterwards. ending_pairs = re.match(snippets['ending'], param_stream, re.DOTALL) has_ending_pairs = ending_pairs and ending_pairs.group(1) if has_ending_pairs: kv_pairs = re.findall(snippets['pairs'], ending_pairs.group(1), re.DOTALL) param_stream = param_stream.replace(ending_pairs.group(1), '') param_stream = " %s " % (param_stream) # 2. Matching optional parameters (with default values). optional = re.findall(snippets['optional'], self._format, re.DOTALL) # Transforming our format string into a regular expression, # substituting {{ ... }} with regex named groups, so that param_stream # matched against this expression yields a dict of params with values. param_match = r'\1["\']?(?P<\2>(?:(?<=\').+?(?=\')|(?<=").+?(?=")|{.+?}|.+?))["\']?' reg = re.sub(r'(\s*)' + snippets['optional'], r'(?:' + param_match + r')?', self._format) reg = re.sub(r'(\s*)' + snippets['required'], param_match, reg) reg_tokens = parse(reg, flags=re.DOTALL) # Add a beginning anchor if none exists if not search_regex_tokens( ((AT, AT_BEGINNING), (AT, AT_BEGINNING_STRING)), reg_tokens): reg = r'^\s*' + reg # Add an ending anchor if none exists if not search_regex_tokens( ((AT, AT_END), (AT, AT_END_STRING)), reg_tokens, backwards=True): reg = reg + r'\s*$' # 3. Matching the command against our regex to get the param values matched_stream = re.match(reg, param_stream, re.DOTALL) if not matched_stream: # If no match is found we throw since this indicates provided user string (command) # didn't match the provided format string raise ParseException( 'Command "%s" doesn\'t match format string "%s"' % (self._param_stream, self._format)) # Compiling results from the steps 1-3. if matched_stream: result = matched_stream.groupdict() for param in optional: matched_value = result[param[0]] if matched_stream else None matched_result = matched_value or ''.join(param[1:]) if matched_result is not None: result[param[0]] = matched_result if has_ending_pairs: for pair in kv_pairs: result[pair[0]] = ''.join(pair[2:]) if self._format and not (self._param_stream.strip() or any(result.values())): raise ParseException( 'No value supplied and no default value found.') return result
def re_compile(expr): """NOT_RPYTHON""" pattern = sre_parse.parse(expr) return (sre_compile._code(pattern, 0), pattern.pattern.groups)
def get_regexp_width(regexp): try: return sre_parse.parse(regexp).getwidth() except sre_constants.error: raise ValueError(regexp)
def base_regex_strategy(regex, parsed=None): if parsed is None: parsed = sre_parse.parse(regex.pattern, flags=regex.flags) return clear_cache_after_draw( _strategy(parsed, Context(flags=regex.flags), isinstance(regex.pattern, str)))
def PrintRegex(pat): re_tree = sre_parse.parse(pat) print('\t\t[') PrintTree(re_tree) print('\t\t]')