def __init__(self, lexicon, flags=0): """ Create a parser from a given lexicon. :param lexicon: Lexicon in the form of list, each entry with: (<regex>, lambda scanner, token: Tok(<kind>, <value>))) :param flags: Extra flags for parsing. """ import sre_parse import sre_compile from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))), ])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p, re.MULTILINE)
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] # NOTE(kgibbs): These lines must be added to make this file work under # Python 2.2, which is commonly used at Google. def enumerate(obj): i = -1 for item in obj: i += 1 yield i, item # NOTE(kgibbs): End changes. for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def record_matches(split, text): """Record which clauses match.""" matches = frozenset() for clause in split: p = sre_parse.Pattern() scanner = sre_compile.compile(sre_parse.SubPattern(p, clause))\ .scanner(text) if scanner.match(): matches |= {tuple(clause)} return matches
def __init__(self, lexicon, flags = 0): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags)))])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, lexicon, flags=0): self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), ])) s.groups = len(p)+1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, lexicon): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() for phrase, action in lexicon: p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p), sre_parse.parse(phrase))), ])) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) s.groups = len(p) self.scanner = sre_compile.compile(p)
def __init__(self, regexp, negative=False, **property_names): """ Create a new C{RegexpTokenizer} from a given regular expression. @type regexp: C{string} or C{SRE_Pattern} @param regexp: The regular expression used to tokenized texts. Unless C{negative} is true, this regular expression specifies the form of a single word type; so the list of tokens generated by tokenization includes all non-overlapping substrings that match C{regexp} @type negative: C{boolean} @param negative: An optional parameter that inverts the meaning of C{regexp}. In particular, if C{negative} is true, then C{regexp} is taken to specify the form of word separators (and not word types); so the list of tokens generated by tokenization includes all substrings that occur I{between} matches of the regular expression. @type property_names: C{dict} @param property_names: A dictionary that can be used to override the default property names. Each entry maps from a default property name to a new property name. """ assert chktype(1, regexp, str) AbstractTokenizer.__init__(self, **property_names) if hasattr(regexp, 'pattern'): regexp = regexp.pattern self._negative = bool(negative) # Replace any grouping parentheses with non-grouping ones. We # need to do this, because the list returned by re.sub will # contain an element corresponding to every set of grouping # parentheses. We must not touch escaped parentheses, and # need to handle the case of escaped escapes (e.g. "\\("). # We also need to handle nested parentheses, which means our # regexp contexts must be zero-width. There are also issues with # parenthesis appearing in bracketed contexts, hence we've # operated on the intermediate parse structure from sre_parse. parsed = sre_parse.parse(regexp) parsed = _remove_group_identifiers(parsed) # Add grouping parentheses around the regexp; this will allow # us to access the material that was split on. # Need to set the Pattern to expect a single group pattern = sre_parse.Pattern() pattern.groups += 1 grouped = sre_parse.SubPattern(pattern) grouped.append((sre_constants.SUBPATTERN, (1, parsed))) self._regexp = sre_compile.compile(grouped, re.UNICODE)
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN self.lexicon = lexicon # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() p.append(sre_parse.SubPattern(s, [ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def _get_group_pattern(self, flags): # combine phrases into a compound pattern patterns = [] sub_pattern = sre_parse.Pattern() sub_pattern.flags = flags for phrase, action in self.lexicon: patterns.append( sre_parse.SubPattern(sub_pattern, [ (SUBPATTERN, (len(patterns) + 1, sre_parse.parse(phrase, flags))), ])) #sub_pattern.groups = len(patterns) + 1 group_pattern = sre_parse.SubPattern(sub_pattern, [(BRANCH, (None, patterns))]) return sre_compile.compile(group_pattern)
def _compile(regexp): parsed = sre_parse.parse(regexp) parsed = _remove_group_identifiers(parsed) # Add grouping parentheses around the regexp; this will allow # us to access the material that was split on. # Need to set the Pattern to expect a single group pattern = sre_parse.Pattern() pattern.groups += 1 grouped = sre_parse.SubPattern(pattern) grouped.append((sre_constants.SUBPATTERN, (1, parsed))) return sre_compile.compile(grouped, re.UNICODE | re.MULTILINE | re.DOTALL)
def _scan_init(self): p = [] try: state = sre_parse.State() except AttributeError: # python < 3.8.0 state = sre_parse.Pattern() for name, regx in self.LEXICON: group = state.opengroup(name) re = sre_parse.parse(regx) data = [(SUBPATTERN, (group, 0, 0, re))] p.append(sre_parse.SubPattern(state, data)) state.closegroup(group, p[-1]) data = [(BRANCH, (None, p))] return state, data
def __init__(self, lexicon, flags=(VERBOSE | MULTILINE | DOTALL), verify=True): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: print "Can't parse %s" % (token.__name__, ) raise token.regex = re.compile(phrase, flags) p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p) if verify: for token in lexicon: example = token.example if example is None: continue def dead(string, i, j): print token.__name__, i, j print '--- PATTERN ---' print token.pattern print '--- PARSED EXAMPLE ---' print string[:i] print '--- UNMATCHED CHUNK ---' print repr(string[i:j]) raise ValueError, "Token %s can not be verified" % token.__name__ s = Scanner([token, InsignificantWhitespace], verify=False) try: for m in s.iterscan(example, dead=dead): pass except: print example raise
def build_scanner(lexicon, flags=0): import sre_parse import sre_compile from sre_constants import BRANCH, SUBPATTERN # combine phrases into a compound pattern p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))), ])) s.groups = len(p) + 1 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) scanner = sre_compile.compile(p) return scanner
def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def __init__(self, lexicon, flags=0): from sre_constants import BRANCH, SUBPATTERN if isinstance(flags, RegexFlag): flags = flags.value self.lexicon = lexicon # 将短语组合成复合模式 p = [] s = sre_parse.Pattern() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() p.append( sre_parse.SubPattern(s, [ (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p)
def make_scanner(lexicon, flags=FLAGS): actions = [None] # Combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags charpatterns = {} p = [] idx = 0 for token in lexicon: if token.pattern in (r'\[', r'{', r'"'): charpatterns[token.pattern[-1]] = token idx += 1 phrase = token.pattern try: subpattern = sre_parse.SubPattern( s, [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) actions.append(token) s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) scanner = sre_compile.compile(p).scanner def _scan_once(string, idx=0, context=None): try: action = charpatterns[string[idx]] except KeyError: pass except IndexError: raise StopIteration else: return action((string, idx + 1), context) m = scanner(string, idx).match() if m is None or m.end() == idx: raise StopIteration return actions[m.lastindex](m, context) return _scan_once
def test_no_pattern(self): import sre_compile, sre_parse sre_pattern = sre_compile.compile( sre_parse.SubPattern(sre_parse.Pattern())) assert sre_pattern.scanner('s') is not None
def as_sre_class(self): import sre_parse, sre_compile p = sre_parse.Pattern() p.flags = sre_parse.SRE_FLAG_UNICODE p.str = self.as_re_class() return sre_compile.compile(self._as_sre_subpattern(p))
import sre_parse # This is the string tokenizer.PseudoToken: pattern = '[ \\f\\t]*((\\\\\\r?\\n|\\Z|#[^\\r\\n]*|([uUbB]?[rR]?\'\'\'|[uUbB]?[rR]?"""))|((\\d+[jJ]|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)[jJ])|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)|(0[xX][\\da-fA-F]+[lL]?|0[bB][01]+[lL]?|(0[oO][0-7]+)|(0[0-7]*)[lL]?|[1-9]\\d*[lL]?))|((\\*\\*=?|>>=?|<<=?|<>|!=|//=?|[+\\-*/%&|^=<>]=?|~)|[][(){}]|(\\r?\\n|[:;.,`@]))|([uUbB]?[rR]?\'[^\\n\'\\\\]*(?:\\\\.[^\\n\'\\\\]*)*(\'|\\\\\\r?\\n)|[uUbB]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*("|\\\\\\r?\\n))|[a-zA-Z_]\\w*)' for i in xrange(600): p = sre_parse.Pattern() p.flags = 0 p.str = pattern sre_parse._parse_sub(sre_parse.Tokenizer(pattern), p, 0)
#