class WordStart(_PositionToken): """Matches if the current position is at the beginning of a Word, and is not preceded by any character in a given set of ``wordChars`` (default= ``printables``). To emulate the ``\b`` behavior of regular expressions, use ``WordStart(alphanums)``. ``WordStart`` will also match at the beginning of the string being parsed, or at the beginning of a line. """ __slots__ = [] Config = append_config(_PositionToken, "word_chars") def __init__(self, wordChars=printables): super(WordStart, self).__init__() self.set_config( regex=regex_compile( f"(?:(?<={(CharsNotIn(wordChars, exact=1)).__regex__()[1]})|^)(?={Char(wordChars).__regex__()[1]})" ), word_chars="".join(sorted(set(wordChars))), ) self.streamlined = True def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: return ParseResults(self, start, start, []) raise ParseException(self, start, string) def min_length(self): return 0 def __regex__(self): return "+", self.parser_config.regex.pattern
class Optional(Many): """Optional matching of the given expression. Parameters: - expr - expression that must match zero or more times - default (optional) - value to be returned if the optional expression is not found. """ __slots__ = [] Config = append_config(Many, "defaultValue") def __init__(self, expr, default=None): Many.__init__(self, expr, stopOn=None, min_match=0, max_match=1) self.set_config(defaultValue=listwrap(default)) def parseImpl(self, string, start, doActions=True): try: results = self.expr._parse(string, start, doActions) return ParseResults(self, results.start, results.end, [results]) except ParseException: return ParseResults(self, start, start, self.parser_config.defaultValue) def __str__(self): if self.parser_name: return self.parser_name return "[" + text(self.expr) + "]"
class Token(ParserElement): __slots__ = [] Config = append_config(ParserElement, "match", "regex") def __init__(self): ParserElement.__init__(self) self.streamlined = True
class Char(Token): __slots__ = [] Config = append_config(Token, "charset") def __init__(self, charset, asKeyword=False, excludeChars=None): """ Represent one character in a given charset """ Token.__init__(self) if excludeChars: charset = set(charset) - set(excludeChars) regex = regex_range(charset) if asKeyword: regex = r"\b%s\b" % self self.set_config( regex=regex_compile(regex), charset="".join(sorted(set(charset))), ) def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: return ParseResults(self, start, found.end(), [found.group()]) raise ParseException(self, start, string) def min_length(self): return 1 def __regex__(self): return "*", self.parser_config.regex.pattern def __str__(self): return self.parser_config.regex.pattern
class Keyword(Token): """Token to exactly match a specified string as a keyword, that is, it must be immediately followed by a non-keyword character. Compare with `Literal`: - ``Literal("if")`` will match the leading ``'if'`` in ``'ifAndOnlyIf'``. - ``Keyword("if")`` will not; it will only match the leading ``'if'`` in ``'if x=1'``, or ``'if(y==2)'`` Accepts two optional constructor arguments in addition to the keyword string: - ``ident_chars`` is a string of characters that would be valid identifier characters, defaulting to all alphanumerics + "_" and "$" - ``caseless`` allows case-insensitive matching, default is ``False``. For case-insensitive matching, use `CaselessKeyword`. """ __slots__ = [] Config = append_config(Token, "ident_chars") def __init__(self, match, ident_chars=None, caseless=None): Token.__init__(self) if ident_chars is None: ident_chars = self.engine.keyword_chars else: ident_chars = "".join(sorted(set(ident_chars))) if caseless: pattern = regex_caseless(match) else: pattern = re.escape(match) non_word = "($|(?!" + regex_range(ident_chars) + "))" self.set_config(ident_chars=ident_chars, match=match, regex=regex_compile(pattern + non_word)) self.parser_name = match if caseless: self.__class__ = CaselessKeyword def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: return ParseResults(self, start, found.end(), [self.parser_config.match]) raise ParseException(self, start, string) def _min_length(self): return len(self.parser_config.match) def __regex__(self): return "+", self.parser_config.regex.pattern
class CloseMatch(Token): """ A variation on `Literal` which matches "close" matches, that is, strings with at most 'n' mismatching characters. `CloseMatch` takes parameters: - ``match_string`` - string to be matched - ``maxMismatches`` - (``default=1``) maximum number of mismatches allowed to count as a match The results from a successful parse will contain the matched text from the input string and the following named results: - ``mismatches`` - a list of the positions within the match_string where mismatches were found - ``original`` - the original match_string used to compare against the input string If ``mismatches`` is an empty list, then the match was an exact match. """ __slots__ = [] Config = append_config(Token, "maxMismatches") def __init__(self, match_string, maxMismatches=1): super(CloseMatch, self).__init__() self.parser_name = match_string self.set_config(match=match_string, maxMismatches=maxMismatches) def parseImpl(self, string, start, doActions=True): end = start instrlen = len(string) maxloc = start + len(self.parser_config.match) if maxloc <= instrlen: match = self.parser_config.match match_stringloc = 0 mismatches = [] maxMismatches = self.parser_config.maxMismatches for match_stringloc, (src, mat) in enumerate( zip(string[end:maxloc], match)): if src != mat: mismatches.append(match_stringloc) if len(mismatches) > maxMismatches: break else: end = match_stringloc + 1 results = ParseResults(self, start, end, [string[start:end]]) results["original"] = match results["mismatches"] = mismatches return results raise ParseException(self, start, string)
class Combine(TokenConverter): """ Converter to concatenate all matching tokens to a single string. """ __slots__ = [] Config = append_config(TokenConverter, "separator") def __init__(self, expr, separator=""): super(Combine, self).__init__(expr.streamline()) self.set_config(separator=separator) self.parseAction.append(_combine) self.streamlined = True
class WordEnd(_PositionToken): """Matches if the current position is at the end of a Word, and is not followed by any character in a given set of ``wordChars`` (default= ``printables``). To emulate the ``\b`` behavior of regular expressions, use ``WordEnd(alphanums)``. ``WordEnd`` will also match at the end of the string being parsed, or at the end of a line. """ __slots__ = [] Config = append_config(_PositionToken, "word_chars") def __init__(self, wordChars=printables): super(WordEnd, self).__init__() self.engine = PLAIN_ENGINE self.set_config( word_chars="".join(sorted(set(wordChars))), regex=regex_compile( f"(?<={Char(wordChars).__regex__()[1]})({(~Char(wordChars)).__regex__()[1]}|$)" ), ) def copy(self): output = _PositionToken.copy(self) output.engine = PLAIN_ENGINE return output def min_length(self): return 0 def parseImpl(self, string, start, doActions=True): word_chars = self.parser_config.word_chars instrlen = len(string) if instrlen > 0 and start < instrlen: if string[start] in word_chars or string[start - 1] not in word_chars: raise ParseException(self, start, string) return ParseResults(self, start, start, []) def __regex__(self): return "+", self.parser_config.regex.pattern
class Combine(TokenConverter): """ Converter to concatenate all matching tokens to a single string. """ Config = append_config(TokenConverter, "separator") def __init__(self, expr, separator=""): super(Combine, self).__init__(expr.streamline()) self.set_config(separator=separator) def parseImpl(self, string, start, doActions=True): result = self.expr.parseImpl(string, start, doActions=doActions) output = ParseResults( self, start, result.end, [result.asString(sep=self.parser_config.separator)], ) return output def streamline(self): if self.streamlined: return self expr = self.expr.streamline() if expr is self.expr: self.streamlined = True return self return Combine(expr, self.parser_config.separator) def expecting(self): return OrderedDict((k, [self]) for k in self.expr.expecting().keys()) def min_length(self): return self.expr.min_length() def __regex__(self): return self.expr.__regex__()
class SkipTo(ParseEnhancement): """Token for skipping over all undefined text until the matched expression is found.""" __slots__ = [] Config = append_config(ParseEnhancement, "include", "fail", "ignore") def __init__(self, expr, include=False, ignore=None, failOn=None): """ :param expr: target expression marking the end of the data to be skipped :param include: if True, the target expression is also parsed (the skipped text and target expression are returned as a 2-element list). :param ignore: used to define grammars (typically quoted strings and comments) that might contain false matches to the target expression :param failOn: define expressions that are not allowed to be included in the skipped test; if found before the target expression is found, the SkipTo is not a match """ ParseEnhancement.__init__(self, expr) self.set_config(include=include, fail=engine.CURRENT.normalize(failOn), ignore=ignore) self.parser_name = str(self) def min_length(self): return 0 def parseImpl(self, string, start, doActions=True): instrlen = len(string) fail = self.parser_config.fail ignore = self.parser_config.ignore loc = start while loc <= instrlen: if fail: # break if failOn expression matches try: fail._parse(string, loc) before_end = loc break except: pass if ignore: # advance past ignore expressions while 1: try: loc = ignore._parse(string, loc).end except ParseException: break try: before_end = loc loc = self.expr._parse(string, loc, doActions=False).end except ParseException: # no match, advance loc in string loc += 1 else: # matched skipto expr, done break else: # ran off the end of the input string without matching skipto expr, fail raise ParseException(self, start, string) # build up return values end = loc skiptext = string[start:before_end] skip_result = [] if skiptext: skip_result.append(skiptext) if self.parser_config.include: end_result = self.expr._parse(string, before_end, doActions) skip_result.append(end_result) return ParseResults(self, start, end, skip_result) else: return ParseResults(self, start, before_end, skip_result)
class MatchAll(ParseExpression): """ Requires all given `ParseExpression` s to be found, but in any order. Expressions may be separated by whitespace. May be constructed using the ``'&'`` operator. """ __slots__ = [] Config = append_config(ParseExpression, "min_match", "max_match") def __init__(self, exprs): """ :param exprs: The expressions to be matched :param mins: list of integers indincating any minimums """ super(MatchAll, self).__init__(exprs) self.set_config( min_match=[ e.parser_config.min_match if isinstance(e, Many) else 1 for e in exprs ], max_match=[ e.parser_config.max_match if isinstance(e, Many) else 1 for e in exprs ], ) def streamline(self): if self.streamlined: return self return super(MatchAll, self).streamline() def _min_length(self): # TODO: MAY BE TOO CONSERVATIVE, WE MAY BE ABLE TO PROVE self CAN CONSUME A CHARACTER return min(e.min_length() for e in self.exprs) def parseImpl(self, string, start, doActions=True): end = start matchOrder = [] todo = list(zip( self.exprs, self.parser_config.min_match, self.parser_config.max_match )) count = [0] * len(self.exprs) while todo: for i, (c, (e, mi, ma)) in enumerate(zip(count, todo)): try: loc = e._parse(string, end).end if loc == end: continue end = loc c2 = count[i] = c + 1 if c2 >= ma: del todo[i] del count[i] matchOrder.append(e) break except ParseException as pe: continue else: break for c, (e, mi, ma) in zip(count, todo): if c < mi: raise ParseException( string, start, "Missing minimum (%i) more required elements (%s)" % (mi, e), ) found = set(id(m) for m in matchOrder) missing = [ e for e, mi in zip(self.exprs, self.parser_config.min_match) if id(e) not in found and mi > 0 ] if missing: missing = ", ".join(text(e) for e in missing) raise ParseException( string, start, "Missing one or more required elements (%s)" % missing ) # add any unmatched Optionals, in case they have default values defined matchOrder += [e for e in self.exprs if id(e) not in found] results = [] end = start for e in matchOrder: result = e._parse(string, end, doActions) end = result.end results.append(result) return ParseResults(self, results[0].start, results[-1].end, results) def __str__(self): if self.parser_name: return self.parser_name return "{" + " & ".join(text(e) for e in self.exprs) + "}"
class Many(ParseEnhancement): __slots__ = [] Config = append_config(ParseEnhancement, "min_match", "max_match", "end") def __init__(self, expr, stopOn=None, min_match=0, max_match=MAX_INT, exact=None): """ MATCH expr SOME NUMBER OF TIMES (OR UNTIL stopOn IS REACHED :param expr: THE EXPRESSION TO MATCH :param stopOn: THE PATTERN TO INDICATE STOP MATCHING (NOT REQUIRED IN PATTERN, JUST A QUICK STOP) :param min_match: MINIMUM MATCHES REQUIRED FOR SUCCESS (-1 IS INVALID) :param max_match: MAXIMUM MATCH REQUIRED FOR SUCCESS (-1 IS INVALID) """ ParseEnhancement.__init__(self, expr) if exact is not None: min_match = exact max_match = exact self.set_config(min_match=min_match, max_match=max_match) self.stopOn(stopOn) def stopOn(self, ender): if ender: end = self.engine.normalize(ender) self.set_config(end=regex_compile(end.__regex__()[1])) return self def _min_length(self): if self.parser_config.min_match == 0: return 0 return self.expr.min_length() def parseImpl(self, string, start, doActions=True): acc = [] end = start max = self.parser_config.max_match stopper = self.parser_config.end count = 0 try: while end < len(string) and count < max: if stopper: end = self.engine.skip(string, end) if stopper.match(string, end): if self.parser_config.min_match <= count: break else: raise ParseException(self, end, string, msg="found stopper too soon") result = self.expr._parse(string, end, doActions) end = result.end if result: acc.append(result) count += 1 except ParseException: if self.parser_config.min_match <= count <= max: pass else: ParseException(self, start, string, msg="Not correct amount of matches") if count: if (count < self.parser_config.min_match or self.parser_config.max_match < count): raise ParseException( self, acc[0].start, string, msg=( f"Expecting between {self.parser_config.min_match} and" f" {self.parser_config.max_match} of {self.expr}"), ) else: return ParseResults(self, acc[0].start, acc[-1].end, acc) else: if not self.parser_config.min_match: return ParseResults(self, start, start, []) else: raise ParseException( self, start, string, msg= f"Expecting at least {self.parser_config.min_match} of {self}", ) def streamline(self): if self.streamlined: return self expr = self.expr.streamline() if (self.parser_config.min_match == self.parser_config.max_match and not self.is_annotated()): if self.parser_config.min_match == 0: return Empty() elif self.parser_config.min_match == 1: return expr if self.expr is expr: self.streamlined = True return self if expr.is_annotated() or not isinstance(expr, Empty): output = self.copy() output.expr = expr output.streamlined = True return output return Empty() def __regex__(self): end = self.parser_config.end.pattern if self.parser_config.end else None prec, regex = self.expr.__regex__() regex = regex_iso(prec, regex, "*") if self.parser_config.max_match == MAX_INT: if self.parser_config.min_match == 0: suffix = "*" elif self.parser_config.min_match == 1: suffix = "+" else: suffix = "{" + text(self.parser_config.min_match) + ",}" elif self.parser_config.min_match == self.parser_config.max_match: if self.parser_config.min_match == 1: suffix = "" else: suffix = "{" + text(self.parser_config.min_match) + "}" else: suffix = ("{" + text(self.parser_config.min_match) + "," + text(self.parser_config.max_match) + "}") if end: return "+", regex + suffix + end else: return "*", regex + suffix def __call__(self, name): if not name: return self for e in [self.expr]: if isinstance(e, ParserElement) and e.token_name == name: Log.error( "can not set token name, already set in one of the other" " expressions") return ParseEnhancement.__call__(self, name) def __str__(self): if self.parser_name: return self.parser_name return f"{self.__class__.__name__}:({self.expr})"
class White(Token): """Special matching class for matching whitespace. Normally, whitespace is ignored by mo_parsing grammars. This class is included when some whitespace structures are significant. Define with a string containing the whitespace characters to be matched; default is ``" \\t\\r\\n"``. Also takes optional ``min``, ``max``, and ``exact`` arguments, as defined for the `Word` class. """ whiteStrs = { " ": "<SP>", "\t": "<TAB>", "\n": "<LF>", "\r": "<CR>", "\f": "<FF>", "u\00A0": "<NBSP>", "u\1680": "<OGHAM_SPACE_MARK>", "u\180E": "<MONGOLIAN_VOWEL_SEPARATOR>", "u\2000": "<EN_QUAD>", "u\2001": "<EM_QUAD>", "u\2002": "<EN_SPACE>", "u\2003": "<EM_SPACE>", "u\2004": "<THREE-PER-EM_SPACE>", "u\2005": "<FOUR-PER-EM_SPACE>", "u\2006": "<SIX-PER-EM_SPACE>", "u\2007": "<FIGURE_SPACE>", "u\2008": "<PUNCTUATION_SPACE>", "u\2009": "<THIN_SPACE>", "u\200A": "<HAIR_SPACE>", "u\200B": "<ZERO_WIDTH_SPACE>", "u\202F": "<NNBSP>", "u\205F": "<MMSP>", "u\3000": "<IDEOGRAPHIC_SPACE>", } __slots__ = [] Config = append_config(Token, "min_len", "max_len", "white_chars") def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0): with Engine(white="".join(c for c in self.engine.white_chars if c not in ws)) as e: super(White, self).__init__() self.set_config(lock_engine=e) white_chars = "".join(sorted(set(ws))) self.parser_name = "|".join(White.whiteStrs[c] for c in white_chars) max = max if max > 0 else MAX_INT if exact > 0: max = exact min = exact self.set_config(min_len=min, max_len=max, white_chars=white_chars) def parseImpl(self, string, start, doActions=True): if string[start] not in self.parser_config.white_chars: raise ParseException(self, start, string) end = start end += 1 maxloc = start + self.parser_config.max_len maxloc = min(maxloc, len(string)) while end < maxloc and string[end] in self.parser_config.white_chars: end += 1 if end - start < self.parser_config.min_len: raise ParseException(self, end, string) return ParseResults(self, start, end, string[start:end])
class CharsNotIn(Token): """Token for matching words composed of characters *not* in a given set (will include whitespace in matched characters if not listed in the provided exclusion set - see example). Defined with string containing all disallowed characters, and an optional minimum, maximum, and/or exact length. The default value for ``min`` is 1 (a minimum value < 1 is not valid); the default values for ``max`` and ``exact`` are 0, meaning no maximum or exact length restriction. """ __slots__ = [] Config = append_config(Token, "min_len", "max_len", "not_chars") def __init__(self, notChars, min=1, max=0, exact=0): Token.__init__(self) not_chars = "".join(sorted(set(notChars))) if min < 1: raise ValueError( "cannot specify a minimum length < 1; use " "Optional(CharsNotIn()) if zero-length char group is permitted" ) max = max if max > 0 else MAX_INT if exact: min = exact max = exact if len(notChars) == 1: regex = "[^" + regex_range(notChars) + "]" else: regex = "[^" + regex_range(notChars)[1:] if not max or max == MAX_INT: if min == 0: suffix = "*" elif min == 1: suffix = "+" else: suffix = "{" + str(min) + ":}" elif min == 1 and max == 1: suffix = "" else: suffix = "{" + str(min) + ":" + str(max) + "}" self.set_config( regex=regex_compile(regex + suffix), min_len=min, max_len=max, not_chars=not_chars, ) self.parser_name = text(self) def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: return ParseResults(self, start, found.end(), [found.group()]) raise ParseException(self, start, string) def min_length(self): return self.parser_config.min_len def __regex__(self): return "*", self.parser_config.regex.pattern def __str__(self): return self.parser_config.regex.pattern
class QuotedString(Token): r""" Token for matching strings that are delimited by quoting characters. Defined with the following parameters: - quoteChar - string of one or more characters defining the quote delimiting string - escChar - character to escape quotes, typically backslash (default= ``None``) - escQuote - special quote sequence to escape an embedded quote string (such as SQL's ``""`` to escape an embedded ``"``) (default= ``None``) - multiline - boolean indicating whether quotes can span multiple lines (default= ``False``) - unquoteResults - boolean indicating whether the matched text should be unquoted (default= ``True``) - endQuoteChar - string of one or more characters defining the end of the quote delimited string (default= ``None`` => same as quoteChar) - convertWhitespaceEscapes - convert escaped whitespace (``'\t'``, ``'\n'``, etc.) to actual whitespace (default= ``True``) """ __slots__ = [] Config = append_config( Token, "quote_char", "end_quote_char", "esc_char", "esc_quote", "multiline", "unquoteResults", "convertWhitespaceEscapes", "escCharReplacePattern", ) def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unquoteResults=True, endQuoteChar=None, convertWhitespaceEscapes=True, ): super(QuotedString, self).__init__() # remove white space from quote chars - wont work anyway quoteChar = quoteChar.strip() if not quoteChar: warnings.warn("quoteChar cannot be the empty string", SyntaxWarning, stacklevel=2) raise SyntaxError() if endQuoteChar is None: endQuoteChar = quoteChar else: endQuoteChar = endQuoteChar.strip() if not endQuoteChar: warnings.warn( "endQuoteChar cannot be the empty string", SyntaxWarning, stacklevel=2, ) raise SyntaxError() self.set_config( quote_char=quoteChar, end_quote_char=endQuoteChar, esc_char=escChar, esc_quote=escQuote, unquoteResults=unquoteResults, convertWhitespaceEscapes=convertWhitespaceEscapes, ) # TODO: FIX THIS MESS. WE SHOULD BE ABLE TO CONSTRUCT REGEX FROM ParserElements included = Empty() excluded = Literal(self.parser_config.end_quote_char) if not multiline: excluded |= Char("\r\n") if escQuote: included |= Literal(escQuote) if escChar: excluded |= Literal(self.parser_config.esc_char) included = included | escChar + Char(printables) self.set_config( escCharReplacePattern=re.escape(self.parser_config.esc_char) + "(.)") prec, pattern = ( Literal(quoteChar) + ((~excluded + AnyChar()) | included)[0:] + Literal(self.parser_config.end_quote_char)).__regex__() self.set_config(multiline=multiline, regex=regex_compile(pattern)) self.parser_name = text(self) def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if not found: raise ParseException(self, start, string) end = found.end() ret = found.group() if self.parser_config.unquoteResults: # strip off quotes ret = ret[len(self.parser_config.quote_char ):-len(self.parser_config.end_quote_char)] if isinstance(ret, text): # replace escaped whitespace if "\\" in ret and self.parser_config.convertWhitespaceEscapes: ws_map = { r"\t": "\t", r"\n": "\n", r"\f": "\f", r"\r": "\r", } for wslit, wschar in ws_map.items(): ret = ret.replace(wslit, wschar) # replace escaped characters if self.parser_config.esc_char: ret = re.sub(self.parser_config.escCharReplacePattern, r"\g<1>", ret) # replace escaped quotes if self.parser_config.esc_quote: ret = ret.replace(self.parser_config.esc_quote, self.parser_config.end_quote_char) return ParseResults(self, start, end, [ret]) def min_length(self): return 2 def __str__(self): try: return super(QuotedString, self).__str__() except Exception: pass return "quoted string, starting with %s ending with %s" % ( self.parser_config.quote_char, self.parser_config.end_quote_char, )
class And(ParseExpression): """ Requires all given `ParseExpression` s to be found in the given order. Expressions may be separated by whitespace. May be constructed using the ``'+'`` operator. May also be constructed using the ``'-'`` operator, which will suppress backtracking. """ __slots__ = [] Config = append_config(ParseExpression, "engine") class SyntaxErrorGuard(Empty): def __init__(self, *args, **kwargs): with Engine(""): super(And.SyntaxErrorGuard, self).__init__(*args, **kwargs) self.parser_name = "-" def __init__(self, exprs, engine): if exprs and Ellipsis in exprs: tmp = [] for i, expr in enumerate(exprs): if expr is Ellipsis: if i < len(exprs) - 1: skipto_arg = (Empty() + exprs[i + 1]).exprs[-1] tmp.append(SkipTo(skipto_arg)("_skipped")) else: raise Exception( "cannot construct And with sequence ending in ..." ) else: tmp.append(expr) exprs[:] = tmp super(And, self).__init__(exprs) self.set_config(engine=engine) def streamline(self): if self.streamlined: return self if not self.exprs: return Empty(self.parser_name) if len(self.exprs) == 1 and not self.is_annotated(): return self.exprs[0].streamline() # collapse any _PendingSkip's same = True exprs = self.exprs if any( isinstance(e, ParseExpression) and e.exprs and isinstance(e.exprs[-1], _PendingSkip) for e in exprs[:-1] ): same = False for i, e in enumerate(exprs[:-1]): if ( isinstance(e, ParseExpression) and e.exprs and isinstance(e.exprs[-1], _PendingSkip) ): ee = e.exprs[-1] + exprs[i + 1] e.exprs[-1] = ee e.streamlined = False exprs[i + 1] = None # streamline INDIVIDUAL EXPRESSIONS acc = [] for e in exprs: if e is None: continue f = e.streamline() same = same and f is e if f.is_annotated(): acc.append(f) elif isinstance(f, And) and f.parser_config.engine is self.parser_config.engine: same = False acc.extend(f.exprs) else: acc.append(f) if same: self.streamlined = True return self output = self.copy() output.exprs = acc output.streamlined = True return output def expecting(self): if not self.exprs: return {} acc = OrderedDict() for e in self.exprs: expect = e.expecting() if not expect: return {} for k in expect.keys(): acc[k] = [self] if e.min_length(): break return acc def _min_length(self): return sum(e.min_length() for e in self.exprs) def parseImpl(self, string, start, doActions=True): # pass False as last arg to _parse for first element, since we already # pre-parsed the string as part of our And pre-parsing encountered_syntax_error = False end = index = start acc = [] for i, expr in enumerate(self.exprs): if end > index: index = self.parser_config.engine.skip(string, end) if isinstance(expr, And.SyntaxErrorGuard): encountered_syntax_error = True continue try: result = expr._parse(string, index, doActions) end = result.end acc.append(result) except ParseException as pe: if encountered_syntax_error: raise ParseSyntaxException(pe.expr, pe.loc, pe.string) else: raise pe return ParseResults(self, start, end, acc) def __add__(self, other): if other is Ellipsis: return _PendingSkip(self) return And([self, engine.CURRENT.normalize(other)], engine.CURRENT).streamline() def checkRecursion(self, seen=empty_tuple): subRecCheckList = seen + (self,) for e in self.exprs: e.checkRecursion(subRecCheckList) if e.min_length(): return def __regex__(self): return "+", "".join(regex_iso(*e.__regex__(), "+") for e in self.exprs) def __str__(self): if self.parser_name: return self.parser_name return "{" + " + ".join(text(e) for e in self.exprs) + "}"
class Word(Token): """Token for matching words composed of allowed character sets. Defined with string containing all allowed initial characters, an optional string containing allowed body characters (if omitted, defaults to the initial character set), and an optional minimum, maximum, and/or exact length. The default value for ``min`` is 1 (a minimum value < 1 is not valid); the default values for ``max`` and ``exact`` are 0, meaning no maximum or exact length restriction. An optional ``excludeChars`` parameter can list characters that might be found in the input ``body_chars`` string; useful to define a word of all printables except for one or two characters, for instance. `srange` is useful for defining custom character set strings for defining ``Word`` expressions, using range notation from regular expression character sets. A common mistake is to use `Word` to match a specific literal string, as in ``Word("Address")``. Remember that `Word` uses the string argument to define *sets* of matchable characters. This expression would match "Add", "AAA", "dAred", or any other word made up of the characters 'A', 'd', 'r', 'e', and 's'. To match an exact literal string, use `Literal` or `Keyword`. mo_parsing includes helper strings for building Words: - `alphas` - `nums` - `alphanums` - `hexnums` - `alphas8bit` (alphabetic characters in ASCII range 128-255 - accented, tilded, umlauted, etc.) - `punc8bit` (non-alphabetic characters in ASCII range 128-255 - currency, symbols, superscripts, diacriticals, etc.) - `printables` (any non-whitespace character) """ __slots__ = [] Config = append_config(Token, "min") def __init__( self, init_chars, body_chars=None, min=1, max=None, exact=0, asKeyword=False, excludeChars=None, ): Token.__init__(self) if body_chars is None: body_chars = init_chars if exact: min = max = exact if min < 1: raise ValueError( "cannot specify a minimum length < 1; use Optional(Word()) if" " zero-length word is permitted") if body_chars == init_chars: prec, regexp = Char( init_chars, excludeChars=excludeChars)[min:max].__regex__() elif max is None or max == MAX_INT: prec, regexp = (Char(init_chars, excludeChars=excludeChars) + Char( body_chars, excludeChars=excludeChars)[min - 1:]).__regex__() else: prec, regexp = ( Char(init_chars, excludeChars=excludeChars) + Char(body_chars, excludeChars=excludeChars)[min - 1:max - 1]).__regex__() if asKeyword: regexp = r"\b" + regexp + r"\b" self.set_config(regex=regex_compile(regexp), min=min) def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: return ParseResults(self, start, found.end(), [found.group()]) raise ParseException(self, start, string) def min_length(self): return self.parser_config.min def __regex__(self): return "+", self.parser_config.regex.pattern def __str__(self): if self.parser_name: return self.parser_name return f"W:({self.parser_config.regex.pattern})"
class PrecededBy(ParseEnhancement): """Lookbehind matching of the given parse expression. ``PrecededBy`` does not advance the parsing position within the input string, it only verifies that the specified parse expression matches prior to the current position. ``PrecededBy`` always returns a null token list, but if a results name is defined on the given expression, it is returned. Parameters: - expr - expression that must match prior to the current parse location - retreat - (default= ``None``) - (int) maximum number of characters to lookbehind prior to the current parse location If the lookbehind expression is a string, Literal, Keyword, or a Word or CharsNotIn with a specified exact or maximum length, then the retreat parameter is not required. Otherwise, retreat must be specified to give a maximum number of characters to look back from the current parse position for a lookbehind match. """ __slots__ = [] Config = append_config(ParseEnhancement, "retreat", "exact") def __init__(self, expr, retreat=None): super(PrecededBy, self).__init__(expr) expr = self.expr = self.expr.leaveWhitespace() if isinstance(expr, (Literal, Keyword, Char)): self.set_config(retreat=expr.min_length(), exact=True) elif isinstance(expr, (Word, CharsNotIn)): self.set_config(retreat=expr.min_length(), exact=False) elif isinstance(expr, _PositionToken): self.set_config(retreat=0, exact=True) else: self.set_config(retreat=expr.min_length(), exact=False) def parseImpl(self, string, start=0, doActions=True): if self.parser_config.exact: loc = start - self.parser_config.retreat if loc < 0: raise ParseException(self, start, string) ret = self.expr._parse(string, loc) else: # retreat specified a maximum lookbehind window, iterate test_expr = self.expr + StringEnd() instring_slice = string[:start] last_cause = ParseException(self, start, string) with self.engine.backup(): for offset in range(self.parser_config.retreat, start + 1): try: ret = test_expr._parse(instring_slice, start - offset) break except ParseException as cause: last_cause = cause else: raise last_cause # return empty list of tokens, but preserve any defined results names ret.__class__ = Annotation return ParseResults(self, start, start, [ret]) def __regex__(self): if self.parser_config.exact: return "*", f"(?<={self.expr.__regex__()[1]})" raise NotImplemented()
class Regex(Token): r"""Token for matching strings that match a given regular expression. Defined with string specifying the regular expression in a form recognized by the stdlib Python `re module <https://docs.python.org/3/library/re.html>`_. If the given regex contains named groups (defined using ``(?P<name>...)``), these will be preserved as named parse results. """ __slots__ = [] Config = append_config(Token, "flags") def __new__(cls, pattern, flags=0, asGroupList=False, asMatch=False): if asGroupList: return object.__new__(_RegExAsGroup) elif asMatch: return object.__new__(_RegExAsMatch) else: return object.__new__(cls) def __init__(self, pattern, flags=0, asGroupList=False, asMatch=False): """The parameters ``pattern`` and ``flags`` are passed to the ``regex_compile()`` function as-is. See the Python `re module <https://docs.python.org/3/library/re.html>`_ module for an explanation of the acceptable patterns and flags. """ Token.__init__(self) if isinstance(pattern, text): if not pattern: warnings.warn( "null string passed to Regex; use Empty() instead", SyntaxWarning, stacklevel=2, ) try: self.set_config(flags=flags, regex=re.compile(pattern, flags)) except sre_constants.error as cause: Log.error( "invalid pattern {{pattern}} passed to Regex", pattern=pattern, cause=cause, ) elif isinstance(pattern, regex_type): self.set_config(flags=flags, regex=pattern) else: Log.error( "Regex may only be constructed with a string or a compiled RE object" ) self.parser_name = text(self) def parseImpl(self, string, start, doActions=True): found = self.parser_config.regex.match(string, start) if found: ret = ParseResults(self, start, found.end(), [found.group()]) d = found.groupdict() if d: for k, v in d.items(): ret[k] = v return ret raise ParseException(self, start, string) def min_length(self): return 0 def __regex__(self): return "|", self.parser_config.regex.pattern def __str__(self): return self.parser_config.regex.pattern def sub(self, repl): r""" Return Regex with an attached parse action to transform the parsed result as if called using `re.sub(expr, repl, string) <https://docs.python.org/3/library/re.html#re.sub>`_. Example:: make_html = Regex(r"(\w+):(.*?):").sub(r"<\1>\2</\1>") print(make_html.transformString("h1:main title:")) # prints "<h1>main title</h1>" """ def pa(tokens): return self.parser_config.regex.sub(repl, tokens[0]) return self.addParseAction(pa)