def locatedExpr(expr): """Helper to decorate a returned token with its starting and ending locations in the input string. This helper adds the following results names: - locn_start = location where matched expression begins - locn_end = location where matched expression ends - value = the actual parsed results Be careful if the input text contains ``<TAB>`` characters, you may want to call `ParserElement.parseWithTabs` Example:: wd = Word(alphas) for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"): print(match) prints:: [[0, 'ljsdf', 5]] [[8, 'lksdjjf', 15]] [[18, 'lkkjj', 23]] """ locator = Empty().addParseAction(lambda t, l, s: l) return Group(locator("locn_start") + Group(expr)("value") + locator("locn_end"))
def originalTextFor(expr, asString=True): """Helper to return the original, untokenized text for a given expression. Useful to restore the parsed fields of an HTML start tag into the raw tag text itself, or to revert separate tokens with intervening whitespace back to the original matching input text. By default, returns astring containing the original parsed text. If the optional ``asString`` argument is passed as ``False``, then the return value is a `ParseResults` containing any results names that were originally matched, and a single token containing the original matched text from the input string. So if the expression passed to `originalTextFor` contains expressions with defined results names, you must set ``asString`` to ``False`` if you want to preserve those results name values. Example:: src = "this is test <b> bold <i>text</i> </b> normal text " for tag in ("b", "i"): opener, closer = makeHTMLTags(tag) patt = originalTextFor(opener + SkipTo(closer) + closer) print(patt.searchString(src)[0]) prints:: ['<b> bold <i>text</i> </b>'] ['<i>text</i>'] """ locMarker = Empty().addParseAction(lambda t, l, s: l) matchExpr = locMarker("_original_start") + Group(expr) + locMarker("_original_end") matchExpr = matchExpr.addParseAction(extractText) return matchExpr
def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): """Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values. """ if isinstance(tagStr, text): resname = tagStr tagStr = Keyword(tagStr, caseless=True) else: resname = tagStr.parser_name tagAttrName = Word(alphas, alphanums + "_-:") tagAttrValue = quotedString.addParseAction(removeQuotes) | Word( printables, exclude=">" ) simpler_name = "".join(resname.replace(":", " ").title().split()) openTag = ( ( suppress_LT + tagStr("tag") + OpenDict(ZeroOrMore(Group( tagAttrName.addParseAction(downcaseTokens) + Optional(Suppress("=") + tagAttrValue) ))) + Optional( "/", default=[False] )("empty").addParseAction(lambda t, l, s: t[0] == "/") + suppress_GT ) .set_token_name("start" + simpler_name) .set_parser_name("<%s>" % resname) ) closeTag = ( Combine(Literal("</") + tagStr + ">") .set_token_name("end" + simpler_name) .set_parser_name("</%s>" % resname) ) # openTag.tag = resname # closeTag.tag = resname # openTag.tag_body = SkipTo(closeTag) return openTag, closeTag
def dictOf(key, value): """Helper to easily and clearly define a dictionary by specifying the respective patterns for the key and value. Takes care of defining the `Dict`, `ZeroOrMore`, and `Group` tokens in the proper order. The key pattern can include delimiting markers or punctuation, as long as they are suppressed, thereby leaving the significant key text. The value pattern can include named results, so that the `Dict` results can include named token fields. Example:: text = "shape: SQUARE posn: upper left color: light blue texture: burlap" attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).addParseAction(' '.join)) print(OneOrMore(attr_expr).parseString(text)) attr_label = label attr_value = Suppress(':') + OneOrMore(data_word, stopOn=label).addParseAction(' '.join) # similar to Dict, but simpler call format result = dictOf(attr_label, attr_value).parseString(text) print(result) print(result['shape']) print(result.shape) # object attribute access works too print(result) prints:: [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']] - color: light blue - posn: upper left - shape: SQUARE - texture: burlap SQUARE SQUARE {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'} """ return Dict(OneOrMore(Group(key + value)))
def indentedBlock(blockStatementExpr, indent=True): """Helper method for defining space-delimited indentation blocks, such as those used to define block statements in Python source code. Parameters: - blockStatementExpr - expression defining syntax of statement that is repeated within the indented block - indentStack - list created by caller to manage indentation stack (multiple statementWithIndentedBlock expressions within a single grammar should share a common indentStack) - indent - boolean indicating whether block must be indented beyond the current level; set to False for block of left-most statements (default= ``True``) A valid block must contain at least one ``blockStatement``. """ blockStatementExpr.engine.add_ignore("\\" + LineEnd()) PEER = Forward() DEDENT = Forward() def _reset_stack(p=None, l=None, s=None, ex=None): oldCol, oldPeer, oldDedent = _indent_stack.pop() PEER << oldPeer DEDENT << oldDedent def peer_stack(expectedCol): def output(t, l, s): if l >= len(s): return curCol = col(l, s) if curCol != expectedCol: if curCol > expectedCol: raise ParseException(t.type, s, l, "illegal nesting") raise ParseException(t.type, l, s, "not a peer entry") return output def dedent_stack(expectedCol): def output(t, l, s): if l >= len(s): return curCol = col(l, s) if curCol not in (i for i, _, _ in _indent_stack): raise ParseException(s, l, "not an unindent") if curCol < _indent_stack[-1][0]: oldCol, oldPeer, oldDedent = _indent_stack.pop() PEER << oldPeer DEDENT << oldDedent return output def indent_stack(t, l, s): curCol = col(l, s) if curCol > _indent_stack[-1][0]: PEER << Empty().addParseAction(peer_stack(curCol)) DEDENT << Empty().addParseAction(dedent_stack(curCol)) _indent_stack.append((curCol, PEER, DEDENT)) else: raise ParseException(t.type, l, s, "not a subentry") def nodent_stack(t, l, s): curCol = col(l, s) if curCol == _indent_stack[-1][0]: PEER << Empty().addParseAction(peer_stack(curCol)) DEDENT << Empty().addParseAction(dedent_stack(curCol)) _indent_stack.append((curCol, PEER, DEDENT)) else: raise ParseException(t.type, s, l, "not a subentry") NL = OneOrMore(LineEnd().suppress()) INDENT = Empty().addParseAction(indent_stack) NODENT = Empty().addParseAction(nodent_stack) if indent: smExpr = Group( Optional(NL) + INDENT + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)) + DEDENT ) else: smExpr = Group( Optional(NL) + NODENT + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)) + DEDENT ) return smExpr.setFailAction(_reset_stack).set_parser_name("indented block")
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString): """Helper method for defining nested lists enclosed in opening and closing delimiters ("(" and ")" are the default). Parameters: - opener - opening character for a nested list (default= ``"("``); can also be a mo_parsing expression - closer - closing character for a nested list (default= ``")"``); can also be a mo_parsing expression - content - expression for items within the nested lists (default= ``None``) - ignoreExpr - expression for ignoring opening and closing delimiters (default= `quotedString`) If an expression is not provided for the content argument, the nested expression will capture all whitespace-delimited content between delimiters as a list of separate values. Use the ``ignoreExpr`` argument to define expressions that may contain opening or closing characters that should not be treated as opening or closing characters for nesting, such as quotedString or a comment expression. Specify multiple expressions using an `Or` or `MatchFirst`. The default is `quotedString`, but if no expressions are to be ignored, then pass ``None`` for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: if not isinstance(opener, text) or not isinstance(closer, text): raise ValueError( "opening and closing arguments must be strings if no content expression" " is given" ) ignore_chars = engine.CURRENT.white_chars with Engine(""): def scrub(t): return t[0].strip() if len(opener) == 1 and len(closer) == 1: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + CharsNotIn(opener + closer + "".join(ignore_chars), exact=1,) )).addParseAction(scrub) else: content = Empty + CharsNotIn( opener + closer + "".join(ignore_chars) ).addParseAction(scrub) else: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) else: content = Combine(OneOrMore( ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) ret = Forward() if ignoreExpr is not None: ret <<= Group( Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) ) else: ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) ret.set_parser_name("nested %s%s expression" % (opener, closer)) return ret
def countFieldParseAction(t, l, s): n = t[0] arrayExpr << Group(Many(expr, exact=n)) return []
def infixNotation(baseExpr, spec, lpar=Suppress("("), rpar=Suppress(")")): """ :param baseExpr: expression representing the most basic element for the nested :param spec: list of tuples, one for each operator precedence level in the expression grammar; each tuple is of the form ``(opExpr, numTerms, rightLeftAssoc, parseAction)``, where: - opExpr is the mo_parsing expression for the operator; may also be a string, which will be converted to a Literal; if numTerms is 3, opExpr is a tuple of two expressions, for the two operators separating the 3 terms - numTerms is the number of terms for this operator (must be 1, 2, or 3) - rightLeftAssoc is the indicator whether the operator is right or left associative, using the mo_parsing-defined constants ``RIGHT_ASSOC`` and ``LEFT_ASSOC``. - parseAction is the parse action to be associated with expressions matching this operator expression (the parse action tuple member may be omitted); if the parse action is passed a tuple or list of functions, this is equivalent to calling ``setParseAction(*fn)`` (:class:`ParserElement.addParseAction`) :param lpar: expression for matching left-parentheses (default= ``Suppress('(')``) :param rpar: expression for matching right-parentheses (default= ``Suppress(')')``) :return: ParserElement """ all_op = {} def norm(op): output = all_op.get(id(op)) if output: return output def record_self(tok): ParseResults(tok.type, [tok.type.parser_name]) output = engine.CURRENT.normalize(op) is_suppressed = isinstance(output, Suppress) if is_suppressed: output = output.expr output = output.addParseAction(record_self) all_op[id(op)] = is_suppressed, output return is_suppressed, output opList = [] """ SCRUBBED LIST OF OPERATORS * expr - used exclusively for ParseResult(expr, [...]), not used to match * op - used to match * arity - same * assoc - same * parse_actions - same """ for operDef in spec: op, arity, assoc, rest = operDef[0], operDef[1], operDef[2], operDef[3:] parse_actions = list(map(wrap_parse_action, listwrap(rest[0]))) if rest else [] if arity == 1: is_suppressed, op = norm(op) if assoc == RIGHT_ASSOC: opList.append(( Group(baseExpr + op), op, is_suppressed, arity, assoc, parse_actions, )) else: opList.append(( Group(op + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) elif arity == 2: is_suppressed, op = norm(op) opList.append(( Group(baseExpr + op + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) elif arity == 3: is_suppressed, op = zip(norm(op[0]), norm(op[1])) opList.append(( Group(baseExpr + op[0] + baseExpr + op[1] + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) opList = tuple(opList) def record_op(op): def output(tokens): return ParseResults(NO_PARSER, [(tokens, op)]) return output prefix_ops = MatchFirst([ op.addParseAction(record_op(op)) for expr, op, is_suppressed, arity, assoc, pa in opList if arity == 1 and assoc == RIGHT_ASSOC ]) suffix_ops = MatchFirst([ op.addParseAction(record_op(op)) for expr, op, is_suppressed, arity, assoc, pa in opList if arity == 1 and assoc == LEFT_ASSOC ]) ops = Or([ opPart.addParseAction(record_op(opPart)) for expr, op, is_suppressed, arity, assoc, pa in opList if arity > 1 for opPart in (op if isinstance(op, tuple) else [op]) ]) def make_tree(tokens, loc, string): flat_tokens = list(tokens) num = len(opList) op_index = 0 while len(flat_tokens) > 1 and op_index < num: expr, op, is_suppressed, arity, assoc, parse_actions = opList[op_index] if arity == 1: if assoc == RIGHT_ASSOC: # PREFIX OPERATOR -3 todo = list(reversed(list(enumerate(flat_tokens[:-1])))) for i, (r, o) in todo: if o == op: if is_suppressed: result = ParseResults(expr, (flat_tokens[i + 1][0],)) else: result = ParseResults(expr, (r, flat_tokens[i + 1][0])) break else: op_index += 1 continue else: # SUFFIX OPERATOR 3! todo = list(enumerate(flat_tokens[1:])) for i, (r, o) in todo: if o == op: if is_suppressed: result = ParseResults(expr, (flat_tokens[i][0],)) else: result = ParseResults(expr, (flat_tokens[i][0], r,)) break else: op_index += 1 continue elif arity == 2: todo = list(enumerate(flat_tokens[1:-1])) if assoc == RIGHT_ASSOC: todo = list(reversed(todo)) for i, (r, o) in todo: if o == op: if is_suppressed: result = ParseResults( expr, (flat_tokens[i][0], flat_tokens[i + 2][0]) ) else: result = ParseResults( expr, (flat_tokens[i][0], r, flat_tokens[i + 2][0]) ) break else: op_index += 1 continue else: # arity==3 todo = list(enumerate(flat_tokens[1:-3])) if assoc == RIGHT_ASSOC: todo = list(reversed(todo)) for i, (r0, o0) in todo: if o0 == op[0]: r1, o1 = flat_tokens[i + 3] if o1 == op[1]: seq = [ flat_tokens[i][0], flat_tokens[i + 2][0], flat_tokens[i + 4][0], ] s0, s1 = is_suppressed if not s1: seq.insert(2, r1) if not s0: seq.insert(1, r0) result = ParseResults(expr, seq) break else: op_index += 1 continue for p in parse_actions: result = p(result, -1, string) offset = (0, 2, 3, 5)[arity] flat_tokens[i : i + offset] = [(result, (expr,))] op_index = 0 return flat_tokens[0][0] flat = Forward() iso = lpar.suppress() + flat + rpar.suppress() atom = (baseExpr | iso).addParseAction(record_op(baseExpr)) modified = ZeroOrMore(prefix_ops) + atom + ZeroOrMore(suffix_ops) flat << (modified + ZeroOrMore(ops + modified)).addParseAction(make_tree) return flat
def countFieldParseAction(t, l, s): n = t[0] arrayExpr << (n and Group(And([expr] * n)) or Group(empty)) return []
_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).addParseAction(lambda t, l, s: t[0][1]) _escapedHexChar = ( Regex(r"\\0?[xX][0-9a-fA-F]+").addParseAction(lambda t: unichr(int( t[0].lstrip('\\').lstrip('0').lstrip('xX'), 16 ))) ) _escapedOctChar = Regex(r"\\0[0-7]+").addParseAction(lambda t, l, s: unichr(int( t[0][1:], 8 ))) _singleChar = ( _escapedPunc | _escapedHexChar | _escapedOctChar | CharsNotIn(r"\]", exact=1) ) _charRange = Group(_singleChar + Suppress("-") + _singleChar) _reBracketExpr = ( Literal("[") + Optional("^").set_token_name("negate") + Group(OneOrMore(_charRange | _singleChar)).set_token_name("body") + "]" ) def srange(s): r"""Helper to easily define string ranges for use in Word construction. Borrows syntax from regexp '[]' string range definitions:: srange("[0-9]") -> "0123456789" srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz"
ret <<= Group( Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) ret.set_parser_name("nested %s%s expression" % (opener, closer)) return ret with Engine(""): _escapedPunc = Word("\\", r"\[]-*.$+^?()~ ", exact=2).addParseAction(lambda t, l, s: t[0][1]) _escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").addParseAction( lambda t: unichr(int(t[0].lstrip("\\").lstrip("0").lstrip("xX"), 16))) _escapedOctChar = Regex(r"\\0[0-7]+").addParseAction( lambda t, l, s: unichr(int(t[0][1:], 8))) _singleChar = (_escapedPunc | _escapedHexChar | _escapedOctChar | CharsNotIn(r"\]", exact=1)) _charRange = Group(_singleChar + Suppress("-") + _singleChar) _reBracketExpr = ("[" + Optional("^").set_token_name("negate") + Group( OneOrMore(_charRange | _singleChar)).set_token_name("body") + "]") def srange(s): r"""Helper to easily define string ranges for use in Word construction. Borrows syntax from regexp '[]' string range definitions:: srange("[0-9]") -> "0123456789" srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz" srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_" The input string must be enclosed in []'s, and the returned string is the expanded character set joined into a single string. The
~macro + Combine("\\" + AnyChar()) ).addParseAction(lambda t: Literal(t.value()[1])) plainChar = Char(exclude=r"\]").addParseAction(lambda t: Literal(t.value())) escapedHexChar = Combine( (Literal("\\0x") | Literal("\\x") | Literal("\\X")) # lookup literals is faster + OneOrMore(Char(hexnums)) ).addParseAction(hex_to_char) escapedOctChar = Combine( Literal("\\0") + OneOrMore(Char("01234567")) ).addParseAction(lambda t: Literal(unichr(int(t.value()[2:], 8)))) singleChar = escapedHexChar | escapedOctChar | escapedChar | plainChar charRange = Group(singleChar("min") + "-" + singleChar("max")).addParseAction(to_range) brackets = ( "[" + Optional("^")("negate") + OneOrMore(Group(charRange | singleChar | macro)("body")) + "]" ).addParseAction(to_bracket) ######################################################################################### # REGEX regex = Forward() line_start = Literal("^").addParseAction(lambda: LineStart()) line_end = Literal("$").addParseAction(lambda: LineEnd()) word_edge = Literal("\\b").addParseAction(lambda: NotAny(any_wordchar))