def copyTokenToRepeater(t, l, s): if t: if len(t) == 1: rep << t[0] else: # flatten t tokens tflat = _flatten(t) rep << And(Literal(tt) for tt in tflat) else: rep << Empty()
def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): """Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values. """ if isinstance(tagStr, text): resname = tagStr tagStr = Keyword(tagStr, caseless=True) else: resname = tagStr.parser_name tagAttrName = Word(alphas, alphanums + "_-:") tagAttrValue = quotedString.addParseAction(removeQuotes) | Word( printables, exclude=">" ) simpler_name = "".join(resname.replace(":", " ").title().split()) openTag = ( ( suppress_LT + tagStr("tag") + OpenDict(ZeroOrMore(Group( tagAttrName.addParseAction(downcaseTokens) + Optional(Suppress("=") + tagAttrValue) ))) + Optional( "/", default=[False] )("empty").addParseAction(lambda t, l, s: t[0] == "/") + suppress_GT ) .set_token_name("start" + simpler_name) .set_parser_name("<%s>" % resname) ) closeTag = ( Combine(Literal("</") + tagStr + ">") .set_token_name("end" + simpler_name) .set_parser_name("</%s>" % resname) ) # openTag.tag = resname # closeTag.tag = resname # openTag.tag_body = SkipTo(closeTag) return openTag, closeTag
def QuotedString( quote_char, esc_char=None, esc_quote=None, multiline=False, unquote_results=True, end_quote_char="", convert_whitespace_escape=True, ): r""" Token for matching strings that are delimited by quoting characters. Defined with the following parameters: - quote_char - string of one or more characters defining the quote delimiting string - esc_char - character to escape quotes, typically backslash (default= ``None``) - esc_quote - special quote sequence to escape an embedded quote string (such as SQL's ``""`` to escape an embedded ``"``) (default= ``None``) - multiline - boolean indicating whether quotes can span multiple lines (default= ``False``) - unquoteResults - boolean indicating whether the matched text should be unquoted (default= ``True``) - end_quote_char - string of one or more characters defining the end of the quote delimited string (default= ``None`` => same as quote_char) - convertWhitespaceEscapes - convert escaped whitespace (``'\t'``, ``'\n'``, etc.) to actual whitespace (default= ``True``) """ quote_char = quote_char.strip() end_quote_char = end_quote_char.strip() or quote_char if not quote_char: Log.error("quote_char cannot be the empty string") if not end_quote_char: Log.error("end_quote_char cannot be the empty string") excluded = Literal(end_quote_char) if multiline: anychar = AnyChar() else: anychar = Char(exclude="\n") excluded |= Char("\r\n") included = ~Literal(end_quote_char) + anychar if esc_quote: included = Literal(esc_quote) | included if esc_char: excluded |= Literal(esc_char) included = esc_char + Char(printables) | included esc_char_replace_pattern = re.escape(esc_char) + "(.)" prec, pattern = ( Literal(quote_char) + ((~excluded + anychar) | included)[0:] ).__regex__() # IMPORTANT: THE end_quote_char IS OUTSIDE THE Regex BECAUSE OF PATHOLOGICAL BACKTRACKING output = Combine(Regex(pattern) + Literal(end_quote_char)) def post_parse(tokens): ret = tokens[0] if unquote_results: # strip off quotes ret = ret[len(quote_char) : -len(end_quote_char)] if isinstance(ret, text): # replace escaped whitespace if "\\" in ret and convert_whitespace_escape: ws_map = { r"\t": "\t", r"\n": "\n", r"\f": "\f", r"\r": "\r", } for wslit, wschar in ws_map.items(): ret = ret.replace(wslit, wschar) # replace escaped characters if esc_char: ret = re.sub(esc_char_replace_pattern, r"\g<1>", ret) # replace escaped quotes if esc_quote: ret = ret.replace(esc_quote, end_quote_char) return ParseResults(tokens.type, tokens.start, tokens.end, [ret]) return output.addParseAction(post_parse).streamline()
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString): """Helper method for defining nested lists enclosed in opening and closing delimiters ("(" and ")" are the default). Parameters: - opener - opening character for a nested list (default= ``"("``); can also be a mo_parsing expression - closer - closing character for a nested list (default= ``")"``); can also be a mo_parsing expression - content - expression for items within the nested lists (default= ``None``) - ignoreExpr - expression for ignoring opening and closing delimiters (default= `quotedString`) If an expression is not provided for the content argument, the nested expression will capture all whitespace-delimited content between delimiters as a list of separate values. Use the ``ignoreExpr`` argument to define expressions that may contain opening or closing characters that should not be treated as opening or closing characters for nesting, such as quotedString or a comment expression. Specify multiple expressions using an `Or` or `MatchFirst`. The default is `quotedString`, but if no expressions are to be ignored, then pass ``None`` for this argument. """ if opener == closer: raise ValueError("opening and closing strings cannot be the same") if content is None: if not isinstance(opener, text) or not isinstance(closer, text): raise ValueError( "opening and closing arguments must be strings if no content expression" " is given" ) ignore_chars = engine.CURRENT.white_chars with Engine(""): def scrub(t): return t[0].strip() if len(opener) == 1 and len(closer) == 1: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + CharsNotIn(opener + closer + "".join(ignore_chars), exact=1,) )).addParseAction(scrub) else: content = Empty + CharsNotIn( opener + closer + "".join(ignore_chars) ).addParseAction(scrub) else: if ignoreExpr is not None: content = Combine(OneOrMore( ~ignoreExpr + ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) else: content = Combine(OneOrMore( ~Literal(opener) + ~Literal(closer) + CharsNotIn(ignore_chars, exact=1) )).addParseAction(scrub) ret = Forward() if ignoreExpr is not None: ret <<= Group( Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) ) else: ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) ret.set_parser_name("nested %s%s expression" % (opener, closer)) return ret
dblQuotedString = Combine( # 0 1 2 3 4 5 # 012345678901234567890123456789012345678901234567890123456789 Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"' ).set_parser_name("string enclosed in double quotes") sglQuotedString = Combine( Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'" ).set_parser_name("string enclosed in single quotes") quotedString = Combine( Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"' | Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'" ).set_parser_name("quotedString using single or double quotes") unicodeString = Combine( Literal("u") + quotedString ).set_parser_name("unicode string literal") def countedArray(expr, intExpr=None): """Helper to define a counted list of expressions. This helper defines a pattern of the form:: integer expr expr expr... where the leading integer tells how many expr expressions follow. The matched tokens returns the array of expr tokens as a list - the leading count token is suppressed. If ``intExpr`` is specified, it should be a mo_parsing expression
from mo_parsing.utils import regex_range # import later And, Or, MatchFirst = [None] * 3 dblQuotedString = Combine( Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"').set_parser_name("string enclosed in double quotes") sglQuotedString = Combine( Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'").set_parser_name("string enclosed in single quotes") quotedString = Combine( Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*') + '"' | Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*") + "'").set_parser_name("quotedString using single or double quotes") unicodeString = Combine(Literal("u") + quotedString).set_parser_name("unicode string literal") def delimitedList(expr, separator=",", combine=False): """ PARSE DELIMITED LIST OF expr Example:: delimitedList(Word(alphas)).parseString("aa,bb,cc") # -> ['aa', 'bb', 'cc'] delimitedList(Word(hexnums), delim=':', combine=True).parseString("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE'] """ if combine: return Combine(expr + ZeroOrMore(separator + expr)) else: return expr + ZeroOrMore(Suppress(separator) + expr)
def infixNotation(baseExpr, spec, lpar=Suppress(Literal("(")), rpar=Suppress(Literal(")"))): """ :param baseExpr: expression representing the most basic element for the nested :param spec: list of tuples, one for each operator precedence level in the expression grammar; each tuple is of the form ``(opExpr, numTerms, rightLeftAssoc, parseAction)``, where: - opExpr is the mo_parsing expression for the operator; may also be a string, which will be converted to a Literal; if numTerms is 3, opExpr is a tuple of two expressions, for the two operators separating the 3 terms - numTerms is the number of terms for this operator (must be 1, 2, or 3) - rightLeftAssoc is the indicator whether the operator is right or left associative, using the mo_parsing-defined constants ``RIGHT_ASSOC`` and ``LEFT_ASSOC``. - parseAction is the parse action to be associated with expressions matching this operator expression :param lpar: expression for matching left-parentheses (default= ``Suppress('(')``) :param rpar: expression for matching right-parentheses (default= ``Suppress(')')``) :return: ParserElement """ all_op = {} def norm(op): if op == None: op = _no_op output = all_op.get(id(op)) if output: return output def record_self(tok): ParseResults(tok.type, tok.start, tok.end, [tok.type.parser_name]) output = engine.CURRENT.normalize(op) is_suppressed = isinstance(output, Suppress) if is_suppressed: output = output.expr output = output.addParseAction(record_self) all_op[id(op)] = is_suppressed, output return is_suppressed, output opList = [] """ SCRUBBED LIST OF OPERATORS * expr - used exclusively for ParseResult(expr, [...]), not used to match * op - used to match * arity - same * assoc - same * parse_actions - same """ for operDef in spec: op, arity, assoc, rest = operDef[0], operDef[1], operDef[2], operDef[ 3:] parse_actions = list(map(wrap_parse_action, listwrap( rest[0]))) if rest else [] if arity == 1: is_suppressed, op = norm(op) if assoc == RIGHT_ASSOC: opList.append(( Group(baseExpr + op), op, is_suppressed, arity, assoc, parse_actions, )) else: opList.append(( Group(op + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) elif arity == 2: is_suppressed, op = norm(op) opList.append(( Group(baseExpr + op + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) elif arity == 3: is_suppressed, op = zip(norm(op[0]), norm(op[1])) opList.append(( Group(baseExpr + op[0] + baseExpr + op[1] + baseExpr), op, is_suppressed, arity, assoc, parse_actions, )) opList = tuple(opList) def record_op(op): def output(tokens): return ParseResults(NO_PARSER, tokens.start, tokens.end, [(tokens, op)]) return output prefix_ops = MatchFirst([ op.addParseAction(record_op(op)) for expr, op, is_suppressed, arity, assoc, pa in opList if arity == 1 and assoc == RIGHT_ASSOC ]) suffix_ops = MatchFirst([ op.addParseAction(record_op(op)) for expr, op, is_suppressed, arity, assoc, pa in opList if arity == 1 and assoc == LEFT_ASSOC ]) ops = Or([ opPart.addParseAction(record_op(opPart)) for opPart in set( opPart for expr, op, is_suppressed, arity, assoc, pa in opList if arity > 1 for opPart in (op if isinstance(op, tuple) else [op])) ]) def make_tree(tokens, loc, string): flat_tokens = list(tokens) num = len(opList) op_index = 0 while len(flat_tokens) > 1 and op_index < num: expr, op, is_suppressed, arity, assoc, parse_actions = opList[ op_index] if arity == 1: if assoc == RIGHT_ASSOC: # PREFIX OPERATOR -3 todo = list(reversed(list(enumerate(flat_tokens[:-1])))) for i, (r, o) in todo: if o == op: tok = flat_tokens[i + 1][0] if is_suppressed: result = ParseResults(expr, tok.start, tok.end, (tok, )) else: result = ParseResults(expr, r.start, tok.end, (r, tok)) break else: op_index += 1 continue else: # SUFFIX OPERATOR 3! todo = list(enumerate(flat_tokens[1:])) for i, (r, o) in todo: if o == op: tok = flat_tokens[i][0] if is_suppressed: result = ParseResults(expr, tok.start, tok.end, (tok, )) else: result = ParseResults(expr, tok.start, r.end, ( tok, r, )) break else: op_index += 1 continue elif arity == 2: todo = list(enumerate(flat_tokens[1:-1])) if assoc == RIGHT_ASSOC: todo = list(reversed(todo)) for i, (r, o) in todo: if o == op: if is_suppressed: result = ParseResults( expr, flat_tokens[i][0].start, flat_tokens[i + 2][0].end, (flat_tokens[i][0], flat_tokens[i + 2][0]), ) else: result = ParseResults( expr, flat_tokens[i][0].start, flat_tokens[i + 2][0].end, (flat_tokens[i][0], r, flat_tokens[i + 2][0]), ) break else: op_index += 1 continue else: # arity==3 todo = list(enumerate(flat_tokens[1:-3])) if assoc == RIGHT_ASSOC: todo = list(reversed(todo)) for i, (r0, o0) in todo: if o0 == op[0]: r1, o1 = flat_tokens[i + 3] if o1 == op[1]: seq = [ flat_tokens[i][0], flat_tokens[i + 2][0], flat_tokens[i + 4][0], ] s0, s1 = is_suppressed if not s1: seq.insert(2, r1) if not s0: seq.insert(1, r0) result = ParseResults(expr, seq[0].start, seq[-1].end, seq) break else: op_index += 1 continue for p in parse_actions: result = p(result, -1, string) offset = (0, 2, 3, 5)[arity] flat_tokens[i:i + offset] = [(result, (expr, ))] op_index = 0 result = flat_tokens[0][0] result.end = tokens.end return result flat = Forward() iso = lpar.suppress() + flat + rpar.suppress() atom = (baseExpr | iso).addParseAction(record_op(baseExpr)) modified = ZeroOrMore(prefix_ops) + atom + ZeroOrMore(suffix_ops) flat << (modified + ZeroOrMore(ops + modified) ).addParseAction(make_tree).streamline() return flat.streamline()
def hex_to_char(t): return Literal(unichr(int(t.value().lower().split("x")[1], 16)))
elif mode in "*?": return ZeroOrMore(operand) elif mode in "+?": return OneOrMore(operand) elif mode == "?": return Optional(operand) else: Log.error("not expected") PLAIN_ENGINE.use() ######################################################################################### # SQUARE BRACKETS any_whitechar = Literal("\\s").addParseAction(lambda: Char(whitespace)) not_whitechar = Literal("\\S").addParseAction(lambda: Char(exclude=whitespace)) any_wordchar = Literal("\\w").addParseAction(lambda: Char(alphanums + "_")) not_wordchar = Literal("\\W").addParseAction(lambda: Char(exclude=alphanums + "_")) any_digitchar = Literal("\\d").addParseAction(lambda: Char(nums)) not_digitchar = Literal("\\D").addParseAction(lambda: Char(exclude=nums)) bs_char = Literal("\\\\").addParseAction(lambda: Literal("\\")) tab_char = Literal("\\t").addParseAction(lambda: Literal("\t")) CR = Literal("\\n").addParseAction(lambda: Literal("\n")) LF = Literal("\\r").addParseAction(lambda: Literal("\r")) any_char = Literal(".").addParseAction(lambda: AnyChar()) macro = ( any_whitechar | any_wordchar | any_digitchar
elif mode in "*?": return ZeroOrMore(operand) elif mode in "+?": return OneOrMore(operand) elif mode == "?": return Optional(operand) else: Log.error("not expected") PLAIN_ENGINE.use() ######################################################################################### # SQUARE BRACKETS any_whitechar = Literal("\\s").addParseAction(lambda: Char(whitespace)) not_whitechar = Literal("\\S").addParseAction(lambda: Char(exclude=whitespace)) any_wordchar = Literal("\\w").addParseAction(lambda: Char(alphanums + "_")) not_wordchar = Literal("\\W").addParseAction( lambda: Char(exclude=alphanums + "_")) any_digitchar = Literal("\\d").addParseAction(lambda: Char(nums)) not_digitchar = Literal("\\D").addParseAction(lambda: Char(exclude=nums)) bs_char = Literal("\\\\").addParseAction(lambda: Literal("\\")) tab_char = Literal("\\t").addParseAction(lambda: Literal("\t")) CR = Literal("\\n").addParseAction(lambda: Literal("\n")) LF = Literal("\\r").addParseAction(lambda: Literal("\r")) any_char = Literal(".").addParseAction(lambda: AnyChar()) macro = (any_whitechar | any_wordchar | any_digitchar