def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): """Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values. """ if isinstance(tagStr, text): resname = tagStr tagStr = Keyword(tagStr, caseless=True) else: resname = tagStr.parser_name tagAttrName = Word(alphas, alphanums + "_-:") tagAttrValue = quotedString.addParseAction(removeQuotes) | Word( printables, exclude=">" ) simpler_name = "".join(resname.replace(":", " ").title().split()) openTag = ( ( suppress_LT + tagStr("tag") + OpenDict(ZeroOrMore(Group( tagAttrName.addParseAction(downcaseTokens) + Optional(Suppress("=") + tagAttrValue) ))) + Optional( "/", default=[False] )("empty").addParseAction(lambda t, l, s: t[0] == "/") + suppress_GT ) .set_token_name("start" + simpler_name) .set_parser_name("<%s>" % resname) ) closeTag = ( Combine(Literal("</") + tagStr + ">") .set_token_name("end" + simpler_name) .set_parser_name("</%s>" % resname) ) # openTag.tag = resname # closeTag.tag = resname # openTag.tag_body = SkipTo(closeTag) return openTag, closeTag
def countedArray(expr, intExpr=None): """Helper to define a counted list of expressions. This helper defines a pattern of the form:: integer expr expr expr... where the leading integer tells how many expr expressions follow. The matched tokens returns the array of expr tokens as a list - the leading count token is suppressed. If ``intExpr`` is specified, it should be a mo_parsing expression that produces an integer value. Example:: countedArray(Word(alphas)).parseString('2 ab cd ef') # -> ['ab', 'cd'] # in this parser, the leading integer value is given in binary, # '10' indicating that 2 values are in the array binaryConstant = Word('01').addParseAction(lambda t: int(t[0], 2)) countedArray(Word(alphas), intExpr=binaryConstant).parseString('10 ab cd ef') # -> ['ab', 'cd'] """ if intExpr is None: intExpr = Word(nums).addParseAction(lambda t: int(t[0])) arrayExpr = Forward() def countFieldParseAction(t, l, s): n = t[0] arrayExpr << Group(Many(expr, exact=n)) return [] intExpr = (intExpr.set_parser_name("arrayLen").addParseAction( countFieldParseAction, callDuringTry=True)) return (intExpr + arrayExpr).set_parser_name("(len) " + text(expr) + "...")
+ INDENT + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)) + DEDENT ) else: smExpr = Group( Optional(NL) + NODENT + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL)) + DEDENT ) return smExpr.setFailAction(_reset_stack).set_parser_name("indented block") anyOpenTag, anyCloseTag = makeHTMLTags( Word(alphas, alphanums + "_:").set_parser_name("any tag") ) _htmlEntityMap = dict(zip("gt lt amp nbsp quot apos".split(), "><& \"'")) commonHTMLEntity = Regex( "&(?P<entity>" + "|".join(_htmlEntityMap.keys()) + ");" ).set_parser_name("common HTML entity") def replaceHTMLEntity(t): """Helper parser action to replace common HTML entities with their special characters""" return _htmlEntityMap.get(t.entity) # it's easy to get these comment structures wrong - they're very common, so may as well make them available cStyleComment = Combine( Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/"
else: ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) ret.set_parser_name("nested %s%s expression" % (opener, closer)) return ret # convenience constants for positional expressions empty = Empty().set_parser_name("empty") lineStart = LineStart().set_parser_name("lineStart") lineEnd = LineEnd().set_parser_name("lineEnd") stringStart = StringStart().set_parser_name("stringStart") stringEnd = StringEnd().set_parser_name("stringEnd") _escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).addParseAction(lambda t, l, s: t[0][1]) _escapedHexChar = ( Regex(r"\\0?[xX][0-9a-fA-F]+").addParseAction(lambda t: unichr(int( t[0].lstrip('\\').lstrip('0').lstrip('xX'), 16 ))) ) _escapedOctChar = Regex(r"\\0[0-7]+").addParseAction(lambda t, l, s: unichr(int( t[0][1:], 8 ))) _singleChar = ( _escapedPunc | _escapedHexChar | _escapedOctChar | CharsNotIn(r"\]", exact=1) ) _charRange = Group(_singleChar + Suppress("-") + _singleChar) _reBracketExpr = ( Literal("[")
brackets = ( "[" + Optional("^")("negate") + OneOrMore(Group(charRange | singleChar | macro)("body")) + "]" ).addParseAction(to_bracket) ######################################################################################### # REGEX regex = Forward() line_start = Literal("^").addParseAction(lambda: LineStart()) line_end = Literal("$").addParseAction(lambda: LineEnd()) word_edge = Literal("\\b").addParseAction(lambda: NotAny(any_wordchar)) simple_char = Word( printables, exclude=r".^$*+{}[]\|()" ).addParseAction(lambda t: Literal(t.value())) esc_char = ("\\" + AnyChar()).addParseAction(lambda t: Literal(t.value()[1])) with Engine(): # ALLOW SPACES IN THE RANGE repetition = ( Word(nums)("exact") + "}" | Word(nums)("min") + "," + Word(nums)("max") + "}" | Word(nums)("min") + "," + "}" | "," + Word(nums)("max") + "}" ) repetition = Group( "{" + repetition | (Literal("*?") | Literal("+?") | Char("*+?"))("mode") )
charRange = Group(singleChar("min") + "-" + singleChar("max")).addParseAction(to_range) brackets = ("[" + Optional("^")("negate") + OneOrMore(Group(charRange | singleChar | macro)("body")) + "]").addParseAction(to_bracket) ######################################################################################### # REGEX regex = Forward() line_start = Literal("^").addParseAction(lambda: LineStart()) line_end = Literal("$").addParseAction(lambda: LineEnd()) word_edge = Literal("\\b").addParseAction(lambda: NotAny(any_wordchar)) simple_char = Word( printables, exclude=r".^$*+{}[]\|()").addParseAction(lambda t: Literal(t.value())) esc_char = ("\\" + AnyChar()).addParseAction(lambda t: Literal(t.value()[1])) with Engine(): # ALLOW SPACES IN THE RANGE repetition = (Word(nums)("exact") + "}" | Word(nums)("min") + "," + Word(nums)("max") + "}" | Word(nums)("min") + "," + "}" | "," + Word(nums)("max") + "}") repetition = Group("{" + repetition | (Literal("*?") | Literal("+?") | Char("*+?"))("mode")) LB = Char("(")