Beispiel #1
0
def locatedExpr(expr):
    """Helper to decorate a returned token with its starting and ending
    locations in the input string.

    This helper adds the following results names:

     - locn_start = location where matched expression begins
     - locn_end = location where matched expression ends
     - value = the actual parsed results

    Be careful if the input text contains ``<TAB>`` characters, you
    may want to call `ParserElement.parseWithTabs`

    Example::

        wd = Word(alphas)
        for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"):
            print(match)

    prints::

        [[0, 'ljsdf', 5]]
        [[8, 'lksdjjf', 15]]
        [[18, 'lkkjj', 23]]
    """
    locator = Empty().addParseAction(lambda t, l, s: l)
    return Group(locator("locn_start") + Group(expr)("value") + locator("locn_end"))
Beispiel #2
0
def originalTextFor(expr, asString=True):
    """Helper to return the original, untokenized text for a given
    expression.  Useful to restore the parsed fields of an HTML start
    tag into the raw tag text itself, or to revert separate tokens with
    intervening whitespace back to the original matching input text. By
    default, returns astring containing the original parsed text.

    If the optional ``asString`` argument is passed as
    ``False``, then the return value is
    a `ParseResults` containing any results names that
    were originally matched, and a single token containing the original
    matched text from the input string.  So if the expression passed to
    `originalTextFor` contains expressions with defined
    results names, you must set ``asString`` to ``False`` if you
    want to preserve those results name values.

    Example::

        src = "this is test <b> bold <i>text</i> </b> normal text "
        for tag in ("b", "i"):
            opener, closer = makeHTMLTags(tag)
            patt = originalTextFor(opener + SkipTo(closer) + closer)
            print(patt.searchString(src)[0])

    prints::

        ['<b> bold <i>text</i> </b>']
        ['<i>text</i>']
    """
    locMarker = Empty().addParseAction(lambda t, l, s: l)
    matchExpr = locMarker("_original_start") + Group(expr) + locMarker("_original_end")
    matchExpr = matchExpr.addParseAction(extractText)
    return matchExpr
Beispiel #3
0
def makeHTMLTags(tagStr, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
    """Helper to construct opening and closing tag expressions for HTML,
    given a tag name. Matches tags in either upper or lower case,
    attributes with namespaces and with quoted or unquoted values.
    """
    if isinstance(tagStr, text):
        resname = tagStr
        tagStr = Keyword(tagStr, caseless=True)
    else:
        resname = tagStr.parser_name

    tagAttrName = Word(alphas, alphanums + "_-:")
    tagAttrValue = quotedString.addParseAction(removeQuotes) | Word(
        printables, exclude=">"
    )
    simpler_name = "".join(resname.replace(":", " ").title().split())

    openTag = (
        (
            suppress_LT
            + tagStr("tag")
            + OpenDict(ZeroOrMore(Group(
                tagAttrName.addParseAction(downcaseTokens)
                + Optional(Suppress("=") + tagAttrValue)
            )))
            + Optional(
                "/", default=[False]
            )("empty").addParseAction(lambda t, l, s: t[0] == "/")
            + suppress_GT
        )
        .set_token_name("start" + simpler_name)
        .set_parser_name("<%s>" % resname)
    )

    closeTag = (
        Combine(Literal("</") + tagStr + ">")
        .set_token_name("end" + simpler_name)
        .set_parser_name("</%s>" % resname)
    )

    # openTag.tag = resname
    # closeTag.tag = resname
    # openTag.tag_body = SkipTo(closeTag)

    return openTag, closeTag
Beispiel #4
0
def dictOf(key, value):
    """Helper to easily and clearly define a dictionary by specifying
    the respective patterns for the key and value.  Takes care of
    defining the `Dict`, `ZeroOrMore`, and
    `Group` tokens in the proper order.  The key pattern
    can include delimiting markers or punctuation, as long as they are
    suppressed, thereby leaving the significant key text.  The value
    pattern can include named results, so that the `Dict` results
    can include named token fields.

    Example::

        text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
        attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).addParseAction(' '.join))
        print(OneOrMore(attr_expr).parseString(text))

        attr_label = label
        attr_value = Suppress(':') + OneOrMore(data_word, stopOn=label).addParseAction(' '.join)

        # similar to Dict, but simpler call format
        result = dictOf(attr_label, attr_value).parseString(text)
        print(result)
        print(result['shape'])
        print(result.shape)  # object attribute access works too
        print(result)

    prints::

        [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
        - color: light blue
        - posn: upper left
        - shape: SQUARE
        - texture: burlap
        SQUARE
        SQUARE
        {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
    """
    return Dict(OneOrMore(Group(key + value)))
Beispiel #5
0
def indentedBlock(blockStatementExpr, indent=True):
    """Helper method for defining space-delimited indentation blocks,
    such as those used to define block statements in Python source code.

    Parameters:

     - blockStatementExpr - expression defining syntax of statement that
       is repeated within the indented block
     - indentStack - list created by caller to manage indentation stack
       (multiple statementWithIndentedBlock expressions within a single
       grammar should share a common indentStack)
     - indent - boolean indicating whether block must be indented beyond
       the current level; set to False for block of left-most
       statements (default= ``True``)

    A valid block must contain at least one ``blockStatement``.
    """
    blockStatementExpr.engine.add_ignore("\\" + LineEnd())

    PEER = Forward()
    DEDENT = Forward()

    def _reset_stack(p=None, l=None, s=None, ex=None):
        oldCol, oldPeer, oldDedent = _indent_stack.pop()
        PEER << oldPeer
        DEDENT << oldDedent

    def peer_stack(expectedCol):
        def output(t, l, s):
            if l >= len(s):
                return
            curCol = col(l, s)
            if curCol != expectedCol:
                if curCol > expectedCol:
                    raise ParseException(t.type, s, l, "illegal nesting")
                raise ParseException(t.type, l, s, "not a peer entry")

        return output

    def dedent_stack(expectedCol):
        def output(t, l, s):
            if l >= len(s):
                return
            curCol = col(l, s)
            if curCol not in (i for i, _, _ in _indent_stack):
                raise ParseException(s, l, "not an unindent")
            if curCol < _indent_stack[-1][0]:
                oldCol, oldPeer, oldDedent = _indent_stack.pop()
                PEER << oldPeer
                DEDENT << oldDedent

        return output

    def indent_stack(t, l, s):
        curCol = col(l, s)
        if curCol > _indent_stack[-1][0]:
            PEER << Empty().addParseAction(peer_stack(curCol))
            DEDENT << Empty().addParseAction(dedent_stack(curCol))
            _indent_stack.append((curCol, PEER, DEDENT))
        else:
            raise ParseException(t.type, l, s, "not a subentry")

    def nodent_stack(t, l, s):
        curCol = col(l, s)
        if curCol == _indent_stack[-1][0]:
            PEER << Empty().addParseAction(peer_stack(curCol))
            DEDENT << Empty().addParseAction(dedent_stack(curCol))
            _indent_stack.append((curCol, PEER, DEDENT))
        else:
            raise ParseException(t.type, s, l, "not a subentry")

    NL = OneOrMore(LineEnd().suppress())
    INDENT = Empty().addParseAction(indent_stack)
    NODENT = Empty().addParseAction(nodent_stack)

    if indent:
        smExpr = Group(
            Optional(NL)
            + INDENT
            + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))
            + DEDENT
        )
    else:
        smExpr = Group(
            Optional(NL)
            + NODENT
            + OneOrMore(PEER + Group(blockStatementExpr) + Optional(NL))
            + DEDENT
        )
    return smExpr.setFailAction(_reset_stack).set_parser_name("indented block")
Beispiel #6
0
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString):
    """Helper method for defining nested lists enclosed in opening and
    closing delimiters ("(" and ")" are the default).

    Parameters:
     - opener - opening character for a nested list
       (default= ``"("``); can also be a mo_parsing expression
     - closer - closing character for a nested list
       (default= ``")"``); can also be a mo_parsing expression
     - content - expression for items within the nested lists
       (default= ``None``)
     - ignoreExpr - expression for ignoring opening and closing
       delimiters (default= `quotedString`)

    If an expression is not provided for the content argument, the
    nested expression will capture all whitespace-delimited content
    between delimiters as a list of separate values.

    Use the ``ignoreExpr`` argument to define expressions that may
    contain opening or closing characters that should not be treated as
    opening or closing characters for nesting, such as quotedString or
    a comment expression.  Specify multiple expressions using an
    `Or` or `MatchFirst`. The default is
    `quotedString`, but if no expressions are to be ignored, then
    pass ``None`` for this argument.

    """
    if opener == closer:
        raise ValueError("opening and closing strings cannot be the same")
    if content is None:
        if not isinstance(opener, text) or not isinstance(closer, text):
            raise ValueError(
                "opening and closing arguments must be strings if no content expression"
                " is given"
            )

        ignore_chars = engine.CURRENT.white_chars
        with Engine(""):

            def scrub(t):
                return t[0].strip()

            if len(opener) == 1 and len(closer) == 1:
                if ignoreExpr is not None:
                    content = Combine(OneOrMore(
                        ~ignoreExpr
                        + CharsNotIn(opener + closer + "".join(ignore_chars), exact=1,)
                    )).addParseAction(scrub)
                else:
                    content = Empty + CharsNotIn(
                        opener + closer + "".join(ignore_chars)
                    ).addParseAction(scrub)
            else:
                if ignoreExpr is not None:
                    content = Combine(OneOrMore(
                        ~ignoreExpr
                        + ~Literal(opener)
                        + ~Literal(closer)
                        + CharsNotIn(ignore_chars, exact=1)
                    )).addParseAction(scrub)
                else:
                    content = Combine(OneOrMore(
                        ~Literal(opener)
                        + ~Literal(closer)
                        + CharsNotIn(ignore_chars, exact=1)
                    )).addParseAction(scrub)
    ret = Forward()
    if ignoreExpr is not None:
        ret <<= Group(
            Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
        )
    else:
        ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
    ret.set_parser_name("nested %s%s expression" % (opener, closer))
    return ret
Beispiel #7
0
 def countFieldParseAction(t, l, s):
     n = t[0]
     arrayExpr << Group(Many(expr, exact=n))
     return []
Beispiel #8
0
def infixNotation(baseExpr, spec, lpar=Suppress("("), rpar=Suppress(")")):
    """
    :param baseExpr: expression representing the most basic element for the
       nested
    :param spec: list of tuples, one for each operator precedence level
       in the expression grammar; each tuple is of the form ``(opExpr,
       numTerms, rightLeftAssoc, parseAction)``, where:

       - opExpr is the mo_parsing expression for the operator; may also
         be a string, which will be converted to a Literal; if numTerms
         is 3, opExpr is a tuple of two expressions, for the two
         operators separating the 3 terms
       - numTerms is the number of terms for this operator (must be 1,
         2, or 3)
       - rightLeftAssoc is the indicator whether the operator is right
         or left associative, using the mo_parsing-defined constants
         ``RIGHT_ASSOC`` and ``LEFT_ASSOC``.
       - parseAction is the parse action to be associated with
         expressions matching this operator expression (the parse action
         tuple member may be omitted); if the parse action is passed
         a tuple or list of functions, this is equivalent to calling
         ``setParseAction(*fn)``
         (:class:`ParserElement.addParseAction`)
    :param lpar: expression for matching left-parentheses
       (default= ``Suppress('(')``)
    :param rpar: expression for matching right-parentheses
       (default= ``Suppress(')')``)
    :return: ParserElement
    """

    all_op = {}

    def norm(op):
        output = all_op.get(id(op))
        if output:
            return output

        def record_self(tok):
            ParseResults(tok.type, [tok.type.parser_name])

        output = engine.CURRENT.normalize(op)
        is_suppressed = isinstance(output, Suppress)
        if is_suppressed:
            output = output.expr
        output = output.addParseAction(record_self)
        all_op[id(op)] = is_suppressed, output
        return is_suppressed, output

    opList = []
    """
    SCRUBBED LIST OF OPERATORS
    * expr - used exclusively for ParseResult(expr, [...]), not used to match
    * op - used to match 
    * arity - same
    * assoc - same
    * parse_actions - same
    """

    for operDef in spec:
        op, arity, assoc, rest = operDef[0], operDef[1], operDef[2], operDef[3:]
        parse_actions = list(map(wrap_parse_action, listwrap(rest[0]))) if rest else []
        if arity == 1:
            is_suppressed, op = norm(op)
            if assoc == RIGHT_ASSOC:
                opList.append((
                    Group(baseExpr + op),
                    op,
                    is_suppressed,
                    arity,
                    assoc,
                    parse_actions,
                ))
            else:
                opList.append((
                    Group(op + baseExpr),
                    op,
                    is_suppressed,
                    arity,
                    assoc,
                    parse_actions,
                ))
        elif arity == 2:
            is_suppressed, op = norm(op)
            opList.append((
                Group(baseExpr + op + baseExpr),
                op,
                is_suppressed,
                arity,
                assoc,
                parse_actions,
            ))
        elif arity == 3:
            is_suppressed, op = zip(norm(op[0]), norm(op[1]))
            opList.append((
                Group(baseExpr + op[0] + baseExpr + op[1] + baseExpr),
                op,
                is_suppressed,
                arity,
                assoc,
                parse_actions,
            ))
    opList = tuple(opList)

    def record_op(op):
        def output(tokens):
            return ParseResults(NO_PARSER, [(tokens, op)])

        return output

    prefix_ops = MatchFirst([
        op.addParseAction(record_op(op))
        for expr, op, is_suppressed, arity, assoc, pa in opList
        if arity == 1 and assoc == RIGHT_ASSOC
    ])
    suffix_ops = MatchFirst([
        op.addParseAction(record_op(op))
        for expr, op, is_suppressed, arity, assoc, pa in opList
        if arity == 1 and assoc == LEFT_ASSOC
    ])
    ops = Or([
        opPart.addParseAction(record_op(opPart))
        for expr, op, is_suppressed, arity, assoc, pa in opList
        if arity > 1
        for opPart in (op if isinstance(op, tuple) else [op])
    ])

    def make_tree(tokens, loc, string):
        flat_tokens = list(tokens)
        num = len(opList)
        op_index = 0
        while len(flat_tokens) > 1 and op_index < num:
            expr, op, is_suppressed, arity, assoc, parse_actions = opList[op_index]
            if arity == 1:
                if assoc == RIGHT_ASSOC:
                    # PREFIX OPERATOR -3
                    todo = list(reversed(list(enumerate(flat_tokens[:-1]))))
                    for i, (r, o) in todo:
                        if o == op:
                            if is_suppressed:
                                result = ParseResults(expr, (flat_tokens[i + 1][0],))
                            else:
                                result = ParseResults(expr, (r, flat_tokens[i + 1][0]))
                            break
                    else:
                        op_index += 1
                        continue
                else:
                    # SUFFIX OPERATOR 3!
                    todo = list(enumerate(flat_tokens[1:]))
                    for i, (r, o) in todo:
                        if o == op:
                            if is_suppressed:
                                result = ParseResults(expr, (flat_tokens[i][0],))
                            else:
                                result = ParseResults(expr, (flat_tokens[i][0], r,))
                            break
                    else:
                        op_index += 1
                        continue
            elif arity == 2:
                todo = list(enumerate(flat_tokens[1:-1]))
                if assoc == RIGHT_ASSOC:
                    todo = list(reversed(todo))

                for i, (r, o) in todo:
                    if o == op:
                        if is_suppressed:
                            result = ParseResults(
                                expr, (flat_tokens[i][0], flat_tokens[i + 2][0])
                            )
                        else:
                            result = ParseResults(
                                expr, (flat_tokens[i][0], r, flat_tokens[i + 2][0])
                            )
                        break
                else:
                    op_index += 1
                    continue

            else:  # arity==3
                todo = list(enumerate(flat_tokens[1:-3]))
                if assoc == RIGHT_ASSOC:
                    todo = list(reversed(todo))

                for i, (r0, o0) in todo:
                    if o0 == op[0]:
                        r1, o1 = flat_tokens[i + 3]
                        if o1 == op[1]:
                            seq = [
                                flat_tokens[i][0],
                                flat_tokens[i + 2][0],
                                flat_tokens[i + 4][0],
                            ]
                            s0, s1 = is_suppressed
                            if not s1:
                                seq.insert(2, r1)
                            if not s0:
                                seq.insert(1, r0)

                            result = ParseResults(expr, seq)
                            break
                else:
                    op_index += 1
                    continue

            for p in parse_actions:
                result = p(result, -1, string)
            offset = (0, 2, 3, 5)[arity]
            flat_tokens[i : i + offset] = [(result, (expr,))]
            op_index = 0

        return flat_tokens[0][0]

    flat = Forward()
    iso = lpar.suppress() + flat + rpar.suppress()
    atom = (baseExpr | iso).addParseAction(record_op(baseExpr))
    modified = ZeroOrMore(prefix_ops) + atom + ZeroOrMore(suffix_ops)
    flat << (modified + ZeroOrMore(ops + modified)).addParseAction(make_tree)

    return flat
Beispiel #9
0
 def countFieldParseAction(t, l, s):
     n = t[0]
     arrayExpr << (n and Group(And([expr] * n)) or Group(empty))
     return []
Beispiel #10
0
_escapedPunc = Word(
    _bslash, r"\[]-*.$+^?()~ ", exact=2
).addParseAction(lambda t, l, s: t[0][1])
_escapedHexChar = (
    Regex(r"\\0?[xX][0-9a-fA-F]+").addParseAction(lambda t: unichr(int(
        t[0].lstrip('\\').lstrip('0').lstrip('xX'), 16
    )))
)
_escapedOctChar = Regex(r"\\0[0-7]+").addParseAction(lambda t, l, s: unichr(int(
    t[0][1:], 8
)))
_singleChar = (
    _escapedPunc | _escapedHexChar | _escapedOctChar | CharsNotIn(r"\]", exact=1)
)
_charRange = Group(_singleChar + Suppress("-") + _singleChar)
_reBracketExpr = (
    Literal("[")
    + Optional("^").set_token_name("negate")
    + Group(OneOrMore(_charRange | _singleChar)).set_token_name("body")
    + "]"
)


def srange(s):
    r"""Helper to easily define string ranges for use in Word
    construction. Borrows syntax from regexp '[]' string range
    definitions::

        srange("[0-9]")   -> "0123456789"
        srange("[a-z]")   -> "abcdefghijklmnopqrstuvwxyz"
Beispiel #11
0
        ret <<= Group(
            Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
    ret.set_parser_name("nested %s%s expression" % (opener, closer))
    return ret


with Engine(""):
    _escapedPunc = Word("\\", r"\[]-*.$+^?()~ ",
                        exact=2).addParseAction(lambda t, l, s: t[0][1])
    _escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").addParseAction(
        lambda t: unichr(int(t[0].lstrip("\\").lstrip("0").lstrip("xX"), 16)))
    _escapedOctChar = Regex(r"\\0[0-7]+").addParseAction(
        lambda t, l, s: unichr(int(t[0][1:], 8)))
    _singleChar = (_escapedPunc | _escapedHexChar | _escapedOctChar
                   | CharsNotIn(r"\]", exact=1))
    _charRange = Group(_singleChar + Suppress("-") + _singleChar)
    _reBracketExpr = ("[" + Optional("^").set_token_name("negate") + Group(
        OneOrMore(_charRange | _singleChar)).set_token_name("body") + "]")


def srange(s):
    r"""Helper to easily define string ranges for use in Word
    construction. Borrows syntax from regexp '[]' string range
    definitions::

        srange("[0-9]")   -> "0123456789"
        srange("[a-z]")   -> "abcdefghijklmnopqrstuvwxyz"
        srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_"

    The input string must be enclosed in []'s, and the returned string
    is the expanded character set joined into a single string. The
Beispiel #12
0
    ~macro + Combine("\\" + AnyChar())
).addParseAction(lambda t: Literal(t.value()[1]))
plainChar = Char(exclude=r"\]").addParseAction(lambda t: Literal(t.value()))

escapedHexChar = Combine(
    (Literal("\\0x") | Literal("\\x") | Literal("\\X"))  # lookup literals is faster
    + OneOrMore(Char(hexnums))
).addParseAction(hex_to_char)

escapedOctChar = Combine(
    Literal("\\0") + OneOrMore(Char("01234567"))
).addParseAction(lambda t: Literal(unichr(int(t.value()[2:], 8))))

singleChar = escapedHexChar | escapedOctChar | escapedChar | plainChar

charRange = Group(singleChar("min") + "-" + singleChar("max")).addParseAction(to_range)

brackets = (
    "["
    + Optional("^")("negate")
    + OneOrMore(Group(charRange | singleChar | macro)("body"))
    + "]"
).addParseAction(to_bracket)

#########################################################################################
# REGEX
regex = Forward()

line_start = Literal("^").addParseAction(lambda: LineStart())
line_end = Literal("$").addParseAction(lambda: LineEnd())
word_edge = Literal("\\b").addParseAction(lambda: NotAny(any_wordchar))