Python append_config Examples, mo_parsing.utils.append_config Python Examples

Example #1

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class WordStart(_PositionToken):
    """Matches if the current position is at the beginning of a Word,
    and is not preceded by any character in a given set of
    ``wordChars`` (default= ``printables``). To emulate the
    ``\b`` behavior of regular expressions, use
    ``WordStart(alphanums)``. ``WordStart`` will also match at
    the beginning of the string being parsed, or at the beginning of
    a line.
    """

    __slots__ = []
    Config = append_config(_PositionToken, "word_chars")

    def __init__(self, wordChars=printables):
        super(WordStart, self).__init__()
        self.set_config(
            regex=regex_compile(
                f"(?:(?<={(CharsNotIn(wordChars, exact=1)).__regex__()[1]})|^)(?={Char(wordChars).__regex__()[1]})"
            ),
            word_chars="".join(sorted(set(wordChars))),
        )
        self.streamlined = True

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            return ParseResults(self, start, start, [])
        raise ParseException(self, start, string)

    def min_length(self):
        return 0

    def __regex__(self):
        return "+", self.parser_config.regex.pattern

Example #2

0

Show file

class Optional(Many):
    """Optional matching of the given expression.

    Parameters:
     - expr - expression that must match zero or more times
     - default (optional) - value to be returned if the optional expression is not found.
    """

    __slots__ = []
    Config = append_config(Many, "defaultValue")

    def __init__(self, expr, default=None):
        Many.__init__(self, expr, stopOn=None, min_match=0, max_match=1)
        self.set_config(defaultValue=listwrap(default))

    def parseImpl(self, string, start, doActions=True):
        try:
            results = self.expr._parse(string, start, doActions)
            return ParseResults(self, results.start, results.end, [results])
        except ParseException:
            return ParseResults(self, start, start,
                                self.parser_config.defaultValue)

    def __str__(self):
        if self.parser_name:
            return self.parser_name

        return "[" + text(self.expr) + "]"

Example #3

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class Token(ParserElement):
    __slots__ = []
    Config = append_config(ParserElement, "match", "regex")

    def __init__(self):
        ParserElement.__init__(self)
        self.streamlined = True

Example #4

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class Char(Token):
    __slots__ = []
    Config = append_config(Token, "charset")

    def __init__(self, charset, asKeyword=False, excludeChars=None):
        """
        Represent one character in a given charset
        """
        Token.__init__(self)
        if excludeChars:
            charset = set(charset) - set(excludeChars)
        regex = regex_range(charset)
        if asKeyword:
            regex = r"\b%s\b" % self
        self.set_config(
            regex=regex_compile(regex),
            charset="".join(sorted(set(charset))),
        )

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            return ParseResults(self, start, found.end(), [found.group()])

        raise ParseException(self, start, string)

    def min_length(self):
        return 1

    def __regex__(self):
        return "*", self.parser_config.regex.pattern

    def __str__(self):
        return self.parser_config.regex.pattern

Example #5

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class Keyword(Token):
    """Token to exactly match a specified string as a keyword, that is,
    it must be immediately followed by a non-keyword character.  Compare
    with `Literal`:

     - ``Literal("if")`` will match the leading ``'if'`` in
       ``'ifAndOnlyIf'``.
     - ``Keyword("if")`` will not; it will only match the leading
       ``'if'`` in ``'if x=1'``, or ``'if(y==2)'``

    Accepts two optional constructor arguments in addition to the
    keyword string:

     - ``ident_chars`` is a string of characters that would be valid
       identifier characters, defaulting to all alphanumerics + "_" and
       "$"
     - ``caseless`` allows case-insensitive matching, default is ``False``.

    For case-insensitive matching, use `CaselessKeyword`.
    """

    __slots__ = []
    Config = append_config(Token, "ident_chars")

    def __init__(self, match, ident_chars=None, caseless=None):
        Token.__init__(self)
        if ident_chars is None:
            ident_chars = self.engine.keyword_chars
        else:
            ident_chars = "".join(sorted(set(ident_chars)))

        if caseless:
            pattern = regex_caseless(match)
        else:
            pattern = re.escape(match)

        non_word = "($|(?!" + regex_range(ident_chars) + "))"
        self.set_config(ident_chars=ident_chars,
                        match=match,
                        regex=regex_compile(pattern + non_word))

        self.parser_name = match
        if caseless:
            self.__class__ = CaselessKeyword

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            return ParseResults(self, start, found.end(),
                                [self.parser_config.match])
        raise ParseException(self, start, string)

    def _min_length(self):
        return len(self.parser_config.match)

    def __regex__(self):
        return "+", self.parser_config.regex.pattern

Example #6

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class CloseMatch(Token):
    """
    A variation on `Literal` which matches "close" matches,
    that is, strings with at most 'n' mismatching characters.
    `CloseMatch` takes parameters:

     - ``match_string`` - string to be matched
     - ``maxMismatches`` - (``default=1``) maximum number of
       mismatches allowed to count as a match

    The results from a successful parse will contain the matched text
    from the input string and the following named results:

     - ``mismatches`` - a list of the positions within the
       match_string where mismatches were found
     - ``original`` - the original match_string used to compare
       against the input string

    If ``mismatches`` is an empty list, then the match was an exact
    match.
    """

    __slots__ = []
    Config = append_config(Token, "maxMismatches")

    def __init__(self, match_string, maxMismatches=1):
        super(CloseMatch, self).__init__()
        self.parser_name = match_string
        self.set_config(match=match_string, maxMismatches=maxMismatches)

    def parseImpl(self, string, start, doActions=True):
        end = start
        instrlen = len(string)
        maxloc = start + len(self.parser_config.match)

        if maxloc <= instrlen:
            match = self.parser_config.match
            match_stringloc = 0
            mismatches = []
            maxMismatches = self.parser_config.maxMismatches

            for match_stringloc, (src, mat) in enumerate(
                    zip(string[end:maxloc], match)):
                if src != mat:
                    mismatches.append(match_stringloc)
                    if len(mismatches) > maxMismatches:
                        break
            else:
                end = match_stringloc + 1
                results = ParseResults(self, start, end, [string[start:end]])
                results["original"] = match
                results["mismatches"] = mismatches
                return results

        raise ParseException(self, start, string)

Example #7

0

Show file

class Combine(TokenConverter):
    """
    Converter to concatenate all matching tokens to a single string.
    """

    __slots__ = []
    Config = append_config(TokenConverter, "separator")

    def __init__(self, expr, separator=""):
        super(Combine, self).__init__(expr.streamline())
        self.set_config(separator=separator)
        self.parseAction.append(_combine)
        self.streamlined = True

Example #8

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class WordEnd(_PositionToken):
    """Matches if the current position is at the end of a Word, and is
    not followed by any character in a given set of ``wordChars``
    (default= ``printables``). To emulate the ``\b`` behavior of
    regular expressions, use ``WordEnd(alphanums)``. ``WordEnd``
    will also match at the end of the string being parsed, or at the end
    of a line.
    """

    __slots__ = []
    Config = append_config(_PositionToken, "word_chars")

    def __init__(self, wordChars=printables):
        super(WordEnd, self).__init__()
        self.engine = PLAIN_ENGINE
        self.set_config(
            word_chars="".join(sorted(set(wordChars))),
            regex=regex_compile(
                f"(?<={Char(wordChars).__regex__()[1]})({(~Char(wordChars)).__regex__()[1]}|$)"
            ),
        )

    def copy(self):
        output = _PositionToken.copy(self)
        output.engine = PLAIN_ENGINE
        return output

    def min_length(self):
        return 0

    def parseImpl(self, string, start, doActions=True):
        word_chars = self.parser_config.word_chars
        instrlen = len(string)
        if instrlen > 0 and start < instrlen:
            if string[start] in word_chars or string[start -
                                                     1] not in word_chars:
                raise ParseException(self, start, string)
        return ParseResults(self, start, start, [])

    def __regex__(self):
        return "+", self.parser_config.regex.pattern

Example #9

0

Show file

class Combine(TokenConverter):
    """
    Converter to concatenate all matching tokens to a single string.
    """

    Config = append_config(TokenConverter, "separator")

    def __init__(self, expr, separator=""):
        super(Combine, self).__init__(expr.streamline())
        self.set_config(separator=separator)

    def parseImpl(self, string, start, doActions=True):
        result = self.expr.parseImpl(string, start, doActions=doActions)
        output = ParseResults(
            self,
            start,
            result.end,
            [result.asString(sep=self.parser_config.separator)],
        )
        return output

    def streamline(self):
        if self.streamlined:
            return self

        expr = self.expr.streamline()
        if expr is self.expr:
            self.streamlined = True
            return self
        return Combine(expr, self.parser_config.separator)

    def expecting(self):
        return OrderedDict((k, [self]) for k in self.expr.expecting().keys())

    def min_length(self):
        return self.expr.min_length()

    def __regex__(self):
        return self.expr.__regex__()

Example #10

0

Show file

class SkipTo(ParseEnhancement):
    """Token for skipping over all undefined text until the matched expression is found."""

    __slots__ = []
    Config = append_config(ParseEnhancement, "include", "fail", "ignore")

    def __init__(self, expr, include=False, ignore=None, failOn=None):
        """
        :param expr: target expression marking the end of the data to be skipped
        :param include: if True, the target expression is also parsed
          (the skipped text and target expression are returned as a 2-element list).
        :param ignore: used to define grammars (typically quoted strings and
          comments) that might contain false matches to the target expression
        :param failOn: define expressions that are not allowed to be
          included in the skipped test; if found before the target expression is found,
          the SkipTo is not a match
        """
        ParseEnhancement.__init__(self, expr)
        self.set_config(include=include,
                        fail=engine.CURRENT.normalize(failOn),
                        ignore=ignore)
        self.parser_name = str(self)

    def min_length(self):
        return 0

    def parseImpl(self, string, start, doActions=True):
        instrlen = len(string)
        fail = self.parser_config.fail
        ignore = self.parser_config.ignore

        loc = start
        while loc <= instrlen:
            if fail:
                # break if failOn expression matches
                try:
                    fail._parse(string, loc)
                    before_end = loc
                    break
                except:
                    pass

            if ignore:
                # advance past ignore expressions
                while 1:
                    try:
                        loc = ignore._parse(string, loc).end
                    except ParseException:
                        break
            try:
                before_end = loc
                loc = self.expr._parse(string, loc, doActions=False).end
            except ParseException:
                # no match, advance loc in string
                loc += 1
            else:
                # matched skipto expr, done
                break

        else:
            # ran off the end of the input string without matching skipto expr, fail
            raise ParseException(self, start, string)

        # build up return values
        end = loc
        skiptext = string[start:before_end]
        skip_result = []
        if skiptext:
            skip_result.append(skiptext)

        if self.parser_config.include:
            end_result = self.expr._parse(string, before_end, doActions)
            skip_result.append(end_result)
            return ParseResults(self, start, end, skip_result)
        else:
            return ParseResults(self, start, before_end, skip_result)

Example #11

0

Show file

class MatchAll(ParseExpression):
    """
    Requires all given `ParseExpression` s to be found, but in
    any order. Expressions may be separated by whitespace.

    May be constructed using the ``'&'`` operator.
    """

    __slots__ = []
    Config = append_config(ParseExpression, "min_match", "max_match")

    def __init__(self, exprs):
        """
        :param exprs: The expressions to be matched
        :param mins: list of integers indincating any minimums
        """
        super(MatchAll, self).__init__(exprs)
        self.set_config(
            min_match=[
                e.parser_config.min_match if isinstance(e, Many) else 1 for e in exprs
            ],
            max_match=[
                e.parser_config.max_match if isinstance(e, Many) else 1 for e in exprs
            ],
        )

    def streamline(self):
        if self.streamlined:
            return self
        return super(MatchAll, self).streamline()

    def _min_length(self):
        # TODO: MAY BE TOO CONSERVATIVE, WE MAY BE ABLE TO PROVE self CAN CONSUME A CHARACTER
        return min(e.min_length() for e in self.exprs)

    def parseImpl(self, string, start, doActions=True):
        end = start
        matchOrder = []
        todo = list(zip(
            self.exprs, self.parser_config.min_match, self.parser_config.max_match
        ))
        count = [0] * len(self.exprs)

        while todo:
            for i, (c, (e, mi, ma)) in enumerate(zip(count, todo)):
                try:
                    loc = e._parse(string, end).end
                    if loc == end:
                        continue
                    end = loc
                    c2 = count[i] = c + 1
                    if c2 >= ma:
                        del todo[i]
                        del count[i]
                    matchOrder.append(e)
                    break
                except ParseException as pe:
                    continue
            else:
                break

        for c, (e, mi, ma) in zip(count, todo):
            if c < mi:
                raise ParseException(
                    string,
                    start,
                    "Missing minimum (%i) more required elements (%s)" % (mi, e),
                )

        found = set(id(m) for m in matchOrder)
        missing = [
            e
            for e, mi in zip(self.exprs, self.parser_config.min_match)
            if id(e) not in found and mi > 0
        ]
        if missing:
            missing = ", ".join(text(e) for e in missing)
            raise ParseException(
                string, start, "Missing one or more required elements (%s)" % missing
            )

        # add any unmatched Optionals, in case they have default values defined
        matchOrder += [e for e in self.exprs if id(e) not in found]

        results = []
        end = start
        for e in matchOrder:
            result = e._parse(string, end, doActions)
            end = result.end
            results.append(result)

        return ParseResults(self, results[0].start, results[-1].end, results)

    def __str__(self):
        if self.parser_name:
            return self.parser_name

        return "{" + " & ".join(text(e) for e in self.exprs) + "}"

Example #12

0

Show file

class Many(ParseEnhancement):
    __slots__ = []
    Config = append_config(ParseEnhancement, "min_match", "max_match", "end")

    def __init__(self,
                 expr,
                 stopOn=None,
                 min_match=0,
                 max_match=MAX_INT,
                 exact=None):
        """
        MATCH expr SOME NUMBER OF TIMES (OR UNTIL stopOn IS REACHED
        :param expr: THE EXPRESSION TO MATCH
        :param stopOn: THE PATTERN TO INDICATE STOP MATCHING (NOT REQUIRED IN PATTERN, JUST A QUICK STOP)
        :param min_match: MINIMUM MATCHES REQUIRED FOR SUCCESS (-1 IS INVALID)
        :param max_match: MAXIMUM MATCH REQUIRED FOR SUCCESS (-1 IS INVALID)
        """
        ParseEnhancement.__init__(self, expr)
        if exact is not None:
            min_match = exact
            max_match = exact

        self.set_config(min_match=min_match, max_match=max_match)
        self.stopOn(stopOn)

    def stopOn(self, ender):
        if ender:
            end = self.engine.normalize(ender)
            self.set_config(end=regex_compile(end.__regex__()[1]))
        return self

    def _min_length(self):
        if self.parser_config.min_match == 0:
            return 0
        return self.expr.min_length()

    def parseImpl(self, string, start, doActions=True):
        acc = []
        end = start
        max = self.parser_config.max_match
        stopper = self.parser_config.end
        count = 0
        try:
            while end < len(string) and count < max:
                if stopper:
                    end = self.engine.skip(string, end)
                    if stopper.match(string, end):
                        if self.parser_config.min_match <= count:
                            break
                        else:
                            raise ParseException(self,
                                                 end,
                                                 string,
                                                 msg="found stopper too soon")
                result = self.expr._parse(string, end, doActions)
                end = result.end
                if result:
                    acc.append(result)
                    count += 1
        except ParseException:
            if self.parser_config.min_match <= count <= max:
                pass
            else:
                ParseException(self,
                               start,
                               string,
                               msg="Not correct amount of matches")
        if count:
            if (count < self.parser_config.min_match
                    or self.parser_config.max_match < count):
                raise ParseException(
                    self,
                    acc[0].start,
                    string,
                    msg=(
                        f"Expecting between {self.parser_config.min_match} and"
                        f" {self.parser_config.max_match} of {self.expr}"),
                )
            else:
                return ParseResults(self, acc[0].start, acc[-1].end, acc)
        else:
            if not self.parser_config.min_match:
                return ParseResults(self, start, start, [])
            else:
                raise ParseException(
                    self,
                    start,
                    string,
                    msg=
                    f"Expecting at least {self.parser_config.min_match} of {self}",
                )

    def streamline(self):
        if self.streamlined:
            return self
        expr = self.expr.streamline()
        if (self.parser_config.min_match == self.parser_config.max_match
                and not self.is_annotated()):
            if self.parser_config.min_match == 0:
                return Empty()
            elif self.parser_config.min_match == 1:
                return expr

        if self.expr is expr:
            self.streamlined = True
            return self
        if expr.is_annotated() or not isinstance(expr, Empty):
            output = self.copy()
            output.expr = expr
            output.streamlined = True
            return output
        return Empty()

    def __regex__(self):
        end = self.parser_config.end.pattern if self.parser_config.end else None
        prec, regex = self.expr.__regex__()
        regex = regex_iso(prec, regex, "*")

        if self.parser_config.max_match == MAX_INT:
            if self.parser_config.min_match == 0:
                suffix = "*"
            elif self.parser_config.min_match == 1:
                suffix = "+"
            else:
                suffix = "{" + text(self.parser_config.min_match) + ",}"
        elif self.parser_config.min_match == self.parser_config.max_match:
            if self.parser_config.min_match == 1:
                suffix = ""
            else:
                suffix = "{" + text(self.parser_config.min_match) + "}"
        else:
            suffix = ("{" + text(self.parser_config.min_match) + "," +
                      text(self.parser_config.max_match) + "}")

        if end:
            return "+", regex + suffix + end
        else:
            return "*", regex + suffix

    def __call__(self, name):
        if not name:
            return self

        for e in [self.expr]:
            if isinstance(e, ParserElement) and e.token_name == name:
                Log.error(
                    "can not set token name, already set in one of the other"
                    " expressions")

        return ParseEnhancement.__call__(self, name)

    def __str__(self):
        if self.parser_name:
            return self.parser_name
        return f"{self.__class__.__name__}:({self.expr})"

Example #13

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class White(Token):
    """Special matching class for matching whitespace.  Normally,
    whitespace is ignored by mo_parsing grammars.  This class is included
    when some whitespace structures are significant.  Define with
    a string containing the whitespace characters to be matched; default
    is ``" \\t\\r\\n"``.  Also takes optional ``min``,
    ``max``, and ``exact`` arguments, as defined for the
    `Word` class.
    """

    whiteStrs = {
        " ": "<SP>",
        "\t": "<TAB>",
        "\n": "<LF>",
        "\r": "<CR>",
        "\f": "<FF>",
        "u\00A0": "<NBSP>",
        "u\1680": "<OGHAM_SPACE_MARK>",
        "u\180E": "<MONGOLIAN_VOWEL_SEPARATOR>",
        "u\2000": "<EN_QUAD>",
        "u\2001": "<EM_QUAD>",
        "u\2002": "<EN_SPACE>",
        "u\2003": "<EM_SPACE>",
        "u\2004": "<THREE-PER-EM_SPACE>",
        "u\2005": "<FOUR-PER-EM_SPACE>",
        "u\2006": "<SIX-PER-EM_SPACE>",
        "u\2007": "<FIGURE_SPACE>",
        "u\2008": "<PUNCTUATION_SPACE>",
        "u\2009": "<THIN_SPACE>",
        "u\200A": "<HAIR_SPACE>",
        "u\200B": "<ZERO_WIDTH_SPACE>",
        "u\202F": "<NNBSP>",
        "u\205F": "<MMSP>",
        "u\3000": "<IDEOGRAPHIC_SPACE>",
    }

    __slots__ = []
    Config = append_config(Token, "min_len", "max_len", "white_chars")

    def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0):
        with Engine(white="".join(c for c in self.engine.white_chars
                                  if c not in ws)) as e:
            super(White, self).__init__()
            self.set_config(lock_engine=e)
        white_chars = "".join(sorted(set(ws)))
        self.parser_name = "|".join(White.whiteStrs[c] for c in white_chars)

        max = max if max > 0 else MAX_INT
        if exact > 0:
            max = exact
            min = exact
        self.set_config(min_len=min, max_len=max, white_chars=white_chars)

    def parseImpl(self, string, start, doActions=True):
        if string[start] not in self.parser_config.white_chars:
            raise ParseException(self, start, string)
        end = start
        end += 1
        maxloc = start + self.parser_config.max_len
        maxloc = min(maxloc, len(string))
        while end < maxloc and string[end] in self.parser_config.white_chars:
            end += 1

        if end - start < self.parser_config.min_len:
            raise ParseException(self, end, string)

        return ParseResults(self, start, end, string[start:end])

Example #14

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class CharsNotIn(Token):
    """Token for matching words composed of characters *not* in a given
    set (will include whitespace in matched characters if not listed in
    the provided exclusion set - see example). Defined with string
    containing all disallowed characters, and an optional minimum,
    maximum, and/or exact length.  The default value for ``min`` is
    1 (a minimum value < 1 is not valid); the default values for
    ``max`` and ``exact`` are 0, meaning no maximum or exact
    length restriction.
    """

    __slots__ = []
    Config = append_config(Token, "min_len", "max_len", "not_chars")

    def __init__(self, notChars, min=1, max=0, exact=0):
        Token.__init__(self)
        not_chars = "".join(sorted(set(notChars)))

        if min < 1:
            raise ValueError(
                "cannot specify a minimum length < 1; use "
                "Optional(CharsNotIn()) if zero-length char group is permitted"
            )

        max = max if max > 0 else MAX_INT
        if exact:
            min = exact
            max = exact

        if len(notChars) == 1:
            regex = "[^" + regex_range(notChars) + "]"
        else:
            regex = "[^" + regex_range(notChars)[1:]

        if not max or max == MAX_INT:
            if min == 0:
                suffix = "*"
            elif min == 1:
                suffix = "+"
            else:
                suffix = "{" + str(min) + ":}"
        elif min == 1 and max == 1:
            suffix = ""
        else:
            suffix = "{" + str(min) + ":" + str(max) + "}"

        self.set_config(
            regex=regex_compile(regex + suffix),
            min_len=min,
            max_len=max,
            not_chars=not_chars,
        )
        self.parser_name = text(self)

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            return ParseResults(self, start, found.end(), [found.group()])

        raise ParseException(self, start, string)

    def min_length(self):
        return self.parser_config.min_len

    def __regex__(self):
        return "*", self.parser_config.regex.pattern

    def __str__(self):
        return self.parser_config.regex.pattern

Example #15

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class QuotedString(Token):
    r"""
    Token for matching strings that are delimited by quoting characters.

    Defined with the following parameters:

        - quoteChar - string of one or more characters defining the
          quote delimiting string
        - escChar - character to escape quotes, typically backslash
          (default= ``None``)
        - escQuote - special quote sequence to escape an embedded quote
          string (such as SQL's ``""`` to escape an embedded ``"``)
          (default= ``None``)
        - multiline - boolean indicating whether quotes can span
          multiple lines (default= ``False``)
        - unquoteResults - boolean indicating whether the matched text
          should be unquoted (default= ``True``)
        - endQuoteChar - string of one or more characters defining the
          end of the quote delimited string (default= ``None``  => same as
          quoteChar)
        - convertWhitespaceEscapes - convert escaped whitespace
          (``'\t'``, ``'\n'``, etc.) to actual whitespace
          (default= ``True``)

    """
    __slots__ = []
    Config = append_config(
        Token,
        "quote_char",
        "end_quote_char",
        "esc_char",
        "esc_quote",
        "multiline",
        "unquoteResults",
        "convertWhitespaceEscapes",
        "escCharReplacePattern",
    )

    def __init__(
        self,
        quoteChar,
        escChar=None,
        escQuote=None,
        multiline=False,
        unquoteResults=True,
        endQuoteChar=None,
        convertWhitespaceEscapes=True,
    ):
        super(QuotedString, self).__init__()

        # remove white space from quote chars - wont work anyway
        quoteChar = quoteChar.strip()
        if not quoteChar:
            warnings.warn("quoteChar cannot be the empty string",
                          SyntaxWarning,
                          stacklevel=2)
            raise SyntaxError()

        if endQuoteChar is None:
            endQuoteChar = quoteChar
        else:
            endQuoteChar = endQuoteChar.strip()
            if not endQuoteChar:
                warnings.warn(
                    "endQuoteChar cannot be the empty string",
                    SyntaxWarning,
                    stacklevel=2,
                )
                raise SyntaxError()

        self.set_config(
            quote_char=quoteChar,
            end_quote_char=endQuoteChar,
            esc_char=escChar,
            esc_quote=escQuote,
            unquoteResults=unquoteResults,
            convertWhitespaceEscapes=convertWhitespaceEscapes,
        )
        # TODO: FIX THIS MESS. WE SHOULD BE ABLE TO CONSTRUCT REGEX FROM ParserElements
        included = Empty()
        excluded = Literal(self.parser_config.end_quote_char)

        if not multiline:
            excluded |= Char("\r\n")
        if escQuote:
            included |= Literal(escQuote)
        if escChar:
            excluded |= Literal(self.parser_config.esc_char)
            included = included | escChar + Char(printables)
            self.set_config(
                escCharReplacePattern=re.escape(self.parser_config.esc_char) +
                "(.)")

        prec, pattern = (
            Literal(quoteChar) + ((~excluded + AnyChar()) | included)[0:] +
            Literal(self.parser_config.end_quote_char)).__regex__()

        self.set_config(multiline=multiline, regex=regex_compile(pattern))

        self.parser_name = text(self)

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if not found:
            raise ParseException(self, start, string)

        end = found.end()
        ret = found.group()

        if self.parser_config.unquoteResults:

            # strip off quotes
            ret = ret[len(self.parser_config.quote_char
                          ):-len(self.parser_config.end_quote_char)]

            if isinstance(ret, text):
                # replace escaped whitespace
                if "\\" in ret and self.parser_config.convertWhitespaceEscapes:
                    ws_map = {
                        r"\t": "\t",
                        r"\n": "\n",
                        r"\f": "\f",
                        r"\r": "\r",
                    }
                    for wslit, wschar in ws_map.items():
                        ret = ret.replace(wslit, wschar)

                # replace escaped characters
                if self.parser_config.esc_char:
                    ret = re.sub(self.parser_config.escCharReplacePattern,
                                 r"\g<1>", ret)

                # replace escaped quotes
                if self.parser_config.esc_quote:
                    ret = ret.replace(self.parser_config.esc_quote,
                                      self.parser_config.end_quote_char)

        return ParseResults(self, start, end, [ret])

    def min_length(self):
        return 2

    def __str__(self):
        try:
            return super(QuotedString, self).__str__()
        except Exception:
            pass

        return "quoted string, starting with %s ending with %s" % (
            self.parser_config.quote_char,
            self.parser_config.end_quote_char,
        )

Example #16

0

Show file

class And(ParseExpression):
    """
    Requires all given `ParseExpression` s to be found in the given order.
    Expressions may be separated by whitespace.
    May be constructed using the ``'+'`` operator.
    May also be constructed using the ``'-'`` operator, which will
    suppress backtracking.
    """

    __slots__ = []
    Config = append_config(ParseExpression, "engine")

    class SyntaxErrorGuard(Empty):
        def __init__(self, *args, **kwargs):
            with Engine(""):
                super(And.SyntaxErrorGuard, self).__init__(*args, **kwargs)
                self.parser_name = "-"

    def __init__(self, exprs, engine):
        if exprs and Ellipsis in exprs:
            tmp = []
            for i, expr in enumerate(exprs):
                if expr is Ellipsis:
                    if i < len(exprs) - 1:
                        skipto_arg = (Empty() + exprs[i + 1]).exprs[-1]
                        tmp.append(SkipTo(skipto_arg)("_skipped"))
                    else:
                        raise Exception(
                            "cannot construct And with sequence ending in ..."
                        )
                else:
                    tmp.append(expr)
            exprs[:] = tmp
        super(And, self).__init__(exprs)
        self.set_config(engine=engine)

    def streamline(self):
        if self.streamlined:
            return self
        if not self.exprs:
            return Empty(self.parser_name)
        if len(self.exprs) == 1 and not self.is_annotated():
            return self.exprs[0].streamline()

        # collapse any _PendingSkip's
        same = True
        exprs = self.exprs
        if any(
            isinstance(e, ParseExpression)
            and e.exprs
            and isinstance(e.exprs[-1], _PendingSkip)
            for e in exprs[:-1]
        ):
            same = False
            for i, e in enumerate(exprs[:-1]):
                if (
                    isinstance(e, ParseExpression)
                    and e.exprs
                    and isinstance(e.exprs[-1], _PendingSkip)
                ):
                    ee = e.exprs[-1] + exprs[i + 1]
                    e.exprs[-1] = ee
                    e.streamlined = False
                    exprs[i + 1] = None

        # streamline INDIVIDUAL EXPRESSIONS
        acc = []
        for e in exprs:
            if e is None:
                continue
            f = e.streamline()
            same = same and f is e
            if f.is_annotated():
                acc.append(f)
            elif isinstance(f, And) and f.parser_config.engine is self.parser_config.engine:
                same = False
                acc.extend(f.exprs)
            else:
                acc.append(f)

        if same:
            self.streamlined = True
            return self

        output = self.copy()
        output.exprs = acc
        output.streamlined = True
        return output

    def expecting(self):
        if not self.exprs:
            return {}

        acc = OrderedDict()
        for e in self.exprs:
            expect = e.expecting()
            if not expect:
                return {}
            for k in expect.keys():
                acc[k] = [self]
            if e.min_length():
                break
        return acc

    def _min_length(self):
        return sum(e.min_length() for e in self.exprs)

    def parseImpl(self, string, start, doActions=True):
        # pass False as last arg to _parse for first element, since we already
        # pre-parsed the string as part of our And pre-parsing
        encountered_syntax_error = False
        end = index = start
        acc = []
        for i, expr in enumerate(self.exprs):
            if end > index:
                index = self.parser_config.engine.skip(string, end)
            if isinstance(expr, And.SyntaxErrorGuard):
                encountered_syntax_error = True
                continue
            try:
                result = expr._parse(string, index, doActions)
                end = result.end
                acc.append(result)
            except ParseException as pe:
                if encountered_syntax_error:
                    raise ParseSyntaxException(pe.expr, pe.loc, pe.string)
                else:
                    raise pe

        return ParseResults(self, start, end, acc)

    def __add__(self, other):
        if other is Ellipsis:
            return _PendingSkip(self)

        return And([self, engine.CURRENT.normalize(other)], engine.CURRENT).streamline()

    def checkRecursion(self, seen=empty_tuple):
        subRecCheckList = seen + (self,)
        for e in self.exprs:
            e.checkRecursion(subRecCheckList)
            if e.min_length():
                return

    def __regex__(self):
        return "+", "".join(regex_iso(*e.__regex__(), "+") for e in self.exprs)

    def __str__(self):
        if self.parser_name:
            return self.parser_name

        return "{" + " + ".join(text(e) for e in self.exprs) + "}"

Example #17

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class Word(Token):
    """Token for matching words composed of allowed character sets.
    Defined with string containing all allowed initial characters, an
    optional string containing allowed body characters (if omitted,
    defaults to the initial character set), and an optional minimum,
    maximum, and/or exact length.  The default value for ``min`` is
    1 (a minimum value < 1 is not valid); the default values for
    ``max`` and ``exact`` are 0, meaning no maximum or exact
    length restriction. An optional ``excludeChars`` parameter can
    list characters that might be found in the input ``body_chars``
    string; useful to define a word of all printables except for one or
    two characters, for instance.

    `srange` is useful for defining custom character set strings
    for defining ``Word`` expressions, using range notation from
    regular expression character sets.

    A common mistake is to use `Word` to match a specific literal
    string, as in ``Word("Address")``. Remember that `Word`
    uses the string argument to define *sets* of matchable characters.
    This expression would match "Add", "AAA", "dAred", or any other word
    made up of the characters 'A', 'd', 'r', 'e', and 's'. To match an
    exact literal string, use `Literal` or `Keyword`.

    mo_parsing includes helper strings for building Words:

     - `alphas`
     - `nums`
     - `alphanums`
     - `hexnums`
     - `alphas8bit` (alphabetic characters in ASCII range 128-255
       - accented, tilded, umlauted, etc.)
     - `punc8bit` (non-alphabetic characters in ASCII range
       128-255 - currency, symbols, superscripts, diacriticals, etc.)
     - `printables` (any non-whitespace character)

    """

    __slots__ = []
    Config = append_config(Token, "min")

    def __init__(
        self,
        init_chars,
        body_chars=None,
        min=1,
        max=None,
        exact=0,
        asKeyword=False,
        excludeChars=None,
    ):
        Token.__init__(self)

        if body_chars is None:
            body_chars = init_chars
        if exact:
            min = max = exact

        if min < 1:
            raise ValueError(
                "cannot specify a minimum length < 1; use Optional(Word()) if"
                " zero-length word is permitted")

        if body_chars == init_chars:
            prec, regexp = Char(
                init_chars, excludeChars=excludeChars)[min:max].__regex__()
        elif max is None or max == MAX_INT:
            prec, regexp = (Char(init_chars, excludeChars=excludeChars) + Char(
                body_chars, excludeChars=excludeChars)[min - 1:]).__regex__()
        else:
            prec, regexp = (
                Char(init_chars, excludeChars=excludeChars) +
                Char(body_chars,
                     excludeChars=excludeChars)[min - 1:max - 1]).__regex__()

        if asKeyword:
            regexp = r"\b" + regexp + r"\b"

        self.set_config(regex=regex_compile(regexp), min=min)

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            return ParseResults(self, start, found.end(), [found.group()])

        raise ParseException(self, start, string)

    def min_length(self):
        return self.parser_config.min

    def __regex__(self):
        return "+", self.parser_config.regex.pattern

    def __str__(self):
        if self.parser_name:
            return self.parser_name
        return f"W:({self.parser_config.regex.pattern})"

Example #18

0

Show file

class PrecededBy(ParseEnhancement):
    """Lookbehind matching of the given parse expression.
    ``PrecededBy`` does not advance the parsing position within the
    input string, it only verifies that the specified parse expression
    matches prior to the current position.  ``PrecededBy`` always
    returns a null token list, but if a results name is defined on the
    given expression, it is returned.

    Parameters:

     - expr - expression that must match prior to the current parse
       location
     - retreat - (default= ``None``) - (int) maximum number of characters
       to lookbehind prior to the current parse location

    If the lookbehind expression is a string, Literal, Keyword, or
    a Word or CharsNotIn with a specified exact or maximum length, then
    the retreat parameter is not required. Otherwise, retreat must be
    specified to give a maximum number of characters to look back from
    the current parse position for a lookbehind match.
    """

    __slots__ = []
    Config = append_config(ParseEnhancement, "retreat", "exact")

    def __init__(self, expr, retreat=None):
        super(PrecededBy, self).__init__(expr)
        expr = self.expr = self.expr.leaveWhitespace()

        if isinstance(expr, (Literal, Keyword, Char)):
            self.set_config(retreat=expr.min_length(), exact=True)
        elif isinstance(expr, (Word, CharsNotIn)):
            self.set_config(retreat=expr.min_length(), exact=False)
        elif isinstance(expr, _PositionToken):
            self.set_config(retreat=0, exact=True)
        else:
            self.set_config(retreat=expr.min_length(), exact=False)

    def parseImpl(self, string, start=0, doActions=True):
        if self.parser_config.exact:
            loc = start - self.parser_config.retreat
            if loc < 0:
                raise ParseException(self, start, string)
            ret = self.expr._parse(string, loc)
        else:
            # retreat specified a maximum lookbehind window, iterate
            test_expr = self.expr + StringEnd()
            instring_slice = string[:start]
            last_cause = ParseException(self, start, string)

            with self.engine.backup():
                for offset in range(self.parser_config.retreat, start + 1):
                    try:
                        ret = test_expr._parse(instring_slice, start - offset)
                        break
                    except ParseException as cause:
                        last_cause = cause
                else:
                    raise last_cause
        # return empty list of tokens, but preserve any defined results names

        ret.__class__ = Annotation
        return ParseResults(self, start, start, [ret])

    def __regex__(self):
        if self.parser_config.exact:
            return "*", f"(?<={self.expr.__regex__()[1]})"
        raise NotImplemented()

Example #19

0

Show file

File: tokens.py Project: astrojams1/cleanstreets

class Regex(Token):
    r"""Token for matching strings that match a given regular
    expression. Defined with string specifying the regular expression in
    a form recognized by the stdlib Python  `re module <https://docs.python.org/3/library/re.html>`_.
    If the given regex contains named groups (defined using ``(?P<name>...)``),
    these will be preserved as named parse results.
    """
    __slots__ = []
    Config = append_config(Token, "flags")

    def __new__(cls, pattern, flags=0, asGroupList=False, asMatch=False):
        if asGroupList:
            return object.__new__(_RegExAsGroup)
        elif asMatch:
            return object.__new__(_RegExAsMatch)
        else:
            return object.__new__(cls)

    def __init__(self, pattern, flags=0, asGroupList=False, asMatch=False):
        """The parameters ``pattern`` and ``flags`` are passed
        to the ``regex_compile()`` function as-is. See the Python
        `re module <https://docs.python.org/3/library/re.html>`_ module for an
        explanation of the acceptable patterns and flags.
        """
        Token.__init__(self)

        if isinstance(pattern, text):
            if not pattern:
                warnings.warn(
                    "null string passed to Regex; use Empty() instead",
                    SyntaxWarning,
                    stacklevel=2,
                )

            try:
                self.set_config(flags=flags, regex=re.compile(pattern, flags))
            except sre_constants.error as cause:
                Log.error(
                    "invalid pattern {{pattern}} passed to Regex",
                    pattern=pattern,
                    cause=cause,
                )
        elif isinstance(pattern, regex_type):
            self.set_config(flags=flags, regex=pattern)
        else:
            Log.error(
                "Regex may only be constructed with a string or a compiled RE object"
            )

        self.parser_name = text(self)

    def parseImpl(self, string, start, doActions=True):
        found = self.parser_config.regex.match(string, start)
        if found:
            ret = ParseResults(self, start, found.end(), [found.group()])
            d = found.groupdict()
            if d:
                for k, v in d.items():
                    ret[k] = v
            return ret

        raise ParseException(self, start, string)

    def min_length(self):
        return 0

    def __regex__(self):
        return "|", self.parser_config.regex.pattern

    def __str__(self):
        return self.parser_config.regex.pattern

    def sub(self, repl):
        r"""
        Return Regex with an attached parse action to transform the parsed
        result as if called using `re.sub(expr, repl, string) <https://docs.python.org/3/library/re.html#re.sub>`_.

        Example::

            make_html = Regex(r"(\w+):(.*?):").sub(r"<\1>\2</\1>")
            print(make_html.transformString("h1:main title:"))
            # prints "<h1>main title</h1>"
        """
        def pa(tokens):
            return self.parser_config.regex.sub(repl, tokens[0])

        return self.addParseAction(pa)