Ejemplo n.º 1
0
    def __regex__(self):
        white = regex_range(self.white_chars)
        if not self.ignore_list:
            if not white:
                return "*", ""
            else:
                return "*", white + "*"

        ignored = "|".join(
            regex_iso(*i.__regex__(), "|") for i in self.ignore_list)
        return "+", f"(?:{white}*(?:{ignored}))*{white}*"
Ejemplo n.º 2
0
    def __init__(self, notChars, min=1, max=0, exact=0):
        Token.__init__(self)
        not_chars = "".join(sorted(set(notChars)))

        if min < 1:
            raise ValueError(
                "cannot specify a minimum length < 1; use "
                "Optional(CharsNotIn()) if zero-length char group is permitted"
            )

        max = max if max > 0 else MAX_INT
        if exact:
            min = exact
            max = exact

        if len(notChars) == 1:
            regex = "[^" + regex_range(notChars) + "]"
        else:
            regex = "[^" + regex_range(notChars)[1:]

        if not max or max == MAX_INT:
            if min == 0:
                suffix = "*"
            elif min == 1:
                suffix = "+"
            else:
                suffix = "{" + str(min) + ":}"
        elif min == 1 and max == 1:
            suffix = ""
        else:
            suffix = "{" + str(min) + ":" + str(max) + "}"

        self.set_config(
            regex=regex_compile(regex + suffix),
            min_len=min,
            max_len=max,
            not_chars=not_chars,
        )
        self.parser_name = text(self)
Ejemplo n.º 3
0
 def __init__(self, charset, asKeyword=False, excludeChars=None):
     """
     Represent one character in a given charset
     """
     Token.__init__(self)
     if excludeChars:
         charset = set(charset) - set(excludeChars)
     regex = regex_range(charset)
     if asKeyword:
         regex = r"\b%s\b" % self
     self.set_config(
         regex=regex_compile(regex),
         charset="".join(sorted(set(charset))),
     )
Ejemplo n.º 4
0
    def __init__(self, match, ident_chars=None, caseless=None):
        Token.__init__(self)
        if ident_chars is None:
            ident_chars = self.engine.keyword_chars
        else:
            ident_chars = "".join(sorted(set(ident_chars)))

        if caseless:
            pattern = regex_caseless(match)
        else:
            pattern = re.escape(match)

        non_word = "($|(?!" + regex_range(ident_chars) + "))"
        self.set_config(ident_chars=ident_chars,
                        match=match,
                        regex=regex_compile(pattern + non_word))

        self.parser_name = match
        if caseless:
            self.__class__ = CaselessKeyword
Ejemplo n.º 5
0
 def test_regex_range(self):
     for i in range(9, 4000):
         c = unichr(i)
         pattern = regex_range(c)
         found = re.match(pattern, c)
         self.assertTrue(bool(found))
Ejemplo n.º 6
0
def oneOf(strs, caseless=False, asKeyword=False):
    """Helper to quickly define a set of alternative Literals, and makes
    sure to do longest-first testing when there is a conflict,
    regardless of the input order, but returns
    a `MatchFirst` for best performance.

    Parameters:

     - strs - a string of space-delimited literals, or a collection of
       string literals
     - caseless - (default= ``False``) - treat all literals as caseless
     - asKeyword - (default=``False``) - enforce Keyword-style matching on the
       generated expressions
    """
    if isinstance(caseless, text):
        warnings.warn(
            "More than one string argument passed to oneOf, pass "
            "choices as a list or space-delimited string",
            stacklevel=2,
        )

    if caseless:
        isequal = lambda a, b: a.upper() == b.upper()
        masks = lambda a, b: b.upper().startswith(a.upper())
        parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral
    else:
        isequal = lambda a, b: a == b
        masks = lambda a, b: b.startswith(a)
        parseElementClass = Keyword if asKeyword else Literal

    symbols = []
    if isinstance(strs, text):
        symbols = strs.split()
    elif isinstance(strs, Iterable):
        symbols = list(strs)
    else:
        warnings.warn(
            "Invalid argument to oneOf, expected string or iterable",
            SyntaxWarning,
            stacklevel=2,
        )
    if not symbols:
        return NoMatch()

    if not asKeyword:
        # if not producing keywords, need to reorder to take care to avoid masking
        # longer choices with shorter ones
        i = 0
        while i < len(symbols) - 1:
            cur = symbols[i]
            for j, other in enumerate(symbols[i + 1:]):
                if isequal(other, cur):
                    del symbols[i + j + 1]
                    break
                elif masks(cur, other):
                    del symbols[i + j + 1]
                    symbols.insert(i, other)
                    break
            else:
                i += 1

    if caseless or asKeyword:
        return (MatchFirst(parseElementClass(sym)
                           for sym in symbols).set_parser_name(
                               " | ".join(symbols)).streamline())

    # CONVERT INTO REGEX
    singles = [s for s in symbols if len(s) == 1]
    rest = list(
        sorted([s for s in symbols if len(s) != 1], key=lambda s: -len(s)))

    acc = []
    acc.extend(re.escape(sym) for sym in rest)
    if singles:
        acc.append(regex_range("".join(singles)))
    regex = "|".join(acc)

    return Regex(regex).set_parser_name(" | ".join(symbols)).streamline()