Beispiel #1
0
    def __init__(self, lexicon, flags=0):
        """
        Create a parser from a given lexicon.

        :param lexicon:     Lexicon in the form of list, each entry with:
                            (<regex>, lambda scanner, token: Tok(<kind>, <value>)))
        :param flags:       Extra flags for parsing.
        """

        import sre_parse
        import sre_compile
        from sre_constants import BRANCH, SUBPATTERN

        self.lexicon = lexicon

        # combine phrases into a compound pattern
        p = []
        s = sre_parse.Pattern()
        s.flags = flags
        for phrase, action in lexicon:
            p.append(
                sre_parse.SubPattern(s, [
                    (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))),
                ]))

        s.groups = len(p) + 1
        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p, re.MULTILINE)
Beispiel #2
0
    def __init__(self, lexicon, flags=FLAGS):
        self.actions = [None]
        # combine phrases into a compound pattern
        s = sre_parse.Pattern()
        s.flags = flags
        p = []

        # NOTE(kgibbs): These lines must be added to make this file work under
        # Python 2.2, which is commonly used at Google.
        def enumerate(obj):
            i = -1
            for item in obj:
                i += 1
                yield i, item

        # NOTE(kgibbs): End changes.
        for idx, token in enumerate(lexicon):
            phrase = token.pattern
            try:
                subpattern = sre_parse.SubPattern(
                    s,
                    [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
            except sre_constants.error:
                raise
            p.append(subpattern)
            self.actions.append(token)

        s.groups = len(p) + 1  # NOTE(guido): Added to make SRE validation work
        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)
Beispiel #3
0
def record_matches(split, text):
    """Record which clauses match."""
    matches = frozenset()
    for clause in split:
        p = sre_parse.Pattern()
        scanner = sre_compile.compile(sre_parse.SubPattern(p, clause))\
                             .scanner(text)
        if scanner.match():
            matches |= {tuple(clause)}

    return matches
Beispiel #4
0
    def __init__(self, lexicon, flags = 0):
        from sre_constants import BRANCH, SUBPATTERN
        self.lexicon = lexicon
        p = []
        s = sre_parse.Pattern()
        s.flags = flags
        for phrase, action in lexicon:
            p.append(sre_parse.SubPattern(s, [(SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags)))]))

        s.groups = len(p) + 1
        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)
Beispiel #5
0
 def __init__(self, lexicon, flags=0):
     self.lexicon = lexicon
     # combine phrases into a compound pattern
     p = []
     s = sre_parse.Pattern()
     s.flags = flags
     for phrase, action in lexicon:
         p.append(sre_parse.SubPattern(s, [
             (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))),
             ]))
     s.groups = len(p)+1
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     self.scanner = sre_compile.compile(p)
Beispiel #6
0
 def __init__(self, lexicon):
     from sre_constants import BRANCH, SUBPATTERN
     self.lexicon = lexicon
     # combine phrases into a compound pattern
     p = []
     s = sre_parse.Pattern()
     for phrase, action in lexicon:
         p.append(
             sre_parse.SubPattern(s, [
                 (SUBPATTERN, (len(p), sre_parse.parse(phrase))),
             ]))
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     s.groups = len(p)
     self.scanner = sre_compile.compile(p)
Beispiel #7
0
    def __init__(self, regexp, negative=False, **property_names):
        """
        Create a new C{RegexpTokenizer} from a given regular expression.
        
        @type regexp: C{string} or C{SRE_Pattern}
        @param regexp: The regular expression used to tokenized texts.
            Unless C{negative} is true, this regular expression
            specifies the form of a single word type; so the list of
            tokens generated by tokenization includes all non-overlapping
            substrings that match C{regexp}
        @type negative: C{boolean}
        @param negative: An optional parameter that inverts the
            meaning of C{regexp}.  In particular, if C{negative} is
            true, then C{regexp} is taken to specify the form of word
            separators (and not word types); so the list of tokens
            generated by tokenization includes all substrings that
            occur I{between} matches of the regular expression.
        @type property_names: C{dict}
        @param property_names: A dictionary that can be used to override
            the default property names.  Each entry maps from a
            default property name to a new property name.
        """
        assert chktype(1, regexp, str)

        AbstractTokenizer.__init__(self, **property_names)

        if hasattr(regexp, 'pattern'): regexp = regexp.pattern
        self._negative = bool(negative)

        # Replace any grouping parentheses with non-grouping ones.  We
        # need to do this, because the list returned by re.sub will
        # contain an element corresponding to every set of grouping
        # parentheses.  We must not touch escaped parentheses, and
        # need to handle the case of escaped escapes (e.g. "\\(").
        # We also need to handle nested parentheses, which means our
        # regexp contexts must be zero-width. There are also issues with
        # parenthesis appearing in bracketed contexts, hence we've
        # operated on the intermediate parse structure from sre_parse.
        parsed = sre_parse.parse(regexp)
        parsed = _remove_group_identifiers(parsed)

        # Add grouping parentheses around the regexp; this will allow
        # us to access the material that was split on.
        # Need to set the Pattern to expect a single group
        pattern = sre_parse.Pattern()
        pattern.groups += 1
        grouped = sre_parse.SubPattern(pattern)
        grouped.append((sre_constants.SUBPATTERN, (1, parsed)))

        self._regexp = sre_compile.compile(grouped, re.UNICODE)
Beispiel #8
0
 def __init__(self, lexicon, flags=0):
     from sre_constants import BRANCH, SUBPATTERN
     self.lexicon = lexicon
     # combine phrases into a compound pattern
     p = []
     s = sre_parse.Pattern()
     s.flags = flags
     for phrase, action in lexicon:
         gid = s.opengroup()
         p.append(sre_parse.SubPattern(s, [
             (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
             ]))
         s.closegroup(gid, p[-1])
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     self.scanner = sre_compile.compile(p)
Beispiel #9
0
 def _get_group_pattern(self, flags):
     # combine phrases into a compound pattern
     patterns = []
     sub_pattern = sre_parse.Pattern()
     sub_pattern.flags = flags
     for phrase, action in self.lexicon:
         patterns.append(
             sre_parse.SubPattern(sub_pattern, [
                 (SUBPATTERN,
                  (len(patterns) + 1, sre_parse.parse(phrase, flags))),
             ]))
     #sub_pattern.groups = len(patterns) + 1
     group_pattern = sre_parse.SubPattern(sub_pattern,
                                          [(BRANCH, (None, patterns))])
     return sre_compile.compile(group_pattern)
def _compile(regexp):

    parsed = sre_parse.parse(regexp)
    parsed = _remove_group_identifiers(parsed)

    # Add grouping parentheses around the regexp; this will allow
    # us to access the material that was split on.
    # Need to set the Pattern to expect a single group

    pattern = sre_parse.Pattern()
    pattern.groups += 1
    grouped = sre_parse.SubPattern(pattern)
    grouped.append((sre_constants.SUBPATTERN, (1, parsed)))

    return sre_compile.compile(grouped, re.UNICODE | re.MULTILINE | re.DOTALL)
Beispiel #11
0
 def _scan_init(self):
     p = []
     try:
         state = sre_parse.State()
     except AttributeError:
         # python < 3.8.0
         state = sre_parse.Pattern()
     for name, regx in self.LEXICON:
         group = state.opengroup(name)
         re = sre_parse.parse(regx)
         data = [(SUBPATTERN, (group, 0, 0, re))]
         p.append(sre_parse.SubPattern(state, data))
         state.closegroup(group, p[-1])
     data = [(BRANCH, (None, p))]
     return state, data
Beispiel #12
0
    def __init__(self,
                 lexicon,
                 flags=(VERBOSE | MULTILINE | DOTALL),
                 verify=True):
        self.actions = [None]
        # combine phrases into a compound pattern
        s = sre_parse.Pattern()
        s.flags = flags
        p = []
        for idx, token in enumerate(lexicon):
            phrase = token.pattern
            try:
                subpattern = sre_parse.SubPattern(
                    s,
                    [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
            except sre_constants.error:
                print "Can't parse %s" % (token.__name__, )
                raise
            token.regex = re.compile(phrase, flags)
            p.append(subpattern)
            self.actions.append(token)

        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)

        if verify:
            for token in lexicon:
                example = token.example
                if example is None:
                    continue

                def dead(string, i, j):
                    print token.__name__, i, j
                    print '--- PATTERN ---'
                    print token.pattern
                    print '--- PARSED EXAMPLE ---'
                    print string[:i]
                    print '--- UNMATCHED CHUNK ---'
                    print repr(string[i:j])
                    raise ValueError, "Token %s can not be verified" % token.__name__

                s = Scanner([token, InsignificantWhitespace], verify=False)
                try:
                    for m in s.iterscan(example, dead=dead):
                        pass
                except:
                    print example
                    raise
Beispiel #13
0
def build_scanner(lexicon, flags=0):
    import sre_parse
    import sre_compile
    from sre_constants import BRANCH, SUBPATTERN
    # combine phrases into a compound pattern
    p = []
    s = sre_parse.Pattern()
    s.flags = flags
    for phrase, action in lexicon:
        p.append(
            sre_parse.SubPattern(s, [
                (SUBPATTERN, (len(p) + 1, sre_parse.parse(phrase, flags))),
            ]))
    s.groups = len(p) + 1
    p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
    scanner = sre_compile.compile(p)
    return scanner
Beispiel #14
0
    def __init__(self, lexicon, flags=FLAGS):
        self.actions = [None]
        # combine phrases into a compound pattern
        s = sre_parse.Pattern()
        s.flags = flags
        p = []
        for idx, token in enumerate(lexicon):
            phrase = token.pattern
            try:
                subpattern = sre_parse.SubPattern(s,
                    [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
            except sre_constants.error:
                raise
            p.append(subpattern)
            self.actions.append(token)

        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
        self.scanner = sre_compile.compile(p)
Beispiel #15
0
Datei: re.py Projekt: ybay/yZhPy
 def __init__(self, lexicon, flags=0):
     from sre_constants import BRANCH, SUBPATTERN
     if isinstance(flags, RegexFlag):
         flags = flags.value
     self.lexicon = lexicon
     # 将短语组合成复合模式
     p = []
     s = sre_parse.Pattern()
     s.flags = flags
     for phrase, action in lexicon:
         gid = s.opengroup()
         p.append(
             sre_parse.SubPattern(s, [
                 (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
             ]))
         s.closegroup(gid, p[-1])
     p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
     self.scanner = sre_compile.compile(p)
Beispiel #16
0
def make_scanner(lexicon, flags=FLAGS):
    actions = [None]
    # Combine phrases into a compound pattern
    s = sre_parse.Pattern()
    s.flags = flags
    charpatterns = {}
    p = []
    idx = 0
    for token in lexicon:
        if token.pattern in (r'\[', r'{', r'"'):
            charpatterns[token.pattern[-1]] = token
        idx += 1
        phrase = token.pattern
        try:
            subpattern = sre_parse.SubPattern(
                s, [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))])
        except sre_constants.error:
            raise
        p.append(subpattern)
        actions.append(token)

    s.groups = len(p) + 1  # NOTE(guido): Added to make SRE validation work
    p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
    scanner = sre_compile.compile(p).scanner

    def _scan_once(string, idx=0, context=None):
        try:
            action = charpatterns[string[idx]]
        except KeyError:
            pass
        except IndexError:
            raise StopIteration
        else:
            return action((string, idx + 1), context)

        m = scanner(string, idx).match()
        if m is None or m.end() == idx:
            raise StopIteration
        return actions[m.lastindex](m, context)

    return _scan_once
Beispiel #17
0
 def test_no_pattern(self):
     import sre_compile, sre_parse
     sre_pattern = sre_compile.compile(
         sre_parse.SubPattern(sre_parse.Pattern()))
     assert sre_pattern.scanner('s') is not None
Beispiel #18
0
 def as_sre_class(self):
     import sre_parse, sre_compile
     p = sre_parse.Pattern()
     p.flags = sre_parse.SRE_FLAG_UNICODE
     p.str = self.as_re_class()
     return sre_compile.compile(self._as_sre_subpattern(p))
Beispiel #19
0
import sre_parse

# This is the string tokenizer.PseudoToken:
pattern = '[ \\f\\t]*((\\\\\\r?\\n|\\Z|#[^\\r\\n]*|([uUbB]?[rR]?\'\'\'|[uUbB]?[rR]?"""))|((\\d+[jJ]|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)[jJ])|((\\d+\\.\\d*|\\.\\d+)([eE][-+]?\\d+)?|\\d+[eE][-+]?\\d+)|(0[xX][\\da-fA-F]+[lL]?|0[bB][01]+[lL]?|(0[oO][0-7]+)|(0[0-7]*)[lL]?|[1-9]\\d*[lL]?))|((\\*\\*=?|>>=?|<<=?|<>|!=|//=?|[+\\-*/%&|^=<>]=?|~)|[][(){}]|(\\r?\\n|[:;.,`@]))|([uUbB]?[rR]?\'[^\\n\'\\\\]*(?:\\\\.[^\\n\'\\\\]*)*(\'|\\\\\\r?\\n)|[uUbB]?[rR]?"[^\\n"\\\\]*(?:\\\\.[^\\n"\\\\]*)*("|\\\\\\r?\\n))|[a-zA-Z_]\\w*)'

for i in xrange(600):
    p = sre_parse.Pattern()
    p.flags = 0
    p.str = pattern
    sre_parse._parse_sub(sre_parse.Tokenizer(pattern), p, 0)
Beispiel #20
0
#