def test_sequence(self): len_eq(Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobingo1234'), 12) # succeed len_eq(Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobing'), None) # don't len_eq(Sequence(Regex('hi*')).match('>hiiii', 1), 5) # non-0 pos
def test_sequence(self): len_eq( Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match('hiiiilobingo1234'), 12) # succeed assert_raises(ParseError, Sequence(Regex('hi*'), Literal('lo'), Regex('.ingo')).match, 'hiiiilobing') # don't len_eq(Sequence(Regex('hi*')).match('>hiiii', 1), 5) # non-0 pos
def test_zero_or_more(self): len_eq(ZeroOrMore(Literal('b')).match(''), 0) # zero len_eq(ZeroOrMore(Literal('b')).match('bbb'), 3) # more len_eq(Regex('^').match(''), 0) # Validate the next test. # Try to make it loop infinitely using a zero-length contained expression: len_eq(ZeroOrMore(Regex('^')).match(''), 0)
def test_all_of(self): """``AllOf`` should return its own node, wrapping the last child.""" expr = AllOf(Literal('a', name='lit_a'), Regex('A', ignore_case=True, name='reg_a'), name='all_of') text = 'a' eq_(expr.match(text), Node('all_of', text, 0, 1, children=[ Node('reg_a', text, 0, 1)]))
def test_lookahead(self): grammar = Grammar(r'''starts_with_a = &"a" ~"[a-z]+"''') self.assertRaises(ParseError, grammar.parse, 'burp') s = 'arp' self.assertEqual(grammar.parse('arp'), Node(grammar['starts_with_a'], s, 0, 3, children=[ Node(Lookahead(Literal('a')), s, 0, 0), Node(Regex(r'[a-z]+'), s, 0, 3)]))
def test_one_or_more(self): len_eq(OneOrMore(Literal('b')).match('b'), 1) # one len_eq(OneOrMore(Literal('b')).match('bbb'), 3) # more len_eq(OneOrMore(Literal('b'), min=3).match('bbb'), 3) # with custom min; success assert_raises(ParseError, OneOrMore(Literal('b'), min=3).match, 'bb') # with custom min; failure len_eq(OneOrMore(Regex('^')).match('bb'), 0) # attempt infinite loop
def visit_regex(self, node, regex): """Return a ``Regex`` expression.""" tilde, literal, flags, _ = regex flags = flags.text.upper() pattern = literal.literal # Pull the string back out of the Literal # object. return Regex(pattern, ignore_case='I' in flags, locale='L' in flags, multiline='M' in flags, dot_all='S' in flags, unicode='U' in flags, verbose='X' in flags)
def test_use_regex_library(): grammar = Grammar(r''' unicode_word = ~"[\p{L}]*" ''', use_regex_library=True) text = 'Тест' expected = RegexNode(expr=Regex(pattern=r'[\p{L}]*', use_regex_library=True), full_text=text, start=0, end=4) result = grammar.parse(text=text) eq_(result, expected)
def visit_regex(self, regex, children): """Return a ``Regex`` expression.""" _, literal, flags, _ = children flags = flags.text.upper() # Pull the string back out of the Literal object. pattern = literal.literal return Regex(pattern, ignore_case='I' in flags, locale='L' in flags, multiline='M' in flags, dot_all='S' in flags, str='U' in flags, verbose='X' in flags)
def visit_regex(self, regex, xxx_todo_changeme11): """Return a ``Regex`` expression.""" (tilde, literal, flags, _) = xxx_todo_changeme11 flags = flags.text.upper() pattern = literal.literal # Pull the string back out of the Literal # object. return Regex(pattern, ignore_case='I' in flags, locale='L' in flags, multiline='M' in flags, dot_all='S' in flags, str='U' in flags, verbose='X' in flags)
def _expressions_from_rules(self, rule_syntax): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: ws = Regex(r'\s+', name='ws') _ = Regex(r'[ \t]+', name='_') label = Regex(r'[a-zA-Z_][a-zA-Z_0-9]*', name='label') quantifier = Regex(r'[*+?]', name='quantifier') # This pattern supports empty literals. TODO: A problem? literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name='literal') regex = Sequence(Literal('~'), literal, Regex('[ilmsux]*', ignore_case=True), name='regex') atom = OneOf(label, literal, regex, name='atom') quantified = Sequence(atom, quantifier, name='quantified') term = OneOf(quantified, atom, name='term') another_term = Sequence(_, term, name='another_term') sequence = Sequence(term, OneOrMore(another_term), name='sequence') or_term = Sequence(_, Literal('/'), another_term, name='or_term') ored = Sequence(term, OneOrMore(or_term), name='ored') and_term = Sequence(_, Literal('&'), another_term, name='and_term') anded = Sequence(term, OneOrMore(and_term), name='anded') poly_term = OneOf(anded, ored, sequence, name='poly_term') rhs = OneOf(poly_term, term, name='rhs') eol = Regex(r'[\r\n$]', name='eol') # TODO: Support $. rule = Sequence(Optional(ws), label, Optional(_), Literal('='), Optional(_), rhs, Optional(_), eol, name='rule') rules = Sequence(OneOrMore(rule), Optional(ws), name='rules') # Use those hard-coded rules to parse the (possibly more extensive) # rule syntax. (For example, unless I start using parentheses in the # rule language definition itself, I should never have to hard-code # expressions for those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def _expressions_from_rules(self, rule_syntax, custom_rules): """Return the rules for parsing the grammar definition syntax. Return a 2-tuple: a dict of rule names pointing to their expressions, and then the top-level expression for the first rule. """ # Hard-code enough of the rules to parse the grammar that describes the # grammar description language, to bootstrap: comment = Regex(r'#[^\r\n]*', name='comment') meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness') _ = ZeroOrMore(meaninglessness, name='_') equals = Sequence(Literal('='), _, name='equals') label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label') reference = Sequence(label, Not(equals), name='reference') quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier') # This pattern supports empty literals. TODO: A problem? spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"', ignore_case=True, dot_all=True, name='spaceless_literal') literal = Sequence(spaceless_literal, _, name='literal') regex = Sequence(Literal('~'), literal, Regex('[ilmsuxa]*', ignore_case=True), _, name='regex') atom = OneOf(reference, literal, regex, name='atom') quantified = Sequence(atom, quantifier, name='quantified') term = OneOf(quantified, atom, name='term') not_term = Sequence(Literal('!'), term, _, name='not_term') term.members = (not_term, ) + term.members sequence = Sequence(term, OneOrMore(term), name='sequence') or_term = Sequence(Literal('/'), _, term, name='or_term') ored = Sequence(term, OneOrMore(or_term), name='ored') expression = OneOf(ored, sequence, term, name='expression') rule = Sequence(label, equals, expression, name='rule') rules = Sequence(_, OneOrMore(rule), name='rules') # Use those hard-coded rules to parse the (more extensive) rule syntax. # (For example, unless I start using parentheses in the rule language # definition itself, I should never have to hard-code expressions for # those above.) rule_tree = rules.parse(rule_syntax) # Turn the parse tree into a map of expressions: return RuleVisitor().visit(rule_tree)
def test_all_of(self): len_eq(AllOf(Literal('0'), Regex('..')).match('01'), 2) # match len_eq(AllOf(Literal('0'), Regex('.2')).match('01'), None) # don't
def test_not(self): len_eq(Not(Regex('.')).match(''), 0) # match len_eq(Not(Regex('.')).match('Hi'), None) # don't
def test_not(self): len_eq(Not(Regex('.')).match(''), 0) # match assert_raises(ParseError, Not(Regex('.')).match, 'Hi') # don't
def test_one_or_more(self): len_eq(OneOrMore(Literal('b')).match('b'), 1) # one len_eq(OneOrMore(Literal('b')).match('bbb'), 3) # more len_eq(OneOrMore(Literal('b'), min=3).match('bbb'), 3) # with custom min; success len_eq(OneOrMore(Literal('b'), min=3).match('bb'), None) # with custom min; failure len_eq(OneOrMore(Regex('^')).match('bb'), 0) # attempt infinite loop
def test_regex(self): len_eq(Literal('hello').match('ehello', 1), 5) # simple len_eq(Regex('hello*').match('hellooo'), 7) # * assert_raises(ParseError, Regex('hello*').match, 'goodbye') # no match len_eq(Regex('hello', ignore_case=True).match('HELLO'), 5)
"""Stick a :class:`LazyReference` in the tree as a placeholder. We resolve them all later. """ return LazyReference(label) def visit_regex(self, regex, (tilde, literal, flags, _)): """Return a ``Regex`` expression.""" flags = flags.text.upper() pattern = literal.literal # Pull the string back out of the Literal # object. return Regex(pattern, ignore_case='I' in flags, locale='L' in flags, multiline='M' in flags, dot_all='S' in flags, unicode='U' in flags, verbose='X' in flags) def visit_spaceless_literal(self, spaceless_literal, visited_children): """Turn a string literal into a ``Literal`` that recognizes it.""" # Piggyback on Python's string support so we can have backslash # escaping and niceties like \n, \t, etc. # string.decode('string_escape') would have been a lower-level # possibility. return Literal(ast.literal_eval(spaceless_literal.text)) def visit_literal(self, literal, (spaceless_literal, _)): """Pick just the literal out of a literal-and-junk combo.""" return spaceless_literal
def test_regex(self): len_eq(Literal('hello').match('ehello', 1), 5) # simple len_eq(Regex('hello*').match('hellooo'), 7) # * len_eq(Regex('hello*').match('goodbye'), None) # no match len_eq(Regex('hello', ignore_case=True).match('HELLO'), 5)