def test_no_flatten_or(self): matcher = Or('a', Join(Or('b', 'c'))) matcher.config.clear().flatten() parser = matcher.get_parse() text = str(parser.matcher) assert text == "Or(Literal, Transform)", text result = parser('abcd') assert result == ['a'], result
def SingleLineString(quote='"', escape='\\', exclude='\n'): ''' Like `String`, but will not match across multiple lines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(exclude))) if escape: content = Or(content, And(Drop(escape), q)) content = Repeat(content, add_=True) return And(Drop(q), content, Drop(q))
def SkipString(quote='"', escape='\\', ignore='\n', empty='', join=__add__): ''' Like `String`, matching across multiple lines, but will silently drop newlines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(ignore))) if escape: content = Or(content, And(Drop(escape), q)) content = Or(content, Drop(Any(ignore))) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def SingleLineString(quote='"', escape='\\', exclude='\n', empty='', join=__add__): ''' Like `String`, but will not match across multiple lines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(exclude))) if escape: content = Or(content, And(Drop(escape), q)) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def test_bline(self): ''' Test a simple example: letters introduce numbers in an indented block. ''' #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) # the simplest whitespace grammar i can think of - lines are either # numbers (which are single, simple statements) or letters (which # mark the start of a new, indented block). block = Delayed() line = Or(BLine(number), BLine(letter) & block) > list # and a block is simply a collection of lines, as above block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.default_line_aware(block_policy=1) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def test_continued_explicit(self): number = Token(Digit()) letter = Token(Letter()) block = Delayed() bline = ContinuedLineFactory(r'x') line = Or(bline(number), bline(letter) & block) > list block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.lines(block_policy=explicit) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def test_explicit(self): #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) block = Delayed() line = Or(Line(number), Line(letter) & block) > list block += Block(line[1:]) program = Trace(line[1:]) text = '''1 2 a 3 b 4 5 6 ''' program.config.lines(block_policy=explicit) parser = program.get_parse_string() result = parser(text) assert result == [['1'], ['2'], ['a', ['3'], ['b', ['4'], ['5']], ['6']]], result
def UnsignedEFloat(decimal='.', exponent='eE'): ''' As `UnsignedEReal`, but must contain a decimal or exponent. This will match real values that are not integers. ''' return Or(Join(UnsignedReal(decimal), Any(exponent), SignedInteger()), UnsignedFloat(decimal))
def __init__(self): from lepl.matchers.core import Any from lepl.matchers.combine import Or max_ = chr(maxunicode) def mkhex(char, n): from lepl.matchers.derived import Drop return Drop(Any(char)) + Any('0123456789abcdefABCDEF')[n,...] >> \ (lambda x: chr(int(x, 16))) def mkchr(char, range, invert=False): from lepl.matchers.core import Literal from lepl.matchers.derived import Map from lepl.regexp.core import Character intervals = lmap(lambda x: (x, x), range) if invert: # this delays call to invert until after creation of self func = lambda _: Character(self.invert(intervals), self) else: func = lambda _: Character(intervals, self) return Map(Literal(char), func) range = Or(mkchr('s', _WHITESPACE), mkchr('S', _WHITESPACE, invert=True)) escaped = Any(ILLEGAL) | mkhex('x', 2) | mkhex('u', 4) | mkhex('U', 8) super(UnicodeAlphabet, self).__init__(chr(0), max_, escaped=escaped, range=range)
def UnsignedReal(decimal='.'): ''' Match a sequence of digits that may include a decimal point. This will match both integer and float values. ''' return Or( Join(Optional(UnsignedInteger()), Any(decimal), UnsignedInteger()), Join(UnsignedInteger(), Optional(Any(decimal))))
def and_(a, b): ''' Add space only in the case when both consume something. ''' return Or(And(Consumer(a), separator, Consumer(b)), And(Consumer(a), Consumer(b, False)), And(Consumer(a, False), Consumer(b)), And(Consumer(a, False), Consumer(b, False)))
def UnsignedFloat(decimal='.'): ''' Match a sequence of digits that must include a decimal point. This will match real values that are not integers. ''' return Or( Join(Optional(UnsignedInteger()), Any(decimal), UnsignedInteger()), Join(UnsignedInteger(), Any(decimal)))
def Literals(*matchers): ''' A series of literals, joined with `Or`. ''' # I considered implementing this by extending Literal() itself, but # that would have meant putting "Or-like" functionality in Literal, # and I felt it better to keep the base matchers reasonably orthogonal. return Or(*lmap(Literal, matchers))
def test_all(self): first = Join(UnsignedFloat(), Any('eE'), SignedInteger()) second = UnsignedFloat() all = Or(first, second) all.config.default() # wrong order #all.config.compile_to_dfa() # gives 1.e3 only #all.config.compile_to_nfa() # wrong order #all.config.no_compile_to_regexp() # ok #all.config.clear() # ok self.assert_direct('1.e3', all, [['1.e3'], ['1.']])
def String(quote='"', escape='\\'): ''' Match a string with quotes that can be escaped. This will match across newlines (see `SingleLineString` for an alternative). ''' q = Literal(quote) content = AnyBut(q) if escape: content = Or(And(Drop(escape), q), content) content = Repeat(content, add_=True) return And(Drop(q), content, Drop(q))
def SignedEFloat(decimal='.', exponent='eE'): ''' As `SignedEReal`, but must contain a decimal or exponent. This will match real values that are not integers. ''' if decimal == '.' and exponent == 'eE': # hack to faster direct implementation for now return NfaRegexp( r'[\+\-]?(?:[0-9]*\.[0-9]+(?:[eE][\+\-]?[0-9]+)?|[0-9]+\.(?:[eE][\+\-]?[0-9]+)?|[0-9]+[eE][\+\-]?[0-9]+)' ) else: return Or(Join(SignedReal(decimal), Any(exponent), SignedInteger()), SignedFloat(decimal))
def __init__(self, clean_html=True): self.clean_html = clean_html self._punctuation = '!"#&\'()*+,.;<=>?@[\\]^_`{|}~' self._lctx_1_exceptions = set('/ :'.split()) self._lctx_2_exceptions = set('discount redeem voucher'.split()) self._rctx_1_exceptions = set('/ : th am pm hour hours %'.split()) self._rctx_2_exceptions = set('discount redeem voucher'.split()) # LEPL Real Number Matchers (w/thousands) _comma_three_digits = Join(Drop(','), Add(Digit()[3]))[:] _thousand_group = Or( Join(_comma_three_digits, Any('.'), UnsignedInteger()), Join(_comma_three_digits, Optional(Any('.')))) _real = Or(Join(UnsignedInteger(), _thousand_group), UnsignedReal()) >> float _any = Join(Star(AnyBut(_real))) self._real_partition_matcher = Star(And(_any, _real, _any)) self._real_simple_matcher = _real[:, Drop( Star(Or(Whitespace(), Any(',-'))) )]
def String(quote='"', escape='\\', empty='', join=__add__): ''' Match a string with quotes that can be escaped. This will match across newlines (see `SingleLineString` for an alternative). More generally, a string is a grouping of results. Setting `empty` and `join` correctly will allow this matcher to work with a variety of types. ''' q = Literal(quote) content = AnyBut(q) if escape: content = Or(And(Drop(escape), q), content) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def simple_grammar(self): ''' Test a simple example: letters introduce numbers in an indented block. ''' #basicConfig(level=DEBUG) number = Token(Digit()) letter = Token(Letter()) # the simplest whitespace grammar i can think of - lines are either # numbers (which are single, simple statements) or letters (which # mark the start of a new, indented block). block = Delayed() line = Or(Line(number), Line(letter) & block) > list # and a block is simply a collection of lines, as above block += Block(line[1:]) program = Trace(line[1:]) program.config.lines(block_policy=1) return program
def and_(matcher_a, matcher_b): ''' Combine two matchers. ''' (requireda, optionala) = non_optional_copy(matcher_a) (requiredb, optionalb) = non_optional_copy(matcher_b) if not (optionala or optionalb): return And(matcher_a, separator, matcher_b) else: matcher = Or(*filter((lambda x: x is not None), [ And(Optional(And(requireda, separator)), requiredb ) if optionala else None, And(requireda, Optional(And(separator, requiredb)) ) if optionalb else None ])) if optionala and optionalb: # making this explicit allows chaining (we can detect it # when called again in a tree of "ands") matcher = Optional(matcher) return matcher
def test_simple(self): self.assert_direct('a', Or(Any('x'), Any('a'), Any()), [['a'],['a']])
def UnsignedFloat(decimal='.'): '''Match a sequence of digits that may include a decimal point.''' return Or(Join(Optional(UnsignedInteger()), Any(decimal), UnsignedInteger()), Join(UnsignedInteger(), Optional(Any(decimal))))
def Newline(): '''Match newline (Unix) or carriage return newline (Windows)''' return Or(Literal('\n'), Literal('\r\n'))
def test_nfa(self): first = Join(UnsignedFloat(), Any('eE'), SignedInteger()) second = UnsignedFloat() all = Or(first, second) all.config.clear().compile_to_nfa() m = all.get_parse()