def clone_and(use, original, *matchers): ''' We can convert an And only if all the sub-matchers have possible regular expressions, and even then we must tag the result unless an add transform is present. ''' wrapper = original.wrapper.functions add_reqd = True if wrapper: if wrapper[0] is add: wrapper = wrapper[1:] add_reqd = False else: raise Unsuitable try: # combine all (use, regexps) = \ RegexpContainer.to_regexps(use, matchers, have_add=None) # if we have regexp sub-expressions, join them regexp = Sequence(alphabet_, *regexps) log.debug(format('And: cloning {0}', regexp)) return RegexpContainer.build(original, regexp, alphabet_, regexp_type, use, add_reqd=add_reqd, wrapper=wrapper) except Unsuitable: # combine contiguous matchers where possible if add_reqd: raise def unpack(matcher): original = RegexpContainer.to_matcher(matcher) try: return (original, RegexpContainer.to_regexps(use, [matcher], have_add=None)[1][0]) except Unsuitable: return (original, None) output = [] (regexps, originals) = ([], []) for (matcher, regexp) in [unpack(matcher) for matcher in matchers]: if regexp: regexps.append(regexp) originals.append(matcher) else: if len(regexps) > 1: # combine regexps output.append( regexp_type(Sequence(alphabet_, *regexps), alphabet_)) else: output.extend(originals) output.append(matcher) (regexps, originals) = ([], []) if len(regexps) > 1: output.append( regexp_type(Sequence(alphabet_, *regexps), alphabet_)) else: output.extend(originals) merged = And(*output) return merged.compose(original.wrapper)
def and_(a, b): ''' Add space only in the case when both consume something. ''' return Or(And(Consumer(a), separator, Consumer(b)), And(Consumer(a), Consumer(b, False)), And(Consumer(a, False), Consumer(b)), And(Consumer(a, False), Consumer(b, False)))
def SkipTo(matcher, include=True): ''' Consume everything up to (and including, if include is True, as it is by default) the matcher. Returns all the skipped data, joined. ''' if include: return Add(And(Star(AnyBut(matcher)), matcher)) else: return And(Add(Star(AnyBut(matcher))), Lookahead(matcher))
def SingleLineString(quote='"', escape='\\', exclude='\n'): ''' Like `String`, but will not match across multiple lines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(exclude))) if escape: content = Or(content, And(Drop(escape), q)) content = Repeat(content, add_=True) return And(Drop(q), content, Drop(q))
def String(quote='"', escape='\\'): ''' Match a string with quotes that can be escaped. This will match across newlines (see `SingleLineString` for an alternative). ''' q = Literal(quote) content = AnyBut(q) if escape: content = Or(And(Drop(escape), q), content) content = Repeat(content, add_=True) return And(Drop(q), content, Drop(q))
def SkipString(quote='"', escape='\\', ignore='\n', empty='', join=__add__): ''' Like `String`, matching across multiple lines, but will silently drop newlines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(ignore))) if escape: content = Or(content, And(Drop(escape), q)) content = Or(content, Drop(Any(ignore))) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def SingleLineString(quote='"', escape='\\', exclude='\n', empty='', join=__add__): ''' Like `String`, but will not match across multiple lines. ''' q = Literal(quote) content = AnyBut(Or(q, Any(exclude))) if escape: content = Or(content, And(Drop(escape), q)) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def String(quote='"', escape='\\', empty='', join=__add__): ''' Match a string with quotes that can be escaped. This will match across newlines (see `SingleLineString` for an alternative). More generally, a string is a grouping of results. Setting `empty` and `join` correctly will allow this matcher to work with a variety of types. ''' q = Literal(quote) content = AnyBut(q) if escape: content = Or(And(Drop(escape), q), content) content = Repeat(content, reduce=(empty, join)) return And(Drop(q), content, Drop(q))
def UnsignedEFloat(decimal='.', exponent='eE'): ''' Match an `UnsignedFloat` followed by an optional exponent (e+02 etc). ''' return Join(UnsignedFloat(decimal), Optional(And(Any(exponent), SignedInteger())))
def UnsignedEReal(decimal='.', exponent='eE'): ''' Match an `UnsignedReal` followed by an optional exponent (e+02 etc). This will match both integer and float values. ''' return Join(UnsignedReal(decimal), Optional(And(Any(exponent), SignedInteger())))
def _match(self, stream_in): ''' Pull indent and call the policy and update the global value, then evaluate the contents. ''' # detect a nested call key = s_key(stream_in) if key in self.__streams: self._debug('Avoided left recursive call to Block.') return self.__streams.add(key) try: ((tokens, token_stream), _) = s_next(stream_in) (indent, _) = s_line(token_stream, True) if START not in tokens: raise StopIteration current = self.__monitor.indent policy = self.policy(current, indent) generator = And(*self.lines)._match(stream_in) while True: self.__monitor.push_level(policy) try: results = yield generator finally: self.__monitor.pop_level() yield results finally: self.__streams.remove(key)
def _replacements(self, separator): ''' Require the separator on each `And`. ''' # Handle circular dependencies from lepl.matchers.combine import And return (lambda a, b: And(a, separator, b), self._repeat(separator))
def __init__(self): super(TokenNamespace, self).__init__({ ADD: lambda a, b: Add(And(a, b)), AND: And, OR: Or, APPLY: Apply, APPLY_RAW: lambda a, b: Apply(a, b, raw=True), NOT: Drop, KARGS: KApply, RAISE: lambda a, b: KApply(a, raise_error(b)), REPEAT: RepeatWrapper, FIRST: First, MAP: Map, REDUCE: None, })
def AnyBut(exclude=None): ''' Match any character except those specified (or, if a matcher is used as the exclude, if the matcher fails). The argument should be a list of tokens (or a string of suitable characters) to exclude, or a matcher. If omitted all tokens are accepted. ''' return And(~Lookahead(coerce_(exclude, Any)), Any())
def repeat(m, st=0, sp=None, d=0, s=None, a=False, r=None): ''' Wrap `Repeat` to adapt the separator. ''' if s is None: s = separator elif not a: s = And(separator, s, separator) return RepeatWrapper(m, st, sp, d, s, a, r)
def and_(matcher_a, matcher_b): ''' Combine two matchers. ''' (requireda, optionala) = non_optional_copy(matcher_a) (requiredb, optionalb) = non_optional_copy(matcher_b) if not (optionala or optionalb): return And(matcher_a, separator, matcher_b) else: matcher = Or(*filter((lambda x: x is not None), [ And(Optional(And(requireda, separator)), requiredb ) if optionala else None, And(requireda, Optional(And(separator, requiredb)) ) if optionalb else None ])) if optionala and optionalb: # making this explicit allows chaining (we can detect it # when called again in a tree of "ands") matcher = Optional(matcher) return matcher
def __init__(self): # Handle circular dependencies from lepl.matchers.error import raise_error from lepl.matchers.derived import Space, Add, Apply, KApply, Drop, Map from lepl.matchers.combine import And, Or, First super(OperatorNamespace, self).__init__({ SPACE_OPT: lambda a, b: And(a, Space()[0:, ...], b), SPACE_REQ: lambda a, b: And(a, Space()[1:, ...], b), ADD: lambda a, b: Add(And(a, b)), AND: And, OR: Or, APPLY: Apply, APPLY_RAW: lambda a, b: Apply(a, b, raw=True), NOT: Drop, KARGS: KApply, RAISE: lambda a, b: KApply(a, raise_error(b)), REPEAT: RepeatWrapper, FIRST: First, MAP: Map, REDUCE: None, })
def Word(chars=NfaRegexp('[^%s]' % whitespace), body=None): ''' Match a sequence of non-space characters, joining them together. chars and body, if given as strings, define possible characters to use for the first and rest of the characters in the word, respectively. If body is not given, then chars is used for the entire word. They can also specify matchers, which typically should match only a single character. So ``Word(Upper(), Lower())`` would match names that being with an upper case letter, for example, while ``Word(AnyBut(Space()))`` (the default) matches any sequence of non-space characters. ''' chars = coerce_(chars, Any) body = chars if body is None else coerce_(body, Any) return Add(And(chars, Star(body)))
def Repeat(matcher, start=0, stop=None, algorithm=DEPTH_FIRST, separator=None, add_=False): ''' This is called by the [] operator. It repeats the given matcher between start and stop number of times (inclusive). If ``add`` is true then the results are joined with `Add`. If ``separator`` is given then each repetition is separated by that matcher. ''' first = coerce_(matcher) if separator is None: rest = first else: rest = And(coerce_(separator, Regexp), first) if start is None: start = 0 assert_type('The start index for Repeat or [...]', start, int) assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True) assert_type('The algorithm/increment for Repeat or [...]', algorithm, str) if start < 0: raise ValueError('Repeat or [...] cannot have a negative start.') if stop is not None and stop < start: raise ValueError('Repeat or [...] must have a stop ' 'value greater than or equal to the start.') if 'dbgn'.find(algorithm) == -1: raise ValueError('Repeat or [...] must have a step (algorithm) ' 'of d, b, g or n.') add_ = Add if add_ else Identity return {DEPTH_FIRST: add_(DepthFirst(first=first, start=start, stop=stop, rest=rest)), BREADTH_FIRST: add_(BreadthFirst(first=first, start=start, stop=stop, rest=rest)), GREEDY: add_(OrderByResultCount(BreadthFirst(first=first, start=start, stop=stop, rest=rest))), NON_GREEDY: add_(OrderByResultCount(BreadthFirst(first=first, start=start, stop=stop, rest=rest), False)) }[algorithm]
def __init__(self, clean_html=True): self.clean_html = clean_html self._punctuation = '!"#&\'()*+,.;<=>?@[\\]^_`{|}~' self._lctx_1_exceptions = set('/ :'.split()) self._lctx_2_exceptions = set('discount redeem voucher'.split()) self._rctx_1_exceptions = set('/ : th am pm hour hours %'.split()) self._rctx_2_exceptions = set('discount redeem voucher'.split()) # LEPL Real Number Matchers (w/thousands) _comma_three_digits = Join(Drop(','), Add(Digit()[3]))[:] _thousand_group = Or( Join(_comma_three_digits, Any('.'), UnsignedInteger()), Join(_comma_three_digits, Optional(Any('.')))) _real = Or(Join(UnsignedInteger(), _thousand_group), UnsignedReal()) >> float _any = Join(Star(AnyBut(_real))) self._real_partition_matcher = Star(And(_any, _real, _any)) self._real_simple_matcher = _real[:, Drop( Star(Or(Whitespace(), Any(',-'))) )]
def _match(self, stream_in): ''' Pull indent and call the policy and update the global value, then evaluate the contents. ''' # detect a nested call (_line_no, _line_off, char_off, _desc, _text) = stream_in.location if char_off in self.__streams: self._debug('Avoided left recursive call to Block.') return self.__streams.add(char_off) try: (indent, _stream) = yield self.indent._match(stream_in) current = self.__monitor.indent self.__monitor.push_level(self.policy(current, indent)) # this flags we have pushed and need to pop self.__monitor = None generator = And(*self.lines)._match(stream_in) while True: yield (yield generator) finally: self.__streams.remove(char_off)
def __build_matcher(self, stream_in): ''' Build a matcher that, when it is evaluated, will return the matcher results for the columns. We base this on `And`, but need to force the correct streams. ''' def force_out(replacement): ''' Generate a transformer function that replaces the stream_out. ''' def replace_out(_stream, matcher): (results, _stream_out) = matcher() return (results, replacement) return replace_out # left and right are the indices for the column # matchers is the list of matchers that will be joined by And # previous is the "column before", which must be modified so that # it returns the correct stream_out for the next matcher right, matchers, previous = 0, [], Empty() columns = list(zip(self.indices, self.matchers)) if self.skip: # this takes the entire stream_in and applies it to skip columns.append(((0, None), Drop(self.skip))) else: # this takes everything to the right of the previous column columns.append((None, Empty())) for (col, matcher) in columns: try: (left, right) = col except TypeError: left = right right = None if col is None else right + col matchers.append(Transform(previous, force_out(stream_in[left:right]))) previous = matcher matchers.append(previous) return And(*matchers)
def SignedInteger(): '''Match a sequence of digits with an optional initial sign.''' return Add(And(Optional(Any('+-')), UnsignedInteger()))
def test_simple(self): #basicConfig(level=DEBUG) self.assert_join([1], Any(), [[1]]) self.assert_join([1,2], And(Any(), Any()), [[1, 2]]) self.assert_join([1,2,3], And(Any(), Any()), [[1, 2]]) self.assert_join([1], And(Any(), Any()), [])
def Join(*matchers): ''' Combine many matchers together with Add(And(...)). It can be used indirectly by placing ``+`` between matchers. ''' return Add(And(*matchers))
def clone_and(use, original, *matchers): ''' We can convert an And only if all the sub-matchers have possible regular expressions, and even then we must tag the result unless an add transform is present. ''' if hasattr(original, 'wrapper'): wrapper = original.wrapper.functions else: wrapper = None add_reqd = True if wrapper: if wrapper[0] is add: wrapper = wrapper[1:] add_reqd = False else: raise Unsuitable try: # combine all (use, regexps) = \ RegexpContainer.to_regexps(use, matchers, have_add=None) # if we have regexp sub-expressions, join them regexp = Sequence(alphabet_, *regexps) log.debug(fmt('And: cloning {0}', regexp)) return RegexpContainer.build(original, regexp, alphabet_, regexp_type, use, add_reqd=add_reqd, wrapper=wrapper) except Unsuitable: # combine contiguous matchers where possible if add_reqd: raise def unpack(matcher): original = RegexpContainer.to_matcher(matcher) try: return (original, RegexpContainer.to_regexps(use, [matcher], have_add=None)[1][0]) except Unsuitable: return (original, None) output = [] (regexps, originals) = ([], []) for (matcher, regexp) in [unpack(matcher) for matcher in matchers]: if regexp: regexps.append(regexp) originals.append(matcher) else: if len(regexps) > 1: # combine regexps output.append( regexp_type(Sequence(alphabet_, *regexps), alphabet_)) else: output.extend(originals) output.append(matcher) (regexps, originals) = ([], []) if len(regexps) > 1: output.append( regexp_type(Sequence(alphabet_, *regexps), alphabet_)) else: output.extend(originals) merged = And(*output) return merged.compose(original.wrapper)
def Repeat(matcher, start=0, stop=None, limit=None, algorithm=DEPTH_FIRST, separator=None, add_=False, reduce=None): ''' This is called by the [] operator. It repeats the given matcher between `start` and `stop` number of times (inclusive). If `limit` is given it is an upper limit on the number of different results returned on backtracking. `algorithm` selects the repeat algorithm to use. If `separator` is given then each repetition is separated by that matcher. If `add_` is true then the results are joined with `Add` (once all results are obtained). If `reduce` is given it should be a pair (zero, join) where `join(results, next)` is used to accumulate results and `zero` is the initial value of `results`. This is implemented via `Reduce`. `reduce` and `add_` cannot be given together. ''' first = coerce_(matcher) if separator is None: rest = first else: rest = And(coerce_(separator, Regexp), first) if start is None: start = 0 # allow duck typing (mutable values - IntVar etc) # assert_type('The start index for Repeat or [...]', start, int) # assert_type('The stop index for Repeat or [...]', stop, int, none_ok=True) # assert_type('The limit value (step index) for Repeat or [...]', limit, int, none_ok=True) # assert_type('The algorithm (step index) for Repeat or [...]', algorithm, str) # if start < 0: # raise ValueError('Repeat or [...] cannot have a negative start.') # if stop is not None and stop < start: # raise ValueError('Repeat or [...] must have a stop ' # 'value greater than or equal to the start.') # if 'dbgn'.find(algorithm) == -1: # raise ValueError('Repeat or [...] must have a step (algorithm) ' # 'of d, b, g or n.') if add_ and reduce: raise ValueError('Repeat cannot apply both add_ and reduce') elif add_: process = Add elif reduce: process = lambda r: Reduce(r, reduce[0], reduce[1]) else: process = Identity matcher = { DEPTH_FIRST: process(DepthFirst(first=first, start=start, stop=stop, rest=rest)), BREADTH_FIRST: process(BreadthFirst(first=first, start=start, stop=stop, rest=rest)), GREEDY: process( OrderByResultCount( BreadthFirst(first=first, start=start, stop=stop, rest=rest))), NON_GREEDY: process( OrderByResultCount( BreadthFirst(first=first, start=start, stop=stop, rest=rest), False)) }[algorithm] if limit is not None: matcher = Limit(matcher, count=limit) return matcher