def __init__(self, matcher, tokens, alphabet, discard, t_regexp=None, s_regexp=None): ''' matcher is the head of the original matcher graph, which will be called with a tokenised stream. tokens is the set of `Token` instances that define the lexer. alphabet is the alphabet for which the regexps are defined. discard is the regular expression for spaces (which are silently dropped if not token can be matcher). t_regexp and s_regexp are internally compiled state, used in cloning, and should not be provided by non-cloning callers. ''' super(Lexer, self).__init__(TOKENS, TokenNamespace) if t_regexp is None: unique = {} for token in tokens: token.compile(alphabet) self._debug(fmt('Token: {0}', token)) # this just reduces the work for the regexp compiler unique[token.id_] = token t_regexp = Compiler.multiple(alphabet, [(t.id_, t.regexp) for t in unique.values() if t.regexp is not None]).dfa() if s_regexp is None and discard is not None: s_regexp = Compiler.single(alphabet, discard).dfa() self._arg(matcher=matcher) self._arg(tokens=tokens) self._arg(alphabet=alphabet) self._arg(discard=discard) self._karg(t_regexp=t_regexp) self._karg(s_regexp=s_regexp)
def binary_parser(*regexps): ''' Parse a set of binary regular expressions, returning the associated Regexp. ''' return Compiler.multiple(BINARY, [Labelled(BINARY, label, *__compiled_binary_parser(text)) for (label, text) in regexps])
def test_match_7(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^(*SOL)(*EOL)a]*').dfa() result = list(expr.match([str('1'), EOL])) assert result == [[str('label')], [str('1')], [EOL]], \ result
def do_test(self, pattern, target, dfa_result, nfa_result, parser_factory): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), parser_factory) compiler = Compiler.single(alphabet, pattern) str(compiler.expression) # text = str(compiler.expression) # assert text == format('(?P<label>{0!s})', pattern), text factory = LineAwareStreamFactory(alphabet) target = factory.from_string(target) dfa = compiler.dfa() result = dfa.match(target) if result: (a, b, c) = result (p, q, r) = dfa_result assert a == p, result assert b == q, result assert_str(repr(c), r) else: assert dfa_result == None, dfa_result nfa = compiler.nfa() result = list(nfa.match(target)) assert len(result) == len(nfa_result), result for ((a,b,c), (p,q,r)) in zip(result, nfa_result): assert a == p, result assert b == q, result assert_str(repr(c), r)
def do_test(self, pattern, target, dfa_result, nfa_result, parser_factory): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), parser_factory) compiler = Compiler.single(alphabet, pattern) str(compiler.expression) # text = str(compiler.expression) # assert text == format('(?P<label>{0!s})', pattern), text factory = LineAwareStreamFactory(alphabet) target = factory.from_string(target) dfa = compiler.dfa() result = dfa.match(target) if result: (a, b, c) = result (p, q, r) = dfa_result assert a == p, result assert b == q, result assert_str(repr(c), r) else: assert dfa_result == None, dfa_result nfa = compiler.nfa() result = list(nfa.match(target)) assert len(result) == len(nfa_result), result for ((a, b, c), (p, q, r)) in zip(result, nfa_result): assert a == p, result assert b == q, result assert_str(repr(c), r)
def _compile(self): ''' Compile the matcher. ''' if self.__cached_matcher is None: self.__cached_matcher = \ Compiler.single(self.alphabet, self.regexp).nfa().match return self.__cached_matcher
def do_test(self, pattern, target, dfa_result, nfa_result): alphabet = UnicodeAlphabet.instance() compiler = Compiler.single(alphabet, pattern) text = str(compiler.expression) assert text == format('(?P<label>{0!s})', pattern), text nfa = compiler.nfa() result = list(nfa.match(target)) assert result == nfa_result, result dfa = compiler.dfa() result = dfa.match(target) assert result == dfa_result, result
def do_test(self, pattern, target, dfa_result, nfa_result): alphabet = UnicodeAlphabet.instance() compiler = Compiler.single(alphabet, pattern) text = str(compiler.expression) assert text == fmt('(?P<label>{0!s})', pattern), text nfa = compiler.nfa() result = list(nfa.match((0, StringHelper(target)))) assert result == nfa_result, result dfa = compiler.dfa() result = dfa.match((0, StringHelper(target))) assert result == dfa_result, result
def __init__(self, matcher, tokens, alphabet, discard, t_regexp=None, s_regexp=None): ''' matcher is the head of the original matcher graph, which will be called with a tokenised stream. tokens is the set of `Token` instances that define the lexer. alphabet is the alphabet for which the regexps are defined. discard is the regular expression for spaces (which are silently dropped if not token can be matcher). t_regexp and s_regexp are internally compiled state, used in cloning, and should not be provided by non-cloning callers. ''' super(Lexer, self).__init__(TOKENS, TokenNamespace) if t_regexp is None: unique = {} for token in tokens: token.compile(alphabet) self._debug(fmt('Token: {0}', token)) # this just reduces the work for the regexp compiler unique[token.id_] = token t_regexp = Compiler.multiple( alphabet, [(t.id_, t.regexp) for t in unique.values() if t.regexp is not None]).dfa() if s_regexp is None and discard is not None: s_regexp = Compiler.single(alphabet, discard).dfa() self._arg(matcher=matcher) self._arg(tokens=tokens) self._arg(alphabet=alphabet) self._arg(discard=discard) self._karg(t_regexp=t_regexp) self._karg(s_regexp=s_regexp)
def binary_single_parser(label, text): ''' Parse a binary regular expression, returning the associated Regexp. ''' return Compiler.single(BINARY, Labelled(BINARY, label, *__compiled_binary_parser(text)))
def _test_parser(regexp): return Compiler.single(UNICODE, regexp)
def test_match_2(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]').nfa() result = list(expr.match(str('123a'))) assert result == [(str('label'), str('1'), str('23a'))], result
def test_match_5(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]*').dfa() result = list(expr.match([SOL, str('1'), str('a')])) assert result == [[str('label')], [SOL, str('1')], [str('a')]], result
def test_match_3(self): alphabet = LineAwareAlphabet(UnicodeAlphabet.instance(), make_str_parser) expr = Compiler.single(alphabet, '[^a]*').dfa() result = list(expr.match(str('123a'))) assert result == [[str('label')], str('123'), str('a')], result