def test_shunting_yard_postfix_re(): assert list( shunting_yard_postfix(tokenizer_re("(12)?(345)*?(67)+89"), MAP_OPERATORS_RE)) == [ '1', '2', '.', '?', '3', '4', '.', '5', '.', '*', '?', '.', '6', '7', '.', '+', '.', '8', '.', '9', '.' ]
def test_rpn_deque_ast(): tokenized = tokenizer_re("(a?b)*?c+d") ast = Ast() output = RpnDequeAst(map_operators=MAP_OPERATORS_RE, ast=ast) ret = shunting_yard_postfix(tokenized, MAP_OPERATORS_RE, output=output) assert num_vertices(ast) == 11 assert num_edges(ast) == 10 [root] = ret assert root == 10 from pybgl.graphviz import graph_to_html graph_to_html(ast)
def test_tokenizer_re_implicit(): map_input_expected = { "123?(4|5)*67": "1.2.3?.(4|5)*.6.7", "(1?2)*?3+4": "(1?.2)*?.3+.4", "a\\dx": "a.\\d.x", "a\\d+x": "a.\\d+.x", "a[0-9]x": "a.[0-9].x", "a[0-9]+x": "a.[0-9]+.x", "a{1,2}+x": "a{1,2}+.x", } for (regexp, expected) in map_input_expected.items(): obtained = "".join(tokenizer_re(regexp)) assert obtained == expected
def test_tokenizer_re_classes(): # Explicit concatenation assert list(tokenizer_re("a.[0-9].b", cat=None)) == ["a", ".", "[0-9]", ".", "b"] assert list(tokenizer_re("a.[^0-9].b", cat=None)) == ["a", ".", "[^0-9]", ".", "b"] assert list(tokenizer_re("a.[(].b", cat=None)) == ["a", ".", "[(]", ".", "b"] assert list(tokenizer_re("[a-z].[0-9]", cat=None)) == ["[a-z]", ".", "[0-9]"] # Implicit concatenation assert list(tokenizer_re("a[0-9]b")) == ["a", ".", "[0-9]", ".", "b"] assert list(tokenizer_re("a[^0-9]b")) == ["a", ".", "[^0-9]", ".", "b"] assert list(tokenizer_re("a[(]b")) == ["a", ".", "[(]", ".", "b"] assert list(tokenizer_re("[a-z][0-9]")) == ["[a-z]", ".", "[0-9]"]
def test_tokenizer_re_char_repetitions(): # Explicit concatenation assert list(tokenizer_re("x{1,3}.y", cat=None)) == ["x", "{1,3}", ".", "y"] assert list(tokenizer_re("x{3}.y", cat=None)) == ["x", "{3}", ".", "y"] assert list(tokenizer_re("x{3,}.y", cat=None)) == ["x", "{3,}", ".", "y"] # Implicit concatenation assert list(tokenizer_re("x{1,3}y")) == ["x", "{1,3}", ".", "y"] assert list(tokenizer_re("x{3}y")) == ["x", "{3}", ".", "y"] assert list(tokenizer_re("x{3,}y")) == ["x", "{3,}", ".", "y"]
def test_tokenizer_re_escape_sequence(): # Explicit concatenation assert list(tokenizer_re("a.\\d.b", cat=None)) == ["a", ".", "\\d", ".", "b"] assert list(tokenizer_re("a.\\s.b", cat=None)) == ["a", ".", "\\s", ".", "b"] assert list(tokenizer_re("a.\\w.b", cat=None)) == ["a", ".", "\\w", ".", "b"] # Implicit concatenation assert list(tokenizer_re("a\\db")) == ["a", ".", "\\d", ".", "b"] assert list(tokenizer_re("a\\sb")) == ["a", ".", "\\s", ".", "b"] assert list(tokenizer_re("a\\wb")) == ["a", ".", "\\w", ".", "b"]
def thompson_compile_nfa(expression: str, whole_alphabet=None) -> Nfa: if not expression: g = Nfa(1) set_final(0, g) return (g, 0, 0) if whole_alphabet is None: whole_alphabet = DEFAULT_ALPHABET expression = list(tokenizer_re(expression, cat=".")) class ThompsonShuntingYardVisitor(DefaultShuntingYardVisitor): def __init__(self): self.cur_id = 0 self.nfas = deque() def on_push_output(self, a): if a in {".", "|"}: (nfa2, q02, f2) = self.nfas.pop() (nfa1, q01, f1) = self.nfas.pop() f = (concatenation if a == "." else alternation if a == "|" else None) (nfa1, q01, f1) = f(nfa1, q01, f1, nfa2, q02, f2) elif a in {"?", "*", "+"}: (nfa1, q01, f1) = self.nfas.pop() f = (zero_or_one if a == "?" else zero_or_more if a == "*" else one_or_more if a == "+" else None) (nfa1, q01, f1) = f(nfa1, q01, f1) elif a[0] == "{": (nfa1, q01, f1) = self.nfas.pop() (m, n) = parse_repetition(a) (nfa1, q01, f1) = repetition_range(nfa1, q01, f1, m, n) elif a[0] == "[": chars = parse_bracket(a, whole_alphabet) (nfa1, q01, f1) = bracket(chars) elif a[0] == "\\": chars = parse_escaped(a, whole_alphabet) (nfa1, q01, f1) = bracket(chars) else: (nfa1, q01, f1) = literal(a) self.nfas.append((nfa1, q01, f1)) vis = ThompsonShuntingYardVisitor() shunting_yard_postfix(expression, map_operators=MAP_OPERATORS_RE, vis=vis) assert len(vis.nfas) == 1 (nfa, q0, f) = vis.nfas.pop() return (nfa, q0, f)
def test_tokenizer_re_parenthesis(): assert list(tokenizer_re( "(a.b.c)+", cat=None)) == ["(", "a", ".", "b", ".", "c", ")", "+"] assert list( tokenizer_re("(abc)+")) == ["(", "a", ".", "b", ".", "c", ")", "+"]
def test_shunting_yard_ast(): tokenized = tokenizer_re("(a?b)*?c+d") (ast, root) = shunting_yard_ast(tokenized, MAP_OPERATORS_RE) assert num_vertices(ast) == 11 assert num_edges(ast) == 10 assert root == 10
def test_tokenizer_re_explicit(): assert list(tokenizer_re("11.2.(3+.4*.5?)", cat=None)) == [ "11", ".", "2", ".", "(", "3", "+", ".", "4", "*", ".", "5", "?", ")" ]