def test_dfa_nfa_intersection(): nfa1 = NondeterministicFiniteAutomaton() state0 = finite_automaton.State(0) state1 = finite_automaton.State(1) state2 = finite_automaton.State(2) state3 = finite_automaton.State(3) nfa1.add_transition(state0, symb_a, state1) nfa1.add_transition(state0, symb_c, state2) nfa1.add_transition(state1, symb_a, state1) nfa1.add_transition(state1, symb_b, state2) nfa1.add_transition(state2, symb_a, state0) nfa1.add_transition(state0, symb_c, state3) nfa1.add_transition(state3, symb_a, state1) nfa1.add_start_state(state0) nfa1.add_final_state(state1) s = ("((((b.a)|(((a|(b.c))|(c.a)).(((b.c))*." + "(b.a)))))*.(((a|(b.c))|(c.a)).((b.c))*))") r = Regex(s) dfa = r.to_epsilon_nfa().to_deterministic().minimize() dnfa = dfa.get_intersection(nfa1) assert (dnfa.accepts([symb_a])) assert (dnfa.accepts([symb_c, symb_a])) assert (dnfa.accepts([symb_a, symb_b, symb_a, symb_a])) assert (not dnfa.accepts([symb_c, symb_b])) assert (not dnfa.accepts([symb_a, symb_a])) assert (not dnfa.accepts([symb_b]))
def test_intersection(self): """ Tests the intersection with a regex """ regex = Regex("a*b*") dfa = regex.to_epsilon_nfa() symb_a = Symbol("a") symb_b = Symbol("b") self.assertTrue(dfa.accepts([symb_a, symb_a, symb_b, symb_b])) self.assertFalse(dfa.accepts([symb_b, symb_b, symb_a])) ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") productions = { Production(var_s, [ter_a, var_s, ter_b]), Production(var_s, [ter_b, var_s, ter_a]), Production(var_s, []) } cfg = CFG(productions=productions, start_symbol=var_s) self.assertTrue(cfg.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg.contains([ter_a, ter_a, ter_b])) cfg_i = cfg.intersection(regex) self.assertTrue(cfg_i.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg_i.contains([ter_a, ter_a, ter_b])) self.assertTrue(cfg_i.contains([])) cfg_i = cfg.intersection(dfa) self.assertTrue(cfg_i.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg_i.contains([ter_a, ter_a, ter_b])) self.assertTrue(cfg_i.contains([]))
def from_regex_file(path: str): file = open(path) regex = Regex(file.readline()) file.close() dfa: DeterministicFiniteAutomaton = regex.to_epsilon_nfa( ).to_deterministic().minimize() return from_dfa(dfa)
def test_creation(self): # S -> a S b | a b enfa = Regex("a S b | a b").to_epsilon_nfa() dfa = enfa.minimize() box = Box(dfa, Symbol("S")) rsa_1 = RecursiveAutomaton({Symbol("S")}, Symbol("S"), {box}) self.assertEqual(rsa_1.get_number_of_boxes(), 1) self.assertEqual(box, rsa_1.get_box(Symbol("S"))) self.assertEqual(rsa_1.labels, {Symbol("S")}) self.assertEqual(rsa_1.initial_label, Symbol("S")) rsa_2 = RecursiveAutomaton() rsa_2.add_box(box) rsa_2.change_initial_label(Symbol("S")) self.assertEqual(rsa_2, rsa_1) # Checking to add a start label rsa_3 = RecursiveAutomaton(set(), Symbol("S"), {box}) self.assertEqual(rsa_3.labels, {Symbol("S")}) try: rsa_4 = RecursiveAutomaton({Symbol("S"), Symbol("v")}, Symbol("S"), {box}) except ValueError: self.assertEqual(True, True)
def test_is_equivalent_to(self): # S -> a* b* rsa_1 = RecursiveAutomaton.from_regex(Regex("a* b*"), Symbol("S")) # S -> a+ b+ rsa_2 = RecursiveAutomaton.from_regex(Regex("a a* b b*"), Symbol("S")) self.assertNotEqual(rsa_1, rsa_2)
def test_regex_to_epsilon_nfa(): regex = Regex("(a|a a b)*") enfa = regex.to_epsilon_nfa() assert (len(enfa.states) == 12) assert (enfa.accepts([symb_a])) assert (enfa.accepts([symb_a, symb_a, symb_b, epsilon])) assert (not enfa.accepts([symb_c])) assert (enfa.accepts([epsilon])) assert (not enfa.accepts([symb_b, symb_a]))
def test_dfa_intersection(): regex = Regex("(a |a b c)*") dfa1 = DFA.min_dfa_from_regex(regex) regex = Regex("(a* | (a | b)*) c") dfa2 = DFA.min_dfa_from_regex(regex) new_dfa = dfa1.get_intersection(dfa2) assert (new_dfa.accepts([symb_a, symb_b, symb_c])) assert (new_dfa.accepts([symb_a, symb_a, symb_b, symb_c])) assert (not new_dfa.accepts([symb_a, symb_b, symb_c, symb_a, symb_b, symb_c]))
def test_from_regex(self): # S -> a* rsa_2 = RecursiveAutomaton.from_regex(Regex("a*"), Symbol("S")) enfa = Regex("a*").to_epsilon_nfa() dfa = enfa.minimize() box = Box(dfa, Symbol("S")) rsa_1 = RecursiveAutomaton({Symbol("S")}, Symbol("S"), {box}) self.assertEqual(rsa_2, rsa_1)
def from_text(cls, text: List[str]): start_symbol = None eps_productions = [] productions_with_dfa = [] for line in text: raw_head, *raw_body = line.strip().split(' ', 1) regex = Regex(' '.join(raw_body).replace('eps', 'epsilon')) head = Variable(raw_head) if start_symbol is None: start_symbol = head if not raw_body: eps_productions.append(Production(head, [])) dfa: DeterministicFiniteAutomaton = regex.to_epsilon_nfa( ).to_deterministic().minimize() productions_with_dfa.append((head, dfa)) import wrappers.GraphWrapper rfa_graph = wrappers.GraphWrapper.empty() rfa_graph.matrix_size = sum( [len(dfa.states) for _, dfa in productions_with_dfa]) rfa_graph.vertices = set() empty_matrix = Matrix.sparse(types.BOOL, rfa_graph.matrix_size, rfa_graph.matrix_size) head_by_start_final_pair = {} total_states_counter = 0 for head, dfa in productions_with_dfa: transitions = dfa._transition_function._transitions num_by_state = {} for state in dfa.states: num_by_state[state] = total_states_counter total_states_counter += 1 rfa_graph.vertices.update(num_by_state.values()) for start_state in dfa.start_states: rfa_graph.start_states.add(num_by_state[start_state]) for final_state in dfa.final_states: rfa_graph.final_states.add(num_by_state[final_state]) head_by_start_final_pair[ num_by_state[dfa.start_state], num_by_state[final_state]] = head.value for state_from in transitions: for edge_symb in transitions[state_from]: state_to = transitions[state_from][edge_symb] matrix = rfa_graph.label_to_bool_matrix.setdefault( edge_symb, empty_matrix.dup()) matrix[num_by_state[state_from], num_by_state[state_to]] = True return cls(rfa_graph, head_by_start_final_pair, eps_productions, start_symbol)
def __init__(self, regex): enfa = Regex(regex).to_epsilon_nfa() self.dfa = enfa.to_deterministic().minimize() states = self.dfa.states vertices = range(len(states)) self.vert_dict = dict(zip(states, vertices)) self.num_vert = len(self.vert_dict) self.start_states = [ self.vert_dict[st] for st in self.dfa.start_states ] self.final_states = [ self.vert_dict[st] for st in self.dfa.final_states ]
def test_simple_regex(): regexes = ["a", "a b", "a | b", "a*"] answers = ["a", "ab", "b", "aaa"] for i in range(4): regex = DifferentiableRegex(Regex(regexes[i])) word = answers[i] assert (regex.accepts(word))
def read_grammar_with_regex(cls, name): id = 0 terminals, variables, productions = set(), set(), set() start_symb = None with open(name, 'r') as file: productions_txt = file.readlines() for production_txt in productions_txt: line = production_txt.strip().split() head, body = line[0], ' '.join(line[1:]) head = Variable(head) if start_symb is None: start_symb = head new_productions, new_variables, new_terminals, id = CFGrammar.read_production_regex( head, Regex(body), id) productions |= new_productions variables |= new_variables terminals |= new_terminals cfg = CFG(variables, terminals, start_symb, productions) return cfg
def from_cfg(cls, cfg: CFG): """ Create a recursive automaton from context-free grammar Parameters ----------- cfg : :class:`~pyformlang.cfg.CFG` The context-free grammar Returns ----------- rsa : :class:`~pyformlang.rsa.RecursiveAutomaton` The new recursive automaton built from context-free grammar """ initial_label = to_symbol(cfg.start_symbol) grammar_in_true_format = remove_repetition_of_nonterminals_from_productions( cfg.to_text()) boxes = set() labels = set() notation_for_epsilon = Epsilon().to_text() for production in grammar_in_true_format.splitlines(): head, body = production.split(" -> ") labels.add(to_symbol(head)) if body == "": body = notation_for_epsilon boxes.add( Box(Regex(body).to_epsilon_nfa().minimize(), to_symbol(head))) return RecursiveAutomaton(labels, initial_label, boxes)
def read_cfg(cls, text, start_symbol=cfg.Variable("S"), contains_regexes=False, track_variables=False): variables = set() productions = set() terminals = set() if track_variables: for line in text.splitlines(): head = line.strip().split(' ', 1)[0] variables.add(cfg.Variable(head)) for line in text.splitlines(): if contains_regexes and \ len(line.split()) > 1 and \ len(line.strip().split(' ', 1)[1]) > 1 and \ any(symb in line for symb in ['*', '|', '+', '?', ]): raw_head, *raw_body = line.strip().split(' ', 1) regex = Regex.from_python_regex(' '.join(raw_body)) head = cfg.Variable(raw_head) cur_cfg = cls._create_cfg_from_regex(head, regex, track_variables) terminals.update(cur_cfg.terminals) productions.update(cur_cfg.productions) variables.update(cur_cfg.variables) else: line = line.strip() if not line: continue if track_variables: tmp_vars = set() cls._read_line(line, productions, terminals, tmp_vars) else: cls._read_line(line, productions, terminals, variables) return cls(variables=variables, terminals=terminals, productions=productions, start_symbol=start_symbol)
def test_comp_regex_redundant(): regexes = ['a *', 'a | a * | a | a *', 'a * * * * *', '(a | ) *'] answers = ['', 'a', 'aaa', 'aaaaaaaa'] for raw_regex in regexes: regex = DifferentiableRegex(Regex(raw_regex)) for word in answers: assert (regex.accepts(word))
def _create_cfg_from_regex(cls, head: Variable, regex: Regex, variables=None) -> CFG: dfa = regex.to_epsilon_nfa().to_deterministic().minimize() transitions = dfa._transition_function._transitions state_to_var: Dict[State, Variable] = {} productions, terms, vars = set(), set(), set() for state in dfa.states: state_to_var[state] = Variable(f'{state}:{cls.__var_state_counter}') cls.__var_state_counter += 1 vars.update(state_to_var.values()) for start_state in dfa.start_states: productions.add(Production(head, [state_to_var[start_state]])) for state_from in transitions: for edge_symb in transitions[state_from]: state_to = transitions[state_from][edge_symb] current_prod_head = state_to_var[state_from] current_prod_body = [] if (not variables and edge_symb.value.isupper() or variables and edge_symb.value in variables): var = Variable(edge_symb.value) vars.add(var) current_prod_body.append(var) else: term = Terminal(edge_symb.value) terms.add(term) current_prod_body.append(term) current_prod_body.append(state_to_var[state_to]) productions.add(Production(current_prod_head, current_prod_body)) if state_to in dfa.final_states: productions.add(Production(state_to_var[state_to], [])) if not productions: return CFG(vars, terms, head, {Production(head, [])}) return CFG(vars, terms, head, productions)
def from_str(st, py=True): if py: e_dfa = Regex.from_python_regex(st).to_epsilon_nfa() else: e_dfa = Regex(st).to_epsilon_nfa() dfa = e_dfa.to_deterministic().minimize() dfa, states_map = SimpleGraph.dfa_normalize_states(dfa) edges = [] size = 0 for vs, labels in dfa.to_dict().items(): for label, ve in labels.items(): vs, ve = int(str(vs)), int(str(ve)) label = str(label) size = max(size, vs, ve) edges.append((vs, label, ve)) return Regexp(size + 1, edges, dfa, states_map)
def from_regex(self, filename): input_file = open(filename) regex = Regex(input_file.read().rstrip()) dfa = regex.to_epsilon_nfa().to_deterministic().minimize() self.n_vertices = len(dfa.states) state_renumeration = dict() i = 0 for state in dfa.states: state_renumeration[state] = i i += 1 for fro, label, to in dfa._transition_function.get_edges(): self.get_by_label(str(label))[state_renumeration[fro], state_renumeration[to]] = True self.start_vertices.add(state_renumeration[dfa.start_state]) for state in dfa.final_states: self.final_vertices.add(state_renumeration[state])
def str_to_graph(s): r = Regex(s) a = r.to_epsilon_nfa().minimize() start_states = list(a.start_states) final_states = list(a.final_states) g = a.to_networkx() g2 = nx.convert_node_labels_to_integers(g, ordering="sorted") d = {} i = 0 for node in sorted(g.nodes): d[node] = sorted(g2.nodes)[i] i += 1 for i in range(len(start_states)): start_states[i] = d[start_states[i]] for i in range(len(final_states)): final_states[i] = d[final_states[i]] labels = nx.get_edge_attributes(g2, 'label') return labels, sorted(start_states), sorted(final_states)
def test_comp_regex(): regex = DifferentiableRegex(Regex('(a * b | c) * d')) correct = ['cd', 'aaabd', 'aabcd', 'abbd'] incorrect = ['dd', 'aaaad', 'ababc', 'q'] for word in correct: assert (regex.accepts(word)) for word in incorrect: assert (not regex.accepts(word))
def regex_to_pda_graph(regex, first_node_number): regex = Regex(regex) nfa: EpsilonNFA = regex.to_epsilon_nfa().minimize() graph: nx.MultiDiGraph = nfa.to_networkx() killing_list = [] for node in graph.nodes: if not graph.nodes[node]['label']: killing_list.append(node) for node in killing_list: graph.remove_node(node) my_map = {} i = first_node_number for node in sorted(graph.nodes): my_map[node] = i i += 1 graph: nx.Graph = nx.relabel_nodes(graph, my_map) for edge in graph.edges: graph.edges[edge]['label'] = [graph.edges[edge]['label']] return nx.DiGraph(graph)
def test_regex_to_min_dfa(): regex = Regex("((b|a) a b)*") dfa = DFA.min_dfa_from_regex(regex) assert (len(dfa.states) == 3) assert(dfa.accepts([symb_b, symb_a, symb_b])) assert(dfa.accepts([symb_a, symb_a, symb_b, symb_a, symb_a, symb_b])) assert(not dfa.accepts([symb_a, symb_a, symb_b, symb_b])) assert(not dfa.accepts([symb_a, symb_a, symb_b, epsilon])) assert(not dfa.accepts([symb_c])) assert(not dfa.accepts([epsilon])) assert(not dfa.accepts([symb_b, symb_a]))
def from_text(cls, text: List[str], use_python_regexes_if_necessary=False, variables=None): vars, terms, prods = set(), set(), set() start_var = None for line in text: if not line.strip(): continue raw_head, *raw_body = line.strip().split(' ', 1) if raw_body and any([spec in raw_body[0] for spec in ['|', '.', '?', '+', '-']]): if '-' in raw_body[0] and use_python_regexes_if_necessary: regex = Regex.from_python_regex(raw_body[0]) else: regex = Regex(raw_body[0]) head = Variable(raw_head) if start_var is None: start_var = head cur_cfg = cls._create_cfg_from_regex(head, regex, variables) vars.update(cur_cfg.variables) terms.update(cur_cfg.terminals) prods.update(cur_cfg.productions) else: raw_body = raw_body[0].split(' ') if raw_body else '' if start_var is None: start_var = Variable(raw_head) head = Variable(raw_head) vars.add(head) body = [] for element in raw_body: if element == 'eps': continue elif (not variables and any(letter.isupper() for letter in element) or variables and element in variables): var = Variable(element) vars.add(var) body.append(var) else: term = Terminal(element) terms.add(term) body.append(term) prods.add(Production(head, body)) cfg = CFG(vars, terms, start_var, prods) return cls(cfg)
def test_from_cfg(self): # g1: S -> a S b | a b rsa1_g1 = RecursiveAutomaton.from_cfg( CFG.from_text("S -> a S b | a b")) rsa2_g1 = RecursiveAutomaton.from_regex(Regex("a S b | a b"), Symbol("S")) self.assertEqual(rsa1_g1, rsa2_g1) # g2: S -> a V b # V -> c S d | c d rsa1_g2 = RecursiveAutomaton.from_cfg( CFG.from_text("S -> a V b\nV -> c S d | c d")) self.assertEqual(rsa1_g2.get_number_of_boxes(), 2) self.assertEqual(rsa1_g2.labels, {Symbol("S"), Symbol("V")}) dfa_S = Regex("a V b").to_epsilon_nfa().minimize() self.assertEqual(rsa1_g2.get_box(Symbol("S")), Box(dfa_S, Symbol("S"))) dfa_V = Regex("c S d | c d").to_epsilon_nfa().minimize() self.assertEqual(rsa1_g2.get_box(Symbol("V")), Box(dfa_V, Symbol("V")))
def test_get_repr(self): regex0 = Regex("a*.(b|c)epsilon") regex_str = str(regex0) regex1 = Regex(regex_str) dfa0 = regex0.to_epsilon_nfa().to_deterministic().minimize() dfa1 = regex1.to_epsilon_nfa().to_deterministic().minimize() self.assertEqual(dfa0, dfa1)
def test_intersection_empty(self): regex = Regex("") ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") productions = { Production(var_s, [ter_a, var_s, ter_b]), Production(var_s, [ter_b, var_s, ter_a]), Production(var_s, []) } cfg = CFG(productions=productions, start_symbol=var_s) cfg_i = cfg & regex self.assertFalse(cfg_i)
def parse_regex(self, file_path): self.__init__() # read regex from file regex_file = open(file_path, 'r') regex = Regex(regex_file.read().rstrip()) regex_file.close() # regex to dfa conversion and vertices count init dfa = regex.to_epsilon_nfa().to_deterministic().minimize() self.vertices_count = len(dfa.states) # states enumeration states = {} start = 0 for state in dfa._states: if state not in states: states[state] = start start = start + 1 # init label_matrices for start in dfa._states: for label in dfa._input_symbols: in_states = dfa._transition_function(start, label) for end in in_states: if label in self.label_matrices: self.label_matrices[label][states[start], states[end]] = True else: bool_matrix = Matrix.sparse(BOOL, self.vertices_count, self.vertices_count) bool_matrix[states[start], states[end]] = True self.label_matrices[label] = bool_matrix # init start and terminal states self.start_vertices.add(states[dfa.start_state]) for state in dfa._final_states: self.terminal_vertices.add(states[state]) return self
def read_cfgrammar(name): file = open(name, 'r') s = '' cfg_from_regex = [] for line in file: if 'regexp' in line: line = line.replace('regexp', "") head = line.split(" -> ")[0] regex = Regex(line.split(" -> ")[1][:-1]) cfg_from_regex.append(regex.to_cfg(starting_symbol=head)) else: s += line file.close() cfg = CFG.from_text(s) for c in cfg_from_regex: cfg = CFG(cfg.variables.union(c.variables), cfg.terminals.union(c.terminals), cfg.start_symbol, cfg.productions.union(c.productions)) return cfg
def rsm_from_text(source: str, start_symbol: Variable = Variable("S")) -> RSM: """Create a Recursive State Machine [1]_ from text. Parameters ---------- source : str The text with which the Recursive State Machine will be created. start_symbol : Variable Start symbol of a Recursive State Machine. Examples -------- >>> import cfpq_data >>> rsm = cfpq_data.rsm_from_text("S -> (a S* b S*)*") >>> [rsm.contains(word) for word in ["", "ab", "aabb"]] [True, True, True] Returns ------- rsm : RSM Recursive State Machine. References ---------- .. [1] Alur R., Etessami K., Yannakakis M. (2001) Analysis of Recursive State Machines. In: Berry G., Comon H., Finkel A. (eds) Computer Aided Verification. CAV 2001. Lecture Notes in Computer Science, vol 2102. Springer, Berlin, Heidelberg. https://doi.org/10.1007/3-540-44585-4_18 """ boxes = list() for production in source.splitlines(): if " -> " not in production: continue head, body = production.split(" -> ") body = body.replace("epsilon", "$").replace("eps", "$") if body == "": body = "$" boxes.append( (Variable(head), Regex(body).to_epsilon_nfa().to_deterministic().minimize()) ) return RSM(start_symbol, boxes)
def from_text(text, start_symbol=Variable("S")): lines = text.splitlines() production_set = set() for line in lines: production = line.split(' -> ') head = Variable(production[0]) body_str = production[1].strip() body_str = body_str.replace('?', f'|{EPS_SYM}') production_set |= Grammar_Wrapper.regex_to_production( Regex(body_str), head) return CFG(start_symbol=start_symbol, productions=production_set)