def test_profiling_conversions(): """ Tests multiple conversions """ ter_a = Terminal("a") ter_b = Terminal("b") ter_c = Terminal("c") var_s = Variable("S") productions = { Production(var_s, [ter_a, var_s, ter_b]), Production(var_s, [ter_c]) } cfg = CFG(productions=productions, start_symbol=var_s) cfg = cfg.to_pda().to_final_state().to_empty_stack().to_cfg() cfg = cfg.to_pda().to_final_state().to_empty_stack().to_cfg() cfg.to_pda().to_final_state().to_empty_stack().to_cfg()
def test_reverse(self): """ Test the reversal of a CFG """ var_s = Variable("S") ter_a = Terminal("a") ter_b = Terminal("b") prod0 = Production(var_s, [ter_a, var_s, ter_b]) prod1 = Production(var_s, []) cfg = CFG({var_s}, {ter_a, ter_b}, var_s, {prod0, prod1}) new_cfg = cfg.reverse() self.assertEqual(len(new_cfg.variables), 1) self.assertEqual(len(new_cfg.terminals), 2) self.assertEqual(len(new_cfg.productions), 2) self.assertFalse(new_cfg.is_empty()) self.assertTrue(new_cfg.contains([ter_b, ter_b, ter_a, ter_a]))
def test_union(self): """ Tests the union of two cfg """ var_s = Variable("S") ter_a = Terminal("a") ter_b = Terminal("b") prod0 = Production(var_s, [ter_a, var_s, ter_b]) prod1 = Production(var_s, []) cfg = CFG({var_s}, {ter_a, ter_b}, var_s, {prod0, prod1}) new_cfg = cfg.union(cfg) self.assertEqual(len(new_cfg.variables), 3) self.assertEqual(len(new_cfg.terminals), 2) self.assertEqual(len(new_cfg.productions), 6) self.assertFalse(new_cfg.is_empty()) self.assertTrue(new_cfg.contains([ter_a, ter_a, ter_b, ter_b]))
def read_production_regex(cls, head, regex, id, case_sens=True): var_by_state = dict() terminals, variables, productions = set(), set(), set() enfa = regex.to_epsilon_nfa().minimize() if len(enfa.states) == 0: variables.add(head) productions.add(Production(head, [Epsilon()])) return productions, variables, terminals, id for state in enfa.states: var_by_state[state] = Variable(f'Id{id},{state}') id += 1 transitions = enfa._transition_function for start_st in enfa.start_states: productions.add(Production(head, [var_by_state[start_st]])) for st_from, symb, st_to in transitions: new_head = var_by_state[st_from] new_body = [] value = symb.value if value == 'eps': new_body.append(Epsilon()) elif value.isupper() and case_sens: variable = Variable(value) new_body.append(variable) variables.add(variable) elif value.isdigit() or value.islower() or not case_sens: variable = Terminal(value) new_body.append(variable) variables.add(variable) else: raise ValueError( f'Symbol "{value}" should be either lower or upper case') new_body.append(var_by_state[st_to]) productions.add(Production(new_head, new_body)) if st_to in enfa.final_states: productions.add(Production(var_by_state[st_to], [])) return productions, variables, terminals, id
def test_substitution(self): """ Tests substitutions in a CFG """ var_s = Variable("S") ter_a = Terminal("a") ter_b = Terminal("b") prod0 = Production(var_s, [ter_a, var_s, ter_b]) prod1 = Production(var_s, []) cfg = CFG({var_s}, {ter_a, ter_b}, var_s, {prod0, prod1}) new_cfg = cfg.substitute({ter_a: cfg}) self.assertEqual(len(new_cfg.variables), 2) self.assertEqual(len(new_cfg.terminals), 2) self.assertEqual(len(new_cfg.productions), 4) self.assertFalse(new_cfg.is_empty()) self.assertTrue( new_cfg.contains([ter_a, ter_b, ter_a, ter_b, ter_b, ter_b]))
def test_nullable_object(self): """ Tests the finding of nullable objects """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(var_a, [ter_a, var_a, var_a]) prod2 = Production(var_a, [Epsilon()]) prod3 = Production(var_b, [ter_b, var_b, var_b]) prod4 = Production(var_b, [Epsilon()]) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2, prod3, prod4}) self.assertEqual(cfg.get_nullable_symbols(), {var_a, var_b, start})
def from_grammar_file(path, python_regex=False, nonterms_upper=True): with open(path, 'r') as g: productions = set() first_line = g.readline() rule = first_line.strip().split(' ', 1) start_symbol = Variable(rule[0]) if any(symb in rule[1] for symb in '?+*|') and len(rule[1]) > 1: body = rule[1].replace('?', f'| eps') productions |= GrammarAlgos.prod_from_regex( start_symbol, body, python_regex, nonterms_upper) else: body = [] for s in rule[1].split(' '): if s == 'eps': e = Epsilon() body.append(e) elif s.isupper(): v = Variable(s) body.append(v) else: t = Terminal(s) body.append(t) productions.add(Production(start_symbol, body)) for line in g.readlines(): rule = line.strip().split(' ', 1) var = Variable(rule[0]) if any(symb in rule[1] for symb in '?+*|') and len(rule[1]) > 1: body = rule[1].replace('?', f'| eps') productions |= GrammarAlgos.prod_from_regex( var, body, python_regex, nonterms_upper) else: body = [] for s in rule[1].split(' '): if s == 'eps': e = Epsilon() body.append(e) elif s.isupper(): v = Variable(s) body.append(v) else: t = Terminal(s) body.append(t) productions.add(Production(var, body)) return CFG(start_symbol=start_symbol, productions=productions)
def test_derivation_empty(self): var_s = Variable("S") productions = [Production(var_s, [Epsilon()])] cfg = CFG(productions=productions, start_symbol=var_s) parse_tree = cfg.get_cnf_parse_tree([]) derivation = parse_tree.get_rightmost_derivation() self.assertEqual([[var_s], []], derivation)
def production_to_pretty(production: Production): new_production = Production( cfg_obj_to_pretty(production.head), list(map(cfg_obj_to_pretty, production.body)), False ) return new_production
def read_grammar(cls, name): terminals, variables, productions = set(), set(), set() start_symb = None with open(name, 'r') as file: productions_txt = file.readlines() for production_txt in productions_txt: head, *body = production_txt.strip().split() if start_symb is None: start_symb = Variable(head) body_cfg = [] for letter in body: if letter.isupper(): variable = Variable(letter) variables.add(variable) body_cfg.append(variable) else: terminal = Terminal(letter) terminals.add(terminal) body_cfg.append(terminal) productions.add(Production(Variable(head), body_cfg)) cfg = CFG(variables, terminals, start_symb, productions) return cfg
def to_cnf(cfg): if cfg.generate_epsilon(): cfg = cfg.to_normal_form() new_start_symbol = Variable(cfg.start_symbol.value + "'") cfg.productions.add(Production(new_start_symbol, [])) res = CFG(variables=cfg.variables, terminals=cfg.terminals, start_symbol=new_start_symbol) res.variables.add(new_start_symbol) for production in cfg.productions: if production.head == cfg.start_symbol: res.productions.add( Production(new_start_symbol, production.body)) res.productions.add(production) return res return cfg.to_normal_form()
def test_useless_removal(self): """ Test the removal of useless symbols """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(start, [ter_a]) prod2 = Production(var_a, [ter_b]) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2}) new_cfg = cfg.remove_useless_symbols() self.assertEqual(len(new_cfg.variables), 1) self.assertEqual(len(new_cfg.terminals), 1) self.assertEqual(len(new_cfg.productions), 1) self.assertFalse(cfg.is_empty())
def test_finite(self): """ Tests whether a grammar is finite or not """ ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") var_a = Variable("A") var_b = Variable("B") prod0 = { Production(var_s, [var_a, var_b]), Production(var_a, [ter_a]), Production(var_b, [ter_b]) } cfg = CFG(productions=prod0, start_symbol=var_s) self.assertTrue(cfg.is_finite()) prod0.add(Production(var_a, [var_s])) cfg = CFG(productions=prod0, start_symbol=var_s) self.assertFalse(cfg.is_finite())
def test_conversions(self): """ Tests multiple conversions """ ter_a = Terminal("a") ter_b = Terminal("b") ter_c = Terminal("c") var_s = Variable("S") productions = { Production(var_s, [ter_a, var_s, ter_b]), Production(var_s, [ter_c]) } cfg = CFG(productions=productions, start_symbol=var_s) cfg = cfg.to_pda().to_final_state().to_empty_stack().to_cfg() self.assertTrue(cfg.contains([ter_c])) self.assertTrue(cfg.contains([ter_a, ter_c, ter_b])) self.assertTrue(cfg.contains([ter_a, ter_a, ter_c, ter_b, ter_b])) self.assertFalse(cfg.contains([ter_b, ter_c, ter_a])) self.assertFalse(cfg.contains([ter_b, ter_b, ter_c, ter_a, ter_a]))
def test_pos_closure(self): """ Tests the closure of a cfg """ var_s = Variable("S") ter_a = Terminal("a") ter_b = Terminal("b") ter_c = Terminal("c") prod0 = Production(var_s, [ter_a, var_s, ter_b]) prod1 = Production(var_s, [ter_c]) cfg = CFG({var_s}, {ter_a, ter_b}, var_s, {prod0, prod1}) new_cfg = cfg.get_positive_closure() self.assertEqual(len(new_cfg.variables), 3) self.assertEqual(len(new_cfg.terminals), 3) self.assertEqual(len(new_cfg.productions), 6) self.assertFalse(new_cfg.is_empty()) self.assertFalse(new_cfg.contains([])) self.assertTrue( new_cfg.contains( [ter_a, ter_a, ter_c, ter_b, ter_b, ter_a, ter_c, ter_b]))
def regex_to_grammar_productions(regex, head, var_dict, terminal_dict): _var_dict = {} production_set = set() # Getting an NFA from regex enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions # Producing variables from NFA states for state in enfa.states: _var_dict[state] = Variable( # Creating new CFG variable with unique name '%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: # Adding productions from head to start states for start_state in enfa.start_states: start_p = Production(head, [_var_dict[start_state]]) production_set.add(start_p) # Getting productions from NFA transitions for sym in list(transitions[head_state]): body_state = transitions[head_state][sym] inner_head = _var_dict[head_state] inner_body = [] if sym in var_dict: inner_body.append(var_dict[sym]) elif sym in terminal_dict: inner_body.append(terminal_dict[sym]) elif sym == EPS_SYM: inner_body.append(Epsilon()) else: raise ValueError(f'''Symbol "{sym}" is not defined as a terminal or a variable''') inner_body.append(_var_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][sym] in enfa.final_states: eps_p = Production(_var_dict[body_state], []) production_set.add(eps_p) return production_set
def __init__(self, cfg: CFG, is_reduced: bool = False): if not is_reduced: if any(p.body.__contains__(cfg.start_symbol) for p in cfg.productions): new_start_variable_name = 'S\'' name_is_used = cfg.variables.__contains__(new_start_variable_name) while name_is_used: new_start_variable_name += '\'' name_is_used = cfg.variables.__contains__(new_start_variable_name) new_start_variable = Variable(new_start_variable_name) cfg._productions.add(Production(new_start_variable, [cfg._start_symbol])) cfg._variables.add(new_start_variable) cfg._start_symbol = new_start_variable generate_epsilon = cfg.generate_epsilon() cfg = cfg.to_normal_form() if generate_epsilon: cfg._productions.add(Production(cfg.start_symbol, [])) super().__init__(cfg.variables, cfg.terminals, cfg.start_symbol, cfg.productions)
def test_creation(self): prod0 = Production(Variable("S0"), [Terminal("S1"), Variable("a")]) prod1 = Production(Variable("S0"), [Terminal("S1"), Variable("a")]) prod2 = Production(Variable("S0'"), [Terminal("S1"), Variable("a")]) prod3 = Production(Variable("S0"), [Terminal("S2"), Variable("a")]) prod4 = Production(Variable("S0"), [Terminal("S2"), Variable("b")]) self.assertEqual(prod0, prod1) self.assertNotEqual(prod0, prod2) self.assertNotEqual(prod0, prod3) self.assertNotEqual(prod0, prod4) self.assertEqual(str(prod0), str(prod1)) self.assertNotEqual(str(prod0), str(prod2)) self.assertNotEqual(str(prod0), str(prod3)) self.assertNotEqual(str(prod0), str(prod4)) self.assertEqual(hash(prod0), hash(prod1)) self.assertNotEqual(hash(prod0), hash(prod2)) self.assertNotEqual(hash(prod0), hash(prod3)) self.assertNotEqual(hash(prod0), hash(prod4)) self.assertIn(" -> ", str(prod0))
def test_generation_words2(self): """ Tests the generation of word """ ter_a = Terminal("a") var_s = Variable("S") var_s1 = Variable("S1") var_s2 = Variable("S2") productions = { Production(var_s, [var_s1, ter_a]), Production(var_s1, [var_s2, ter_a]), Production(var_s1, []), Production(var_s2, []), Production(var_s, []) } cfg = CFG(productions=productions, start_symbol=var_s) words0 = list(cfg.get_words()) self.assertIn([], words0) self.assertIn([ter_a], words0) self.assertIn([ter_a, ter_a], words0) self.assertEqual(len(words0), 3)
def read_grammar(cls, name): id = 0 terminals, variables, productions = set(), set(), set() start_symb = None with open(name, 'r') as file: productions_txt = file.readlines() for production_txt in productions_txt: head, _, *body_full = production_txt.strip().split() if start_symb is None: start_symb = Variable(head) tmp_body = [] bodies = [ list(group) for k, group in groupby(body_full, lambda x: x == "|") if not k ] for body in bodies: is_regex = not any([ True if '*' not in value else False for value in body ]) if is_regex: new_productions, new_variables, new_terminals, id = CFGrammar \ .read_production_regex(head, Regex.from_python_regex(body[0]), id, False) productions |= new_productions variables |= new_variables terminals |= new_terminals else: body_cfg = [] for letter in body: if letter == "epsilon": body_cfg.append(Epsilon()) elif letter.isupper(): non_terminal = Variable(letter) variables.add(non_terminal) body_cfg.append(non_terminal) else: terminal = Terminal(letter) terminals.add(terminal) body_cfg.append(terminal) productions.add(Production(Variable(head), body_cfg)) cfg = CFG(variables, terminals, start_symb, productions) return cfg
def test_get_rightmost_derivation(self): ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") var_a = Variable("A") var_b = Variable("B") var_c = Variable("C") productions = [ Production(var_s, [var_c, var_b]), Production(var_c, [var_a, var_a]), Production(var_a, [ter_a]), Production(var_b, [ter_b]) ] cfg = CFG(productions=productions, start_symbol=var_s) parse_tree = cfg.get_cnf_parse_tree([ter_a, ter_a, ter_b]) derivation = parse_tree.get_rightmost_derivation() self.assertEqual( derivation, [[var_s], [var_c, var_b], [var_c, ter_b], [var_a, var_a, ter_b], [var_a, ter_a, ter_b], [ter_a, ter_a, ter_b]])
def test_remove_epsilon(self): """ Tests the removal of epsilon """ var_a = Variable("A") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") start = Variable("S") prod0 = Production(start, [var_a, var_b]) prod1 = Production(var_a, [ter_a, var_a, var_a]) prod2 = Production(var_a, [Epsilon()]) prod3 = Production(var_b, [ter_b, var_b, var_b]) prod4 = Production(var_b, []) cfg = CFG({var_a, var_b, start}, {ter_a, ter_b}, start, {prod0, prod1, prod2, prod3, prod4}) new_cfg = cfg.remove_epsilon() self.assertEqual(len(new_cfg.variables), 3) self.assertEqual(len(new_cfg.terminals), 2) self.assertEqual(len(set(new_cfg.productions)), 9) self.assertEqual(len(new_cfg.get_nullable_symbols()), 0) self.assertFalse(cfg.is_empty())
def cnf(cfgrammar): if not cfgrammar.generate_epsilon(): return cfgrammar.to_normal_form() else: cfgrammar = cfgrammar.to_normal_form() new_symbol = Variable(cfgrammar.start_symbol.value + "'") cfgrammar.productions.add(Production(new_symbol, [])) output = CFG(variables=cfgrammar.variables, start_symbol=new_symbol, terminals=cfgrammar.terminals) output.variables.add(new_symbol) for i in cfgrammar.productions: if cfgrammar.start_symbol == i.head: output.productions.add(Production(new_symbol, i.body)) output.productions.add(i) return output
def test_membership(self): """ Tests the membership of a CFG """ # pylint: disable=too-many-locals var_useless = Variable("USELESS") var_s = Variable("S") var_b = Variable("B") ter_a = Terminal("a") ter_b = Terminal("b") ter_c = Terminal("c") prod0 = Production(var_s, [ter_a, var_s, var_b]) prod1 = Production(var_useless, [ter_a, var_s, var_b]) prod2 = Production(var_s, [var_useless]) prod4 = Production(var_b, [ter_b]) prod5 = Production(var_useless, []) cfg0 = CFG({var_useless, var_s}, {ter_a, ter_b}, var_s, {prod0, prod1, prod2, prod4, prod5}) self.assertTrue(cfg0.contains([Epsilon()])) self.assertTrue(cfg0.contains([ter_a, ter_b])) self.assertTrue(cfg0.contains([ter_a, ter_a, ter_b, ter_b])) self.assertTrue( cfg0.contains([ter_a, ter_a, ter_a, ter_b, ter_b, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_b, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_b, ter_c, ter_b])) self.assertFalse(cfg0.contains([ter_a, ter_a, ter_a, ter_b, ter_b])) prod3 = Production(var_s, [ter_c]) cfg0 = CFG({var_s}, {ter_a, ter_b, ter_c}, var_s, {prod0, prod3}) self.assertFalse(cfg0.contains([Epsilon()])) var_a = Variable("A") prod6 = Production(var_s, [var_a, var_b]) prod7 = Production(var_a, [var_a, var_b]) prod8 = Production(var_a, [ter_a]) prod9 = Production(var_b, [ter_b]) cfg1 = CFG({var_a, var_b, var_s}, {ter_a, ter_b}, var_s, {prod6, prod7, prod8, prod9}) self.assertTrue(cfg1.contains([ter_a, ter_b, ter_b])) cfg1 = CFG({"A", "B", "S"}, {"a", "b"}, "S", {prod6, prod7, prod8, prod9}) self.assertTrue(cfg1.contains(["a", "b", "b"]))
def regex_to_grammar_productions(regex, head): _var_dict = {} production_set = set() # Getting an NFA from regex enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions for state in enfa.states: _var_dict[state] = Variable( # Creating new CFG variable with unique name '%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: # Adding productions from head to start states for start_state in enfa.start_states: start_p = Production(head, [_var_dict[start_state]]) production_set.add(start_p) # Getting productions from NFA transitions for sym in list(transitions[head_state]): body_state = transitions[head_state][sym] inner_head = _var_dict[head_state] inner_body = [] if sym.value == EPS_SYM: inner_body.append(Epsilon()) elif sym.value.isupper(): inner_body.append(Variable(sym)) else: inner_body.append(Terminal(sym)) inner_body.append(_var_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][sym] in enfa.final_states: eps_p = Production(_var_dict[body_state], []) production_set.add(eps_p) return production_set
def from_text(cls, text: List[str]): start_symbol = None eps_productions = [] productions_with_dfa = [] for line in text: raw_head, *raw_body = line.strip().split(' ', 1) regex = Regex(' '.join(raw_body).replace('eps', 'epsilon')) head = Variable(raw_head) if start_symbol is None: start_symbol = head if not raw_body: eps_productions.append(Production(head, [])) dfa: DeterministicFiniteAutomaton = regex.to_epsilon_nfa( ).to_deterministic().minimize() productions_with_dfa.append((head, dfa)) import wrappers.GraphWrapper rfa_graph = wrappers.GraphWrapper.empty() rfa_graph.matrix_size = sum( [len(dfa.states) for _, dfa in productions_with_dfa]) rfa_graph.vertices = set() empty_matrix = Matrix.sparse(types.BOOL, rfa_graph.matrix_size, rfa_graph.matrix_size) head_by_start_final_pair = {} total_states_counter = 0 for head, dfa in productions_with_dfa: transitions = dfa._transition_function._transitions num_by_state = {} for state in dfa.states: num_by_state[state] = total_states_counter total_states_counter += 1 rfa_graph.vertices.update(num_by_state.values()) for start_state in dfa.start_states: rfa_graph.start_states.add(num_by_state[start_state]) for final_state in dfa.final_states: rfa_graph.final_states.add(num_by_state[final_state]) head_by_start_final_pair[ num_by_state[dfa.start_state], num_by_state[final_state]] = head.value for state_from in transitions: for edge_symb in transitions[state_from]: state_to = transitions[state_from][edge_symb] matrix = rfa_graph.label_to_bool_matrix.setdefault( edge_symb, empty_matrix.dup()) matrix[num_by_state[state_from], num_by_state[state_to]] = True return cls(rfa_graph, head_by_start_final_pair, eps_productions, start_symbol)
def prod_from_regex(head, regex, python_regex=False, nonterms_upper=True): if python_regex: regex = Regex.from_python_regex(regex) else: regex = Regex(regex) enfa = regex.to_epsilon_nfa().minimize() transitions = enfa.to_dict() state_to_var = dict() production_set = set() for state in enfa.states: global state_counter state_counter += 1 state_to_var[state] = Variable(f'State{state_counter}') for start_state in enfa.start_states: production_set.add(Production(head, [state_to_var[start_state]])) for head_state, transition in transitions.items(): for symbol, body_state in transition.items(): prod_head = state_to_var[head_state] prod_body = list() if symbol.value == 'eps': prod_body.append(Epsilon()) elif nonterms_upper and symbol.value.isupper(): prod_body.append(Variable(symbol.value)) else: prod_body.append(Terminal(symbol.value)) prod_body.append(state_to_var[body_state]) production_set.add(Production(prod_head, prod_body)) if body_state in enfa.final_states: production_set.add(Production(state_to_var[body_state], [])) return production_set
def test_intersection(self): """ Tests the intersection with a regex """ regex = Regex("a*b*") dfa = regex.to_epsilon_nfa().to_deterministic() symb_a = Symbol("a") symb_b = Symbol("b") self.assertTrue(dfa.accepts([symb_a, symb_a, symb_b, symb_b])) self.assertFalse(dfa.accepts([symb_b, symb_b, symb_a])) ter_a = Terminal("a") ter_b = Terminal("b") var_s = Variable("S") productions = { Production(var_s, [ter_a, var_s, ter_b]), Production(var_s, [ter_b, var_s, ter_a]), Production(var_s, []) } cfg = CFG(productions=productions, start_symbol=var_s) self.assertTrue(cfg.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg.contains([ter_a, ter_a, ter_b])) cfg_i = cfg.intersection(regex) self.assertTrue(cfg_i.contains([ter_a, ter_a, ter_b, ter_b])) self.assertFalse(cfg_i.contains([ter_a, ter_a, ter_b])) self.assertTrue(cfg_i.contains([]))
def __init__(self, start_symbol=None, productions=None): cfg = CFG(start_symbol=start_symbol, productions=productions) cnf = cfg.to_normal_form() # needed for language preservation if cfg.generate_epsilon(): cnf._productions.add(Production(cnf._start_symbol, [])) self.pair_productions = set() for p in cnf._productions: if len(p.body) == 2: self.pair_productions.add(p) super(GrammarCNF, self).__init__(start_symbol=cnf._start_symbol, productions=cnf._productions)
def regex_to_production(regex, head): _dict = {} production_set = set() enfa = regex.to_epsilon_nfa() enfa = enfa.minimize() transitions = enfa._transition_function._transitions for state in enfa.states: _dict[state] = Variable('%s#REGEX#%s' % (head.value, get_new_var_num())) for head_state in transitions: for start_state in enfa.start_states: start_production = Production(head, [_dict[start_state]]) production_set.add(start_production) for symbol in list(transitions[head_state]): body_state = transitions[head_state][symbol] inner_head = _dict[head_state] inner_body = [] if symbol.value == EPS_SYM: inner_body.append(Epsilon()) elif symbol.value.isupper(): inner_body.append(Variable(symbol)) else: inner_body.append(Terminal(symbol)) inner_body.append(_dict[body_state]) production_set.add(Production(inner_head, inner_body)) if transitions[head_state][symbol] in enfa.final_states: eps_production = Production(_dict[body_state], []) production_set.add(eps_production) return production_set