def __init__(self, X={}, D={}, acsiom=None, P={}): self.X = Set() self.D = Set() self.add_terminal_symbols(X) self.add_nonterminal_symbols(D) self.set_acsiom(acsiom) self.set_prod_rules(P)
def toNFA(self): """ convert Grammar to NFA 1. rule A -> aB convert to δ[Ā, a] = B̄ 2. rule A -> a convert to δ[Ā, a] = qf 3. S̄ ∈ F if there is a rule S -> ε """ import NFA Q = Set(f"{A}̄" for A in self.N).union(Set("qf")) q0 = f"{self.S}̄" δ = NFA.δ(Q=Q, Σ=self.Σ) F = Set("S̄", "qf") if "ε" in self.P["S"] else Set("qf") M = NFA(Q, self.Σ, δ, q0, F) for A in self.N: for rule in self.P[A]: if len(rule) == 2: a, B = rule δ[f"{A}̄", a].add(f"{B}̄") elif len(rule) == 1 and rule != "ε": a = rule δ[f"{A}̄", a].add("qf") return M
def remove_ε(self): import CFG import re Nε = self.Nε N = self.N S = self.S P = CFG.P() for A in self.N: P[A] = self.P[A] - Set("ε") for opt in P[A]: for subset in all_subsets(Nε): new_opt = re.sub(f"[ε{''.join(subset)}]", "", str(opt)) if new_opt != "": P[A].add(new_opt) for opt in self.P[self.S]: if all(X in Nε for X in opt): N |= (Set("S'")) S = "S'" P["S'"] = f"ε | {self.S}" break G = CFG(N, self.Σ, P, S) return G
def __init__(self, worker_num=10, chunk_size=10000, log_interval=600, data_dir='data', log_dir='log'): self.chunk_size = chunk_size self.log_interval = log_interval self.urls = Queue() self.results = Queue() self.url_cache = Set() self.name_cache = Set() self.black_urls = Set() self.black_cache = Dict() self.chunk_num = 0 self.parser = HtmlParser(home='https://baike.baidu.com') self.last = 0 self.state = 1 if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) self.data_dir = data_dir self.log_dir = log_dir self.writer = Thread(target=self._write) self.logger = Timer(log_interval, self._log) self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)]
def _reduce(self, imax=None): from utils import roman import DFA def init(i): """ split into two groups I - non terminal II - terminal """ groups = { qi: roman(1) if qi not in self.F else roman(2) for qi in self.Q } δ = DFA.δ(Q=self.Q, Σ=self.Σ) for qi in (self.Q - self.F) + self.F: for a in self.Σ: δ[qi, a] = groups.get(self.δ[qi, a]) if imax is None or i < imax: return step(i + 1, groups, δ) return groups, δ def step(i, groups, δ): new_groups = {} force_move = 0 numeratd_patterns = [] for i in range(len(Set(groups.values()))): for qi, val in groups.items(): target = δ.group()[qi] if val == roman(i + 1): if target not in numeratd_patterns: numeratd_patterns.append(target) index = numeratd_patterns.index(target) new_groups[qi] = roman(index + 1 + force_move) force_move += len(numeratd_patterns) numeratd_patterns = [] new_δ = DFA.δ() for qi in self.Q: for a in self.Σ: new_δ[qi, a] = new_groups.get(self.δ[qi, a]) if (imax is None or i < imax) and groups != new_groups and δ != new_δ: return step(i + 1, new_groups, new_δ) return new_groups, new_δ groups, new_δ = init(0) Q = Set(groups.values()) q0 = groups[self.q0] δ = DFA.δ() for (qi, a), target in new_δ.items(): δ[groups[qi], a] = target F = Set(groups[qf] for qf in self.F) return DFA(Q, self.Σ, δ, q0, F)
def test_grammar(): P = G.P({ "S": "aA | bC | a | ε", "A": "bB | aA | b | c", "B": "aB | bC | aC | cA | c", "C": "a | b | aA | bB" }) A = G(Set("S", "A", "C", "B"), Set("a", "b", "c"), P, "S") B = A.toNFA()
def _canonize(self): import DFA Q = Set(chr(ord('A') + i) for i in range(len(self.Q))) letterMapping = dict(zip(self.δ.reachables(self.q0), Q)) δ = DFA.δ() for (qi, a), target in self.δ.items(): δ[letterMapping[qi], a] = letterMapping[target] q0 = letterMapping[self.q0] F = Set(letterMapping[qf] for qf in self.F) return DFA(Q, self.Σ, δ, q0, F)
def toTopDownAnalyzer(self): import PDA L = self.remove_left_recursion() δ = PDA.δ() M = TopDownAnalyzer(Set("q"), L.Σ, L.N.union(L.Σ), δ, "q", L.S, Set()) for A in L.N: for rule in L.P[A]: δ["q", "ε", A].add(("q", rule)) for a in L.Σ: δ["q", a, a].add(("q", "ε")) return M
def __setitem__(self, key, value): if isinstance(value, str): super().__setitem__( key, Set(map(Rule, map(str.strip, value.split("|"))))) elif isinstance(value, Set): super().__setitem__(key, value)
def __setitem__(self, key, val): if len(key) == 2: super().__setitem__(tuple(map(str, key)), Set(map(str, val))) elif len(key) > 2 and len(key) == len(val) + 1: for k, v in zip(key[1:], val): self.__setitem__((key[0], k), v)
def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅" + "".join(N[i - 1]) + "]", "", str(p)).islower()))
def do(): nonlocal i, V i = i + 1 V[i] = V[i - 1].union( Set(sym for sym in self.N.union(self.Σ) if any(sym in rule for A in V[i - 1] if A.isupper() for rule in self.P[A])))
def calc_potentials(): potentials = {} for A in N: potentials.setdefault(A, Set()) for rule in P[A]: if rule[0] in N: potentials[A].add(rule[0]) return potentials
def remove_left_recursion(self): def calc_potentials(): potentials = {} for A in N: potentials.setdefault(A, Set()) for rule in P[A]: if rule[0] in N: potentials[A].add(rule[0]) return potentials N = self.N.copy() P = self.P.copy() for i, A in enumerate(N): for B in N[:i + 1]: if not B in calc_potentials()[A]: continue if A == B: α = [rule for rule in P[A] if rule.startswith(B)] β = [rule for rule in P[A] if not rule.startswith(B)] N = Set(f"{A}'").union(N) P[f"{A}'"] |= Set(Rule(rule[1:]) for rule in α) | Set( Rule(rule[1:] + f"{A}'") for rule in α) P[A] = Set(Rule(rule) for rule in β) | Set( Rule(rule + f"{A}'") for rule in β) else: α = [rule for rule in P[A] if rule.startswith(B)] β = [rule for rule in P[A] if not rule.startswith(B)] P[A] = Set(Rule(rule) for rule in β) | Set( Rule(ruleB + rule[1:]) for rule in α for ruleB in P[B]) return CFG(N, self.Σ.copy(), P, self.S)
def toNFA(self): """ convert EFA to DFA remove ε steps """ import NFA δ = NFA.δ() for qi in self.Q: for a in self.Σ: if a == "ε": continue step1 = self.δ.get((qi, a), Set()) step2 = Set() for s in self.δ.Dε(qi): step2 |= self.δ.get((s, a), Set()) step3 = Set() for s in step1 | step2: step3 |= self.δ.Dε(s) δ[qi, a] = tuple(step3) Σ = self.Σ - Set("ε") F = (self.F if self.δ.Dε(self.q0).intercept(self.F) == Set() else self.F.union(Set(self.q0))) return NFA(self.Q, Σ, δ, self.q0, F)
def toBottomUpAnalyzer(self): import PDA # L = self.remove_left_recursion() L = self ô = PDA.ô() M = BottomUpAnalyzer(Set("q", "r"), L.Σ, L.N | L.Σ | Set("⊥"), ô, "q", "⊥", Set("r")) for A in L.N: for rule in L.P[A]: ô["q", "ε", rule].add(("q", A)) for a in L.Σ: ô["q", a, "ε"].add(("q", a)) ô["q", "ε", "⊥S"].add(("r", "ε")) return M
def Dε(self, q0): result = Set() stack = [q0] while len(stack) > 0: q = stack.pop(0) result.add(q) targets = self.__getitem__((q, "ε")) for target in targets: if target not in result: stack.append(target) result.add(target) return result
def __init__(self, Q, Σ, δ, q0, F): """ Q : set of states Σ : finite alphabet δ : Q × Σ → Q transition function q0 : q0 ∈ Q initial state F : F ⊆ Q set of accepting states """ super().__init__(Q, Σ, δ, Set(q0), F) self.q0 = str(q0)
def test_dfa(): δ = DFA.δ() A = DFA(Set(1, 2, 3, 4, 5, 6, 7), Set("a", "b"), δ, 1, Set(3, 5, 6)) δ[1, "a"] = 2 δ[1, "b"] = "-" δ[2, "a"] = 3 δ[2, "b"] = 4 δ[3, "a"] = 6 δ[3, "b"] = 5 δ[4, "a"] = 3 δ[4, "b"] = 2 δ[5, "a"] = 6 δ[5, "b"] = 3 δ[6, "a"] = 2 δ[6, "b"] = "-" δ[7, "a"] = 6 δ[7, "b"] = 1 # A.table() B = A.minimize() B.diagram()
def remove_primitive_rules(self): """ remove all rules of type A → B where A,B ∈ N """ import CFG P = CFG.P() for A in self.N: NA = self.Nx(A) P[A] = Set(rule for B in NA for rule in self.P[B] if not self.isprimitive(rule)) return CFG(self.N, self.Σ, P, self.S)
def Nx(self, x): i = 0 N = {0: Set(x)} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(rule for A in N[i - 1] for rule in self.P[A] if self.isprimitive(rule))) do() while N[i] != N[i - 1]: do() Nx = N[i] return Nx
def reachables(self, q0): """ return a list of all reachable qi states from the state q0 """ result = Set() stack = [q0] while len(stack) > 0: q = stack.pop(0) result.add(q) for a in self.Σ: target = self.__getitem__((q, a)) if target is not None: if target not in result: stack.append(target) result.add(target) return result
def resolve(A, B): for _ in range(len(P[A])): rule = list(P[A].pop(0)) if rule[0].islower(): for i, c in enumerate(rule[1:]): if c.islower() and len(c) == 1: rule[i + 1] = f"{c}̄" rule = "".join(rule) P[A].add(Rule(rule)) elif rule[0] == B: rules = Set() for rule1 in P[B]: rules.add(Rule(rule1 + "".join(rule[1:]))) P[A] |= rules else: P[A].add(Rule(rule))
def V(self): """ get all reachable symbols """ i = 0 V = {0: Set(self.S)} def do(): nonlocal i, V i = i + 1 V[i] = V[i - 1].union( Set(sym for sym in self.N.union(self.Σ) if any(sym in rule for A in V[i - 1] if A.isupper() for rule in self.P[A]))) do() while V[i] != V[i - 1]: do() V = V[i] return V
def Ne(self): """ get all normalised nonterminals """ import re i = 0 N = {0: Set()} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅" + "".join(N[i - 1]) + "]", "", str(p)).islower())) do() while N[i] != N[i - 1]: do() Ne = N[i] return Ne
def CYK_parser(self, w): n = len(w) C = np.empty((n, n), dtype=Set) for i in range(n): for j in range(n): C[i, j] = Set() for d in self.P: for i in range(n): # for k in range(0, n-i, -1): for k in range(n-i): if self.check_prod_rule((d, w[i:i+k+1])): C[i, i+k].add(d) for m in range(2, n+1): for i in range(1, n-m+2): j = i + m - 1 for rule in self.get_nonterm_prod_rules(): for k in range(i, j): lrule, rrule = rule[1][0], rule[1][1] if lrule in C[i-1, k-1] and rrule in C[k, j-1]: C[i-1, j-1].add(rule[0]) return C
def step(i, groups, δ): new_groups = {} force_move = 0 numeratd_patterns = [] for i in range(len(Set(groups.values()))): for qi, val in groups.items(): target = δ.group()[qi] if val == roman(i + 1): if target not in numeratd_patterns: numeratd_patterns.append(target) index = numeratd_patterns.index(target) new_groups[qi] = roman(index + 1 + force_move) force_move += len(numeratd_patterns) numeratd_patterns = [] new_δ = DFA.δ() for qi in self.Q: for a in self.Σ: new_δ[qi, a] = new_groups.get(self.δ[qi, a]) if (imax is None or i < imax) and groups != new_groups and δ != new_δ: return step(i + 1, new_groups, new_δ) return new_groups, new_δ
def Nε(self): """ get all states that can turn into ε """ import re i = 0 N = {0: Set()} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅ε" + "".join(N[i - 1]) + "]", "", str(p)) == "")) do() while N[i] != N[i - 1]: do() Nε = N[i] return Nε
def toGNF(self): """ each rule must be of format A → aB1B2B3...Bn (a ∈ Σ, B1,B2,B3,...,Bn ∈ N) """ import CFG G = self.remove_left_recursion() N = Set(reversed(G.N.copy())) P = G.P.copy() def resolve(A, B): for _ in range(len(P[A])): rule = list(P[A].pop(0)) if rule[0].islower(): for i, c in enumerate(rule[1:]): if c.islower() and len(c) == 1: rule[i + 1] = f"{c}̄" rule = "".join(rule) P[A].add(Rule(rule)) elif rule[0] == B: rules = Set() for rule1 in P[B]: rules.add(Rule(rule1 + "".join(rule[1:]))) P[A] |= rules else: P[A].add(Rule(rule)) for i, A in enumerate(N): for B in N[:i + 1]: resolve(A, B) return CFG(N, self.Σ, P, self.S)
def turn_to_HomskyForm(gramm): # creating new grammar as copy of argument in Homsky form: new_grammar = gramm #1) delete long rules: for j in range(len(new_grammar.P)): keys_list = list(new_grammar.P) rules = new_grammar.P[keys_list[j]] for m in range(len(rules)): if type(rules[m]) == tuple and len(rules[m]) > 2: k = len(rules[m]) for i in range(1, k-2): newNonTerminal = chr(65+j) + str(i+m) new_grammar.D.add(newNonTerminal) new_grammar.D.add(chr(65+j) + str(i+1+m)) new_grammar.P[newNonTerminal] = [(rules[m][i], chr(65+j) + str(i+1))] newlastNonTerminal = chr(65+j) + str(k-2+m) new_grammar.P[newlastNonTerminal] = [((rules[m][k-2]), (rules[m][k-1]))] new_grammar.D.add(newlastNonTerminal) rules.append((rules[m][0], chr(65+j) + str(1+m))) rules.remove(rules[m]) # 2) delete epsilon-rules: # to find rules A => eps: S = Set() #set of espilon non-Terms for element in new_grammar.P.copy(): for rule in new_grammar.P[element]: if rule == 'eps': S.add(element) s = S while True: S = s for element in new_grammar.P.copy(): for rule in new_grammar.P[element]: if type(rule) == tuple: if rule[0] in S and rule[1] in S: s.add(element) else: if rule in S: s.add(element) if s == S: break #now Eliminate them! new_P = new_grammar.P for element in new_P.copy(): rules = new_P[element] for rule in rules: if type(rule) == tuple: for symbol in rule: if symbol in S and len(rule) > 1: temp = list(rule) temp.remove(symbol) new_rule = tuple(temp) rules.append(new_rule) delete_reps = set(rules) new_P[element] = list(delete_reps) for element in new_P.copy(): if 'eps' in new_P[element]: new_P[element].remove('eps') if new_P[element] == []: del new_P[element] new_grammar.P = new_P # # # # 3) delete the chain prod rules: # # # to find unit pairs: def unit_pairs_set(D, P): the_set = list((i, i) for i in D) for element in P: for rule in P[element]: if rule in D: for item in the_set: if item[1] == element and (item[0], rule[0]) not in the_set: if type(rule) == tuple: the_set.append((item[0], rule[0])) else: the_set.append((item[0], rule)) return the_set pairs_set = unit_pairs_set(new_grammar.D, new_grammar.P) # print(pairs_set) for pair in pairs_set: if pair[0] != pair[1]: # tupl = list() # tupl.append(pair[1]) # tupl = tuple(tupl) # print(pair) new_grammar.P[pair[0]].remove(pair[1]) new_grammar.P[pair[0]] = new_grammar.P[pair[0]] + new_grammar.P[pair[1]] new_grammar.P[pair[0]] = list(set(new_grammar.P[pair[0]])) for element in new_grammar.P: for rule in new_grammar.P[element]: if type(rule) == tuple and len(rule) == 1: new_grammar.P[element][new_grammar.P[element].index(rule)] = rule[0] # # #4) delete useless elems: # # # # # delete non-generating non-terms set_of_generatings = set() set_of_generatings.add(new_grammar.acsiom) for element in new_grammar.P: for rule in new_grammar.P[element]: if type(rule) == tuple: # print(rule) if rule[0] not in new_grammar.D and rule[1] not in new_grammar.D: set_of_generatings.add(element) else: if rule not in new_grammar.D: set_of_generatings.add(element) while True: s = set_of_generatings for element in new_grammar.P: for rule in new_grammar.P[element]: if type(rule) == tuple: if (rule[0] in set_of_generatings or rule[0] in new_grammar.X) and (rule[1] in set_of_generatings or rule[1] in new_grammar.X): s.add(element) else: if rule in set_of_generatings: s.add(element) if s == set_of_generatings: break #delete ureachable non-Terms #algorithm of search found_elements = Set() found_elements.add(new_grammar.acsiom) for item in found_elements: if item in new_grammar.P: for rule in new_grammar.P[item]: if type(rule) == tuple: for term in rule: if term in new_grammar.D: found_elements.add(term) else: if rule in new_grammar.D: found_elements.add(term) #delete them all! for element in new_grammar.P: for rule in new_grammar.P[element]: if type(rule) == tuple: for term in rule: if term not in set_of_generatings and term in new_grammar.D: new_list = list(rule) new_list.remove(term) new_grammar.P[element].remove(rule) if not(len(new_list) == 1 and new_list[0] == element): new_grammar.P[element].append(new_list[0]) else: if rule not in set_of_generatings and rule in new_grammar.D: new_grammar.P[element].remove(rule) for element in new_grammar.P.copy(): if element not in found_elements: del new_grammar.P[element] # # #last shtrikh: for item in new_grammar.X: S1 = 'Z' + str(new_grammar.X.index(item)) for element in new_grammar.P.copy(): for rule in new_grammar.P[element]: if type(rule) == tuple and item in rule and len(rule) > 1: new_list = list(rule) for i in range(2): if rule[i] == item: new_list[i] = S1 new_grammar.D.add(S1) new_grammar.P[element][new_grammar.P[element].index(rule)] = tuple(new_list) new_grammar.P[S1] = [item] return new_grammar