def __init__(self, X={}, D={}, acsiom=None, P={}): self.X = Set() self.D = Set() self.add_terminal_symbols(X) self.add_nonterminal_symbols(D) self.set_acsiom(acsiom) self.set_prod_rules(P)
def toNFA(self): """ convert Grammar to NFA 1. rule A -> aB convert to δ[Ā, a] = B̄ 2. rule A -> a convert to δ[Ā, a] = qf 3. S̄ ∈ F if there is a rule S -> ε """ import NFA Q = Set(f"{A}̄" for A in self.N).union(Set("qf")) q0 = f"{self.S}̄" δ = NFA.δ(Q=Q, Σ=self.Σ) F = Set("S̄", "qf") if "ε" in self.P["S"] else Set("qf") M = NFA(Q, self.Σ, δ, q0, F) for A in self.N: for rule in self.P[A]: if len(rule) == 2: a, B = rule δ[f"{A}̄", a].add(f"{B}̄") elif len(rule) == 1 and rule != "ε": a = rule δ[f"{A}̄", a].add("qf") return M
def remove_ε(self): import CFG import re Nε = self.Nε N = self.N S = self.S P = CFG.P() for A in self.N: P[A] = self.P[A] - Set("ε") for opt in P[A]: for subset in all_subsets(Nε): new_opt = re.sub(f"[ε{''.join(subset)}]", "", str(opt)) if new_opt != "": P[A].add(new_opt) for opt in self.P[self.S]: if all(X in Nε for X in opt): N |= (Set("S'")) S = "S'" P["S'"] = f"ε | {self.S}" break G = CFG(N, self.Σ, P, S) return G
def __init__(self, worker_num=10, chunk_size=10000, log_interval=600, data_dir='data', log_dir='log'): self.chunk_size = chunk_size self.log_interval = log_interval self.urls = Queue() self.results = Queue() self.url_cache = Set() self.name_cache = Set() self.black_urls = Set() self.black_cache = Dict() self.chunk_num = 0 self.parser = HtmlParser(home='https://baike.baidu.com') self.last = 0 self.state = 1 if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(log_dir): os.mkdir(log_dir) self.data_dir = data_dir self.log_dir = log_dir self.writer = Thread(target=self._write) self.logger = Timer(log_interval, self._log) self.spiders = [Thread(target=self._scrap) for _ in range(worker_num)]
def _reduce(self, imax=None): from utils import roman import DFA def init(i): """ split into two groups I - non terminal II - terminal """ groups = { qi: roman(1) if qi not in self.F else roman(2) for qi in self.Q } δ = DFA.δ(Q=self.Q, Σ=self.Σ) for qi in (self.Q - self.F) + self.F: for a in self.Σ: δ[qi, a] = groups.get(self.δ[qi, a]) if imax is None or i < imax: return step(i + 1, groups, δ) return groups, δ def step(i, groups, δ): new_groups = {} force_move = 0 numeratd_patterns = [] for i in range(len(Set(groups.values()))): for qi, val in groups.items(): target = δ.group()[qi] if val == roman(i + 1): if target not in numeratd_patterns: numeratd_patterns.append(target) index = numeratd_patterns.index(target) new_groups[qi] = roman(index + 1 + force_move) force_move += len(numeratd_patterns) numeratd_patterns = [] new_δ = DFA.δ() for qi in self.Q: for a in self.Σ: new_δ[qi, a] = new_groups.get(self.δ[qi, a]) if (imax is None or i < imax) and groups != new_groups and δ != new_δ: return step(i + 1, new_groups, new_δ) return new_groups, new_δ groups, new_δ = init(0) Q = Set(groups.values()) q0 = groups[self.q0] δ = DFA.δ() for (qi, a), target in new_δ.items(): δ[groups[qi], a] = target F = Set(groups[qf] for qf in self.F) return DFA(Q, self.Σ, δ, q0, F)
def test_grammar(): P = G.P({ "S": "aA | bC | a | ε", "A": "bB | aA | b | c", "B": "aB | bC | aC | cA | c", "C": "a | b | aA | bB" }) A = G(Set("S", "A", "C", "B"), Set("a", "b", "c"), P, "S") B = A.toNFA()
def _canonize(self): import DFA Q = Set(chr(ord('A') + i) for i in range(len(self.Q))) letterMapping = dict(zip(self.δ.reachables(self.q0), Q)) δ = DFA.δ() for (qi, a), target in self.δ.items(): δ[letterMapping[qi], a] = letterMapping[target] q0 = letterMapping[self.q0] F = Set(letterMapping[qf] for qf in self.F) return DFA(Q, self.Σ, δ, q0, F)
def toTopDownAnalyzer(self): import PDA L = self.remove_left_recursion() δ = PDA.δ() M = TopDownAnalyzer(Set("q"), L.Σ, L.N.union(L.Σ), δ, "q", L.S, Set()) for A in L.N: for rule in L.P[A]: δ["q", "ε", A].add(("q", rule)) for a in L.Σ: δ["q", a, a].add(("q", "ε")) return M
def __setitem__(self, key, val): if len(key) == 2: super().__setitem__(tuple(map(str, key)), Set(map(str, val))) elif len(key) > 2 and len(key) == len(val) + 1: for k, v in zip(key[1:], val): self.__setitem__((key[0], k), v)
def __setitem__(self, key, value): if isinstance(value, str): super().__setitem__( key, Set(map(Rule, map(str.strip, value.split("|"))))) elif isinstance(value, Set): super().__setitem__(key, value)
def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅" + "".join(N[i - 1]) + "]", "", str(p)).islower()))
def do(): nonlocal i, V i = i + 1 V[i] = V[i - 1].union( Set(sym for sym in self.N.union(self.Σ) if any(sym in rule for A in V[i - 1] if A.isupper() for rule in self.P[A])))
def calc_potentials(): potentials = {} for A in N: potentials.setdefault(A, Set()) for rule in P[A]: if rule[0] in N: potentials[A].add(rule[0]) return potentials
def remove_left_recursion(self): def calc_potentials(): potentials = {} for A in N: potentials.setdefault(A, Set()) for rule in P[A]: if rule[0] in N: potentials[A].add(rule[0]) return potentials N = self.N.copy() P = self.P.copy() for i, A in enumerate(N): for B in N[:i + 1]: if not B in calc_potentials()[A]: continue if A == B: α = [rule for rule in P[A] if rule.startswith(B)] β = [rule for rule in P[A] if not rule.startswith(B)] N = Set(f"{A}'").union(N) P[f"{A}'"] |= Set(Rule(rule[1:]) for rule in α) | Set( Rule(rule[1:] + f"{A}'") for rule in α) P[A] = Set(Rule(rule) for rule in β) | Set( Rule(rule + f"{A}'") for rule in β) else: α = [rule for rule in P[A] if rule.startswith(B)] β = [rule for rule in P[A] if not rule.startswith(B)] P[A] = Set(Rule(rule) for rule in β) | Set( Rule(ruleB + rule[1:]) for rule in α for ruleB in P[B]) return CFG(N, self.Σ.copy(), P, self.S)
def toNFA(self): """ convert EFA to DFA remove ε steps """ import NFA δ = NFA.δ() for qi in self.Q: for a in self.Σ: if a == "ε": continue step1 = self.δ.get((qi, a), Set()) step2 = Set() for s in self.δ.Dε(qi): step2 |= self.δ.get((s, a), Set()) step3 = Set() for s in step1 | step2: step3 |= self.δ.Dε(s) δ[qi, a] = tuple(step3) Σ = self.Σ - Set("ε") F = (self.F if self.δ.Dε(self.q0).intercept(self.F) == Set() else self.F.union(Set(self.q0))) return NFA(self.Q, Σ, δ, self.q0, F)
def test_product(): set1 = {1, 2} set2 = {'a', 'b', 'c'} product = Set.product(set1, set2) print("Set 1: ", set1) print("Set 2: ", set2) print("Cartesian product: ", product)
def toBottomUpAnalyzer(self): import PDA # L = self.remove_left_recursion() L = self ô = PDA.ô() M = BottomUpAnalyzer(Set("q", "r"), L.Σ, L.N | L.Σ | Set("⊥"), ô, "q", "⊥", Set("r")) for A in L.N: for rule in L.P[A]: ô["q", "ε", rule].add(("q", A)) for a in L.Σ: ô["q", a, "ε"].add(("q", a)) ô["q", "ε", "⊥S"].add(("r", "ε")) return M
def Dε(self, q0): result = Set() stack = [q0] while len(stack) > 0: q = stack.pop(0) result.add(q) targets = self.__getitem__((q, "ε")) for target in targets: if target not in result: stack.append(target) result.add(target) return result
def __init__(self, Q, Σ, δ, q0, F): """ Q : set of states Σ : finite alphabet δ : Q × Σ → Q transition function q0 : q0 ∈ Q initial state F : F ⊆ Q set of accepting states """ super().__init__(Q, Σ, δ, Set(q0), F) self.q0 = str(q0)
def test_dfa(): δ = DFA.δ() A = DFA(Set(1, 2, 3, 4, 5, 6, 7), Set("a", "b"), δ, 1, Set(3, 5, 6)) δ[1, "a"] = 2 δ[1, "b"] = "-" δ[2, "a"] = 3 δ[2, "b"] = 4 δ[3, "a"] = 6 δ[3, "b"] = 5 δ[4, "a"] = 3 δ[4, "b"] = 2 δ[5, "a"] = 6 δ[5, "b"] = 3 δ[6, "a"] = 2 δ[6, "b"] = "-" δ[7, "a"] = 6 δ[7, "b"] = 1 # A.table() B = A.minimize() B.diagram()
def remove_primitive_rules(self): """ remove all rules of type A → B where A,B ∈ N """ import CFG P = CFG.P() for A in self.N: NA = self.Nx(A) P[A] = Set(rule for B in NA for rule in self.P[B] if not self.isprimitive(rule)) return CFG(self.N, self.Σ, P, self.S)
def Nx(self, x): i = 0 N = {0: Set(x)} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(rule for A in N[i - 1] for rule in self.P[A] if self.isprimitive(rule))) do() while N[i] != N[i - 1]: do() Nx = N[i] return Nx
def reachables(self, q0): """ return a list of all reachable qi states from the state q0 """ result = Set() stack = [q0] while len(stack) > 0: q = stack.pop(0) result.add(q) for a in self.Σ: target = self.__getitem__((q, a)) if target is not None: if target not in result: stack.append(target) result.add(target) return result
def resolve(A, B): for _ in range(len(P[A])): rule = list(P[A].pop(0)) if rule[0].islower(): for i, c in enumerate(rule[1:]): if c.islower() and len(c) == 1: rule[i + 1] = f"{c}̄" rule = "".join(rule) P[A].add(Rule(rule)) elif rule[0] == B: rules = Set() for rule1 in P[B]: rules.add(Rule(rule1 + "".join(rule[1:]))) P[A] |= rules else: P[A].add(Rule(rule))
def V(self): """ get all reachable symbols """ i = 0 V = {0: Set(self.S)} def do(): nonlocal i, V i = i + 1 V[i] = V[i - 1].union( Set(sym for sym in self.N.union(self.Σ) if any(sym in rule for A in V[i - 1] if A.isupper() for rule in self.P[A]))) do() while V[i] != V[i - 1]: do() V = V[i] return V
def CYK_parser(self, w): n = len(w) C = np.empty((n, n), dtype=Set) for i in range(n): for j in range(n): C[i, j] = Set() for d in self.P: for i in range(n): # for k in range(0, n-i, -1): for k in range(n-i): if self.check_prod_rule((d, w[i:i+k+1])): C[i, i+k].add(d) for m in range(2, n+1): for i in range(1, n-m+2): j = i + m - 1 for rule in self.get_nonterm_prod_rules(): for k in range(i, j): lrule, rrule = rule[1][0], rule[1][1] if lrule in C[i-1, k-1] and rrule in C[k, j-1]: C[i-1, j-1].add(rule[0]) return C
def step(i, groups, δ): new_groups = {} force_move = 0 numeratd_patterns = [] for i in range(len(Set(groups.values()))): for qi, val in groups.items(): target = δ.group()[qi] if val == roman(i + 1): if target not in numeratd_patterns: numeratd_patterns.append(target) index = numeratd_patterns.index(target) new_groups[qi] = roman(index + 1 + force_move) force_move += len(numeratd_patterns) numeratd_patterns = [] new_δ = DFA.δ() for qi in self.Q: for a in self.Σ: new_δ[qi, a] = new_groups.get(self.δ[qi, a]) if (imax is None or i < imax) and groups != new_groups and δ != new_δ: return step(i + 1, new_groups, new_δ) return new_groups, new_δ
def Nε(self): """ get all states that can turn into ε """ import re i = 0 N = {0: Set()} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅ε" + "".join(N[i - 1]) + "]", "", str(p)) == "")) do() while N[i] != N[i - 1]: do() Nε = N[i] return Nε
def Ne(self): """ get all normalised nonterminals """ import re i = 0 N = {0: Set()} def do(): nonlocal i, N i = i + 1 N[i] = N[i - 1].union( Set(A for A in self.N for p in self.P[A] if re.sub("[∅" + "".join(N[i - 1]) + "]", "", str(p)).islower())) do() while N[i] != N[i - 1]: do() Ne = N[i] return Ne
def toGNF(self): """ each rule must be of format A → aB1B2B3...Bn (a ∈ Σ, B1,B2,B3,...,Bn ∈ N) """ import CFG G = self.remove_left_recursion() N = Set(reversed(G.N.copy())) P = G.P.copy() def resolve(A, B): for _ in range(len(P[A])): rule = list(P[A].pop(0)) if rule[0].islower(): for i, c in enumerate(rule[1:]): if c.islower() and len(c) == 1: rule[i + 1] = f"{c}̄" rule = "".join(rule) P[A].add(Rule(rule)) elif rule[0] == B: rules = Set() for rule1 in P[B]: rules.add(Rule(rule1 + "".join(rule[1:]))) P[A] |= rules else: P[A].add(Rule(rule)) for i, A in enumerate(N): for B in N[:i + 1]: resolve(A, B) return CFG(N, self.Σ, P, self.S)