def convert_hybrid(grammar): ''' Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B ''' rules = grammar.productions() new_rules = [] for rule in rules: lhs = rule.lhs() rhs = rule.rhs() # check for hybrid rules if rule.is_lexical() and len(rhs) > 1: new_rhs = [] for item in rule.rhs(): if is_terminal(item): new_sym = Nonterminal(item) new_rhs.append(new_sym) # add new lexical rule with dummy lhs nonterminal new_rules.append(Production(new_sym, (item, ))) else: new_rhs.append(item) # add converted mixed rule with only non-terminals on rhs new_rules.append(Production(lhs, tuple(new_rhs))) else: new_rules.append(rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def exploreCFG(cfg, length_limit): """ Generate strings with the CFG, without ever allowing an intermediate expression to exceed the length_limit. Note that not all strings with length <= length_limit that can be generated will be generated. For example, with S -> 1S | e and length_limit = 2, we won't be able to generate "11" because in the sequence S -> 1S -> 11S -> 11, 11S is too long and we would stop exploring there. """ cfg = CFG(cfg.start(), [splitProdRhs(prod) for prod in cfg.productions()]) finished = set() # Expressions with no nonterminals left visited = set( ) # Expressions with nonterminals that have already been explored to_explore = [(cfg.start(), )] while to_explore: expr = to_explore.pop() if expr in visited or len(expr) > length_limit: continue for i in range(len(expr)): if isinstance(expr[i], Nonterminal): break else: finished.add(expr) continue visited.add(expr) for prod in cfg.productions(lhs=expr[i]): to_explore.append(expr[:i] + prod.rhs() + expr[i + 1:]) return finished
def remove_unary_rules(grammar): """Remove unary nonterminal productions A -> B""" result = [] unary = [] fake_rules = [] removed_rules = [] for rule in grammar.productions(): if len(rule) == 1 and rule.is_nonlexical(): unary.append(rule) else: result.append(rule) while unary: rule = unary.pop(0) removed_rules.append(rule) for item in grammar.productions(lhs=rule.rhs()[0]): new_rule = Production(rule.lhs(), item.rhs()) if len(new_rule) != 1 or new_rule.is_lexical(): result.append(new_rule) fake_rules.append(new_rule) else: unary.append(new_rule) n_grammar = CFG(grammar.start(), result) return n_grammar, grammar
def cfg(self, include_edgelabels=True): sents = self.parsed_sents(include_edgelabels) tiger_prods = set(prod for sent in sents for prod in sent.productions()) cfg = CFG(Nonterminal(TigerCorpusReader.GRAMMAR_START), list(tiger_prods)) return cfg
def generate_grammar_and_parsers(parsed_sents): # From sentences, extract the parsing tree and transform each tree to a list of CFG productions; # generate a set containing all the productions (without repetitions) tbank_productions_with_repet = [ production for parsed_sent in parsed_sents for production in parsed_sent.productions() ] tbank_productions = set( tbank_productions_with_repet) # exclude repetitions print("Num. of unique productions read:", len(tbank_productions)) # Build a CFG from the productions print("\nBuinding a CFG...") cfg_grammar = CFG(Nonterminal('S'), tbank_productions) # a CFG print(cfg_grammar, end="\n\n") # CFG - An Earley parser cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3) # Build a PCFG from the productions print("Building a PCFG...") pcfg_grammar = induce_pcfg( Nonterminal('S'), tbank_productions_with_repet) # a PCFG, here repetitions are needed! print(pcfg_grammar, end="\n\n") # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html pcfg_pchart_parser = InsideChartParser(pcfg_grammar) return cfg_earley_parser, pcfg_pchart_parser # return both parsers
def demo2(): from nltk import Nonterminal, Production, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = CFG(S, productions) text = 'I saw a man in the park'.split() d=CFGDemo(grammar, text) d.mainloop()
def demo2(): from nltk import Nonterminal, Production, CFG nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ["up", "over", NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) grammar = CFG(S, productions) text = "I saw a man in the park".split() d = CFGDemo(grammar, text) d.mainloop()
def create_taskgrammar(grammar, task, encoders): logger.info('Creating specific grammar for task %s' % task) productions = grammar.productions(Nonterminal(task)) start_token = Nonterminal('S') new_productions = [] for start_production in productions: first_token = start_production.rhs()[0] if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'): for new_start_production in grammar.productions(first_token): new_productions.append(Production(start_token, new_start_production.rhs())) else: new_productions.append(Production(start_token, start_production.rhs())) for production in grammar.productions(): for new_production in new_productions: if production.lhs() in new_production.rhs() and production not in new_productions: if production.lhs().symbol() == 'ENCODERS': # Use encoders only for types of features in the dataset if len(encoders) > 0: new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders])) else: new_productions.append(Production(production.lhs(), ['E'])) else: new_productions.append(production) task_grammar = CFG(start_token, new_productions) with open(TASK_GRAMMAR_PATH, 'w') as fout: fout.write('\n'.join([str(x) for x in task_grammar.productions()])) return task_grammar
def binarize(grammar): """Binarize grammar by introducing new nonterminals""" result = [] for rule in grammar.productions(): if len(rule.rhs()) > 2: # this rule needs to be broken down left_side = rule.lhs() symbol_names = [ tsym.symbol() if not isinstance(tsym, str) else '@' + tsym for tsym in rule.rhs() ] for k in range(1, len(rule.rhs()) - 1): new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join( symbol_names[k:]) + '>' new_sym = Nonterminal(new_rhs_name) new_production = Production(left_side, (rule.rhs()[k - 1], new_sym)) left_side = new_sym result.append(new_production) last_prd = Production(left_side, rule.rhs()[-2:]) result.append(last_prd) else: result.append(rule) n_grammar = CFG(grammar.start(), result) return n_grammar
def convert_unit(grammar): ''' Convert unitary rules in the form of [A -> B] where the rhs has one non-terminal by eliminating intermediate unitary rules and promoting the final lexical rule, e.g. [B -> 'b'] => [A -> 'b'] or stop at an intermediate rule with only non-terminals on the rhs like [B -> C D] => [A -> C D] ''' rules = grammar.productions() new_rules = [] unit_rules = [] for rule in rules: # check for unit rules if rule.is_nonlexical() and len(rule) == 1: unit_rules.append(rule) else: new_rules.append(rule) # following each unit rule and find the final terminal while unit_rules: rule = unit_rules.pop(0) lhs = rule.lhs() rhs = rule.rhs() # find rules that can derive the rhs to something else for cascade_rule in grammar.productions(lhs=rhs[0]): temp_rule = Production(lhs, cascade_rule.rhs()) if cascade_rule.is_lexical() or len(cascade_rule) > 1: new_rules.append(temp_rule) else: unit_rules.append(temp_rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def buildFromTreebank(self): """ Build a Context-Free-Grammar based on UPenn treebank """ tbank_productions = set() for sent in treebank.parsed_sents(): for production in sent.productions(): if production.is_lexical(): new_rhs = [str(production._lhs)] production = Production(production._lhs, new_rhs) tbank_productions.add(production) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) return tbank_grammar
def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = CFG(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus()
def remove_mixing(grammar): result = [] for rule in grammar.productions(): if len(rule.rhs()) == 2 and (isinstance(rule.rhs()[0], str) or isinstance(rule.rhs()[1], str)): new_rhs = [] for k in range(2): if isinstance(rule.rhs()[k], str): new_sym = Nonterminal('$'+rule.rhs()[k]) new_production = Production(new_sym, (rule.rhs()[k],)) result.append(new_production) new_rhs.append(new_sym) else: new_rhs.append(rule.rhs()[k]) new_production = Production(rule.lhs(), new_rhs) result.append(new_production) else: result.append(rule) n_grammar = CFG(grammar.start(), result) return n_grammar
def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import Nonterminal, Production, CFG nonterminals = "S VP NP PP P N Name V Det" (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ["I"]), Production(Det, ["the"]), Production(Det, ["a"]), Production(N, ["man"]), Production(V, ["saw"]), Production(P, ["in"]), Production(P, ["with"]), Production(N, ["park"]), Production(N, ["dog"]), Production(N, ["statue"]), Production(Det, ["my"]), ) grammar = CFG(S, productions) # tokenize the sentence sent = "my dog saw a man in the park with a statue".split() ShiftReduceApp(grammar, sent).mainloop()
def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import Nonterminal, Production, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = CFG(S, productions) # tokenize the sentence sent = 'my dog saw a man in the park with a statue'.split() ShiftReduceApp(grammar, sent).mainloop()
def convert_long(grammar): ''' Convert non-binary rules in the form of [A -> B C D], where the rhs has more than 2 non-terminals into binarised rules in the form of [A -> B_C D] & [B_C -> B C] witha dummy non-terminal B_C ''' rules = grammar.productions() new_rules = [] long_rules = [] for rule in rules: if len(rule.rhs()) > 2: long_rules.append(rule) else: new_rules.append(rule) while long_rules: rule = long_rules.pop(0) lhs = rule.lhs() rhs = rule.rhs() new_rhs = [] for i in range(0, len(rhs) - 1, 2): new_sym = Nonterminal(f"{rhs[i].symbol()}_{rhs[i + 1].symbol()}") new_rules.append(Production(new_sym, (rhs[i], rhs[i + 1]))) new_rhs.append(new_sym) # case: odd number of non-terminals on rhs if len(rhs) % 2 == 1: new_rhs.append(rhs[-1]) new_rule = Production(lhs, tuple(new_rhs)) # continue binarisation if rhs still has more than 2 non-terminals if len(new_rhs) > 2: long_rules.append(new_rule) else: new_rules.append(new_rule) new_grammar = CFG(grammar.start(), new_rules) return new_grammar
def create_completegrammar(primitives): base_grammar = load_grammar(BASE_GRAMMAR_PATH) new_productions = [] for production in base_grammar.productions(): primitive_type = production.lhs().symbol() if primitive_type in primitives: new_rhs_list = [] for token in production.rhs(): if isinstance(token, str) and token.startswith('primitive_'): new_rhs_list.append(primitives[primitive_type]) else: new_rhs_list.append([token]) for new_rhs in itertools.product(*new_rhs_list): new_productions.append(Production(production.lhs(), new_rhs)) else: new_productions.append(production) complete_grammar = CFG(Nonterminal('S'), new_productions) with open(COMPLETE_GRAMMAR_PATH, 'w') as fout: fout.write('\n'.join([str(x) for x in complete_grammar.productions()])) return complete_grammar
def extract_simple_cfg(n): rules = extract_simple_productions(n) rules = list(set(rules)) return CFG(Nonterminal("S"), sort_rules(rules))
def guess(self, verbose=None): """ Makes a guess based on the next observation. Updates self._curr_guess. :rtype: CFG :returns: The next guess """ if verbose is not None: self._verbose = verbose sentence = Sentence(next(self._text)) self._num_steps += 1 self._log("String {}: {}".format(self._num_steps, sentence)) if sentence in self._data: self._log("String already seen") return self._curr_guess # Info from previous guess num_contexts = len(self._contexts) num_subs = len(self._substrings) if self._curr_guess is not None: num_nts = len(set(p.lhs() for p in self._curr_guess.productions())) - 1 else: num_nts = 0 total_timer = Timer() total_timer.start() # Update data and terminals words = sentence.get_words() self._data.add(sentence) self._terminals.update(set(words)) # Update contexts self._log("Updating contexts...") inds = range(0, len(words) + 1) contexts = [ Context(words[:i], words[j:]) for i in inds for j in inds[i:] ] self._contexts.update(ContextSet(contexts)) self._log( "{} new contexts added".format(len(self._contexts) - num_contexts)) # Update substrings self._log("Updating substrings...") is_new_sentence = True if self._curr_guess_parser is not None: try: parses = self._curr_guess_parser.parse(words) is_new_sentence = len(list(parses)) == 0 except: is_new_sentence = True if is_new_sentence: subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]] self._substrings.update(SentenceSet(subs)) self._log("{} new substrings added".format( len(self._substrings) - num_subs)) else: self._log("Sentence already generated by current guess") # Construct the nonterminals self._log("Constructing nonterminals...") kernels = set() for i in range(1, self._k + 1): subsets = [ SentenceSet(j) for j in combinations(self._substrings, i) ] kernels.update(subsets) for kernel in kernels: if kernel not in self._nonterminals: nt_name = self._new_name() contexts = self._oracle.restr_right_triangle( kernel, self._contexts) nt = Nonterminal(nt_name) self._nonterminals[kernel] = nt self._nt_contexts[nt] = contexts # Get a set of nonterminals with unique contexts self._log("Removing equivalent nonterminals...") context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()} self._log( "{} nonterminals removed".format(len(kernels) - len(context_nts))) self._log("{} new nonterminals constructed".format( len(context_nts) - num_nts)) # Construct the rules self._log("Constructing rules...") self._productions = set() timer = Timer() # Lexical rules timer.start() for t in self._terminals: t_kernel = SentenceSet([Sentence([t])]) t_nt = self._nonterminals[t_kernel] t_contexts = self._nt_contexts[t_nt] for contexts, nt in context_nts.iteritems(): rule = Production(nt, [t]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if contexts.issubset(t_contexts): self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_lex = len(self._productions) self._log("{} lexical rules ({:.2f} secs)".format( num_lex, timer.elapsed())) # Binary rules timer.reset() timer.start() for kernel_l in self._nonterminals: for kernel_r in self._nonterminals: kernel_rhs = kernel_l + kernel_r sents_rhs = list(kernel_rhs.intersection(self._substrings)) inds = range(len(sents_rhs) / self._k + 1) kers_rhs = [ sents_rhs[self._k * i:self._k * (i + 1)] for i in inds ] kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0] nts_rhs = [self._nonterminals[k] for k in kers_rhs] contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs] if len(contexts_nts_rhs) > 0: contexts_rhs = contexts_nts_rhs[0].intersection( *contexts_nts_rhs) else: contexts_rhs = self._contexts # Membership queries new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs)) new_contexts_rhs = self._oracle.restr_right_triangle( new_strs_rhs, contexts_rhs) contexts_rhs.intersection_update(new_contexts_rhs) # Building the rules for contexts, nt in context_nts.iteritems(): nt_l = context_nts[self._nt_contexts[ self._nonterminals[kernel_l]]] nt_r = context_nts[self._nt_contexts[ self._nonterminals[kernel_r]]] rule = Production(nt, [nt_l, nt_r]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if contexts.issubset(contexts_rhs): self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_bin = len(self._productions) - num_lex self._log("{} binary rules ({:.2f} secs)".format( num_bin, timer.elapsed())) # Start rules timer.reset() timer.start() for contexts, nt in context_nts.iteritems(): rule = Production(self._start_symbol, [nt]) if rule in self._productions: continue if rule in self._eliminated_rules: continue if Context([], []) in contexts: self._productions.add(rule) else: self._eliminated_rules.add(rule) timer.stop() num_start = len(self._productions) - num_lex - num_bin self._log("{} start rules ({:.2f} secs)".format( num_start, timer.elapsed())) # Construct the grammar self._curr_guess = CFG(self._start_symbol, self._productions) self._curr_guess_parser = ChartParser(self._curr_guess) total_timer.stop() elapsed = total_timer.elapsed() num_rules = len(self._curr_guess.productions()) self._log("Constructed grammar with {} rules ({:.2f} secs)".format( num_rules, elapsed)) return self._curr_guess
def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = CFG(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg)
s = '' print('Bulding tree from parsed sentences') with open('parsed_sentences.txt') as f: sentences = list(f) + [''] for line in sentences: line = line.strip() if len(line) > 0: if line[0] != '#': s += line elif len(s) > 0: t = tree.Tree.fromstring(s) prod += t.productions() t.chomsky_normal_form() t.collapse_unary(collapsePOS=True) prod_cnf += t.productions() s = '' prod = set(prod) prod_cnf = set(prod_cnf) print('Writing CFG to file with %d productions' % len(prod)) grammar = CFG(Nonterminal('ROOT'), prod) with open('grammar.cfg', 'w') as f: f.write('\n'.join([str(p) for p in grammar.productions()])) print('Writing CFG (CNF) to file with %d productions' % len(prod_cnf)) grammar_cnf = CFG(Nonterminal('ROOT'), prod_cnf) with open('grammar_cnf.cfg', 'w') as f: f.write('\n'.join([str(p) for p in grammar_cnf.productions()]))
return len(rhs) == 1 and isinstance(rhs[0], str) parser = CoreNLPParser(url="http://localhost:9000") sentences = brown.sents() # FILTER SHORT AND LONG SENTENCES filter_sentences = [] for sentence in tqdm(sentences): nb_words = number_of_words(sentence) if nb_words >= 5 and nb_words <= 10: filter_sentences.append(sentence) # PARSE SENTENCES productions = [] for sentence in tqdm(filter_sentences): parse_tree = next(iter(parser.parse(sentence))) productions += parse_tree.productions() unique_productions = list(set(productions)) # REMOVE TERMINAL SYMBOLS productions_wo_term = [] for prod in unique_productions: if not is_rhs_terminal(prod): productions_wo_term.append(prod) grammar = CFG(start=Nonterminal("ROOT"), productions=productions_wo_term) pickle.dump(grammar, open("brown_grammar.pickle", "wb"))
for sent in sentences: for p in parser.parse(sent): p.draw() from nltk.corpus import treebank print(treebank.parsed_sents()[0]) print(treebank.parsed_sents()[1]) from nltk.grammar import CFG, Nonterminal prods = list({ production for sent in treebank.parsed_sents() for production in sent.productions() }) t_grammar = CFG(Nonterminal('S'), prods) sents = [ 'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(), 'Alan introduced a plan .'.split() ] t_parser = BottomUpChartParser(t_grammar) parses = 0 for s in sents[:1]: for p in t_parser.parse(s): if parses < 5: print(p) parses += 1
def parse(text): """ Parse some text. """ ''' # extract new words and numbers words = set([match.group(0) for match in re.finditer(r"[a-zA-Z]+", text)]) numbers = set([match.group(0) for match in re.finditer(r"\d+", text)]) ''' numbers = set([match.group(0) for match in re.finditer(r"\d+", text)]) coordinates = set( [match.group(0) for match in re.finditer(r"\(\d+,\d+\)", text)]) relations = [ "segitiga", "kotak", "titik", "garis", "poligon", "negara", "kota", "provinsi" ] fields = ["nama", "ibukota", "geom", "id", "id_ibukota"] class Relation: def __init__(self, name, attrs, geom): self.name = name self.attrs = attrs self.geom = geom # segitiga: id, nama, geom # kotak: id, nama, geom # titik: id, nama, geom # garis: id, nama, geom # poligon: id, nama, geom # negara: id, nama, id_ibukota, geom # provinsi: id, nama, id_ibukota, geom # kota: id, nama, geom # Make a local copy of productions lproductions = list(productions) # Add a production for every words and number lproductions.extend( [literal_production("NUMBER", number) for number in numbers]) lproductions.extend( [literal_production("RELATION", relation) for relation in relations]) lproductions.extend( [literal_production("VALUE", value) for value in values]) lproductions.extend( [literal_production("FIELD", field) for field in fields]) lproductions.extend( [literal_production("COOR", coor) for coor in coordinates]) key = "VALUE" lhs = Nonterminal(key) lproductions.extend([Production(lhs, ["bengawan", "solo"])]) # Make a local copy of the grammar with extra productions lgrammar = CFG(grammar.start(), lproductions) # Load grammar into a parser parser = nltk.RecursiveDescentParser(lgrammar) tokens = text.split() return parser.parse(tokens)