def count_rules(self, tree): """ For each level in the given tree, increment the corresponding count in the dictionary :param tree: the given tree :return: void """ lhs = tree.label() if self.root is '': self.root = lhs # if this tree's children are subtrees, count the rules in each of those too if len(tree) > 1: rhs = Nonterminal(tree[0].label() + " " + tree[1].label()) self.count_rules(tree[0]) self.count_rules(tree[1]) # if this tree has one child, and it's a subtree, count the rules in there too elif isinstance(tree[0], Tree): rhs = Nonterminal(tree[0].label()) self.count_rules(tree[0]) # if this tree's child is a leaf, no recursion is necessary else: rhs = tree[0] self.grammar_counts.setdefault(lhs, {}).setdefault(rhs, 0) self.grammar_counts[lhs][rhs] += 1
def chunker(parsedData): """ Extract the grammar rules from the input parsed text and assign each rule with the probability of it occuring in the parsed text. """ tags_words = treebank.tagged_words() # This is the list where all the rules will be stored, for # construction of the PCFG rules = [] NP = Nonterminal('NP') rhs_rules = [] # Extract the rules from the training-data for sent in parsedData: for production in sent.productions(): rules.append(production) # Add the lexical rules for word, tag in tags_words: # For each tagged word, create a tree containing that # lexical rule # This is to be able to add it to the list rules t = Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): rules.append(production) # All the syntactic rules and all of the lexical rules # are extracted from the training-data # Here the PCFG is extracted rules_prob = nltk.grammar.induce_pcfg(Nonterminal('S'), rules) return rules_prob
def yoda_translation(root: Tree): """ Provides translation from italian language to Yoda-speak language using a Transfer approach. :param root: the syntactic tree to be translated """ current_index = list( (index for index in root.treepositions() if isinstance(root[index], Tree) and root[index].label() in [Nonterminal("VP"), Nonterminal("AUX")] and len(root[index]) == 1))[0] parent_index = get_parent(current_index) nodes_to_be_moved = [] while root.__getitem__(parent_index).label() == Nonterminal("VP"): index_to_be_moved = get_right_child(parent_index) nodes_to_be_moved.append(root.__getitem__(index_to_be_moved)) root.__setitem__(index_to_be_moved, Tree("ε", [])) current_index = parent_index parent_index = get_parent(current_index) nodes_to_be_moved.reverse() for node in nodes_to_be_moved: root = Tree('Yoda Translation', [node, root]) root.draw()
def update_complete_chart(chart, tokens, grammar, trace=False): """Updates non-diagonal elements of chart Arguments: ---------- chart (list): List of list containing chart algorithm elements tokens (list): List of words in input sentence grammar (list): List of production rules in the grammar """ index = dict((p.rhs(), p.lhs()) for p in grammar.productions()) num_tokens = len(tokens) for span in range(2, num_tokens + 1): for start in range(num_tokens + 1 - span): end = start + span temp_categories, temp_rules = [], [] for mid in range(start + 1, end): nt1s, nt2s = chart[start][mid], chart[mid][end] if len(nt1s) != 0 and len(nt2s) != 0: for nt1 in nt1s[0]: for nt2 in nt2s[0]: if nt1 and nt2 and (nt1, nt2) in index: p = Production( index[(nt1, nt2)], (Nonterminal(nt1), Nonterminal(nt2))) temp_rules.append(f'{p._lhs} -> {p._rhs}') temp_categories.append(index[(nt1, nt2)]) chart[start][end] = [(temp_categories[i], temp_rules[i], mid) for i in range(len(temp_rules))] return chart
def _generate_production(self, t): arr = [] for i in range(len(t)): if type(t[i]) == str: arr.append(t[i]) else: arr.append(Nonterminal(t[i].label())) return Production(Nonterminal(t.label()), tuple(arr))
def translate_it_yo(tree): SUBJ = [Nonterminal("PRON"), Nonterminal("NP"), Nonterminal("N")] VERB = [Nonterminal("VP")] yoda_tree = Tree("Yoda", []) for i in range(len(tree)): if (tree[i].label() in SUBJ): yoda_tree.insert(1, tree[i]) if (tree[i].label() in VERB): V = tree[i][0] X = tree[i][1] yoda_tree.insert(0, X) yoda_tree.insert(2, V) return yoda_tree
def create_grammar(x_train): productions = [] for x in x_train: for tree in treebank.parsed_sents(x): # tree.collapse_unary(collapsePOS = True) tree.chomsky_normal_form() productions += tree.productions() S = Nonterminal('S') for w in ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNS','NNP','NNPS','PDT','POS','PRP','PRP','RB','RBR','RBS','RP','TO','UH','VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP','WRB', 'NP' ]: productions.append(Production(Nonterminal(w), ('<UNK>', ))) grammar = create_pcfg(S, productions) return grammar
def test_grammar_general(): np.random.seed(0) txtgram = "S -> S '+' F [0.2] | F [0.8] \n" txtgram += "F -> 'x' [0.5] | 'y' [0.5]" grammar = GeneratorGrammar(txtgram) sample = grammar.generate_one() assert sample[0] == ['y'] and sample[1] == 0.4 and sample[2] == '11' assert grammar.count_trees(Nonterminal("S"), 5) == 30 assert grammar.count_coverage(Nonterminal("S"), 2) == 0.8 assert "".join(grammar.code_to_expression('0101')[0]) == "x+y"
def pcfg_learn1(treebank, n): productions = list() for i in range(n): for tree in treebank.parsed_sents()[:i + 1]: prod_gen = tree_to_productions(tree, "BOT") tree_to_append = next(prod_gen)[0] while tree_to_append: if tree_to_append.lhs() == Nonterminal('NP'): productions.append(tree_to_append) try: tree_to_append = next(prod_gen)[0] except Exception as e: tree_to_append = False productions, dist = get_productions(productions) return PCFG(Nonterminal('NP'), productions), dist
def to_pcfg(sequences, sections): sequences = [s[s >= 0] for s in sequences] trees = [Tree.fromstring(to_tree(s, sections)) for s in sequences] # [t.collapse_unary(collapsePOS = False) for t in trees] # [t.chomsky_normal_form(horzMarkov = 2) for t in trees] prods = [p for t in trees for p in t.productions()] print(induce_pcfg(Nonterminal('S'), prods))
def test1(): nt1 = Nonterminal('NP') nt2 = Nonterminal('VP') print nt1.symbol() S, NP, VP, PP = nonterminals('S, NP, VP, PP') N, V, P, DT = nonterminals('N, V, P, DT') prod1 = Production(S, [NP, VP]) prod2 = Production(NP, [DT, NP]) print prod1.lhs() print prod1.rhs() print prod1 == Production(S, [NP, VP]) print prod1 == prod2
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess): res = "" productions = [] parser = CoreNLPParser(url='http://localhost:9500') for sentence in re.split(r"[.!?]", review): try: tree = next(parser.raw_parse(sentence)) # Optimize by creating Chomsky normal form tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() except StopIteration: # End of review reached break S = Nonterminal('S') grammar = induce_pcfg(S, productions) count = 0 for line in str(grammar).split("\n"): if count == 0: count += 1 continue elif "'" in line: res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " " res += bipos.get_bigrams_and_unigrams_of_sentence( bow.sanitize_sentence(review, speller, stop_words, ps, preprocess)) return res
def train_grammar(unknown_words=[], nb_reduced_production=6000): productions = [] for item in train: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D #tree_prods = tree.productions() productions += tree.productions() counter = collections.Counter(productions) n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)] #Adding unkwown words and terminal rules back into the reduced productions set unknown_words_prods = [] for p in productions: if isinstance(p._rhs[0], str): unknown_words_prods.append(p) for u in unknown_words: rhs = [u] lhs = p._lhs new_prod = Production(lhs, rhs) unknown_words_prods.append(new_prod) n_comms += unknown_words_prods S = Nonterminal('S') grammar = induce_pcfg(S, n_comms) return grammar
def pcfg(train_idx=None, smoothing=None): """ productions = [] item = treebank._fileids[0] print("ITEM\n\n",item,"\n\n") for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() """ if train_idx == None: train_idx = (len(treebank.fileids()) * 3) // 4 productions = [] for item in treebank.fileids()[0:train_idx]: for tree in treebank.parsed_sents(item): tree.collapse_unary( collapsePOS=False) # Remove unary production rule tree.chomsky_normal_form( horzMarkov=2 ) # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D) productions += tree.productions() S = Nonterminal('S') if smoothing == None: grammar = learn_pcfg(S, productions) elif smoothing == 'L1': grammar = smoothing_pcfg(S, productions) with open('grammar.pkl', 'wb') as f: pickle.dump(grammar, f) return grammar
def main(config): grammar_string = parse_induced_grammar( config.grammar ) if config.output: with open(config.output, 'w') as f: f.write(grammar_string) grammar = PCFG.fromstring( grammar_string ) grammar._start = Nonterminal('TOP') # Not sure whether this is allowed or breaks things # Create directory for parse_trees if it does not already exist if config.textfile: if not os.path.exists(config.output_parse): os.makedirs(config.output_parse) if config.textfile: parser = ViterbiParser(grammar) with open(config.textfile, 'r') as f: lines = f.read().splitlines() for i, line in enumerate(lines): if i==config.number_parses: break print(f"Parsing sentence {i+1}") sent = line.split() for t in parser.parse(sent): TreeView(t)._cframe.print_to_file(f"{config.output_parse}/tree_{i}")
def get_grammar(cls, train_trees, starting_symb='SENT'): """ This method returns a the grammar coputed from the training set. Inputs: ------- train_trees (list): List of trees to perform training startting_symbol (str): The root symbol """ productions = [] # Chmosky Normal Form for tree in train_trees: # Remove unary rules treetransforms.collapse_unary(tree) # Transform to CNF treetransforms.chomsky_normal_form(tree, horzMarkov=2) # Copute production and store is productions += tree.productions() # Define the root symbol SENT = Nonterminal(starting_symb) # Compute the grammar using PCFG grammar = induce_pcfg(SENT, productions) grammar.chomsky_normal_form() return grammar
def accuracy(train_rules, test_rules, prob_thresh): """ Gives the NP production rules which are exclusive to one set of rules. """ NP = Nonterminal('NP') trainAmount = 0 testAmount = 0 rules_train = [] for rule in train_rules.productions(): if NP == rule.lhs( ): # and rule.lhs() != 'NNP' and rule.lhs() != 'NNPS': trainAmount += 1 rules_train.append(rule.rhs()) rules_test = [] for rule in test_rules.productions(): if NP == rule.lhs( ): # and rule.lhs() != 'NNP' and rule.lhs() != 'NNPS': testAmount += 1 rules_test.append(rule.rhs()) rulesExclusivelyInTrain = 0 for train_rule in rules_train: if train_rule not in rules_test: rulesExclusivelyInTrain += 1 rulesExclusivelyInTest = 0 for test_rule in rules_test: if test_rule not in rules_train: rulesExclusivelyInTest += 1 return rulesExclusivelyInTrain, rulesExclusivelyInTest, trainAmount, testAmount
def create_pcfg_from_treebank(pickle_it=False, log_it=False, filename="treebank", full=False): """ Creates a PCFG from the Penn Treebank dataset using induce_pcfg Optional pickling of this PCFG in pickled-vars/ """ if full: tb = ptb else: tb = treebank productions = [] flat_trees = 0 for item in tb.fileids(): # Goes through all trees for tree in tb.parsed_sents(item): if tree.height() == 2: # Gets rid of flat trees # print("####Tree not collected#####") flat_trees += 1 continue # print(" ".join(tree.leaves())) # This should print the sentences # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() print("%s Flat trees purged" % flat_trees) S = Nonterminal('S') grammar = induce_pcfg(S, productions) if pickle_it: pickle.dump(grammar, open("%s%s-grammar.p" % (var_dir, filename), "wb")) if log_it: save_grammar_cleartext(grammar, filename) save_lexicon_cleartext(grammar, filename) return grammar
def test_weird_indices(): cfg = CFGHelper(sample_grammar) indexes = [1, 1, 1, 1, 1, 1, 1, 1, 1] assert cfg.indexes_to_tokens(indexes) == ['y'] indexes = [1] assert cfg.indexes_to_tokens(indexes) == [Nonterminal('V')]
def build_context_free_grammar(self, data): productions = [] for tree in [Tree.fromstring(tree) for tree in data]: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() starting_state = Nonterminal('SENT') grammar = induce_pcfg(starting_state, productions) return grammar
def build_candidate_tree(score, back, words): li = 0 ri = len(words) tagi = Nonterminal('NEW_ROOT') if tagi not in back[li][ri]: return None tree_string = '(' + str(tagi) + ' ' + build_tree(back, li, ri, tagi, words, "") candidate_tree = Tree.fromstring(tree_string) return candidate_tree
def compose_children(self): """ Combine all valid left and right children for the current location in the matrix :return: """ for l_symbol, l_info in self.matrix[self.i][self.k].items(): l_rhs = Nonterminal(l_symbol) for r_symbol, r_info in self.matrix[self.k][self.j].items(): r_rhs = Nonterminal(r_symbol) # check the subtrees in [i][k] and [k][j] to see if you can make a valid rhs potential_rules = [p for p in self.grammar.productions(rhs=l_rhs) if p.rhs()[1] == r_rhs] for potential_rule in sorted(potential_rules, key=lambda x: x.prob()): new_lhs = potential_rule.lhs().symbol() new_tree = Tree(new_lhs, [l_info[1], r_info[1]]) new_prob = log(potential_rule.prob()) + l_info[0] + r_info[0] if new_lhs not in self.matrix[self.i][self.j] or new_prob > self.matrix[self.i][self.j][new_lhs][0]: self.matrix[self.i][self.j][new_lhs] = (new_prob, new_tree)
def create_pcfg(self, trees): productions = [] for tree in trees: tree.collapse_unary(collapsePOS=True) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) return grammar
def _setprob(self, tree, prod_probs): if tree.prob() is not None: return # Get the prob of the CFG production. lhs = Nonterminal(tree.node) rhs = [] for child in tree: if isinstance(child, Tree): rhs.append(Nonterminal(child.node)) else: rhs.append(child) prob = prod_probs[lhs, tuple(rhs)] # Get the probs of children. for child in tree: if isinstance(child, Tree): self._setprob(child, prod_probs) prob *= child.prob() tree.set_prob(prob)
def induce(trees: Iterable) -> FancyPCFG: productions = [] for tree in trees: # tree.pretty_print() # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) return FancyPCFG.fromCFG(grammar)
def update_grammar(productions, unknown): lis = pos_tagger.tag(unknown) for i in range(len(lis)): pos = nonterminals(lis[i][1])[0] production_ = Production(pos, [unknown[i]]) productions.append(production_) print production_, "added to productions" S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) return grammar
def gen_sql_stmt_from_grammar(self, start_, num_stmts=None, table_name="table_name", columns_name="columns_names"): grammar = CFG.fromstring( self.get_sql_select_stml_grammar(table_name, columns_name, COMMON_VALUES)) sql_select_stmts = [] for stmt in generate(grammar, start=Nonterminal(start_), n=num_stmts): sql_select_stmts.append(''.join(stmt)) return sql_select_stmts
def getGrammar(): fileid = treebank.fileids() trainfiles = fileid[:160] #testfiles=fileid[0.8*len(fileid):] productions = [] for item in trainfiles: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary( collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form( horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() lhs_prod = [p.lhs() for p in productions] rhs_prod = [p.rhs() for p in productions] set_prod = set(productions) list_prod = list(set_prod) token_rule = [] for ele in list_prod: if ele.is_lexical(): token_rule.append(ele) set_token_rule = set(p.lhs() for p in token_rule) list_token_rule = list(set_token_rule) corr_list_token_rule = [] for word in list_token_rule: if str(word).isalpha(): corr_list_token_rule.append(word) continue #print(corr_list_token_rule) import nltk a = [] for tok in corr_list_token_rule: #lhs = nltk.grammar.Nonterminal('UNK') lhs = 'UNK' rhs = [u'UNK'] UNK_production = nltk.grammar.Production(lhs, rhs) lhs2 = nltk.grammar.Nonterminal(str(tok)) a.append(nltk.grammar.Production(lhs2, [lhs])) token_rule.extend(a) list_prod.extend(a) S = Nonterminal('S') grammar = induce_pcfg(S, list_prod) return grammar
def nonterm_generation_suite(cfg): # Data expansions = defaultdict(list) # Run tests # TODO expand all nonterminals for x in range(5): expansions[Nonterminal('ROOT')].append( cfg.nltk_expand(Nonterminal('ROOT'))) # Print results print('Symbols in cfg:\n') pprint.pprint(cfg.productions.keys()) print('\nExpansions:\n') for symbol in expansions: if symbol == 'S': print('Root:', symbol, '\n') else: print('Nonroot:', symbol) pprint.pprint(expansions[symbol]) print('\n')
def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def build(self, examples=tuple()): """ :param examples: tuple or list of nltk Trees :return: """ allproductions = [] for example in examples: q = example t = self.grammarify(q) t = Tree("S", [t]) productions = t.productions() allproductions += productions pcfg = nltk.induce_pcfg(Nonterminal("S"), allproductions) return pcfg
import nltk from nltk import Nonterminal, nonterminals, Production, CFG nonterminal1 = Nonterminal('NP') nonterminal2 = Nonterminal('VP') nonterminal3 = Nonterminal('PP') print(nonterminal1.symbol()) print(nonterminal2.symbol()) print(nonterminal3.symbol()) print(nonterminal1==nonterminal2) print(nonterminal2==nonterminal3) print(nonterminal1==nonterminal3) S, NP, VP, PP = nonterminals('S, NP, VP, PP') N, V, P, DT = nonterminals('N, V, P, DT') production1 = Production(S, [NP, VP]) production2 = Production(NP, [DT, NP]) production3 = Production(VP, [V, NP,NP,PP]) print(production1.lhs()) print(production1.rhs()) print(production3.lhs()) print(production3.rhs()) print(production3 == Production(VP, [V,NP,NP,PP])) print(production2 == production3)