def get_grammar(cls, train_trees, starting_symb='SENT'): """ This method returns a the grammar coputed from the training set. Inputs: ------- train_trees (list): List of trees to perform training startting_symbol (str): The root symbol """ productions = [] # Chmosky Normal Form for tree in train_trees: # Remove unary rules treetransforms.collapse_unary(tree) # Transform to CNF treetransforms.chomsky_normal_form(tree, horzMarkov=2) # Copute production and store is productions += tree.productions() # Define the root symbol SENT = Nonterminal(starting_symb) # Compute the grammar using PCFG grammar = induce_pcfg(SENT, productions) grammar.chomsky_normal_form() return grammar
def _induce_pcfg(self, corpus): """ Induce PCFG del corpus considerando lexicalización en primer nivel y grupos verbales. """ prods = sum((lexicalize(t, grup="grup.verb").productions() for t in lemmatized_sents(corpus.corpus)), []) S = nltk.Nonterminal('sentence') return nltk.induce_pcfg(S, prods)
def get_bigram_and_deep_syntax_feature(review, speller, stop_words, ps, preprocess): res = "" productions = [] parser = CoreNLPParser(url='http://localhost:9500') for sentence in re.split(r"[.!?]", review): try: tree = next(parser.raw_parse(sentence)) # Optimize by creating Chomsky normal form tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() except StopIteration: # End of review reached break S = Nonterminal('S') grammar = induce_pcfg(S, productions) count = 0 for line in str(grammar).split("\n"): if count == 0: count += 1 continue elif "'" in line: res += re.sub(r"[(->) `\'\"\[\d\]]", "", line) + " " res += bipos.get_bigrams_and_unigrams_of_sentence( bow.sanitize_sentence(review, speller, stop_words, ps, preprocess)) return res
def create_pcfg_from_treebank(pickle_it=False, log_it=False, filename="treebank", full=False): """ Creates a PCFG from the Penn Treebank dataset using induce_pcfg Optional pickling of this PCFG in pickled-vars/ """ if full: tb = ptb else: tb = treebank productions = [] flat_trees = 0 for item in tb.fileids(): # Goes through all trees for tree in tb.parsed_sents(item): if tree.height() == 2: # Gets rid of flat trees # print("####Tree not collected#####") flat_trees += 1 continue # print(" ".join(tree.leaves())) # This should print the sentences # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() print("%s Flat trees purged" % flat_trees) S = Nonterminal('S') grammar = induce_pcfg(S, productions) if pickle_it: pickle.dump(grammar, open("%s%s-grammar.p" % (var_dir, filename), "wb")) if log_it: save_grammar_cleartext(grammar, filename) save_lexicon_cleartext(grammar, filename) return grammar
def pcfg_reverse(word): s = build_tree(word, 0) tree = nltk.Tree.fromstring(s) productions = tree.productions() for p in productions: ################################################## # !!! THIS IS WHERE THE MAGIC HAPPENS !!! # if len(p._rhs) > 1: # p._rhs = (p._rhs[1], p._rhs[0]) # ############################################## grammar = nltk.induce_pcfg(nltk.Nonterminal("N0"), productions) # print(grammar) # UNCOMMENT FOR A FUN TIME! parser = nltk.pchart.InsideChartParser(grammar) # Shuffle to generate 1000 possible words; only the correct # solution will be parseable with our grammar! for i in range(1000): cand = random.sample(word, len(word)) # print(cand) # UNCOMMENT FOR A FUN TIME! parser.parse(cand) for parse in parser.parse(cand): if parse._ProbabilisticMixIn__prob > 0: # print("number of tries: {}".format(i)) # UNCOMMENT! return "".join(cand) return "no reverse found, try again"
def pcfg_demo(): """ A demonstration showing how C{WeightedGrammar}s can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', repr(pcfg_prod)) print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs())) print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs())) print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob())) print() grammar = toy_pcfg2 print('A PCFG grammar:', repr(grammar)) print(' grammar.start() =>', repr(grammar.start())) print(' grammar.productions() =>', end=' ') # Use string.replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26)) print() print('Coverage of input words by a grammar:') print(grammar.covers(['a', 'boy'])) print(grammar.covers(['a', 'girl'])) # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for item in treebank.items[:2]: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) print(grammar) print() print("Parse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: #sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves() print(sent) for parse in parser.nbest_parse(sent): print(parse)
def _induce_pcfg(self, corpus): """ Induce PCFG del corpus considerando lexicalización en primer nivel. """ #stemmer = SpanishStemmer() S = nltk.Nonterminal('sentence') arboles = [] for tree in corpus.corpus.parsed_sents(): # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky. #tree.collapse_unary(collapsePOS = True, collapseRoot = True) #tree.chomsky_normal_form(horzMarkov = 2) arboles.append(tree.copy()) # Hago parent annotation del nivel superior a las hojas utilizando el lema de la hoja. productions = [] for arbol in arboles: for t in arbol.treepositions('leaves'): arbol[t] = arbol[t].lower() t_p = tuple(x[1] for x in enumerate(t) if x[0] != len(t)-1) arbol[t_p].set_label(arbol[t_p].label() + "#" + corpus.obtener_lema(arbol[t])) productions.extend(arbol.productions()) return nltk.induce_pcfg(S, productions)
def create_grammar() -> PCFG: # 21,763 productions with word terminals # 8,028 productions with pos terminals # 6,275 productions with nonterminals without digits # 5,402 productions with nonterminals without punctuation # 2,972 productions with nonterminals without suffixes # 707 nonterminals # 190 nonterminals without digit labels # 180 nonterminals without punctuation # 63 nonterminals without suffixes productions = [] start_symbol = Nonterminal('S') for tree in nltk.corpus.treebank.parsed_sents(): for production in tree.productions(): if not valid_nonterminal(production.lhs()): continue if isinstance(production.rhs()[0], Nonterminal): lhs = simplify_nonterminal(production.lhs()) rhs = tuple( simplify_nonterminal(t) for t in production.rhs() if valid_nonterminal(t)) productions.append(Production(lhs, rhs)) else: simplified = simplify_nonterminal(production.lhs()) productions.append( Production(simplified, (simplified.symbol(), ))) grammar = nltk.induce_pcfg(start_symbol, productions) #print(grammar.productions()) print(len(grammar.productions())) nonterminals = set(prod.lhs() for prod in grammar.productions()) print(sorted(nonterminals)) print(len(nonterminals)) return grammar
def induce_weights(self, sentences): if self.grammar is None: raise Exception("Need to call induce_structure first") sentences = [[c for c in s] for s in sentences] log_prob_last = 0 log_prob_curr = float('inf') while abs(log_prob_last - log_prob_curr) > 0.0001: log_prob_last = log_prob_curr parser = ViterbiParser(self.grammar) productions = [] log_prob_curr = 0 for i, sent in enumerate(sentences): print("parsing sentence %i of %i" % (i, len(sentences))) found = False for tree in parser.parse(sent): found = True log_prob_curr += tree.logprob() productions += tree.productions() if not found: print(sent) raise Exception("Unable to parse sentence") # print("last log prog", log_prob_last) print("curr log prob", log_prob_curr) self.grammar = nltk.induce_pcfg(self.start, productions)
def learn_PCFG(text_set, start_token): s = Nonterminal(start_token) production_list = [] for sent in text_set: production_list += sent.productions() return induce_pcfg(s, production_list)
def pcfg_demo(): """ A demonstration showing how C{WeightedGrammar}s can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print 'A PCFG production:', ` pcfg_prod ` print ' pcfg_prod.lhs() =>', ` pcfg_prod.lhs() ` print ' pcfg_prod.rhs() =>', ` pcfg_prod.rhs() ` print ' pcfg_prod.prob() =>', ` pcfg_prod.prob() ` print grammar = toy_pcfg2 print 'A PCFG grammar:', ` grammar ` print ' grammar.start() =>', ` grammar.start() ` print ' grammar.productions() =>', # Use string.replace(...) is to line-wrap the output. print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26) print print 'Coverage of input words by a grammar:' print grammar.covers(['a', 'boy']) print grammar.covers(['a', 'girl']) # extract productions from three trees and induce the PCFG print "Induce PCFG grammar from treebank data:" productions = [] for item in treebank.items[:2]: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) print grammar print print "Parse sentence using induced grammar:" parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: #sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves() print sent for parse in parser.nbest_parse(sent): print parse
def generate_pcfg_productions(self, questionbank): productions = [] with io.open(questionbank, 'r', encoding='utf8') as f: for line in f: line = line.strip() sent_text = nltk.sent_tokenize(line) for sentence in sent_text: #print sentence ss = self.parser.raw_parse_sents((sentence, )) for k in ss: for s in k: buf = "%s" % s buf = six.text_type(buf) s1 = Tree.fromstring(buf) #get rid of the ROOT for node in s1: if node.label() == 'ROOT': continue else: s1 = node break s1.chomsky_normal_form(horzMarkov=2) pdc = [] for p in s1.productions(): #remove the lexical production if not p.is_lexical(): pdc.append(p) productions += pdc S = Nonterminal('S') self.grammar = induce_pcfg(S, productions)
def _induce_pcfg(self, corpus): """ Induce PCFG grammar del corpus (treebank) considerando palabras UNK. """ unk_words = {k for k,v in self.wordfrecs.iteritems() if v == 1} productions = [] for tree in corpus.corpus.parsed_sents(): # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky. #tree.collapse_unary(collapsePOS = True, collapseRoot = True) #tree.chomsky_normal_form(horzMarkov = 2) # Transformo todas las hojas a minusculas. for t in tree.treepositions('leaves'): tree[t] = tree[t].lower() productions += tree.productions() new_productions = [] for pr in productions: if len(pr.rhs()) == 1 and pr.rhs()[0] in unk_words: new_pr = nltk.grammar.Production(pr.lhs(), ['UNK']) new_productions.append(new_pr) else: new_productions.append(pr) S = nltk.Nonterminal('sentence') return nltk.induce_pcfg(S, new_productions)
def fit_pcfg(self, X): if self.fitted_pcfg: raise ValueError("PCFG.pcfg already fitted") productions = [] for sentence in X: # nltk format t = nltk.tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) # chomky normal form self.chomkysation(t) #rules exraction rules = self.extract_rules(t, lexical=False) productions.extend(rules) start = nltk.Nonterminal('SENT') self.pcfg_ = nltk.induce_pcfg(start, productions) self.pcfg_.chomsky_normal_form(flexible=False) #get tokens for prod in self.pcfg_._productions: for token in prod._rhs: if not token == 'SENT': self.non_terminals.append(token) self.non_terminals.insert(0, start) #get tokens2index self.pos2index = {} for i, token in enumerate(self.non_terminals): self.pos2index[token] = i self.fitted_pcfg = True
def train_grammar(unknown_words=[], nb_reduced_production=6000): productions = [] for item in train: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D #tree_prods = tree.productions() productions += tree.productions() counter = collections.Counter(productions) n_comms = [item for item, count in counter.most_common(nb_reduced_production) for i in range(count)] #Adding unkwown words and terminal rules back into the reduced productions set unknown_words_prods = [] for p in productions: if isinstance(p._rhs[0], str): unknown_words_prods.append(p) for u in unknown_words: rhs = [u] lhs = p._lhs new_prod = Production(lhs, rhs) unknown_words_prods.append(new_prod) n_comms += unknown_words_prods S = Nonterminal('S') grammar = induce_pcfg(S, n_comms) return grammar
def to_pcfg(sequences, sections): sequences = [s[s >= 0] for s in sequences] trees = [Tree.fromstring(to_tree(s, sections)) for s in sequences] # [t.collapse_unary(collapsePOS = False) for t in trees] # [t.chomsky_normal_form(horzMarkov = 2) for t in trees] prods = [p for t in trees for p in t.productions()] print(induce_pcfg(Nonterminal('S'), prods))
def generate_grammar(f, s): productions = [] for line in f: tree = Tree.fromstring(line) for prodution in tree.productions(): productions += [prodution] return nltk.induce_pcfg(s, productions)
def get_parser(training_trees): training_prods = sum([list(tree_to_productions(t)) for t in training_trees], list()) pos_rules = [Production(Nonterminal(lhs), ["$" + rhs]) for lhs, rhs in GRAMMAR_TO_POS.iteritems()] training_prods += pos_rules training_pcfg = induce_pcfg(Nonterminal("S"), training_prods) parser = ViterbiParser(training_pcfg) return parser, training_pcfg
def _induce_pcfg(self, corpus): """ Induce PCFG del corpus. """ prods = sum((t.productions() for t in corpus.corpus.parsed_sents()),[]) S = nltk.Nonterminal('sentence') grammar = nltk.induce_pcfg(S, prods) return grammar
def _train_rules_grammar(self): print("training grammar") self._grammar = nltk.induce_pcfg( nltk.Nonterminal('TOP'), reduce(lambda a, b: a + b, map(lambda t: t.productions(), self._treebank))) if RUN_MODE == PURE_CKY_M and UNKOWN_MODE: add_unknowns(self._grammar) print("finished grammar training")
def build_context_free_grammar(self, data): productions = [] for tree in [Tree.fromstring(tree) for tree in data]: tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() starting_state = Nonterminal('SENT') grammar = induce_pcfg(starting_state, productions) return grammar
def create_pcfg(self, trees): productions = [] for tree in trees: tree.collapse_unary(collapsePOS=True) tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) return grammar
def induce(trees: Iterable) -> FancyPCFG: productions = [] for tree in trees: # tree.pretty_print() # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) return FancyPCFG.fromCFG(grammar)
def gen_sample_data_grammars(data, root, grammar_format): ''' Generate cfg/pcfg (grammar_format) from nltk.Tree datalist ''' grammars = [] for _data in data: for production in _data.productions(): grammars.append(production) if grammar_format == constants.APP_DEFAULT_PCFG_FORMAT: grammars = nltk.induce_pcfg(root, grammars) return grammars
def induce_structure(self, sentences): sentences = [[c for c in s] for s in sentences] start_symbols = set() productions = [] prod_table = {} # group all digits together digit_terminals = set([str(i) for i in range(10)]) # unary rules terminals = set() for s in sentences: terminals.update(s) for t in terminals: if t in digit_terminals: nt = nltk.Nonterminal("Digit") else: nt = nltk.Nonterminal("Unary%s" % self.gen_nt()) p = Production(nt, [t]) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.apply_unary_prod(sentences, prod_table) while len(sentences) > 0: if self.has_recursion(sentences): p = self.generate_recursive_prod(sentences) else: p = self.generate_most_frequent_prod(sentences) productions.append(p) prod_table[tuple(p.rhs())] = p.lhs() sentences = self.update_with_prod(sentences, prod_table) new_sentences = [] for s in sentences: if len(s) == 1: start_symbols.add(s[0]) else: new_sentences.append(s) sentences = new_sentences # generate the start productions for symbol in start_symbols: for p in productions: if p.lhs() == symbol: productions.append(Production(self.start, p.rhs())) self.grammar = nltk.induce_pcfg(self.start, productions)
def _induce_pcfg(self, corpus): """ Induce PCFG grammar del corpus (treebank) considerando palabras UNK. """ def induce_unk(tree): for prod in tree.productions(): if prod.is_lexical() and self.wordfrecs[prod.rhs()[0]] == 1: yield nltk.Production(prod.lhs(),["UNK"]) else: yield prod prods = sum((list(induce_unk(t)) for t in corpus.corpus.parsed_sents()), []) S = nltk.Nonterminal('sentence') return nltk.induce_pcfg(S, prods)
def getGrammar(): fileid = treebank.fileids() trainfiles = fileid[:160] #testfiles=fileid[0.8*len(fileid):] productions = [] for item in trainfiles: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary( collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form( horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() lhs_prod = [p.lhs() for p in productions] rhs_prod = [p.rhs() for p in productions] set_prod = set(productions) list_prod = list(set_prod) token_rule = [] for ele in list_prod: if ele.is_lexical(): token_rule.append(ele) set_token_rule = set(p.lhs() for p in token_rule) list_token_rule = list(set_token_rule) corr_list_token_rule = [] for word in list_token_rule: if str(word).isalpha(): corr_list_token_rule.append(word) continue #print(corr_list_token_rule) import nltk a = [] for tok in corr_list_token_rule: #lhs = nltk.grammar.Nonterminal('UNK') lhs = 'UNK' rhs = [u'UNK'] UNK_production = nltk.grammar.Production(lhs, rhs) lhs2 = nltk.grammar.Nonterminal(str(tok)) a.append(nltk.grammar.Production(lhs2, [lhs])) token_rule.extend(a) list_prod.extend(a) S = Nonterminal('S') grammar = induce_pcfg(S, list_prod) return grammar
def update_grammar(productions, unknown): lis = pos_tagger.tag(unknown) for i in range(len(lis)): pos = nonterminals(lis[i][1])[0] production_ = Production(pos, [unknown[i]]) productions.append(production_) print production_, "added to productions" S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) return grammar
def build(self, examples=tuple()): """ :param examples: tuple or list of nltk Trees :return: """ allproductions = [] for example in examples: q = example t = self.grammarify(q) t = Tree("S", [t]) productions = t.productions() allproductions += productions pcfg = nltk.induce_pcfg(Nonterminal("S"), allproductions) return pcfg
def createGrammar(self, userMessages, ctx): parser = CoreNLPParser(url='http://localhost:9000') parse_trees = [] for message in userMessages: tokenized = nltk.sent_tokenize(message) for sentence in tokenized: parse_trees.append(list(parser.raw_parse(sentence))[0]) grammar_rules = set() for tree in parse_trees: for production in tree.productions(): grammar_rules.add(production) start = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(start, grammar_rules) return (' '.join((self.generate_sentence(grammar))))
def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def test_pcfg(): #trees = ['(S (0 1 2) (0 2 3))', '(S (0 1 2) (0 4 5))', '(S (1 1 2) (1 2 3))'] #trees = ['(S (0 1 2) (3 4 5 6))', '(S (0 1 2) (7 8) (3 4 5 6))', '(S (0 1 2) (9 4) (10 6))'] trees = [ '(S (0 1 2) (3 4 5 6))', '(S (0 1 2) (7 8) (3 4 5 6))', '(S (0 1 2) (3 4 6))' ] trees = [Tree.fromstring(t) for t in trees] print(trees) prods = [p for t in trees for p in t.productions()] print(prods) grammar = induce_pcfg(Nonterminal('S'), prods) print(grammar) sequences = [t.leaves() for t in trees] description_length(grammar, sequences)
def __init__(self, training_set, vocabulary): """ Initializer for the OmissionDetector Trains a PCFG and a HMM to aid in omission detection Args: training_set -- list of parsed sentences vocabulary -- set of known words """ prods = flatten([tree.productions() for tree in training_set]) pcfg = induce_pcfg(Nonterminal("S"), prods) self._grammar = InvertedGrammar(pcfg) tagged_training_set = [tagged_sent(s) for s in training_set] self._hmm = NgramHMM(TEST_N, vocabulary) self._hmm.train(tagged_training_set)
def pcfg_train(trees, vocab): # Write a function pcfg_train() that takes as its input a collection # of nltk.tree.Tree objects. For example, it might be passed some # portion of nltk.corpus.treebank.parsed_sents(). This function # should return a nltk.PCFG object. all_productions = [] for t in trees: for p in t.productions(): all_productions.append(nltk.Production(p.lhs(), p.rhs())) pcfg = nltk.induce_pcfg(nltk.Nonterminal('S'), all_productions) return (pcfg)
def ex7(): """ Using NLTK's own library to generate the probabilities. """ productions = [ p for tree in treebank.parsed_sents() for p in tree.productions() ] pcfg = induce_pcfg(Nonterminal("S"), productions) # print(pcfg.productions()) parser = pchart.InsideChartParser(pcfg, beam_size=800) for sent in sentences1: parsed = list(parser.parse(sent.split())) print("Parsing sent: {}".format(sent)) print(parsed[0])
def parse(sentence: str, save: bool = True, index: str = str(uuid.uuid4())) -> dict: results = dict() sentence = preprocess(sentence) print(f'Sentence: {sentence}') results['speech_text'] = sentence tokenized = list(pos_parser.tokenize(sentence)) print(f'Tokenized: {tokenized}') print(f'Total words: {len(tokenized)}\n') tagged = list(pos_parser.tag(tokenized)) print(f'Tagged: {tagged}\n') results['pos_tagged'] = tagged ne_tags = ne_chunk(tagged) entities = [(tag.label(), ' '.join(t[0] for t in tag)) for tag in ne_tags if hasattr(tag, 'label')] entities.extend(tag_entities(tagged)) print(f'Entities: {entities}\n') results['named_entities'] = entities parsed = next(pos_parser.raw_parse(sentence)) print('Grammar') parsed.pretty_print() root = Nonterminal('S') grammar = induce_pcfg(root, parsed.productions()) print(grammar, '\n') productions = [(str(prod._lhs), [str(t) for t in prod._rhs], prod.prob()) for prod in grammar._productions] results['productions'] = productions probabilities = [prod.prob() for prod in grammar._productions] pcfg = reduce(mul, probabilities) print(pcfg, '\n') results['pcfg'] = pcfg if save: results['tree_bin'] = save_image(parsed, index) return results
def pcfgParse(sentence): productions = list() root = nltk.Nonterminal('S') for tree in nltk.corpus.treebank.parsed_sents(): productions += tree.productions() grammar = nltk.induce_pcfg(root, productions) PCFGParser = nltk.ViterbiParser(grammar) s_sent = sentence.split() parsed_sent = PCFGParser.parse(s_sent) for p in parsed_sent: print p
def getParser(): """ :return: A Viterbi Parser """ productions = [] S = nltk.Nonterminal('S') for tree in train_corpus: productions += tree.productions() grammar = nltk.induce_pcfg(S, productions) for p in islice(grammar.productions(), 50): print p return nltk.ViterbiParser(grammar)
def construct_grammar(f, S): productions = [] for line in f: tree = Tree.fromstring(line) # 把树chomsky_normal_form # tree.collapse_unary(collapsePOS = True, collapseRoot = True) # tree.chomsky_normal_form(horzMarkov = 2) # tree.set_label('TOP') for prodution in tree.productions(): if len(prodution.rhs()) == 1 and isinstance( prodution.rhs()[0], nltk.Nonterminal): print(prodution.rhs()) productions += [prodution] with open('productions.txt', 'w') as f: f.write(str(productions)) return nltk.induce_pcfg(S, productions)
def PCFG_Section(): toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', pcfg_prod) print('pcfg_prod.lhs() =>', pcfg_prod.lhs()) print('pcfg_prod.rhs() =>', pcfg_prod.rhs()) print('pcfg_prod.prob() =>', pcfg_prod.prob()) # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] for item in treebank.fileids()[:2]: for tree in treebank.parsed_sents(item): # print(" ".join(tree.leaves())) # perform optional tree transformations, e.g.: # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D prods = tree.productions() # print(prods[0].prob()) productions += prods S = Nonterminal('S') grammar = induce_pcfg(S, productions) # print(grammar) # This is a PCFG ### Parsing section below ### print("\nParse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(1) sent = treebank.parsed_sents('wsj_0001.mrg')[0] print(sent.prob())
def _induce_pcfg(self, corpus): """ Induce PCFG del corpus. """ productions = [] S = nltk.Nonterminal('sentence') for tree in corpus.corpus.parsed_sents(): # Trasnformamos los arboles para obtener las reglas en Forma Normal de Chomsky. #tree.collapse_unary(collapsePOS = True, collapseRoot = True) #tree.chomsky_normal_form(horzMarkov = 2) # Transformo todas las hojas del arbol a minuscula for t in tree.treepositions('leaves'): tree[t] = tree[t].lower() productions += tree.productions() return nltk.induce_pcfg(S, productions)
def _induce_pcfg(self, corpus): """ Induce PCFG del corpus considerando lexicalización en primer nivel y grupos verbales. """ S = nltk.Nonterminal('sentence') # Hago parent annotation del nivel superior a las hojas utilizando el lema de la hoja. productions = [] for arbol in corpus.corpus.parsed_sents(): for t in arbol.treepositions('leaves'): arbol[t] = arbol[t].lower() t_p = tuple(x[1] for x in enumerate(t) if x[0] != len(t)-1) arbol[t_p].set_label(arbol[t_p].label() + "#" + corpus.obtener_lema(arbol[t])) verbo = corpus.obtener_lema(arbol[t]) if verbo.endswith(("ar", "er", "ir",)) and "_" not in verbo: t_p2 = tuple(x[1] for x in enumerate(t_p) if x[0] != len(t_p)-1) if arbol[t_p2].label() == "grup.verb": arbol[t_p2].set_label(arbol[t_p2].label() + "#" + verbo) productions.extend(arbol.productions()) return nltk.induce_pcfg(S, productions)
def train_pcfg(): print 'training grammar' productions = [] # print len(treebank.fileids()) trees = [] # up to 199 less for shorter grammar for quicker training for fileid in treebank.fileids()[0:20]: for tree in treebank.parsed_sents(fileid): # perform optional tree transformations, e.g.: # Remove branches A->B->C into A->B+C so we can avoid infinite # productions tree.collapse_unary(collapsePOS=False) # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser) # horizontal and vertical Markovization: remember parents and siblings in tree # This gives a performance boost, but makes the grammar HUGE # If we use these we would need to implement a tag forgetting method #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0) tree.chomsky_normal_form() productions += tree.productions() S = nltk.Nonterminal('S') grammar = nltk.induce_pcfg(S, productions) print "grammar trained!" return grammar
path = 'ancora/ancora-3.0.1es/' corpus = ancora.AncoraCorpusReader(path) t = corpus.parsed_sents()[0] t.draw() t.productions() prods = [] for t in corpus.parsed_sents(): prods += t.productions() #print (prods) S = nltk.Nonterminal('sentence') grammar = nltk.induce_pcfg(S, prods) #prods2 = grammar.productions(lhs=nltk.Nonterminal('ncms000')) #print (prods2) print ("===============================================================") print ("===============================================================") parser = nltk.ViterbiParser(grammar) for tree in parser.parse("El gato come pescado crudo .".split()): print (tree) tree.draw() tree.prob()
print('done!') #merge productions Production += Production_singleCharWord # # inducing PCFG from the productions # print('\n\nInducing PCFG from the producitons occurre the treebank...') W=nltk.Nonterminal('W') baseline_grammar=nltk.induce_pcfg(W, Production) print('done!') path_grammar='../working_data/baseline.grammar.pickle' print('\nSaving the induced grammar to',path_grammar,' ...') f=open(path_grammar, 'wb') pickle.dump(baseline_grammar, f) f.close()
def PCFGlearning(dataset, start): production_list = [] S = Nonterminal(start) for sent in dataset: production_list += sent.productions() return induce_pcfg(S, production_list)
# normalize the c structures for t in grammar_used: t.chomsky_normal_form() tbank_productions2 = list(treebank.sents()) test_part = tbank_productions2[int(len(tbank_productions) * 0.8):] # prodcutions productions = [] for t in grammar_used: productions += Tree.productions(t) # induce PCFG S = nltk.Nonterminal("S") grammar = nltk.induce_pcfg(S, productions) prod = grammar.productions() #helping function to get the probability of a production def findProb(lhsa, rhsa, prod): for p in prod: if p.lhs() == lhsa and p.rhs() == rhsa: return (p.prob()) def CKY(words, grammar): nonterms = set() for g in grammar.productions(): nonterms.add(g.lhs()) triples =[] lenwords = len(words)
from nltk import induce_pcfg from nltk import treetransforms from nltk.corpus import treebank from nltk.grammar import Nonterminal from nltk.parse import pchart productions = [] for tree in treebank.parsed_sents(): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS=False) # Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov=2) # Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # Print the grammar S = Nonterminal('S') grammar = induce_pcfg(S, productions) print grammar
from nltk.treetransforms import chomsky_normal_form ''' tbank_productions = set(production for sent in treebank.parsed_sents() for production in sent.productions()) ''' treebank_prods = [] for i in range(199): # for all found sets of fileids tbstuff = treebank._fileids[i] # get a bunch of 'em for tree in treebank.parsed_sents(tbstuff): tree.chomsky_normal_form() treebank_prods += tree.productions() tTCpcfg = nltk.induce_pcfg(Nonterminal('S'), list(treebank_prods)) # induce pcfg # PTCpcfg = nltk.induce_pcfg(tbank_grammar) # treetransforms: chomsky_normal_form print("done! You have your WeightedGrammar")
#tree.chomsky_normal_form(horzMarkov = 2) # srules is the set of non-lexical rules with duplicates removed srules = list(set(drules)) print len(srules) # slex is the set of lexical rules with duplicates removed slex = list(set(plex)) print len(slex) # create a nonprobabilistic parser grammar = ContextFreeGrammar(Nonterminal('S'),srules+slex) parser = nltk.parse.chart.ChartParser(grammar) # create a probabilistic parser wgrammar = nltk.induce_pcfg(Nonterminal('S'),plex+drules) wparser = nltk.parse.viterbi.ViterbiParser(wgrammar) # try out the probabilistic parser on a few sentences t = "that is very interesting .".split(' ') trees = wparser.nbest_parse(t,n=10) print trees u = "from the beginning to the end , it was an entertaining game .".split(' ') trees = wparser.nbest_parse(u,n=10) print trees v = "the workers dumped sacks into a bin .".split(' ') trees = wparser.nbest_parse(v,n=10) print trees print trees[0].productions()