def generate_grammar_and_parsers(parsed_sents): # From sentences, extract the parsing tree and transform each tree to a list of CFG productions; # generate a set containing all the productions (without repetitions) tbank_productions_with_repet = [ production for parsed_sent in parsed_sents for production in parsed_sent.productions() ] tbank_productions = set( tbank_productions_with_repet) # exclude repetitions print("Num. of unique productions read:", len(tbank_productions)) # Build a CFG from the productions print("\nBuinding a CFG...") cfg_grammar = CFG(Nonterminal('S'), tbank_productions) # a CFG print(cfg_grammar, end="\n\n") # CFG - An Earley parser cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3) # Build a PCFG from the productions print("Building a PCFG...") pcfg_grammar = induce_pcfg( Nonterminal('S'), tbank_productions_with_repet) # a PCFG, here repetitions are needed! print(pcfg_grammar, end="\n\n") # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html pcfg_pchart_parser = InsideChartParser(pcfg_grammar) return cfg_earley_parser, pcfg_pchart_parser # return both parsers
def pcfg(self, include_edgelabels=True): sents = self.parsed_sents(include_edgelabels) tiger_prods = set(prod for sent in sents for prod in sent.productions()) pcfg = induce_pcfg(Nonterminal(TigerCorpusReader.GRAMMAR_START), list(tiger_prods)) return pcfg
def __init__(self): self.sentences = floresta.parsed_sents() productions = [] for fileid in floresta.fileids()[:2]: for t in floresta.parsed_sents(fileid): t = self.simpifly_tree_tag(t) t.chomsky_normal_form() productions += t.productions() # for e, t in enumerate(self.sentences): # if(t): # #print("\taqui carai", t) # #print(type(t)) # t = self.simpifly_tree_tag(t) # #print(t) # t.chomsky_normal_form() # productions += t.productions() # if(e==4): # break print(productions) np = nltk.Nonterminal('np') grammar = induce_pcfg(np, productions) print(grammar)
def induce_grammar(sents): UNK = '*UNKNOWN*' productions = [] for sent in sents: # remove -NONE- tags and simplify hyphenated nonterminals sent.chomsky_normal_form() sent.collapse_unary(collapsePOS=True, collapseRoot=True) productions.extend(sent.productions()) # add UNK-rules grammar = induce_pcfg(Nonterminal('S'), productions) return grammar
def getGrammar(sentence): """ Constructs an ad-hoc split head DMV grammar for the given sentence. @param sentence: Input sentence as a list of tokens @return: NLTK grammar with weighted productions. """ productions = [] for i, head in enumerate(sentence): S = Nonterminal("S") Y_head = Nonterminal("Y_" + head) L_head = Nonterminal("L_" + head) R_head = Nonterminal("R_" + head) L1_head = Nonterminal("L1_" + head) R1_head = Nonterminal("R1_" + head) LP_head = Nonterminal("LP_" + head) RP_head = Nonterminal("RP_" + head) productions.append(Production(S, [Y_head])) productions.append(Production(Y_head, [L_head, R_head])) productions.append(Production(L_head, [head + "_l"])) productions.append(Production(R_head, [head + "_r"])) productions.append(Production(L_head, [L1_head])) productions.append(Production(R_head, [R1_head])) productions.append(Production(LP_head, [head + "_l"])) productions.append(Production(RP_head, [head + "_r"])) productions.append(Production(LP_head, [L1_head])) productions.append(Production(RP_head, [R1_head])) grammar = induce_pcfg(Nonterminal("S"), productions) for i, head in enumerate(sentence): L1_head = Nonterminal("L1_" + head) R1_head = Nonterminal("R1_" + head) LP_head = Nonterminal("LP_" + head) RP_head = Nonterminal("RP_" + head) for j in xrange(0, i): arg = sentence[j] prob = model_wrapper.getProb(head, arg, direction="left") grammar.productions().append( WeightedProduction(L1_head, [Nonterminal("Y_" + sentence[j]), LP_head], prob=prob) ) for j in xrange(i + 1, len(sentence)): arg = sentence[j] prob = model_wrapper.getProb(head, arg, direction="right") grammar.productions().append( WeightedProduction(R1_head, [RP_head, Nonterminal("Y_" + sentence[j])], prob=prob) ) return grammar
def set_grammar(self, productions): """ Add the grammar rules from MBMA to the parser. Transforms all rules to Chomsky Normal Form, induces a Weighted Context Free Grammar on the basis of these rules and indexes the Right Hand Rules of the productions. Args: - productions (list): a list of :class:`nltk.Production` instances """ cnf_prods = [] for p in productions: # transform each production of MBMA into CNF cnf_prods.extend(prod_to_chomsky_normal_form(p)) self._local_grammar = induce_pcfg('S', cnf_prods) self._local_productions = self._build_productions(self._local_grammar) self.initialize = _initialize(self._local_grammar)
def induce_grammar(train): """Induces a PCFG from the given set of Penn Treebank sentences Args: train (Any): Set of Penn Treebank sentences Returns: PCFG: A PCFG grammar instance """ productions = [] for item in train: for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: # Remove branches A-B-C into A-B+C tree.collapse_unary(collapsePOS=False) # Remove A->(B,C,D) into A->B,C+D->D tree.chomsky_normal_form(horzMarkov=2) productions += tree.productions() S = Nonterminal('S') return induce_pcfg(S, productions)
def learn_trees(trees, collapse=True, markov_order=None): """ Given a list of parsed sentences, return the maximum likelihood PCFG for those sentences. If 'collapse' is True, it will collapse the trees before learning the grammar so that there are no unary productions. This will reduce productions of length more than 2 using Chomsky normal form. You can Markov-smooth the results by setting markov_order to a number such as 2. """ productions = [] for tree in trees: if collapse: tree.collapse_unary(collapsePOS=False) #if markov_order: # tree.chomsky_normal_form(horzMarkov=markov_order) #else: #tree.chomsky_normal_form() productions += tree.productions() #print productions grammar_p = grammar.induce_pcfg(Nonterminal('S'), productions) return grammar_p
def __init__(self, parsed_sents, start='sentence', horzMarkov=None): """ parsed_sents -- list of training trees. """ # Non-Terminal start symbol of the pcfg. # Be aware that the start symbol now is specified by the init parameter # 'start', and not the start label of the trees in parsed_sents self.start = N(start) self.horzMarkov = horzMarkov # saving repeated productions (for induce probabilities) productions = [] for t in parsed_sents: unlex_t = unlexicalize(t.copy(deep=True)) # Set node label unlex_t.set_label(start) unlex_t.chomsky_normal_form(horzMarkov=horzMarkov) # Not collapsing the Root (collapseRoot=False) unlex_t.collapse_unary(collapsePOS=True, collapseRoot=True) productions += unlex_t.productions() self.pcfg = induce_pcfg(self.start, productions) self._probabilistic_productions = self.pcfg.productions() self._parser = CKYParser(self.pcfg)
def fill_missing_words(grammar: PCFG, missing_words: Set[str]): # UNK -> word1 | word2 | ... | wordN unknown = Nonterminal('UNK') unk_rules = [ Production(unknown, [missing_word]) for missing_word in missing_words ] # Add UNK as a possibility to all rules with strings in the right hand side corrected_rules: List[Nonterminal] = [] rule: ProbabilisticProduction for rule in grammar.productions(): # right hand side has a string somewhere if any(isinstance(element, str) for element in rule.rhs()): # rule has already been corrected if rule.lhs() in corrected_rules: continue unk_rules.append(Production(rule.lhs(), [unknown])) corrected_rules.append(rule.lhs()) return induce_pcfg(grammar.start(), grammar.productions() + unk_rules)
def induce_grammar(productions: List[Production]): S = Nonterminal("S") return induce_pcfg(S, productions)
######################### print "Combining extended lexicon with the training trees and " print "building grammar." #trainTrees currently has the productions in a list of lists #this turns them into a list trainProds = reduce(lambda x,y: x+y,\ map(lambda x: x.productions(), trainTrees)) parserProds = trainProds+extraLexicals #remove duplicates print "Removing duplicates from productions." parserProds = list(set(parserProds)) for each in parserProds: print each parser = ViterbiParser(induce_pcfg(Nonterminal("S"),parserProds)) #I've got this thing somewhat working. I'm tired of running it over # and over. I think it's taking nearly 10 minutes. # PICKLE AND EXPORT TIME prsFilename = "hw4_vitParser.pkl" print "Saving parser to " + prsFilename dump(parser, open(prsFilename, "w")) dump(testTrees, open("hw4_testTrees.pkl","w"))
def learn_pcfg(trees): productions = [] for tree in trees: productions += tree.productions() return induce_pcfg(Nonterminal('S'), productions)
productions = list(treebank_productions) output = set() for p in productions: p = str(p) if 'NONE' in p or "'" in p or "'" in p or ":" in p or "," in p or "PDT" in p or "FW" in p: continue lhs = remove_dash(p.split()[0]) rhs = [] for strr in p.split()[2:]: rhs.append(remove_dash(strr)) temp_rhs = rhs[:] if len(temp_rhs) == 1: output.add(Production(Nonterminal(lhs), [Nonterminal(rhs[0])])) else: rhs_head = "-".join(temp_rhs[1:]) output.add( Production(Nonterminal(lhs), [Nonterminal(temp_rhs[0]), Nonterminal(rhs_head)])) lhs = rhs_head temp_rhs = temp_rhs[1:] grammar = induce_pcfg(Nonterminal('S'), list(output)) print('ROOT -> S 1') for g in grammar.productions(): temp = str(g).split('[') print(temp[0], temp[1].strip(']'))
sent.chomsky_normal_form() for production in sent.productions(): tbank_productions.append(production) # supaya tidak ada permasalahan unknown token/word, kita tambahkan semua lexical(termasuk) #yang di test_set for word, tag in treebank.tagged_words(): t = Tree.fromstring("(" + tag + " " + word + ")") for production in t.productions(): tbank_productions.append(production) print tbank_productions[2] #Secara otomatis membangun grammar (terutama menghitung probability rule) #dari list production rule tbank_productions tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions) print tbank_grammar #PARSING parser = ViterbiParser(tbank_grammar) s = time.time() #parsing untuk raw data latih kedua for t in parser.parse(raw_test_set[1]): print(t) #hitung waktu parsing s = time.time() - s #gold standard dari dataset kedua print test_set[1]
def learning_CNF_probabilities(self): S = Nonterminal('SENT') self.grammar = induce_pcfg(S, self.CNF_rules).productions()
def extract_simple_pcfg(n): rules = extract_simple_productions(n) pcfg = grammar.induce_pcfg(Nonterminal("S"), rules) return PCFG(pcfg.start(), sort_rules(pcfg.productions()))
def trees_to_pcfg(trees): productions = trees_to_productions(trees) pcfg = induce_pcfg(Nonterminal('S'), productions) return pcfg
ta /= len(list_tag_val) # armazena o resultado r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta} resultados.append(r) else: print("Sentença com mais de 18 palavras.") except Exception: print("Árvore mal formada.") # realiza o calculo da media para cada metrica media_lp = sum(item['lp'] for item in resultados)/len(resultados) media_lr = sum(item['lr'] for item in resultados)/len(resultados) media_f1 = sum(item['f1'] for item in resultados)/len(resultados) media_ta = sum(item['ta'] for item in resultados)/len(resultados) print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta) # extrai as arvores da base de dados floresta, com suas respectivas tags filter_errors(floresta.parsed_sents()) roots = [] ROOT = Nonterminal('ROOT') # nao-terminal representado o simbolo inicial da gramatica initial_symbols = list(set(initial_symbols)) # remover repetidos for t in initial_symbols: roots += [Production(ROOT,[t])] # unificar a gramatica para apenas um simbolo inicial productions += roots productions += [Production(Nonterminal("n"), ["UNK"])] # regra para palavras desconhecidas (substantivo) pcfg = induce_pcfg(ROOT, productions) # cria a PCFG informando o simbolo inicial e as regras do_cky(pcfg) # aplica o algoritmo CKY (ViterbiParser)
B -> B D [.5] | C [.5] C -> 'a' [.1] | 'b' [0.9] D -> 'b' [1.0] """) grammar.productions() # In[ ]: # get grammer from parsed sentences from nltk import Nonterminal productions = [] for fileid in treebank.fileids()[:2]: for t in treebank.parsed_sents(fileid): productions += t.productions() grammar = induce_pcfg(Nonterminal('S'), productions) # In[ ]: print(grammar) # In[ ]: sorted(grammar.productions(lhs=Nonterminal('PP')))[:2] # In[ ]: sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2] # In[ ]: