def CKY_parser(): ''' Given the PCFG, we use the built in CKY praser function to get a sentence's most probable parse ''' PCFG_grammar = make_PCFG_grammar() # Utilize the ViertabiParser given the PCFG grammar induction rules parser = ViterbiParser(PCFG_grammar) # Sample sentence parse sentences = treebank.parsed_sents('wsj_1964.mrg') skipped_sentences = 0 # A for loop to print out the full parse for sentence in sentences: sentence = sentence.leaves() try: PCFG_grammar.check_coverage(sentence) for parse in parser.parse(sentence): print(parse) except: skipped_sentences += 1 continue print("Total skipped sentences:", skipped_sentences)
def evaluate_sentence(sentence: string, grammar: PCFG): sentence = sentence.split() print(sentence, flush=True) pos = [pos for word, pos in pos_tag(sentence)] print(pos, flush=True) parser = ViterbiParser(grammar, trace=0) for line in accumulate(pos, lambda total, token: total + ' ' + token): line = line.split() print(line) print([tree.prob() for tree in list(parser.parse(line))], flush=True)
def main(): data = pd.read_csv(data_file_path) data = data.drop(columns=["Unnamed: 0"]) (sentence, sentence_tokens) = readsentence() # take input from user and save text, tokenized text if os.path.exists('mytagger.pkl'): # try to open a previously saved tagger input = open('mytagger.pkl', 'rb') mytagger = load(input) input.close() else: # no such tagger is found so train/save it mytagger = traintagger() output = open('mytagger.pkl', 'wb') dump(mytagger, output, -1) output.close() tagged_tokens = mytagger.tag(sentence_tokens) print(tagged_tokens) if os.path.exists('mypcfg.pickle'): # try to open a previously saved PCFG input = open('mypcfg.pickle', 'rb') mypcfg = load(input) input.close() else: # no such PCFG exists, so induce/save it mypcfg = buildpcfg() output = open('mypcfg.pickle', 'wb') dump(mypcfg, output) output.close() try: tree = sequence_matching(tagged_tokens) print("Sequence matching was used") except: parser = ViterbiParser(mypcfg) tree = parser.parse(tagged_tokens) print("Vitberi parser was used") finally: if not isinstance(tree, Tree): Tree.pretty_print(next(tree)) # do something to print it out, or print error message if input couldn't be parsed else: print(tree) df2 = {'sentence': sentence, 'sentence tokens': sentence_tokens, 'tagged tokens': tagged_tokens} data = data.append(df2, ignore_index=True) data.to_csv(data_file_path) print("Previous data:") print(data)
def parse_sentence(self, sent): """ Parse sent using induced grammar Visualize the most likely parse tree for sent :return: None. Save parsing results to pcfg.txt """ if self.grammar is None: raise ValueError("PCFG hasn't been induced yet.") # other parser option(s): e.g., parser = pchart.InsideChartParser(self.grammar) parser = ViterbiParser(self.grammar) parser.trace(3) # http://www.nltk.org/api/nltk.parse.html sys.stdout = open('pcfg.txt', 'w') parses = parser.parse(sent) for parse in parses: print(parse) # visualize the tree: print(parse.draw())
def perplexity(): ''' Give the PCFG and the parser used, run the parser on the rest of the treebank and calculates the perplexity of the model given the testing sentences. ''' PCFG_grammar = make_PCFG_grammar() parser = ViterbiParser(PCFG_grammar) all_p = [] skipped_sentence = 0 for item in treebank.fileids()[1964:]: trees = treebank.parsed_sents(item) for tree in trees: tree = tree.leaves() try: PCFG_grammar.check_coverage(tree) for parse in parser.parse(tree): parse_string = str(parse) p = re.search(r"p=([^/]+)", parse_string).group(1) p = p[:-1] all_p.append(float(p)) except: skipped_sentence += 1 continue perplexity = 1 N = float(len(all_p)) for p in all_p: perplexity = perplexity * (1/p) perplexity = pow(perplexity, 1/float(N)) print("Perplexity:", perplexity) print("All parse probabilities:", all_p) print("Skipped sentences:", skipped_sentence) print("PCFG grammar:", PCFG_grammar)
def do_cky(grammar): global test global posx viterbi = ViterbiParser(grammar) # inicializa o parser com a gramatica (PCFG) resultados = [] for t in test[:1]: # para cada sentenca da base de teste try: sent = t.leaves() # pega a frase if len(sent) <= 18: # filtro para palavras com até 18 palavras (incluindo pontuação) ssent = [] for s in sent: # checar a cobertura da gramatica para cada palavra try: grammar.check_coverage([s]) ssent.append(s) except ValueError: # para os casos de palavras desconhecidas ssent.append("UNK") saida = [] for i in viterbi.parse(ssent): # utiliza o parser para a sentenca de teste saida.append(i) # lista para o resultado das duas vertentes: descobrir os não-terminais que original os terminais; e identificar os não-terminais que derivam as palavras list_eval_val = [] list_eval_test = [] list_tag_val = [] list_tag_test = [] posx = 0 make_tree_evaluation(saida[0][0],list_eval_test,list_tag_test,0) # realiza a avalicao para o resultado do parser posx = 0 make_tree_evaluation(t,list_eval_val,list_tag_val, 0) # realiza a avalicao para a arvore da base de teste # ordena pela ordem de visitacao list_eval_test.sort(key=lambda tup: tup[3]) list_eval_val.sort(key=lambda tup: tup[3]) # quantidade de acertos acertos = len(set(list_eval_test).intersection(set(list_eval_val))) # labeled precision lp = acertos/len(list_eval_test) # labeled recall lr = acertos/len(list_eval_val) # f1 f1 = 0 if lp > 0 and lr > 0: f1 = 2*lp*lr/(lp+lr) # tagging accuracy ta = 0 ta = len([i for i, j in zip(list_tag_test, list_tag_val) if i == j]) ta /= len(list_tag_val) # armazena o resultado r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta} resultados.append(r) else: print("Sentença com mais de 18 palavras.") except Exception: print("Árvore mal formada.") # realiza o calculo da media para cada metrica media_lp = sum(item['lp'] for item in resultados)/len(resultados) media_lr = sum(item['lr'] for item in resultados)/len(resultados) media_f1 = sum(item['f1'] for item in resultados)/len(resultados) media_ta = sum(item['ta'] for item in resultados)/len(resultados) print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)
for production in t.productions(): tbank_productions.append(production) print tbank_productions[2] #Secara otomatis membangun grammar (terutama menghitung probability rule) #dari list production rule tbank_productions tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions) print tbank_grammar #PARSING parser = ViterbiParser(tbank_grammar) s = time.time() #parsing untuk raw data latih kedua for t in parser.parse(raw_test_set[1]): print(t) #hitung waktu parsing s = time.time() - s #gold standard dari dataset kedua print test_set[1] '''Tugas anda adalah membangun fungsi untuk mengukur akurasi dari parser yang telah dibangun. Akurasi terdiri dari 2 macam, yaitu exact match, dan partial match (rata-rata recall dan precision). Cari sendiri bagaimana menghitung recall dan precision untuk parsing dari referensi yang valid. Setelah dibangun fungsi untuk menghitung 2 jenis akurasi tersebut, lakukan skenario di bawah ini :
from nltk import induce_pcfg from nltk.parse import pchart from nltk.parse import ViterbiParser from nltk.treetransforms import * from nltk import * productions = [] for item in treebank._fileids: length=int(len(item)*0.9) for tree in treebank.parsed_sents(item)[:length]: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) parser = pchart.InsideChartParser(grammar) parserv= ViterbiParser(grammar2) for item in treebank._fileids: start=int(len(item)*0.9) for tree in treebank.parsed_sents(item)[start:]: sent = tree.leaves() print(tree.pos()) for parse in parser.parse(sent): print(parse) for parse in parserv.parse(sent): print(parse)
parser = nltk.parse.EarleyChartParser(grammar) for t in parser.parse(tokens): print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse_all(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent: print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent: print(t) # In[ ]: