def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def test(): """ A test to check if the changes I made have the intended effect """ import nltk from nltk.parse import ViterbiParser sent = 'I saw the man with my telescope' tokens = sent.split() grammar = nltk.toy_pcfg1 parser = ViterbiParser(grammar) parser.trace(3) parses = parser.nbest_parse(tokens) print(parses)
def test(): """ A test to check if the changes I made have the intended effect """ import nltk from nltk.parse import ViterbiParser sent = 'I saw the man with my telescope' tokens = sent.split() grammar = nltk.toy_pcfg1 parser = ViterbiParser(grammar) parser.trace(3) parses = parser.nbest_parse(tokens) print(parses)
def parse_sentence(self, sent): """ Parse sent using induced grammar Visualize the most likely parse tree for sent :return: None. Save parsing results to pcfg.txt """ if self.grammar is None: raise ValueError("PCFG hasn't been induced yet.") # other parser option(s): e.g., parser = pchart.InsideChartParser(self.grammar) parser = ViterbiParser(self.grammar) parser.trace(3) # http://www.nltk.org/api/nltk.parse.html sys.stdout = open('pcfg.txt', 'w') parses = parser.parse(sent) for parse in parses: print(parse) # visualize the tree: print(parse.draw())
def Parser_Section(): demos = [('I saw John through the telescope', toy_pcfg1)] sent, grammar = demos[0] # print(grammar) # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) parser.trace(0) # Use this to change verbosity t = time.time() parses = parser.parse_all(tokens) print("Time:", time.time()-t) if parses: lp = len(parses) p = reduce(lambda a,b:a+b.prob(), parses, 0.0) else: p = 0 print("Probability:", p)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [ ('I saw the man with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time import nltk from nltk import tokenize from nltk.parse import ViterbiParser # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw the man with my telescope', nltk.toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', nltk.toy_pcfg2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i+1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar) parser.trace(3) t = time.time() parses = parser.nbest_parse(tokens) time = time.time()-t if parses: average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: average = 0 num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print 'Time (secs) # Parses Average P(parse)' print '-----------------------------------------' print '%11.4f%11d%19.14f' % (time, num_parses, average) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print '------------------------------------------' print '%11s%11d%19.14f' % ('n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def create_viterbi_parser(grammar, pickle_it=False, filename="viterbi"): parser = ViterbiParser(grammar) parser.trace(0) if pickle_it: pickle.dump(parser, open("%s%s-parser.p" % (var_dir, filename), "wb")) return parser
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.draw.tree import draw_trees from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget # Define two demos. Each demo has a sentence and a grammar. # demos = [('move the green sphere to the bottom left corner', learned_pcfg), # ('move the green ball over the red block', learned_pcfg), # ('take the green pyramid and put it in the top left corner', learned_pcfg), # ('put the green pyramid on the red block', learned_pcfg), # ('move the red cylinder and place it on top of the blue cylinder that is on top of a green cylinder', learned_pcfg),] # Ask the user which demo they want to use. # print() # for i in range(len(demos)): # print('%3s: %s' % (i+1, demos[i][0])) # print(' %r' % demos[i][1]) # print() # print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') # try: # snum = int(sys.stdin.readline().strip())-1 # sent, grammar = demos[snum] # except: # print('Bad sentence number') # return max_scene = 1 if max_scene<10: sc = '0000'+str(max_scene) elif max_scene<100: sc = '000'+str(max_scene) elif max_scene<1000: sc = '00'+str(max_scene) elif max_scene<10000: sc = '0'+str(max_scene) g = 'grammar_'+sc+'.txt' learned_pcfg = load('/home/omari/Dropbox/robot_modified/AR/grammar/'+g) grammar = learned_pcfg file1 = open('/home/omari/Dropbox/robot_modified/AR/hypotheses/matched_commands.txt', 'r') g1 = [i for i in file1.readlines()] for line in g1: line = unicode(line,encoding='utf-8') sent = line.split('\n')[0].split('-')[-1] scene = line.split('\n')[0].split('-')[0] sent_num = line.split('\n')[0].split('-')[1] print(line) if scene == '239' and sent_num == '0': continue # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) parses = parser.parse_all(tokens) average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics # print() # print('Time (secs) # Parses Average P(parse)') # print('-----------------------------------------') # print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 # print('------------------------------------------') # print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. # print() # print('Draw parses (y/n)? ', end=' ') # if sys.stdin.readline().strip().lower().startswith('y'): # print(' please wait...') # draw_trees(*parses) cf = CanvasFrame() # t = Tree(parses) t = Tree.fromstring('(S (CH_POS_PREPOST move) (PRE_POST (PRE (the the) (_entity (F_HSV green) (F_SHAPE sphere))) (PREPOST_connect (to to) (the the)) (POST (_F_POS (F_POS (_bottom_left (bottom bottom) (left left)))) (corner corner))))') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14), leaf_font=('helvetica', -12), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue4') cf.add_widget(tc,10,10) # tc = TreeWidget(cf.canvas(),t) # cf.add_widget(tc,10,10) # (10,10) offsets cf.print_to_file('/home/omari/Dropbox/robot_modified/trees/scene-'+scene+'-'+sent_num+'.ps') cf.destroy()
############ create PCFG from the productions ####### from nltk import Nonterminal from nltk import induce_pcfg S = Nonterminal('SENT') grammar = induce_pcfg(S, productions) print(grammar) ######### Parser with CYK dynamic algorithm ######## from nltk.parse import pchart from nltk.parse import ViterbiParser from nltk.treetransforms import un_chomsky_normal_form parser = ViterbiParser(grammar) parser.trace(2) parses_bank = [] test_file = open(args.test_dir, 'wb') test_output_file = open(args.output_dir, 'wb') for i in range(valid_idx, test_idx + 1): # take the leaves of each tree of testset and store # them in the test file tokens = treebank[i][0].leaves() sentence = u" ".join(tokens) test_file.write((sentence + u"\n").encode('utf-8')) print 'parsing :', sentence # we will use lexicon knowledge to replace the
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import PCFG from nltk.parse import ViterbiParser toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [('حرك الكرة الخضراء في أسفل الزاوية اليسرى', learned_pcfg), ('حرك الكرة', learned_pcfg), ('take the green pyramid and put it in the top left corner', learned_pcfg), ('move the pink triangle on top of the black square', learned_pcfg), ('move the red block and place it on top of the blue block that is on top of a green block', learned_pcfg), ('move the green block on top of the blue block', learned_pcfg)] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i+1, demos[i][0])) # print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() print(grammar) parser = ViterbiParser(grammar) all_parses = {} # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time()-t average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse)
def parseAllTestXmls(fileList, grammar, allTestSolutionsDict, verbose=False, displayTrees=False): testPitchLists = [] testIntervalLists = [] totalCorrect = 0 totalCorrectNonN = 0 totalProductions = 0 totalLeaves = 0 parseTreeStrings = {} for filepath in fileList: curPitchList = getPitchListFromFile(filepath) testPitchLists.append(curPitchList) testIntervalLists.append(getIntervalStringsFromPitchList(curPitchList, verbose)) if verbose: print(testIntervalLists[-1]) listLen = len(testIntervalLists[-1]) if verbose: print(tree) parser = ViterbiParser(grammar) if verbose: parser.trace(0)#3 else: parser.trace(0) try: parses = parser.parse_all(testIntervalLists[-1]) except Exception as errorMsg: print("error parsing file " + filepath) print(errorMsg) numTrees = sum(1 for _ in parses) if numTrees > 0 and displayTrees == True: from nltk.draw.tree import draw_trees draw_trees(*parses) if numTrees == 0: print("Couldn't find a valid parse, this is bad, very very bad") return 0,0 numCorrect = 0 numCorrectNonN = 0 bottomCorrect = 0 bottomCorrectNonN = 0 solutionTree = None try: solutionTreeStr = allTestSolutionsDict[filepath] solutionTree = Tree.fromstring(solutionTreeStr) except Exception as errorMsg: print("couldn't find solution for file " + filepath) print(errorMsg) if solutionTree != None and solutionTree != '': parseTreeStrings[filepath] = str(parses[0]) numCorrect, numCorrectNonN = validate_tree.compareTrees(solutionTree, parses[0]) numProductions = len(solutionTree.productions()) totalProductions += numProductions #solutionTree.draw() #parses[0].draw() bottomCorrect, bottomCorrectNonN = validate_tree.compareTreesBottomUp(solutionTree, parses[0]) parseTreeStrings[filepath+'_afterComparison'] = str(parses[0]) totalLeaves += len(solutionTree.leaves()) #parses[0].draw() totalCorrect += bottomCorrect totalCorrect += numCorrect totalCorrectNonN += numCorrectNonN totalCorrectNonN += bottomCorrectNonN return totalCorrect, totalCorrectNonN, totalProductions, totalLeaves, parseTreeStrings
print(t) ### try RandomChartParser, UnsortedChartParser, LongestChartParser # In[ ]: parser = nltk.parse.EarleyChartParser(grammar) for t in parser.parse(tokens): print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse_all(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent: print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent:
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.parse import ViterbiParser # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)