def test_sentences(grammar): for t in test: print("Processing: " + str(t)) reference = list(treebank.tagged_words(t)) tokens = list(treebank.words(t)) print("fixing grammar.....") # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary fixed_grammar = get_fixed_grammer(grammar, tokens) print("fixed grammar") print("Building Parser....") parser = ViterbiParser(fixed_grammar) print("Parsing...") #Gets list of all possible trees, the most likely tree is at index 0 start = time.time() parses = parser.parse_all(tokens) print("Time") print(start - time.time()) #Getting POS tags from parser tree leafs = parses[0].pos() #Calculating accuracy of Parser results correct_tags = 0.0 for i in range(len(leafs)): if leafs[i] == reference[i]: correct_tags += 1.0 print(str(correct_tags/len(leafs)))
def main(): # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0]) # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw() # print("Induce PCFG grammar from treebank data:") # productions = [] print(len(treebank.fileids())) for item in treebank.fileids(): # Goes through all trees for tree in treebank.parsed_sents(item): # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D productions += tree.productions() # # # # print(type(productions[0])) # # S = Nonterminal('S') grammar = induce_pcfg(S, productions) # # # print(grammar) # This is a PCFG # pickle.dump(grammar, open("tbank-grammar.p", "wb")) # t = time.time() # grammar = pickle.load(open("tbank-grammar.p", "rb")) # textf = open("lexicon.txt", "w") # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n")))))) # textf.close() # print(time.time()-t) parser = ViterbiParser(grammar) # pickle.dump(parser, open("cky-parser.p", "wb")) # parser = pickle.load(open("cky-parser.p", "rb")) parser.trace(0) sent = "John will join the board" tokens = sent.split() try: grammar.check_coverage(tokens) print("All words covered") parses = parser.parse_all(tokens) if parses: lp = len(parses) print(lp) print(parses[0].label()) # parses[0].draw() p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0) else: p = 0 print("Probability:", p) except: print("Some words not covered")
def Parser_Section(): demos = [('I saw John through the telescope', toy_pcfg1)] sent, grammar = demos[0] # print(grammar) # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) parser.trace(0) # Use this to change verbosity t = time.time() parses = parser.parse_all(tokens) print("Time:", time.time()-t) if parses: lp = len(parses) p = reduce(lambda a,b:a+b.prob(), parses, 0.0) else: p = 0 print("Probability:", p)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [ ('I saw the man with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.draw.tree import draw_trees from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget # Define two demos. Each demo has a sentence and a grammar. # demos = [('move the green sphere to the bottom left corner', learned_pcfg), # ('move the green ball over the red block', learned_pcfg), # ('take the green pyramid and put it in the top left corner', learned_pcfg), # ('put the green pyramid on the red block', learned_pcfg), # ('move the red cylinder and place it on top of the blue cylinder that is on top of a green cylinder', learned_pcfg),] # Ask the user which demo they want to use. # print() # for i in range(len(demos)): # print('%3s: %s' % (i+1, demos[i][0])) # print(' %r' % demos[i][1]) # print() # print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') # try: # snum = int(sys.stdin.readline().strip())-1 # sent, grammar = demos[snum] # except: # print('Bad sentence number') # return max_scene = 1 if max_scene<10: sc = '0000'+str(max_scene) elif max_scene<100: sc = '000'+str(max_scene) elif max_scene<1000: sc = '00'+str(max_scene) elif max_scene<10000: sc = '0'+str(max_scene) g = 'grammar_'+sc+'.txt' learned_pcfg = load('/home/omari/Dropbox/robot_modified/AR/grammar/'+g) grammar = learned_pcfg file1 = open('/home/omari/Dropbox/robot_modified/AR/hypotheses/matched_commands.txt', 'r') g1 = [i for i in file1.readlines()] for line in g1: line = unicode(line,encoding='utf-8') sent = line.split('\n')[0].split('-')[-1] scene = line.split('\n')[0].split('-')[0] sent_num = line.split('\n')[0].split('-')[1] print(line) if scene == '239' and sent_num == '0': continue # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) parses = parser.parse_all(tokens) average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics # print() # print('Time (secs) # Parses Average P(parse)') # print('-----------------------------------------') # print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 # print('------------------------------------------') # print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. # print() # print('Draw parses (y/n)? ', end=' ') # if sys.stdin.readline().strip().lower().startswith('y'): # print(' please wait...') # draw_trees(*parses) cf = CanvasFrame() # t = Tree(parses) t = Tree.fromstring('(S (CH_POS_PREPOST move) (PRE_POST (PRE (the the) (_entity (F_HSV green) (F_SHAPE sphere))) (PREPOST_connect (to to) (the the)) (POST (_F_POS (F_POS (_bottom_left (bottom bottom) (left left)))) (corner corner))))') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14), leaf_font=('helvetica', -12), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue4') cf.add_widget(tc,10,10) # tc = TreeWidget(cf.canvas(),t) # cf.add_widget(tc,10,10) # (10,10) offsets cf.print_to_file('/home/omari/Dropbox/robot_modified/trees/scene-'+scene+'-'+sent_num+'.ps') cf.destroy()
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [ ('I saw the man with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i + 1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = ( reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 ) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse)
# them in the test file tokens = treebank[i][0].leaves() sentence = u" ".join(tokens) test_file.write((sentence + u"\n").encode('utf-8')) print 'parsing :', sentence # we will use lexicon knowledge to replace the # unknown word in order to do a parsing with large corpus # of unknown words unknowns = check_unknown_words(tokens, grammar) if len(unknowns) > 0: grammar = update_grammar(productions, unknowns) parser = ViterbiParser(grammar) parser.trace(2) parses = parser.parse_all(tokens) if len(parses) > 0: parse = parses[0] else: parse = "" test_output_file.write(" ".join(parse.__str__().replace("\n", '').split())) test_output_file.write('\n') parses_bank.append(parse) test_file.close() test_output_file.close() if args.run_in_shell != "True": sys.exit(0)
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import PCFG from nltk.parse import ViterbiParser toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)
# Tokenize the sentence. #tokens = sent.split() tokens = sys.argv[1:] # Define a list of parsers. We'll use all parsers. parser = ViterbiParser(grammar) print('Coverage of input words by a grammar:\n') change_words = [] for i, ele in enumerate(tokens): try: grammar.check_coverage([ele]) except: print("%s is not covered by the grammar. Replacing it with 'UNK'\n" % ele) change_words.append(tokens[i]) tokens[i] = 'UNK' trees = parser.parse_all(tokens) for tree in trees: pass UNK_str = trees[0].__str__() answer = UNK_str for i in change_words: answer = answer.replace("UNK", i, 1) print("\nTree is:\n\n") print(answer)
def main(): parser = argparse.ArgumentParser() parser.add_argument("directory", help="Directory that contains melody files") parser.add_argument("solutions", help="Directory that contains the solution files") parser.add_argument("-g", "--grammar", help="The file that specifies a saved grammar, this grammar will be used instead of training a new one") parser.add_argument("-f", "--folds", help="number of folds desired") parser.add_argument("-o", "--outfile", help="The file that the grammar will be saved in") parser.add_argument("-t", "--type", help="The type of solutions file we're using, either 'PR' or 'TS' for Prolongational Reduction or Time-span Tree, respectively") parser.add_argument("-v", "--verbose", help="increase output verbosity") args = parser.parse_args() print(args) if args.verbose == None or args.verbose == 'False': args.verbose = False elif args.verbose == 'True': args.verbose = True #If the grammar is specified, then the "directory" folder will be used as a test set #If the grammar is not specified, it will use 20% of the files in "directory" as a test set, and the rest as a training set to create the grammar #If folds are specified, then it will split the files in "directory" into numFolds groups, applying training and testing, and then adding up the percentages overall allProductions = [] stats = True if args.grammar != None and args.grammar != '': if not os.path.isfile(args.grammar): return f = open(args.grammar, 'r') for line in f.readlines(): allProductions.append(line) S = Nonterminal('S') smallTrees = music_grammar.collectProductions(allProductions, args.verbose) trainedGrammar = induce_pcfg(S, smallTrees) print(trainedGrammar) np_productions = trainedGrammar.productions(Nonterminal('4')) dict = {} for pr in np_productions: dict[pr.rhs()] = pr.prob() np_probDist = DictionaryProbDist(dict) #Used this code for generating specific figures: exampleTree = Tree.fromstring('(S (N (N (5 5)) (N (m4 -4))) (N (6 6)))') #exampleTree.draw() exampleTreeToCompare = Tree.fromstring('(S (N (5 5)) (N (N (m4 -4)) (N (6 6))))') #exampleTreeToCompare.draw() validate_tree.compareTreesBottomUp(exampleTree, exampleTreeToCompare) #exampleTreeToCompare.draw() for i in range(100): rightHand = np_probDist.generate() print(rightHand) print(len(rightHand)) generatedTree = pcfg_generate.generate(trainedGrammar) print('resulting tree: ') generatedTreeObj = Tree.fromstring(generatedTree) print(generatedTreeObj) print(str(generatedTreeObj.leaves())) print('\n\n') embellishedTree = pcfg_generate.expandAllLeaves(trainedGrammar, generatedTreeObj) print(embellishedTree) print(str(Tree.fromstring(str(embellishedTree)).leaves())) fileToTest = "./MusicXml/72_Preludes 1 La fille aux cheveux de lin.xml" musicXmlTest = converter.parse(fileToTest) curPitchList = music_grammar.getPitchListFromFile(fileToTest) intervalList = music_grammar.getIntervalStringsFromPitchList(curPitchList) with open(music_grammar.musicGrammarFilename, 'r') as f: musicGrammarString = f.read() musicGrammar = CFG.fromstring(musicGrammarString) parser = ChartParser(musicGrammar, trace=2) #parses = parser.parse(intervalList) print(intervalList) print('num intervals is: ' + str(len(intervalList))) #numTrees = sum(1 for _ in parses) #print('numTrees is: ' + str(numTrees)) #return #this is for the musical examples trainedParser = ViterbiParser(trainedGrammar) parses = trainedParser.parse_all(intervalList) bestParse = parses[0] #bestParse.draw() treeType = bestParse.convert(ParentedTree) parse_depth = 0 depth = score_from_tree.get_total_depth_of_tree_obj(bestParse, parse_depth) print('depth is : ' + str(depth)) print('builtin height is : ' + str(bestParse.height())) print(bestParse) bestParse.draw() #score_from_tree.get_tree_obj_to_negative_depth(bestParse, 2, parse_depth) #prunedBestParse, removedLeaves, leafIndex= score_from_tree.remove_embellishment_rules_from_tree_below_depth(bestParse, {}, depth - 2, 0, 0) prunedBestParse, removedLeaves, leafIndex, maxSubtreeDepth = score_from_tree.remove_embellishment_rules_from_tree_negative_depth(bestParse, {}, 3, 0, 0) print(prunedBestParse) #for s in parentedBestParse.subtrees(lambda t: t.height() > parse_depth - 3): #treepos = parentedBestParse.treeposition(s) #parentedBestParse.remove(treepos) prunedBestParse.draw() score_from_tree.get_melody_from_parse_tree(bestParse, removedLeaves, musicXmlTest) PR_fileToTest = "./MusicXml/PR/PR-39_Andante C dur.xml" ET = ElementTree() ET.parse(PR_fileToTest) root = ET.getroot() rootName = args.type.lower() topXml = root.find(rootName) depth = 0 depth = score_from_tree.get_total_depth_of_tree(topXml, depth, rootName) print('depth is ' + str(depth)) musicXmlTest.show() for d in reversed(range(0, depth - 1)): pitch_refs = score_from_tree.gather_note_refs_of_depth(topXml, [], rootName, d, 0) pitch_refs.sort(key=music_grammar.pitchRefToNum) melody_of_depth = score_from_tree.pitch_refs_to_notes(pitch_refs, musicXmlTest) melody_of_depth.show() print (pitch_refs) #examples with 3-child nodes #, './MusicXml/MSC-166.xml', './MusicXml/MSC-103.xml', './MusicXml/37_Sonate fur Klavier Nr.48 C dur Op.30-1 Mov.1.xml', './MusicXml/MSC-211.xml' #wrong buti like it , './MusicXml/MSC-238.xml' #like: ./MusicXml/MSC-141.xml fold 2, #filesToTest = ['./MusicXml/MSC-238.xml', './MusicXml/39_Andante C dur.xml']#fold 1 #filesToTest = ['./MusicXml/MSC-224.xml', './MusicXml/MSC-141.xml']# fold 2 #filesToTest = ["./MusicXml/03_Bagatelle 'Fur Elise' WoO.59.xml"]#fold 3 #filesToTest = ['./MusicXml/MSC-111.xml'] #['./MusicXml/MSC-108.xml', './MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml', './MusicXml/MSC-231.xml', './MusicXml/37_Sonate fur Klavier Nr.48 C dur Op.30-1 Mov.1.xml', './MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml']#fold4 #filesToTest = ['./MusicXml/MSC-111.xml', './MusicXml/MSC-108.xml', './MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml', './MusicXml/MSC-231.xml', './MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml'] #PR #filesToTest = ['./MusicXml/80_Symphonie Nr.40 g moll KV.550 1.Satz.xml', './MusicXml/31_Sinfonie Nr.9 d moll Op.125 4.Satz An die Freude.xml']# fold 0 PR #filesToTest = ['./MusicXml/34_Water Music in D major HWV 349 No.11 Alla Hornpipe.xml', './MusicXml/02_Moments Musicaux.xml']#fold 0 20% #filesToTest = ['./MusicXml/84_Toccata und Fuge d moll BWV565.xml'] #fold 1 #filesToTest = ['./MusicXml/33_Swan Lake Op.20 No.9 Finale.xml', './MusicXml/40_Alpengluhen Op.193.xml']# fold 3 Pr #filesToTest = ['./MusicXml/57_Waves of the Danube.xml']#, './MusicXml/60_Ma Vlast Moldau.xml']# fold 3 PR < 20% filesToTest = ['./MusicXml/02_Moments Musicaux.xml']#fold 4 ts totalCorrect, totalCorrectNonN, totalProductions, totalLeaves, parseTreeStrings = music_grammar.parseAllTestXmls(filesToTest, trainedGrammar, solutionTreeDictForTestSet, args.verbose, False) solutionTreeDictForTestSet, testProductions = music_grammar.getAllProductions(args.directory, args.type, filesToTest, args.type, args.verbose) parseFilename = "fold4_" + args.type + "_parsedTestSet.txt" parseFile = open(parseFilename, 'r') parses = json.load(parseFile) for filename, solutionTree in parseTreeStrings.items(): if "_afterComparison" in filename: continue treeSolution = solutionTreeDictForTestSet[filename] percentageCorrect = -1 print(filename) treeSolutionObj = Tree.fromstring(treeSolution) treeSolutionObj.draw() parseTreeNoProbabilities = removeProbability(str(parseTreeStrings[filename])) parseTreeObj = Tree.fromstring(parseTreeNoProbabilities) parseTreeObj.draw() parseAfterCompNoProbabilities = removeProbability(str(parseTreeStrings[filename+'_afterComparison'])) parseAfterCompObj = Tree.fromstring(parseAfterCompNoProbabilities) parseAfterCompObj.draw() percentageCorrectNonN = -1 percentageLeaves = -1 if totalProductions > 0: percentageCorrect = totalCorrect / totalProductions percentageCorrectNonN = totalCorrectNonN / totalProductions percentageLeaves = totalLeaves / totalProductions print("results:\ntotalCorrect: " + str(totalCorrect) + "\npercentageCorrect: " + str(percentageCorrect) + "\ntotalCorrectNonN: " + str(totalCorrectNonN) + "\npercentageCorrectNonN: " + str(percentageCorrectNonN) + "\ntotalProductions: " + str(totalProductions) + "\ntotalLeaves: " + str(totalLeaves) + "\npercentageLeavess: " + str(percentageLeaves) + "\n") #finish this case return if stats == True: totalCorrect = 0 totalCorrectNonN = 0 totalProductions = 0 totalLeaves = 0 #./MusicXml/MSC-103.xml, ./MusicXml/24_Orphee aux Enfers Overture.xml, ./MusicXml/MSC-211.xml, ./MusicXml/39_Andante C dur.xml,./MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml #small ones #./MusicXml/MSC-224.xml #pretty good one: ['./MusicXml/57_Waves of the Danube.xml', './MusicXml/MSC-107.xml','./MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml', './MusicXml/MSC-231.xml'] goodOnesTS = ['./MusicXml/57_Waves of the Danube.xml', './MusicXml/MSC-107.xml','./MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml', './MusicXml/MSC-231.xml'] goodOnesPR = ['./MusicXml/02_Moments Musicaux.xml',"./MusicXml/95_12 Variationen uber ein franzosisches Lied 'Ah,vous dirai-je, maman' C dur K.265 K6.300e.xml"] #music_grammar.getAllProductionsHarmonicGrammar(args.directory, args.type, [goodOnesPR[0]], args.type, "MINOR", args.verbose) #if stats == True: # return num_skip = 0 for fold in range(int(args.folds)): bestSolutionFiles = [] worstSolutionFile = "" bestPercentage = .25 worstPercentage = .2 #get parses from file parseFilename = "fold" + str(fold) + "_" + args.type + "_parsedTestSet.txt" parseFile = open(parseFilename, 'r') parses = json.load(parseFile) #get solutions from file solutionsFilename = "fold" + str(fold) + "_" + args.type + "_testSolutions.txt" solutionsFile = open(solutionsFilename, 'r') solutions = json.load(solutionsFile) foldLeaves = 0 foldProductions = 0 foldCorrect = 0 foldCorrectNonN = 0 for filename, solutionTree in solutions.items(): if parses[filename] != None and parses[filename] != '': solutionTreeObj = Tree.fromstring(solutionTree) parseStr = parses[filename] probabilisticPart = re.findall('(\(p=[^()]*\))', parseStr) indexOfProbPart = parseStr.index(probabilisticPart[0]) parseTreeObj = Tree.fromstring(parseStr[:indexOfProbPart]) #here's where we look at example reductions in musical scores curMusicXml = converter.parse(filename) if len(curMusicXml.flat.notes) >= 15 and len(curMusicXml.flat.notes) < 20 or filename == './MusicXml/03_Bagatelle \'Fur Elise\' WoO.59.xml': print(filename) if filename != './MusicXml/03_Bagatelle \'Fur Elise\' WoO.59.xml': continue #if args.type == 'PR': solutionFilename = args.type + "-" + basename(filename) solutionFilepath = args.directory + '/' + args.type + '/' + solutionFilename #else: # solutionFilename = args.type + "-" + basename(filename)[4:] # solutionFilepath = args.directory + '/' + args.type + '/' + solutionFilename[:-4] + '_1' + solutionFilename[-4:] ET = ElementTree() ET.parse(solutionFilepath) root = ET.getroot() rootName = args.type.lower() topXml = root.find(rootName) #topXml.show() if num_skip > 0: num_skip -= 1 continue parseTreeObj.draw() #score_from_tree.print_reductions_for_parse_tree(parseTreeObj, curMusicXml) depth = 0 depth = score_from_tree.get_total_depth_of_tree(topXml, depth, rootName) print('depth is ' + str(depth)) curMusicXml.show() for d in reversed(range(0, depth - 1)): pitch_refs = score_from_tree.gather_note_refs_of_depth(topXml, [], rootName, d, 0) pitch_refs.sort(key=music_grammar.pitchRefToNum) melody_of_depth = score_from_tree.pitch_refs_to_notes(pitch_refs, curMusicXml) melody_of_depth.show() print (pitch_refs) continue parseTreeObjAfterComparison = copy.deepcopy(parseTreeObj) numProductions = len(solutionTreeObj.productions()) foldProductions += numProductions bottomCorrect, bottomCorrectNonN = validate_tree.compareTreesBottomUp(solutionTreeObj, parseTreeObjAfterComparison) if bottomCorrect / numProductions > worstPercentage:# and bottomCorrect / numProductions < bestPercentage: bestSolutionFiles.append(filename) if filename in goodOnesPR and False: print(filename) print(parseTreeObj.leaves()) solutionTreeObj.draw() parseTreeObj.draw() parseTreeObjAfterComparison.draw() #bestPercentage = bottomCorrect / numProductions #if bottomCorrect / numProductions < worstPercentage: # worstSolutionFile = filename # worstPercentage = bottomCorrect / numProductions foldLeaves = len(solutionTreeObj.leaves()) foldCorrect += bottomCorrect foldCorrectNonN += bottomCorrectNonN totalProductions += foldProductions totalLeaves += foldLeaves totalCorrect += foldCorrect totalCorrectNonN += foldCorrectNonN foldPercentageCorrect = -1 foldPercentageCorrectNonN = -1 foldPercentageLeaves = -1 if foldProductions > 0: foldPercentageCorrect = foldCorrect / foldProductions foldPercentageCorrectNonN = foldCorrectNonN / foldProductions foldPercentageLeaves = foldLeaves / foldProductions print("Fold number " + str(fold) + " results:\nfoldCorrect: " + str(foldCorrect) + "\nfoldPercentageCorrect: " + str(foldPercentageCorrect) + "\nfoldCorrectNonN: " + str(foldCorrectNonN) + "\nfoldPercentageCorrectNonN: " + str(foldPercentageCorrectNonN) + "\nfoldProductions: " + str(foldProductions) + "\nfoldLeaves: " + str(foldLeaves) + "\nfoldPercentageLeaves: " + str(foldPercentageLeaves)) print("Best: " + str(bestSolutionFiles) + ', ' + str(bestPercentage)) print("Worst: " + worstSolutionFile + ', ' + str(worstPercentage)+ "\n") percentageCorrect = -1 percentageCorrectNonN = -1 percentageLeaves = -1 if totalProductions > 0: percentageCorrect = totalCorrect / totalProductions percentageCorrectNonN = totalCorrectNonN / totalProductions percentageLeaves = totalLeaves / totalProductions print("results:\ntotalCorrect: " + str(totalCorrect) + "\npercentageCorrect: " + str(percentageCorrect) + "\ntotalCorrectNonN: " + str(totalCorrectNonN) + "\npercentageCorrectNonN: " + str(percentageCorrectNonN) + "\ntotalProductions: " + str(totalProductions) + "\ntotalLeaves: " + str(totalLeaves) + "\npercentageLeavess: " + str(percentageLeaves) + "\n") #finish this case return #cross-validate crossVal(args)
def parseAllTestXmls(fileList, grammar, allTestSolutionsDict, verbose=False, displayTrees=False): testPitchLists = [] testIntervalLists = [] totalCorrect = 0 totalCorrectNonN = 0 totalProductions = 0 totalLeaves = 0 parseTreeStrings = {} for filepath in fileList: curPitchList = getPitchListFromFile(filepath) testPitchLists.append(curPitchList) testIntervalLists.append(getIntervalStringsFromPitchList(curPitchList, verbose)) if verbose: print(testIntervalLists[-1]) listLen = len(testIntervalLists[-1]) if verbose: print(tree) parser = ViterbiParser(grammar) if verbose: parser.trace(0)#3 else: parser.trace(0) try: parses = parser.parse_all(testIntervalLists[-1]) except Exception as errorMsg: print("error parsing file " + filepath) print(errorMsg) numTrees = sum(1 for _ in parses) if numTrees > 0 and displayTrees == True: from nltk.draw.tree import draw_trees draw_trees(*parses) if numTrees == 0: print("Couldn't find a valid parse, this is bad, very very bad") return 0,0 numCorrect = 0 numCorrectNonN = 0 bottomCorrect = 0 bottomCorrectNonN = 0 solutionTree = None try: solutionTreeStr = allTestSolutionsDict[filepath] solutionTree = Tree.fromstring(solutionTreeStr) except Exception as errorMsg: print("couldn't find solution for file " + filepath) print(errorMsg) if solutionTree != None and solutionTree != '': parseTreeStrings[filepath] = str(parses[0]) numCorrect, numCorrectNonN = validate_tree.compareTrees(solutionTree, parses[0]) numProductions = len(solutionTree.productions()) totalProductions += numProductions #solutionTree.draw() #parses[0].draw() bottomCorrect, bottomCorrectNonN = validate_tree.compareTreesBottomUp(solutionTree, parses[0]) parseTreeStrings[filepath+'_afterComparison'] = str(parses[0]) totalLeaves += len(solutionTree.leaves()) #parses[0].draw() totalCorrect += bottomCorrect totalCorrect += numCorrect totalCorrectNonN += numCorrectNonN totalCorrectNonN += bottomCorrectNonN return totalCorrect, totalCorrectNonN, totalProductions, totalLeaves, parseTreeStrings
### try RandomChartParser, UnsortedChartParser, LongestChartParser # In[ ]: parser = nltk.parse.EarleyChartParser(grammar) for t in parser.parse(tokens): print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse_all(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent: print(t) # In[ ]: ### CYK parser gets the most probable parse from nltk.parse import ViterbiParser parser = ViterbiParser(grammar) parser.trace(3) parsed_sent = list(parser.parse(tokens)) # to convert generator to list parsed_sent[0].draw() for t in parsed_sent: print(t)
class MyViterbi: # init # create the object # param: void # return: void def __init__(self): self.wordToTags = defaultdict(set) convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()] for word, tag in convertedTaggedWords: self.wordToTags[word].add(tag) productions = list() S = nltk.Nonterminal('S') for tree in treebank.parsed_sents(): productions += tree.productions() # create the grammar pcfg = nltk.induce_pcfg(S, productions) # print(pcfg) self.viterb = ViterbiParser(pcfg) self.mostRecentTree = None self.validPosTags = set() self.validChunkTags = set() self.validIOBTags = set() self.relationTags = set() self.anchorTags = set() # pos tags self.validPosTags.add("CC") self.validPosTags.add("CD") self.validPosTags.add("DT") self.validPosTags.add("EX") self.validPosTags.add("FW") self.validPosTags.add("IN") self.validPosTags.add("JJ") self.validPosTags.add("JJR") self.validPosTags.add("JJS") self.validPosTags.add("LS") self.validPosTags.add("MD") self.validPosTags.add("NN") self.validPosTags.add("NNS") self.validPosTags.add("NNP") self.validPosTags.add("NNPS") self.validPosTags.add("PDT") self.validPosTags.add("POS") self.validPosTags.add("PRP") self.validPosTags.add("PRP$") self.validPosTags.add("PR") self.validPosTags.add("PBR") self.validPosTags.add("PBS") self.validPosTags.add("RP") self.validPosTags.add("SYM") self.validPosTags.add("TO") self.validPosTags.add("UH") self.validPosTags.add("VB") self.validPosTags.add("VBZ") self.validPosTags.add("VBP") self.validPosTags.add("VBD") self.validPosTags.add("VBG") self.validPosTags.add("WDT") self.validPosTags.add("WP") self.validPosTags.add("WP$") self.validPosTags.add("WRB") self.validPosTags.add(".") self.validPosTags.add(",") self.validPosTags.add(":") self.validPosTags.add("(") self.validPosTags.add(")") # chunk tags self.validChunkTags.add("NP") self.validChunkTags.add("PP") self.validChunkTags.add("VP") self.validChunkTags.add("ADVP") self.validChunkTags.add("ADJP") self.validChunkTags.add("SBAR") self.validChunkTags.add("PRT") self.validChunkTags.add("INTJ") self.validChunkTags.add("PNP") # IOB tags self.validIOBTags.add("I-") self.validIOBTags.add("O-") self.validIOBTags.add("B-") # relation tags self.relationTags.add("SBJ") self.relationTags.add("OBJ") self.relationTags.add("PRD") self.relationTags.add("TMP") self.relationTags.add("CLR") self.relationTags.add("LOC") self.relationTags.add("DIR") self.relationTags.add("EXT") self.relationTags.add("PRP") # anchor tags self.anchorTags.add("A1") self.anchorTags.add("P1") # parse # returns a parse tree corresponding to the given string # # param: # x : the string to be parsed # return: # the parse tree corresponding to x def parse(self, x): tokenizedSent = nltk.word_tokenize(x) trees = self.viterb.parse_all(tokenizedSent) # save the first one and then return it self.mostRecentTree = trees[0] return self.mostRecentTree # lastparse_label # returns all subtrees that has the given label for the root for the last # generated tree # param: # x : the label # return: # a list of all subtrees that have x as the label of the root def lastparse_label(self, x): # see if a previous tree exists if self.mostRecentTree is None: raise RuntimeError("No previous tree exists") # see if it is a POS tag if x not in self.validPosTags: # if not see if it is a chunk tag stringParts = x.split("-") if len(stringParts) == 2 and stringParts[1] not in self.relationTags: raise RuntimeError("Invalid relation label") if stringParts[0] not in self.validChunkTags: raise RuntimeError("Invalid tag") return [subtree for subtree in self.mostRecentTree.subtrees(lambda t: t.label() == x)] def lastparse_phrase(self, x): # find all subtrees of a certain type # see if a previous tree exists if self.mostRecentTree is None: raise RuntimeError("No previous tree exists") if x not in self.validChunkTags: raise RuntimeError("not a valid type of chunk") return [subtree for subtree in self.mostRecentTree.subtrees(lambda t: x in t.label())] # lastparse_height # returns the height of the tree that was just generated # # return: the height of the tree def lastparse_height(self): # see if a previous tree exists if self.mostRecentTree is None: raise RuntimeError("No previous tree exists") return self.mostRecentTree.height() # wordsFromChunks # helper function for taking the trees given and turning them into a lists of words def wordsFromChunks(self, label, alternateMode = False): chunks = self.lastparse_phrase(label) if alternateMode else self.lastparse_label(label) returnList = list() for chunk in chunks: temp = chunk.pos() returnList.append([word for word, pos in temp]) return returnList # lastparse_nounphrase # returns all noun phrases of the most recently generated tree # return: # all noun phrases def lastparse_nounphrase(self): return self.wordsFromChunks("NP", True) # lastparse_verbphrase # returns all verb phrases of the most recently generated tree # return: # all verb phrases def lastparse_verbphrase(self): return self.wordsFromChunks("VP", True) # lastparse_verbs # returns all verbs of the most recently generated tree # return: # all verbs def lastparse_verbs(self): result = [] verbList = ['VB','VBZ','VBP','VBD','VBG'] for i in range(0,len(verbList)): tmp = self.wordsFromChunks(verbList[i]) for j in range(0,len(tmp)): result.append(tmp[j]) return result # lastparse_nouns # returns all nouns of the most recently generated tree # return: # all nouns def lastparse_nouns(self): result = [] nounList = ['NN','NNS','NNP','NNPS'] for i in range(0,len(nounList)): tmp = self.wordsFromChunks(nounList[i]) for j in range(0,len(tmp)): result.append(tmp[j]) return result def parse_with_substitution(self,x): tokenizedSent = nltk.word_tokenize(x) posTags = nltk.pos_tag(tokenizedSent, tagset='universal') fixedSentence = list() for word, tag in posTags: if tag in self.wordToTags[word]: fixedSentence.append(word) else: for word, tags in self.wordToTags.items(): if tag in tags: fixedSentence.append(word) break print(fixedSentence) trees = self.viterb.parse_all(fixedSentence) # save the first one and then return it self.mostRecentTree = trees[0] return self.mostRecentTree
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys import time from nltk import tokenize from nltk.grammar import toy_pcfg1, toy_pcfg2 from nltk.parse import ViterbiParser # Define two demos. Each demo has a sentence and a grammar. demos = [ ("I saw the man with my telescope", toy_pcfg1), ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), ] # Ask the user which demo they want to use. print() for i in range(len(demos)): print(f"{i + 1:>3}: {demos[i][0]}") print(" %r" % demos[i][1]) print() print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") try: snum = int(sys.stdin.readline().strip()) - 1 sent, grammar = demos[snum] except: print("Bad sentence number") return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time() - t average = (reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print("Time (secs) # Parses Average P(parse)") print("-----------------------------------------") print("%11.4f%11d%19.14f" % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) else: p = 0 print("------------------------------------------") print("%11s%11d%19.14f" % ("n/a", len(parses), p)) # Ask the user if we should draw the parses. print() print("Draw parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): from nltk.draw.tree import draw_trees print(" please wait...") draw_trees(*parses) # Ask the user if we should print the parses. print() print("Print parses (y/n)? ", end=" ") if sys.stdin.readline().strip().lower().startswith("y"): for parse in parses: print(parse)