Esempio n. 1
0
def test_sentences(grammar):

    for t in test:
        print("Processing: " + str(t))
        reference = list(treebank.tagged_words(t))

        tokens = list(treebank.words(t))

        print("fixing grammar.....")
        # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary
        fixed_grammar = get_fixed_grammer(grammar, tokens)

        print("fixed grammar")
        print("Building Parser....")
        parser = ViterbiParser(fixed_grammar)

        print("Parsing...")
        #Gets list of all possible trees, the most likely tree is at index 0
        start = time.time()
        parses = parser.parse_all(tokens)
        print("Time")
        print(start - time.time())

        #Getting POS tags from parser tree
        leafs = parses[0].pos()

        #Calculating accuracy of Parser results
        correct_tags = 0.0
        for i in range(len(leafs)):
            if leafs[i] == reference[i]:
                correct_tags += 1.0


        print(str(correct_tags/len(leafs)))
Esempio n. 2
0
def main():
    # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0])
    # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw()

    # print("Induce PCFG grammar from treebank data:")
    #
    productions = []
    print(len(treebank.fileids()))
    for item in treebank.fileids(): # Goes through all trees
      for tree in treebank.parsed_sents(item):
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
    # #
    # # print(type(productions[0]))
    # #
    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # # # print(grammar)    # This is a PCFG
    # pickle.dump(grammar, open("tbank-grammar.p", "wb"))
    # t = time.time()
    # grammar = pickle.load(open("tbank-grammar.p", "rb"))
    # textf = open("lexicon.txt", "w")
    # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n"))))))
    # textf.close()
    # print(time.time()-t)
    parser = ViterbiParser(grammar)
    # pickle.dump(parser, open("cky-parser.p", "wb"))
    # parser = pickle.load(open("cky-parser.p", "rb"))
    parser.trace(0)
    sent = "John will join the board"
    tokens = sent.split()

    try:
        grammar.check_coverage(tokens)
        print("All words covered")
        parses = parser.parse_all(tokens)
        if parses:
            lp = len(parses)
            print(lp)
            print(parses[0].label())
            # parses[0].draw()
            p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0)
        else:
            p = 0

        print("Probability:", p)
    except:
        print("Some words not covered")
Esempio n. 3
0
def Parser_Section():
    demos = [('I saw John through the telescope', toy_pcfg1)]
    sent, grammar = demos[0]
    # print(grammar)

    # Tokenize the sentence.
    tokens = sent.split()
    parser = ViterbiParser(grammar)

    parser.trace(0) # Use this to change verbosity
    t = time.time()
    parses = parser.parse_all(tokens)
    print("Time:", time.time()-t)

    if parses:
        lp = len(parses)
        p = reduce(lambda a,b:a+b.prob(), parses, 0.0)
    else:
        p = 0

    print("Probability:", p)
Esempio n. 4
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import ViterbiParser
    from nltk.grammar import toy_pcfg1, toy_pcfg2

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ('I saw the man with my telescope', toy_pcfg1),
        ('the boy saw Jack with Bob under the table with a telescope',
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print('%3s: %s' % (i + 1, demos[i][0]))
        print('     %r' % demos[i][1])
        print()
    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('Time (secs)   # Parses   Average P(parse)')
    print('-----------------------------------------')
    print('%11.4f%11d%19.14f' % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print('------------------------------------------')
    print('%11s%11d%19.14f' % ('n/a', len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print('Draw parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk.draw.tree import draw_trees

        print('  please wait...')
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print('Print parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print(parse)
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import ViterbiParser
    from nltk.grammar import toy_pcfg1, toy_pcfg2
    from nltk.draw.tree import draw_trees
    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget

    # Define two demos.  Each demo has a sentence and a grammar.
    # demos = [('move the green sphere to the bottom left corner', learned_pcfg),
    #          ('move the green ball over the red block', learned_pcfg),
    #          ('take the green pyramid and put it in the top left corner', learned_pcfg),
    #           ('put the green pyramid on the red block', learned_pcfg),
    #           ('move the red cylinder and place it on top of the blue cylinder that is on top of a green cylinder', learned_pcfg),]

    # Ask the user which demo they want to use.
    # print()
    # for i in range(len(demos)):
    #     print('%3s: %s' % (i+1, demos[i][0]))
    #     print('     %r' % demos[i][1])
    #     print()
    # print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    # try:
    #     snum = int(sys.stdin.readline().strip())-1
    #     sent, grammar = demos[snum]
    # except:
    #     print('Bad sentence number')
    #     return

    max_scene = 1

    if max_scene<10:            sc = '0000'+str(max_scene)
    elif max_scene<100:         sc = '000'+str(max_scene)
    elif max_scene<1000:        sc = '00'+str(max_scene)
    elif max_scene<10000:       sc = '0'+str(max_scene)

    g = 'grammar_'+sc+'.txt'
    learned_pcfg = load('/home/omari/Dropbox/robot_modified/AR/grammar/'+g)
    grammar = learned_pcfg

    file1 = open('/home/omari/Dropbox/robot_modified/AR/hypotheses/matched_commands.txt', 'r')
    g1 = [i for i in file1.readlines()]
    for line in g1:
        line = unicode(line,encoding='utf-8')
        sent = line.split('\n')[0].split('-')[-1]
        scene = line.split('\n')[0].split('-')[0]
        sent_num = line.split('\n')[0].split('-')[1]
        print(line)
        if scene == '239' and sent_num == '0':  continue


        # Tokenize the sentence.
        tokens = sent.split()

        parser = ViterbiParser(grammar)
        all_parses = {}

        # print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar))
        parser.trace(3)
        parses = parser.parse_all(tokens)
        average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
                   if parses else 0)
        num_parses = len(parses)
        for p in parses:
            all_parses[p.freeze()] = 1

        # Print some summary statistics
        # print()
        # print('Time (secs)   # Parses   Average P(parse)')
        # print('-----------------------------------------')
        # print('%11.4f%11d%19.14f' % (time, num_parses, average))
        parses = all_parses.keys()
        if parses:
            p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses)
        else: p = 0
        # print('------------------------------------------')
        # print('%11s%11d%19.14f' % ('n/a', len(parses), p))

        # Ask the user if we should draw the parses.
        # print()
        # print('Draw parses (y/n)? ', end=' ')
        # if sys.stdin.readline().strip().lower().startswith('y'):

        #     print('  please wait...')
        # draw_trees(*parses)

        cf = CanvasFrame()
        # t = Tree(parses)
        t = Tree.fromstring('(S  (CH_POS_PREPOST move)  (PRE_POST    (PRE      (the the)      (_entity (F_HSV green) (F_SHAPE sphere)))    (PREPOST_connect (to to) (the the))    (POST      (_F_POS (F_POS (_bottom_left (bottom bottom) (left left)))) (corner corner))))')

        tc = TreeWidget(cf.canvas(), t, draggable=1,
                        node_font=('helvetica', -14),
                        leaf_font=('helvetica', -12),
                        roof_fill='white', roof_color='black',
                        leaf_color='green4', node_color='blue4')
        cf.add_widget(tc,10,10)

        # tc = TreeWidget(cf.canvas(),t)
        # cf.add_widget(tc,10,10) # (10,10) offsets
        cf.print_to_file('/home/omari/Dropbox/robot_modified/trees/scene-'+scene+'-'+sent_num+'.ps')
        cf.destroy()
Esempio n. 6
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import ViterbiParser
    from nltk.grammar import toy_pcfg1, toy_pcfg2

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ('I saw the man with my telescope', toy_pcfg1),
        ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print('%3s: %s' % (i + 1, demos[i][0]))
        print('     %r' % demos[i][1])
        print()
    print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (
        reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
    )
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print('Time (secs)   # Parses   Average P(parse)')
    print('-----------------------------------------')
    print('%11.4f%11d%19.14f' % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print('------------------------------------------')
    print('%11s%11d%19.14f' % ('n/a', len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print('Draw parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk.draw.tree import draw_trees

        print('  please wait...')
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print('Print parses (y/n)? ', end=' ')
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print(parse)
Esempio n. 7
0
    # them in the test file
    tokens = treebank[i][0].leaves()
    sentence = u" ".join(tokens)
    test_file.write((sentence + u"\n").encode('utf-8'))

    print 'parsing :', sentence
    # we will use lexicon knowledge to replace the
    # unknown word in order to do a parsing with large corpus
    # of unknown words
    unknowns = check_unknown_words(tokens, grammar)
    if len(unknowns) > 0:
        grammar = update_grammar(productions, unknowns)
        parser = ViterbiParser(grammar)
        parser.trace(2)

    parses = parser.parse_all(tokens)
    if len(parses) > 0:
        parse = parses[0]
    else:
        parse = ""
    test_output_file.write(" ".join(parse.__str__().replace("\n", '').split()))
    test_output_file.write('\n')

    parses_bank.append(parse)

test_file.close()
test_output_file.close()

if args.run_in_shell != "True":
    sys.exit(0)
Esempio n. 8
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import PCFG
    from nltk.parse import ViterbiParser

    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)
# Tokenize the sentence.
#tokens = sent.split()
tokens = sys.argv[1:]

# Define a list of parsers.  We'll use all parsers.
parser = ViterbiParser(grammar)

print('Coverage of input words by a grammar:\n')
change_words = []
for i, ele in enumerate(tokens):
    try:
        grammar.check_coverage([ele])
    except:
        print("%s is not covered by the grammar. Replacing it with 'UNK'\n" %
              ele)
        change_words.append(tokens[i])
        tokens[i] = 'UNK'

trees = parser.parse_all(tokens)

for tree in trees:
    pass

UNK_str = trees[0].__str__()
answer = UNK_str

for i in change_words:
    answer = answer.replace("UNK", i, 1)
print("\nTree is:\n\n")
print(answer)
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("directory", help="Directory that contains melody files")
	parser.add_argument("solutions", help="Directory that contains the solution files")
	parser.add_argument("-g", "--grammar", help="The file that specifies a saved grammar, this grammar will be used instead of training a new one")
	parser.add_argument("-f", "--folds", help="number of folds desired")
	parser.add_argument("-o", "--outfile", help="The file that the grammar will be saved in")
	parser.add_argument("-t", "--type", help="The type of solutions file we're using, either 'PR' or 'TS' for Prolongational Reduction or Time-span Tree, respectively")
	parser.add_argument("-v", "--verbose", help="increase output verbosity")
	args = parser.parse_args()
	print(args)


	if args.verbose == None or args.verbose == 'False':
		args.verbose = False
	elif args.verbose == 'True':
		args.verbose = True

	#If the grammar is specified, then the "directory" folder will be used as a test set
	#If the grammar is not specified, it will use 20% of the files in "directory" as a test set, and the rest as a training set to create the grammar
	#If folds are specified, then it will split the files in "directory" into numFolds groups, applying training and testing, and then adding up the percentages overall
	allProductions = []
	stats = True
	if args.grammar != None and args.grammar != '':
		if not os.path.isfile(args.grammar):
			return
		f = open(args.grammar, 'r')
		for line in f.readlines():
			allProductions.append(line)
		S = Nonterminal('S')
		smallTrees = music_grammar.collectProductions(allProductions, args.verbose)
		trainedGrammar = induce_pcfg(S, smallTrees)
		print(trainedGrammar)
		np_productions = trainedGrammar.productions(Nonterminal('4'))
		dict = {}
		for pr in np_productions: dict[pr.rhs()] = pr.prob()
		np_probDist = DictionaryProbDist(dict)


		#Used this code for generating specific figures:
		exampleTree = Tree.fromstring('(S (N (N (5 5)) (N (m4 -4))) (N (6 6)))')
		#exampleTree.draw()
		exampleTreeToCompare = Tree.fromstring('(S (N (5 5)) (N (N (m4 -4)) (N (6 6))))')
		#exampleTreeToCompare.draw()
		validate_tree.compareTreesBottomUp(exampleTree, exampleTreeToCompare)
		#exampleTreeToCompare.draw()


		for i in range(100):
			rightHand = np_probDist.generate()
			print(rightHand)
			print(len(rightHand))

		generatedTree = pcfg_generate.generate(trainedGrammar)
		print('resulting tree: ')
		generatedTreeObj = Tree.fromstring(generatedTree)
		print(generatedTreeObj)
		print(str(generatedTreeObj.leaves()))
		print('\n\n')
		embellishedTree = pcfg_generate.expandAllLeaves(trainedGrammar, generatedTreeObj)
		print(embellishedTree)

		print(str(Tree.fromstring(str(embellishedTree)).leaves()))

		fileToTest = "./MusicXml/72_Preludes 1 La fille aux cheveux de lin.xml"
		musicXmlTest = converter.parse(fileToTest)

		curPitchList = music_grammar.getPitchListFromFile(fileToTest)

		intervalList = music_grammar.getIntervalStringsFromPitchList(curPitchList)


		with open(music_grammar.musicGrammarFilename, 'r') as f:
			musicGrammarString = f.read()
		musicGrammar = CFG.fromstring(musicGrammarString)
		parser = ChartParser(musicGrammar, trace=2)
		#parses = parser.parse(intervalList)

		print(intervalList)
		print('num intervals is: ' + str(len(intervalList)))
		#numTrees = sum(1 for _ in parses)
		#print('numTrees is: ' + str(numTrees))
		#return

		#this is for the musical examples

		trainedParser = ViterbiParser(trainedGrammar)
		parses = trainedParser.parse_all(intervalList)
		bestParse = parses[0]
		#bestParse.draw()
		treeType = bestParse.convert(ParentedTree)
		parse_depth = 0
		depth = score_from_tree.get_total_depth_of_tree_obj(bestParse, parse_depth)
		print('depth is : ' + str(depth))
		print('builtin height is : ' + str(bestParse.height()))
		print(bestParse)
		bestParse.draw()
		#score_from_tree.get_tree_obj_to_negative_depth(bestParse, 2, parse_depth)

		#prunedBestParse, removedLeaves, leafIndex= score_from_tree.remove_embellishment_rules_from_tree_below_depth(bestParse, {}, depth - 2, 0, 0)
		prunedBestParse, removedLeaves, leafIndex, maxSubtreeDepth = score_from_tree.remove_embellishment_rules_from_tree_negative_depth(bestParse, {}, 3, 0, 0)

		print(prunedBestParse)
		#for s in parentedBestParse.subtrees(lambda t: t.height() > parse_depth - 3):
			#treepos = parentedBestParse.treeposition(s)
			#parentedBestParse.remove(treepos)
		prunedBestParse.draw()
		score_from_tree.get_melody_from_parse_tree(bestParse, removedLeaves, musicXmlTest)

		PR_fileToTest = "./MusicXml/PR/PR-39_Andante C dur.xml"
		ET = ElementTree()
		ET.parse(PR_fileToTest)
		root = ET.getroot()
		rootName = args.type.lower()
		topXml = root.find(rootName)
		depth = 0
		depth = score_from_tree.get_total_depth_of_tree(topXml, depth, rootName)
		print('depth is ' + str(depth))
		musicXmlTest.show()
		for d in reversed(range(0, depth - 1)):
			pitch_refs = score_from_tree.gather_note_refs_of_depth(topXml, [], rootName, d, 0)
			pitch_refs.sort(key=music_grammar.pitchRefToNum)
			melody_of_depth = score_from_tree.pitch_refs_to_notes(pitch_refs, musicXmlTest)
			melody_of_depth.show()
			print (pitch_refs)


		#examples with 3-child nodes
		#, './MusicXml/MSC-166.xml', './MusicXml/MSC-103.xml', './MusicXml/37_Sonate fur Klavier Nr.48 C dur Op.30-1 Mov.1.xml', './MusicXml/MSC-211.xml'
		#wrong buti like it , './MusicXml/MSC-238.xml'
		#like: ./MusicXml/MSC-141.xml fold 2,
		#filesToTest = ['./MusicXml/MSC-238.xml', './MusicXml/39_Andante C dur.xml']#fold 1
		#filesToTest = ['./MusicXml/MSC-224.xml', './MusicXml/MSC-141.xml']# fold 2
		#filesToTest = ["./MusicXml/03_Bagatelle 'Fur Elise' WoO.59.xml"]#fold 3
		#filesToTest = ['./MusicXml/MSC-111.xml'] #['./MusicXml/MSC-108.xml', './MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml', './MusicXml/MSC-231.xml', './MusicXml/37_Sonate fur Klavier Nr.48 C dur Op.30-1 Mov.1.xml', './MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml']#fold4
		#filesToTest = ['./MusicXml/MSC-111.xml', './MusicXml/MSC-108.xml', './MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml', './MusicXml/MSC-231.xml', './MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml']

		#PR
		#filesToTest = ['./MusicXml/80_Symphonie Nr.40 g moll KV.550 1.Satz.xml', './MusicXml/31_Sinfonie Nr.9 d moll Op.125 4.Satz An die Freude.xml']# fold 0 PR
		#filesToTest = ['./MusicXml/34_Water Music in D major HWV 349 No.11 Alla Hornpipe.xml', './MusicXml/02_Moments Musicaux.xml']#fold 0 20%
		#filesToTest = ['./MusicXml/84_Toccata und Fuge d moll BWV565.xml'] #fold 1
		#filesToTest = ['./MusicXml/33_Swan Lake Op.20 No.9 Finale.xml', './MusicXml/40_Alpengluhen Op.193.xml']# fold 3 Pr
		#filesToTest =  ['./MusicXml/57_Waves of the Danube.xml']#, './MusicXml/60_Ma Vlast Moldau.xml']# fold 3 PR < 20%

		filesToTest = ['./MusicXml/02_Moments Musicaux.xml']#fold 4 ts
		totalCorrect, totalCorrectNonN, totalProductions, totalLeaves, parseTreeStrings = music_grammar.parseAllTestXmls(filesToTest, trainedGrammar, solutionTreeDictForTestSet, args.verbose, False)
		solutionTreeDictForTestSet, testProductions = music_grammar.getAllProductions(args.directory, args.type, filesToTest, args.type, args.verbose)

		parseFilename = "fold4_" + args.type + "_parsedTestSet.txt"
		parseFile = open(parseFilename, 'r')
		parses = json.load(parseFile)

		for filename, solutionTree in parseTreeStrings.items():
			if "_afterComparison" in filename:
				continue
			treeSolution = solutionTreeDictForTestSet[filename]
			percentageCorrect = -1
			print(filename)
			treeSolutionObj = Tree.fromstring(treeSolution)
			treeSolutionObj.draw()
			parseTreeNoProbabilities = removeProbability(str(parseTreeStrings[filename]))
			parseTreeObj = Tree.fromstring(parseTreeNoProbabilities)
			parseTreeObj.draw()
			parseAfterCompNoProbabilities = removeProbability(str(parseTreeStrings[filename+'_afterComparison']))
			parseAfterCompObj = Tree.fromstring(parseAfterCompNoProbabilities)
			parseAfterCompObj.draw()
		percentageCorrectNonN = -1
		percentageLeaves = -1
		if totalProductions > 0:
			percentageCorrect = totalCorrect / totalProductions
			percentageCorrectNonN = totalCorrectNonN / totalProductions
			percentageLeaves = totalLeaves / totalProductions
		print("results:\ntotalCorrect: " + str(totalCorrect) + "\npercentageCorrect: " + str(percentageCorrect) + "\ntotalCorrectNonN: " + str(totalCorrectNonN) + "\npercentageCorrectNonN: " + str(percentageCorrectNonN) + "\ntotalProductions: " + str(totalProductions) + "\ntotalLeaves: " + str(totalLeaves) + "\npercentageLeavess: " + str(percentageLeaves) + "\n")
		#finish this case
		return

	if stats == True:
		totalCorrect = 0
		totalCorrectNonN = 0
		totalProductions = 0
		totalLeaves = 0
		#./MusicXml/MSC-103.xml, ./MusicXml/24_Orphee aux Enfers Overture.xml, ./MusicXml/MSC-211.xml, ./MusicXml/39_Andante C dur.xml,./MusicXml/01_Waltz in E flat Grande Valse Brillante Op.18.xml
		#small ones
		#./MusicXml/MSC-224.xml
		#pretty good one:  ['./MusicXml/57_Waves of the Danube.xml', './MusicXml/MSC-107.xml','./MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml', './MusicXml/MSC-231.xml']
		goodOnesTS = ['./MusicXml/57_Waves of the Danube.xml', './MusicXml/MSC-107.xml','./MusicXml/59_Schwanengesang No.1 Op.72-4 D.957-4 Standchen.xml', './MusicXml/MSC-231.xml']
		goodOnesPR = ['./MusicXml/02_Moments Musicaux.xml',"./MusicXml/95_12 Variationen uber ein franzosisches Lied 'Ah,vous dirai-je, maman' C dur K.265 K6.300e.xml"]

		#music_grammar.getAllProductionsHarmonicGrammar(args.directory, args.type, [goodOnesPR[0]], args.type, "MINOR", args.verbose)
		#if stats == True:
		#	return

		num_skip = 0
		for fold in range(int(args.folds)):
			bestSolutionFiles = []
			worstSolutionFile = ""
			bestPercentage = .25
			worstPercentage = .2
			#get parses from file
			parseFilename = "fold" + str(fold) + "_" + args.type + "_parsedTestSet.txt"
			parseFile = open(parseFilename, 'r')
			parses = json.load(parseFile)
			#get solutions from file
			solutionsFilename = "fold" + str(fold) + "_" + args.type + "_testSolutions.txt"
			solutionsFile = open(solutionsFilename, 'r')
			solutions = json.load(solutionsFile)

			foldLeaves = 0
			foldProductions = 0
			foldCorrect = 0
			foldCorrectNonN = 0
			for filename, solutionTree in solutions.items():
				if parses[filename] != None and parses[filename] != '':
					solutionTreeObj = Tree.fromstring(solutionTree)
					parseStr = parses[filename]
					probabilisticPart = re.findall('(\(p=[^()]*\))', parseStr)
					indexOfProbPart = parseStr.index(probabilisticPart[0])
					parseTreeObj = Tree.fromstring(parseStr[:indexOfProbPart])

					#here's where we look at example reductions in musical scores
					curMusicXml = converter.parse(filename)
					if len(curMusicXml.flat.notes) >= 15 and len(curMusicXml.flat.notes) < 20 or filename == './MusicXml/03_Bagatelle \'Fur Elise\' WoO.59.xml':
						print(filename)
						if filename != './MusicXml/03_Bagatelle \'Fur Elise\' WoO.59.xml':
							continue
						#if args.type == 'PR':
						solutionFilename = args.type + "-" + basename(filename)
						solutionFilepath = args.directory + '/' + args.type + '/' + solutionFilename
						#else:

						#	solutionFilename = args.type + "-" + basename(filename)[4:]
						#	solutionFilepath = args.directory + '/' + args.type + '/' + solutionFilename[:-4] + '_1' + solutionFilename[-4:]

						ET = ElementTree()
						ET.parse(solutionFilepath)
						root = ET.getroot()
						rootName = args.type.lower()
						topXml = root.find(rootName)
						#topXml.show()
						if num_skip > 0:
							num_skip -= 1
							continue
						parseTreeObj.draw()
						#score_from_tree.print_reductions_for_parse_tree(parseTreeObj, curMusicXml)
						depth = 0
						depth = score_from_tree.get_total_depth_of_tree(topXml, depth, rootName)
						print('depth is ' + str(depth))
						curMusicXml.show()
						for d in reversed(range(0, depth - 1)):
							pitch_refs = score_from_tree.gather_note_refs_of_depth(topXml, [], rootName, d, 0)
							pitch_refs.sort(key=music_grammar.pitchRefToNum)
							melody_of_depth = score_from_tree.pitch_refs_to_notes(pitch_refs, curMusicXml)
							melody_of_depth.show()
							print (pitch_refs)

					continue
					parseTreeObjAfterComparison = copy.deepcopy(parseTreeObj)
					numProductions = len(solutionTreeObj.productions())
					foldProductions += numProductions
					bottomCorrect, bottomCorrectNonN = validate_tree.compareTreesBottomUp(solutionTreeObj, parseTreeObjAfterComparison)

					if bottomCorrect / numProductions > worstPercentage:# and bottomCorrect / numProductions < bestPercentage:
						bestSolutionFiles.append(filename)
						if filename in goodOnesPR and False:
							print(filename)
							print(parseTreeObj.leaves())
							solutionTreeObj.draw()
							parseTreeObj.draw()
							parseTreeObjAfterComparison.draw()
						#bestPercentage = bottomCorrect / numProductions

					#if bottomCorrect / numProductions < worstPercentage:
					#	worstSolutionFile = filename
					#	worstPercentage = bottomCorrect / numProductions
					foldLeaves = len(solutionTreeObj.leaves())

					foldCorrect += bottomCorrect
					foldCorrectNonN += bottomCorrectNonN
			totalProductions += foldProductions
			totalLeaves += foldLeaves
			totalCorrect += foldCorrect
			totalCorrectNonN += foldCorrectNonN
			foldPercentageCorrect = -1
			foldPercentageCorrectNonN = -1
			foldPercentageLeaves = -1
			if foldProductions > 0:
				foldPercentageCorrect = foldCorrect / foldProductions
				foldPercentageCorrectNonN = foldCorrectNonN / foldProductions
				foldPercentageLeaves = foldLeaves / foldProductions
			print("Fold number " + str(fold) + " results:\nfoldCorrect: " + str(foldCorrect) + "\nfoldPercentageCorrect: " + str(foldPercentageCorrect) + "\nfoldCorrectNonN: " + str(foldCorrectNonN) + "\nfoldPercentageCorrectNonN: " + str(foldPercentageCorrectNonN) + "\nfoldProductions: " + str(foldProductions) + "\nfoldLeaves: " + str(foldLeaves) + "\nfoldPercentageLeaves: " + str(foldPercentageLeaves))
			print("Best: " + str(bestSolutionFiles) + ', ' + str(bestPercentage))
			print("Worst: " + worstSolutionFile + ', ' + str(worstPercentage)+ "\n")
		percentageCorrect = -1
		percentageCorrectNonN = -1
		percentageLeaves = -1
		if totalProductions > 0:
			percentageCorrect = totalCorrect / totalProductions
			percentageCorrectNonN = totalCorrectNonN / totalProductions
			percentageLeaves = totalLeaves / totalProductions
		print("results:\ntotalCorrect: " + str(totalCorrect) + "\npercentageCorrect: " + str(percentageCorrect) + "\ntotalCorrectNonN: " + str(totalCorrectNonN) + "\npercentageCorrectNonN: " + str(percentageCorrectNonN) + "\ntotalProductions: " + str(totalProductions) + "\ntotalLeaves: " + str(totalLeaves) + "\npercentageLeavess: " + str(percentageLeaves) + "\n")
		#finish this case
		return

	#cross-validate
	crossVal(args)
def parseAllTestXmls(fileList, grammar, allTestSolutionsDict, verbose=False, displayTrees=False):
	testPitchLists = []
	testIntervalLists = []
	totalCorrect = 0
	totalCorrectNonN = 0
	totalProductions = 0
	totalLeaves = 0
	parseTreeStrings = {}
	for filepath in fileList:
		curPitchList = getPitchListFromFile(filepath)
		testPitchLists.append(curPitchList)
		testIntervalLists.append(getIntervalStringsFromPitchList(curPitchList, verbose))
		if verbose:
			print(testIntervalLists[-1])
		listLen = len(testIntervalLists[-1])
		if verbose:
			print(tree)
		parser = ViterbiParser(grammar)
		if verbose:
			parser.trace(0)#3
		else:
			parser.trace(0)
		try:
			parses = parser.parse_all(testIntervalLists[-1])
		except Exception as errorMsg:
			print("error parsing file " + filepath)
			print(errorMsg)
		numTrees = sum(1 for _ in parses)
		if numTrees > 0 and displayTrees == True:
			from nltk.draw.tree import draw_trees
			draw_trees(*parses)
		if numTrees == 0:
			print("Couldn't find a valid parse, this is bad, very very bad")
			return 0,0
		numCorrect = 0
		numCorrectNonN = 0
		bottomCorrect = 0
		bottomCorrectNonN = 0
		solutionTree = None
		try:
			solutionTreeStr = allTestSolutionsDict[filepath]
			solutionTree = Tree.fromstring(solutionTreeStr)
		except Exception as errorMsg:
			print("couldn't find solution for file " + filepath)
			print(errorMsg)
		if solutionTree != None and solutionTree != '':
			parseTreeStrings[filepath] = str(parses[0])
			numCorrect, numCorrectNonN = validate_tree.compareTrees(solutionTree, parses[0])
			numProductions = len(solutionTree.productions())
			totalProductions += numProductions

			#solutionTree.draw()
			#parses[0].draw()
			bottomCorrect, bottomCorrectNonN = validate_tree.compareTreesBottomUp(solutionTree, parses[0])
			parseTreeStrings[filepath+'_afterComparison'] = str(parses[0])
			totalLeaves += len(solutionTree.leaves())
			#parses[0].draw()

		totalCorrect += bottomCorrect
		totalCorrect += numCorrect
		totalCorrectNonN += numCorrectNonN
		totalCorrectNonN += bottomCorrectNonN
	return totalCorrect, totalCorrectNonN, totalProductions, totalLeaves, parseTreeStrings
Esempio n. 12
0
### try RandomChartParser, UnsortedChartParser, LongestChartParser

# In[ ]:

parser = nltk.parse.EarleyChartParser(grammar)
for t in parser.parse(tokens):
    print(t)

# In[ ]:

### CYK parser gets the most probable parse
from nltk.parse import ViterbiParser

parser = ViterbiParser(grammar)
parser.trace(3)
parsed_sent = list(parser.parse_all(tokens))  # to convert generator to list
parsed_sent[0].draw()
for t in parsed_sent:
    print(t)

# In[ ]:

### CYK parser gets the most probable parse
from nltk.parse import ViterbiParser

parser = ViterbiParser(grammar)
parser.trace(3)
parsed_sent = list(parser.parse(tokens))  # to convert generator to list
parsed_sent[0].draw()
for t in parsed_sent:
    print(t)
Esempio n. 13
0
class MyViterbi:

    # init
    # create the object
    # param: void
    # return: void
    def __init__(self):
        self.wordToTags = defaultdict(set)
        convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()]
        for word, tag in convertedTaggedWords:
            self.wordToTags[word].add(tag)

        productions = list()
        S = nltk.Nonterminal('S')
        for tree in treebank.parsed_sents():
            productions += tree.productions()
        # create the grammar
        pcfg = nltk.induce_pcfg(S, productions)
        # print(pcfg)
        self.viterb = ViterbiParser(pcfg)
        self.mostRecentTree = None
        self.validPosTags = set()
        self.validChunkTags = set()
        self.validIOBTags = set()
        self.relationTags = set()
        self.anchorTags = set()

        # pos tags
        self.validPosTags.add("CC")
        self.validPosTags.add("CD")
        self.validPosTags.add("DT")
        self.validPosTags.add("EX")
        self.validPosTags.add("FW")
        self.validPosTags.add("IN")
        self.validPosTags.add("JJ")
        self.validPosTags.add("JJR")
        self.validPosTags.add("JJS")
        self.validPosTags.add("LS")
        self.validPosTags.add("MD")
        self.validPosTags.add("NN")
        self.validPosTags.add("NNS")
        self.validPosTags.add("NNP")
        self.validPosTags.add("NNPS")
        self.validPosTags.add("PDT")
        self.validPosTags.add("POS")
        self.validPosTags.add("PRP")
        self.validPosTags.add("PRP$")
        self.validPosTags.add("PR")
        self.validPosTags.add("PBR")
        self.validPosTags.add("PBS")
        self.validPosTags.add("RP")
        self.validPosTags.add("SYM")
        self.validPosTags.add("TO")
        self.validPosTags.add("UH")
        self.validPosTags.add("VB")
        self.validPosTags.add("VBZ")
        self.validPosTags.add("VBP")
        self.validPosTags.add("VBD")
        self.validPosTags.add("VBG")
        self.validPosTags.add("WDT")
        self.validPosTags.add("WP")
        self.validPosTags.add("WP$")
        self.validPosTags.add("WRB")
        self.validPosTags.add(".")
        self.validPosTags.add(",")
        self.validPosTags.add(":")
        self.validPosTags.add("(")
        self.validPosTags.add(")")

        # chunk tags
        self.validChunkTags.add("NP")
        self.validChunkTags.add("PP")
        self.validChunkTags.add("VP")
        self.validChunkTags.add("ADVP")
        self.validChunkTags.add("ADJP")
        self.validChunkTags.add("SBAR")
        self.validChunkTags.add("PRT")
        self.validChunkTags.add("INTJ")
        self.validChunkTags.add("PNP")

        # IOB tags
        self.validIOBTags.add("I-")
        self.validIOBTags.add("O-")
        self.validIOBTags.add("B-")

        # relation tags
        self.relationTags.add("SBJ")
        self.relationTags.add("OBJ")
        self.relationTags.add("PRD")
        self.relationTags.add("TMP")
        self.relationTags.add("CLR")
        self.relationTags.add("LOC")
        self.relationTags.add("DIR")
        self.relationTags.add("EXT")
        self.relationTags.add("PRP")

        # anchor tags
        self.anchorTags.add("A1")
        self.anchorTags.add("P1")


    # parse
    # returns a parse tree corresponding to the given string
    #
    # param:
    #   x : the string to be parsed
    # return:
    #   the parse tree corresponding to x
    def parse(self, x):
        tokenizedSent = nltk.word_tokenize(x)
        trees = self.viterb.parse_all(tokenizedSent)
        # save the first one and then return it
        self.mostRecentTree = trees[0]
        return self.mostRecentTree


    # lastparse_label
    # returns all subtrees that has the given label for the root for the last
    # generated tree
    # param:
    #   x : the label
    # return:
    #   a list of all subtrees that have x as the label of the root
    def lastparse_label(self, x):
        # see if a previous tree exists
        if self.mostRecentTree is None:
            raise RuntimeError("No previous tree exists")
        # see if it is a POS tag
        if x not in self.validPosTags:
            # if not see if it is a chunk tag
            stringParts = x.split("-")
            if len(stringParts) == 2 and stringParts[1] not in self.relationTags:
                raise RuntimeError("Invalid relation label")
            if stringParts[0] not in self.validChunkTags:
                raise RuntimeError("Invalid tag")
        return [subtree for subtree in self.mostRecentTree.subtrees(lambda t: t.label() == x)]

    def lastparse_phrase(self, x):
        # find all subtrees of a certain type
        # see if a previous tree exists
        if self.mostRecentTree is None:
            raise RuntimeError("No previous tree exists")
        if x not in self.validChunkTags:
            raise RuntimeError("not a valid type of chunk")
        return [subtree for subtree in self.mostRecentTree.subtrees(lambda t: x in t.label())]

    # lastparse_height
    # returns the height of the tree that was just generated
    #
    # return: the height of the tree
    def lastparse_height(self):
        # see if a previous tree exists
        if self.mostRecentTree is None:
            raise RuntimeError("No previous tree exists")
        return self.mostRecentTree.height()

    # wordsFromChunks
    # helper function for taking the trees given and turning them into a lists of words
    def wordsFromChunks(self, label, alternateMode = False):
        chunks = self.lastparse_phrase(label) if alternateMode else self.lastparse_label(label)
        returnList = list()
        for chunk in chunks:
            temp = chunk.pos()
            returnList.append([word for word, pos in temp])
        return returnList

    # lastparse_nounphrase
    # returns all noun phrases of the most recently generated tree
    # return:
    #   all noun phrases
    def lastparse_nounphrase(self):
        return self.wordsFromChunks("NP", True)

    # lastparse_verbphrase
    # returns all verb phrases of the most recently generated tree
    # return:
    #   all verb phrases
    def lastparse_verbphrase(self):
        return self.wordsFromChunks("VP", True)

    # lastparse_verbs
    # returns all verbs of the most recently generated tree
    # return:
    #   all verbs
    def lastparse_verbs(self):
        result = []
        verbList = ['VB','VBZ','VBP','VBD','VBG']
        for i in range(0,len(verbList)):
            tmp = self.wordsFromChunks(verbList[i])
            for j in range(0,len(tmp)):
                result.append(tmp[j])

        return result

    # lastparse_nouns
    # returns all nouns of the most recently generated tree
    # return:
    #   all nouns
    def lastparse_nouns(self):
        result = []
        nounList = ['NN','NNS','NNP','NNPS']
        for i in range(0,len(nounList)):
            tmp = self.wordsFromChunks(nounList[i])
            for j in range(0,len(tmp)):
                result.append(tmp[j])

        return result

    def parse_with_substitution(self,x):
        tokenizedSent = nltk.word_tokenize(x)
        posTags = nltk.pos_tag(tokenizedSent, tagset='universal')
        fixedSentence = list()
        for word, tag in posTags:
            if tag in self.wordToTags[word]:
                fixedSentence.append(word)
            else:
                for word, tags in self.wordToTags.items():
                    if tag in tags:
                        fixedSentence.append(word)
                        break

        print(fixedSentence)

        trees = self.viterb.parse_all(fixedSentence)
        # save the first one and then return it
        self.mostRecentTree = trees[0]
        return self.mostRecentTree
Esempio n. 14
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys
    import time

    from nltk import tokenize
    from nltk.grammar import toy_pcfg1, toy_pcfg2
    from nltk.parse import ViterbiParser

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [
        ("I saw the man with my telescope", toy_pcfg1),
        ("the boy saw Jack with Bob under the table with a telescope",
         toy_pcfg2),
    ]

    # Ask the user which demo they want to use.
    print()
    for i in range(len(demos)):
        print(f"{i + 1:>3}: {demos[i][0]}")
        print("     %r" % demos[i][1])
        print()
    print("Which demo (%d-%d)? " % (1, len(demos)), end=" ")
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print("Bad sentence number")
        return

    # Tokenize the sentence.
    tokens = sent.split()

    parser = ViterbiParser(grammar)
    all_parses = {}

    print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}")
    parser.trace(3)
    t = time.time()
    parses = parser.parse_all(tokens)
    time = time.time() - t
    average = (reduce(lambda a, b: a + b.prob(), parses, 0) /
               len(parses) if parses else 0)
    num_parses = len(parses)
    for p in parses:
        all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print("Time (secs)   # Parses   Average P(parse)")
    print("-----------------------------------------")
    print("%11.4f%11d%19.14f" % (time, num_parses, average))
    parses = all_parses.keys()
    if parses:
        p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else:
        p = 0
    print("------------------------------------------")
    print("%11s%11d%19.14f" % ("n/a", len(parses), p))

    # Ask the user if we should draw the parses.
    print()
    print("Draw parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        from nltk.draw.tree import draw_trees

        print("  please wait...")
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print()
    print("Print parses (y/n)? ", end=" ")
    if sys.stdin.readline().strip().lower().startswith("y"):
        for parse in parses:
            print(parse)