Beispiel #1
0
def pcfg_demo():
    """
    A demonstration showing how C{WeightedGrammar}s can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print 'A PCFG production:', ` pcfg_prod `
    print '    pcfg_prod.lhs()  =>', ` pcfg_prod.lhs() `
    print '    pcfg_prod.rhs()  =>', ` pcfg_prod.rhs() `
    print '    pcfg_prod.prob() =>', ` pcfg_prod.prob() `
    print

    grammar = toy_pcfg2
    print 'A PCFG grammar:', ` grammar `
    print '    grammar.start()       =>', ` grammar.start() `
    print '    grammar.productions() =>',
    # Use string.replace(...) is to line-wrap the output.
    print ` grammar.productions() `.replace(',', ',\n' + ' ' * 26)
    print

    print 'Coverage of input words by a grammar:'
    print grammar.covers(['a', 'boy'])
    print grammar.covers(['a', 'girl'])

    # extract productions from three trees and induce the PCFG
    print "Induce PCFG grammar from treebank data:"

    productions = []
    for item in treebank.items[:2]:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print grammar
    print

    print "Parse sentence using induced grammar:"

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
    print sent
    for parse in parser.nbest_parse(sent):
        print parse
Beispiel #2
0
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    # pcfg_prods = toy_pcfg1.productions()
    #
    # pcfg_prod = pcfg_prods[2]
    # print('A PCFG production:', repr(pcfg_prod))
    # print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    # print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    # print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    # print()
    #
    # grammar = toy_pcfg2
    # print('A PCFG grammar:', repr(grammar))
    # print('    grammar.start()       =>', repr(grammar.start()))
    # print '    grammar.productions() =>',
    # # Use .replace(...) is to line-wrap the output.
    # print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    # print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS=False)
        tree.chomsky_normal_form(horzMarkov=2)

        productions += tree.productions()

    # S = Nonterminal('S')
    # grammar = induce_pcfg(S, productions)
    print(productions)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    # sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse)
Beispiel #3
0
def ex7():
    """
    Using NLTK's own library to generate the probabilities.
    """
    productions = [
        p for tree in treebank.parsed_sents() for p in tree.productions()
    ]
    pcfg = induce_pcfg(Nonterminal("S"), productions)
    # print(pcfg.productions())
    parser = pchart.InsideChartParser(pcfg, beam_size=800)

    for sent in sentences1:
        parsed = list(parser.parse(sent.split()))
        print("Parsing sent: {}".format(sent))
        print(parsed[0])
def PCFG_Section():
    toy_pcfg1 = PCFG.fromstring("""
        S -> NP VP [1.0]
        NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
        Det -> 'the' [0.8] | 'my' [0.2]
        N -> 'man' [0.5] | 'telescope' [0.5]
        VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
        V -> 'ate' [0.35] | 'saw' [0.65]
        PP -> P NP [1.0]
        P -> 'with' [0.61] | 'under' [0.39]
    """)

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', pcfg_prod)
    print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
    print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
    print('pcfg_prod.prob() =>', pcfg_prod.prob())

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.fileids()[:2]:
      for tree in treebank.parsed_sents(item):
        # print(" ".join(tree.leaves()))
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        prods = tree.productions()
        # print(prods[0].prob())
        productions += prods

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # print(grammar)    # This is a PCFG

    ### Parsing section below ###

    print("\nParse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(1)

    sent = treebank.parsed_sents('wsj_0001.mrg')[0]
    print(sent.prob())
Beispiel #5
0
def ex8():
    """
    Parsing unseen sentences (not in the training corpus)
    """
    legal_sentences = [
        "If there is any conflict between the terms in the General Terms and the Additional Terms, then the Additional Terms govern .",
        "You may have additional rights under the law .",
        "We do not seek to limit those rights where it is prohibited to do so by law ."
    ]
    tokenized = [[TreebankWordTokenizer().tokenize(sent)]
                 for sent in legal_sentences]  # sentences2
    productions = [
        p for tree in treebank.parsed_sents() for p in tree.productions()
    ]
    pcfg = induce_pcfg(Nonterminal("S"), productions)
    parser = pchart.InsideChartParser(pcfg, beam_size=500)
    for sent in tokenized:
        print("Parsing sent: {}".format(sent[0]))
        parsed = list(parser.parse(sent[0]))
        print(parsed)
Beispiel #6
0
def demo(choice=None, draw_parses=None, print_parses=None):
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
    """)

    toy_pcfg2 = PCFG.fromstring("""
    S    -> NP VP         [1.0]
    VP   -> V NP          [.59]
    VP   -> V             [.40]
    VP   -> VP PP         [.01]
    NP   -> Det N         [.41]
    NP   -> Name          [.28]
    NP   -> NP PP         [.31]
    PP   -> P NP          [1.0]
    V    -> 'saw'         [.21]
    V    -> 'ate'         [.51]
    V    -> 'ran'         [.28]
    N    -> 'boy'         [.11]
    N    -> 'cookie'      [.12]
    N    -> 'table'       [.13]
    N    -> 'telescope'   [.14]
    N    -> 'hill'        [.5]
    Name -> 'Jack'        [.52]
    Name -> 'Bob'         [.48]
    P    -> 'with'        [.61]
    P    -> 'under'       [.39]
    Det  -> 'the'         [.41]
    Det  -> 'a'           [.31]
    Det  -> 'my'          [.28]
    """)

    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    if choice is None:
        # Ask the user which demo they want to use.
        print()
        for i in range(len(demos)):
            print('%3s: %s' % (i + 1, demos[i][0]))
            print('     %r' % demos[i][1])
            print()
        print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
        choice = int(sys.stdin.readline().strip()) - 1
    try:
        sent, grammar = demos[choice]
    except:
        print('Bad sentence number')
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
        parser.trace(3)
        t = time.time()
        parses = list(parser.parse(tokens))
        times.append(time.time() - t)
        p = (reduce(lambda a, b: a + b.prob(), parses, 0) /
             len(parses) if parses else 0)
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print()
    print(
        '       Parser      Beam | Time (secs)   # Parses   Average P(parse)')
    print(
        '------------------------+------------------------------------------')
    for i in range(len(parsers)):
        print('%18s %4d |%11.4f%11d%19.14f' %
              (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],
               num_parses[i], average_p[i]))
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print(
        '------------------------+------------------------------------------')
    print('%18s      |%11s%11d%19.14f' %
          ('(All Parses)', 'n/a', len(parses), p))

    if draw_parses is None:
        # Ask the user if we should draw the parses.
        print()
        print('Draw parses (y/n)? ', end=' ')
        draw_parses = sys.stdin.readline().strip().lower().startswith('y')
    if draw_parses:
        from nltk.draw.tree import draw_trees
        print('  please wait...')
        draw_trees(*parses)

    if print_parses is None:
        # Ask the user if we should print the parses.
        print()
        print('Print parses (y/n)? ', end=' ')
        print_parses = sys.stdin.readline().strip().lower().startswith('y')
    if print_parses:
        for parse in parses:
            print(parse)
Beispiel #7
0
def main(sentences, grammarfile, pcfg_grammar, algo, output, \
	 to_keeps, percent_discard, beam=0):

    grammar = nltk.data.load("file:%s" %(grammarfile))
    chart_parser = ChartParser(grammar,strategy=EARLEY_STRATEGY,trace=0)
    
    f = open(pcfg_grammar)
    pcfgrammar = f.read()
    f.close()

    if algo == "viterbi":
	pcfg_parser = nltk.ViterbiParser(nltk.parse_pcfg(pcfgrammar))
    elif algo == "inside":
	pcfg_parser = pchart.InsideChartParser(nltk.parse_pcfg(pcfgrammar),\
					       beam_size=beam)
    elif algo == "random":
	pcfg_parser = pchart.RandomChartParser(nltk.parse_pcfg(pcfgrammar),\
					       beam_size=beam)
    elif algo == "longest":
	pcfg_parser = pchart.LongestChartParser(nltk.parse_pcfg(pcfgrammar),\
						beam_size=beam)
    elif algo == "unsorted":
	pcfg_parser = pchart.UnsortedChartParser(nltk.parse_pcfg(pcfgrammar),\
						 beam_size=beam)	
    elif algo == "chart":
	pass
    else:
	print "unrecognized algorithm: %s" %(algo)
	return 1
	
    forest = []
    for sentence in sentences:
	parsed_sent = sentence.split()
	print "parsed_sent: %s" %(parsed_sent)
	start = datetime.now()

	if algo == "chart":
	    trees = chart_parser.nbest_parse(parsed_sent)
	else:
	    trees = pcfg_parser.nbest_parse(parsed_sent)
	    
	end = datetime.now()
	elapsed = end - start
	print "parsing time elapsed: %s" %(elapsed)
	print "parsing time elapsed: %d us" %(elapsed.microseconds)

	if (len(trees) == 0):
	    print "failed to parse: %s" %(sentence)
	    return 1;
	forest.append(trees)

    all_productions = grammar.productions()
    # randomly shuffle the productions
    all_productions = all_productions[0:len(all_productions)]
    random.shuffle(all_productions)
    random.shuffle(all_productions)

    status = 0
    for keep in to_keeps:
	for discard in percent_discard:
	    status += create_pruned_grammar(forest, all_productions, keep,\
					    discard, output)
    return status
Beispiel #8
0
def demo():
    """
    A demonstration of the probabilistic parsers.  The user is
    prompted to select which demo to run, and how many parses should
    be found; and then each parser is run on the same demo, and a
    summary of the results are displayed.
    """
    import sys, time
    from nltk import tokenize, toy_pcfg1, toy_pcfg2
    from nltk.parse import pchart

    # Define two demos.  Each demo has a sentence and a grammar.
    demos = [('I saw John with my telescope', toy_pcfg1),
             ('the boy saw Jack with Bob under the table with a telescope',
              toy_pcfg2)]

    # Ask the user which demo they want to use.
    print
    for i in range(len(demos)):
        print '%3s: %s' % (i + 1, demos[i][0])
        print '     %r' % demos[i][1]
        print
    print 'Which demo (%d-%d)? ' % (1, len(demos)),
    try:
        snum = int(sys.stdin.readline().strip()) - 1
        sent, grammar = demos[snum]
    except:
        print 'Bad sentence number'
        return

    # Tokenize the sentence.
    tokens = sent.split()

    # Define a list of parsers.  We'll use all parsers.
    parsers = [
        pchart.InsideChartParser(grammar),
        pchart.RandomChartParser(grammar),
        pchart.UnsortedChartParser(grammar),
        pchart.LongestChartParser(grammar),
        pchart.InsideChartParser(grammar,
                                 beam_size=len(tokens) + 1)  # was BeamParser
    ]

    # Run the parsers on the tokenized sentence.
    times = []
    average_p = []
    num_parses = []
    all_parses = {}
    for parser in parsers:
        print '\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar)
        parser.trace(3)
        t = time.time()
        parses = parser.nbest_parse(tokens)
        times.append(time.time() - t)
        if parses:
            p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
        else:
            p = 0
        average_p.append(p)
        num_parses.append(len(parses))
        for p in parses:
            all_parses[p.freeze()] = 1

    # Print some summary statistics
    print
    print '       Parser      Beam | Time (secs)   # Parses   Average P(parse)'
    print '------------------------+------------------------------------------'
    for i in range(len(parsers)):
        print '%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__,
                                               parsers[i].beam_size, times[i],
                                               num_parses[i], average_p[i])
    parses = all_parses.keys()
    if parses: p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
    else: p = 0
    print '------------------------+------------------------------------------'
    print '%18s      |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses),
                                          p)

    # Ask the user if we should draw the parses.
    print
    print 'Draw parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        from nltk.draw.tree import draw_trees
        print '  please wait...'
        draw_trees(*parses)

    # Ask the user if we should print the parses.
    print
    print 'Print parses (y/n)? ',
    if sys.stdin.readline().strip().lower().startswith('y'):
        for parse in parses:
            print parse
S = nltk.Nonterminal("S")
grammar = nltk.induce_pcfg(S, treeProductions)

### Extracting PCFG to a text file
#grammar_PCFG = str(grammar)
#file = open('/Users/mayapetranova/Documents/QMUL/NLP/assignment_2/6/PCFG.txt', 'w')
#file.write(grammar_PCFG)
#file.close()

########################
## Q u e s t i o n 6b ##
########################

sentence = "show me the meals on the flight from Phoenix".split()
parser = pchart.InsideChartParser(grammar)
for tp in parser.parse(sentence):
    print(tp)

## For Drawing the CFG trees, uncomment lines 59-80
#q6s1 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP (Det the) (Nominal (Noun meals)))(PP(Preposition on)(NP(Det the)(Nominal(Nominal (Noun flight))(PP(Preposition from)(NP (Proper_Noun Phoenix))))))))')
#TreeView(q6s1)._cframe.print_to_file('q6s1.ps')
#q6s2 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP(Det the)(Nominal(Nominal (Noun meals))(PP(Preposition on)(NP (Det the) (Nominal (Noun flight))))))(PP (Preposition from) (NP (Proper_Noun Phoenix)))))')
#TreeView(q6s2)._cframe.print_to_file('q6s2.ps')
#q6s3 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP(NP (Det the) (Nominal (Noun meals)))(PP(Preposition on)(NP (Det the) (Nominal (Noun flight)))))(PP (Preposition from) (NP (Proper_Noun Phoenix)))))')
#TreeView(q6s3)._cframe.print_to_file('q6s3.ps')
#q6s4 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP (Det the) (Nominal (Noun meals)))(PP(Preposition on)(NP(NP (Det the) (Nominal (Noun flight)))(PP (Preposition from) (NP (Proper_Noun Phoenix)))))))')
#TreeView(q6s4)._cframe.print_to_file('q6s4.ps')
#q6s5 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP(Det the)(Nominal(Nominal(Nominal (Noun meals))(PP(Preposition on)(NP (Det the) (Nominal (Noun flight)))))(PP (Preposition from) (NP (Proper_Noun Phoenix)))))))')
#TreeView(q6s5)._cframe.print_to_file('q6s5.ps')
#q6s6 = Tree.fromstring('(S(IVP(IVerb show)(NP (Pronoun me))(NP(Det the)(Nominal(Nominal (Noun meals))(PP(Preposition on)(NP(Det the)(Nominal(Nominal (Noun flight))(PP(Preposition from)(NP (Proper_Noun Phoenix))))))))))')
Beispiel #10
0
    TEN -> 'eighty'                     [0.125]
    TEN -> 'ninety'                     [0.125]

    CD -> 'zero'                        [0.1]
    CD -> 'one'                         [0.1]
    CD -> 'two'                         [0.1]
    CD -> 'three'                       [0.1]
    CD -> 'four'                        [0.1]
    CD -> 'five'                        [0.1]
    CD -> 'six'                         [0.1]
    CD -> 'seven'                       [0.1]
    CD -> 'eight'                       [0.1]
    CD -> 'nine'                        [0.1]
""")

PARSER = pchart.InsideChartParser(GRAMMAR)

LEMMATIZER = WordNetLemmatizer()

def add(one, two):
    """
    returns one + two
    """
    return one + two

def subtract(one, two):
    """
    returns one - two
    """
    return one - two